diff --git a/build/autotools/common.am b/build/autotools/common.am
index 291ed6d0..947e8634 100644
--- a/build/autotools/common.am
+++ b/build/autotools/common.am
@@ -6,6 +6,7 @@
diff --git a/src/bullet/Bullet-C-Api.h b/src/bullet/Bullet-C-Api.h
deleted file mode 100644
index f27a17d5..00000000
--- a/src/bullet/Bullet-C-Api.h
+++ /dev/null
@@ -1,176 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-	Draft high-level generic physics C-API. For low-level access, use the physics SDK native API's.
-	Work in progress, functionality will be added on demand.
-	If possible, use the richer Bullet C++ API, by including "btBulletDynamicsCommon.h"
-#ifndef BULLET_C_API_H
-#define BULLET_C_API_H
-#define PL_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
-typedef double	plReal;
-typedef float	plReal;
-typedef plReal	plVector3[3];
-typedef plReal	plQuaternion[4];
-#ifdef __cplusplus
-extern "C" { 
-/**	Particular physics SDK (C-API) */
-	PL_DECLARE_HANDLE(plPhysicsSdkHandle);
-/** 	Dynamics world, belonging to some physics SDK (C-API)*/
-	PL_DECLARE_HANDLE(plDynamicsWorldHandle);
-/** Rigid Body that can be part of a Dynamics World (C-API)*/	
-	PL_DECLARE_HANDLE(plRigidBodyHandle);
-/** 	Collision Shape/Geometry, property of a Rigid Body (C-API)*/
-	PL_DECLARE_HANDLE(plCollisionShapeHandle);
-/** Constraint for Rigid Bodies (C-API)*/
-	PL_DECLARE_HANDLE(plConstraintHandle);
-/** Triangle Mesh interface (C-API)*/
-	PL_DECLARE_HANDLE(plMeshInterfaceHandle);
-/** Broadphase Scene/Proxy Handles (C-API)*/
-	PL_DECLARE_HANDLE(plCollisionBroadphaseHandle);
-	PL_DECLARE_HANDLE(plBroadphaseProxyHandle);
-	PL_DECLARE_HANDLE(plCollisionWorldHandle);
-	Create and Delete a Physics SDK	
-	extern	plPhysicsSdkHandle	plNewBulletSdk(void); //this could be also another sdk, like ODE, PhysX etc.
-	extern	void		plDeletePhysicsSdk(plPhysicsSdkHandle	physicsSdk);
-/** Collision World, not strictly necessary, you can also just create a Dynamics World with Rigid Bodies which internally manages the Collision World with Collision Objects */
-	typedef void(*btBroadphaseCallback)(void* clientData, void* object1,void* object2);
-	extern plCollisionBroadphaseHandle	plCreateSapBroadphase(btBroadphaseCallback beginCallback,btBroadphaseCallback endCallback);
-	extern void	plDestroyBroadphase(plCollisionBroadphaseHandle bp);
-	extern 	plBroadphaseProxyHandle plCreateProxy(plCollisionBroadphaseHandle bp, void* clientData, plReal minX,plReal minY,plReal minZ, plReal maxX,plReal maxY, plReal maxZ);
-	extern void plDestroyProxy(plCollisionBroadphaseHandle bp, plBroadphaseProxyHandle proxyHandle);
-	extern void plSetBoundingBox(plBroadphaseProxyHandle proxyHandle, plReal minX,plReal minY,plReal minZ, plReal maxX,plReal maxY, plReal maxZ);
-/* todo: add pair cache support with queries like add/remove/find pair */
-	extern plCollisionWorldHandle plCreateCollisionWorld(plPhysicsSdkHandle physicsSdk);
-/* todo: add/remove objects */
-/* Dynamics World */
-	extern  plDynamicsWorldHandle plCreateDynamicsWorld(plPhysicsSdkHandle physicsSdk);
-	extern  void           plDeleteDynamicsWorld(plDynamicsWorldHandle world);
-	extern	void	plStepSimulation(plDynamicsWorldHandle,	plReal	timeStep);
-	extern  void plAddRigidBody(plDynamicsWorldHandle world, plRigidBodyHandle object);
-	extern  void plRemoveRigidBody(plDynamicsWorldHandle world, plRigidBodyHandle object);
-/* Rigid Body  */
-	extern  plRigidBodyHandle plCreateRigidBody(	void* user_data,  float mass, plCollisionShapeHandle cshape );
-	extern  void plDeleteRigidBody(plRigidBodyHandle body);
-/* Collision Shape definition */
-	extern  plCollisionShapeHandle plNewSphereShape(plReal radius);
-	extern  plCollisionShapeHandle plNewBoxShape(plReal x, plReal y, plReal z);
-	extern  plCollisionShapeHandle plNewCapsuleShape(plReal radius, plReal height);	
-	extern  plCollisionShapeHandle plNewConeShape(plReal radius, plReal height);
-	extern  plCollisionShapeHandle plNewCylinderShape(plReal radius, plReal height);
-	extern	plCollisionShapeHandle plNewCompoundShape(void);
-	extern	void	plAddChildShape(plCollisionShapeHandle compoundShape,plCollisionShapeHandle childShape, plVector3 childPos,plQuaternion childOrn);
-	extern  void plDeleteShape(plCollisionShapeHandle shape);
-	/* Convex Meshes */
-	extern  plCollisionShapeHandle plNewConvexHullShape(void);
-	extern  void		plAddVertex(plCollisionShapeHandle convexHull, plReal x,plReal y,plReal z);
-/* Concave static triangle meshes */
-	extern  plMeshInterfaceHandle		   plNewMeshInterface(void);
-	extern  void		plAddTriangle(plMeshInterfaceHandle meshHandle, plVector3 v0,plVector3 v1,plVector3 v2);
-	extern  plCollisionShapeHandle plNewStaticTriangleMeshShape(plMeshInterfaceHandle);
-	extern  void plSetScaling(plCollisionShapeHandle shape, plVector3 scaling);
-/* SOLID has Response Callback/Table/Management */
-/* PhysX has Triggers, User Callbacks and filtering */
-/* ODE has the typedef void dNearCallback (void *data, dGeomID o1, dGeomID o2); */
-/*	typedef void plUpdatedPositionCallback(void* userData, plRigidBodyHandle	rbHandle, plVector3 pos); */
-/*	typedef void plUpdatedOrientationCallback(void* userData, plRigidBodyHandle	rbHandle, plQuaternion orientation); */
-	/* get world transform */
-	extern void	plGetOpenGLMatrix(plRigidBodyHandle object, plReal* matrix);
-	extern void	plGetPosition(plRigidBodyHandle object,plVector3 position);
-	extern void plGetOrientation(plRigidBodyHandle object,plQuaternion orientation);
-	/* set world transform (position/orientation) */
-	extern  void plSetPosition(plRigidBodyHandle object, const plVector3 position);
-	extern  void plSetOrientation(plRigidBodyHandle object, const plQuaternion orientation);
-	extern	void plSetEuler(plReal yaw,plReal pitch,plReal roll, plQuaternion orient);
-	extern	void plSetOpenGLMatrix(plRigidBodyHandle object, plReal* matrix);
-	typedef struct plRayCastResult {
-		plRigidBodyHandle		m_body;  
-		plCollisionShapeHandle	m_shape; 		
-		plVector3				m_positionWorld; 		
-		plVector3				m_normalWorld;
-	} plRayCastResult;
-	extern  int plRayCast(plDynamicsWorldHandle world, const plVector3 rayStart, const plVector3 rayEnd, plRayCastResult res);
-	/* Sweep API */
-	/* extern  plRigidBodyHandle plObjectCast(plDynamicsWorldHandle world, const plVector3 rayStart, const plVector3 rayEnd, plVector3 hitpoint, plVector3 normal); */
-	/* Continuous Collision Detection API */
-	// needed for source/blender/blenkernel/intern/collision.c
-	double plNearestPoints(float p1[3], float p2[3], float p3[3], float q1[3], float q2[3], float q3[3], float *pa, float *pb, float normal[3]);
-#ifdef __cplusplus
-#endif //BULLET_C_API_H
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3BroadphaseCallback.h b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3BroadphaseCallback.h
new file mode 100644
index 00000000..1bc56cf8
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3BroadphaseCallback.h
@@ -0,0 +1,40 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+struct b3BroadphaseProxy;
+struct	b3BroadphaseAabbCallback
+	virtual ~b3BroadphaseAabbCallback() {}
+	virtual bool	process(const b3BroadphaseProxy* proxy) = 0;
+struct	b3BroadphaseRayCallback : public b3BroadphaseAabbCallback
+	///added some cached data to accelerate ray-AABB tests
+	b3Vector3		m_rayDirectionInverse;
+	unsigned int	m_signs[3];
+	b3Scalar		m_lambda_max;
+	virtual ~b3BroadphaseRayCallback() {}
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.cpp b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.cpp
new file mode 100644
index 00000000..16991bc0
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.cpp
@@ -0,0 +1,1295 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///b3DynamicBvh implementation by Nathanael Presson
+#include "b3DynamicBvh.h"
+typedef b3AlignedObjectArray<b3DbvtNode*>			b3NodeArray;
+typedef b3AlignedObjectArray<const b3DbvtNode*>	b3ConstNodeArray;
+struct b3DbvtNodeEnumerator : b3DynamicBvh::ICollide
+	b3ConstNodeArray	nodes;
+	void Process(const b3DbvtNode* n) { nodes.push_back(n); }
+static B3_DBVT_INLINE int			b3IndexOf(const b3DbvtNode* node)
+	return(node->parent->childs[1]==node);
+static B3_DBVT_INLINE b3DbvtVolume	b3Merge(	const b3DbvtVolume& a,
+									  const b3DbvtVolume& b)
+	B3_ATTRIBUTE_ALIGNED16(char locals[sizeof(b3DbvtAabbMm)]);
+	b3DbvtVolume&	res=*(b3DbvtVolume*)locals;
+		b3DbvtVolume	res;
+	b3Merge(a,b,res);
+	return(res);
+// volume+edge lengths
+static B3_DBVT_INLINE b3Scalar		b3Size(const b3DbvtVolume& a)
+	const b3Vector3	edges=a.Lengths();
+	return(	edges.x*edges.y*edges.z+
+		edges.x+edges.y+edges.z);
+static void						b3GetMaxDepth(const b3DbvtNode* node,int depth,int& maxdepth)
+	if(node->isinternal())
+	{
+		b3GetMaxDepth(node->childs[0],depth+1,maxdepth);
+		b3GetMaxDepth(node->childs[1],depth+1,maxdepth);
+	} else maxdepth=b3Max(maxdepth,depth);
+static B3_DBVT_INLINE void			b3DeleteNode(	b3DynamicBvh* pdbvt,
+										   b3DbvtNode* node)
+	b3AlignedFree(pdbvt->m_free);
+	pdbvt->m_free=node;
+static void						b3RecurseDeleteNode(	b3DynamicBvh* pdbvt,
+												  b3DbvtNode* node)
+	if(!node->isleaf())
+	{
+		b3RecurseDeleteNode(pdbvt,node->childs[0]);
+		b3RecurseDeleteNode(pdbvt,node->childs[1]);
+	}
+	if(node==pdbvt->m_root) pdbvt->m_root=0;
+	b3DeleteNode(pdbvt,node);
+static B3_DBVT_INLINE b3DbvtNode*	b3CreateNode(	b3DynamicBvh* pdbvt,
+										   b3DbvtNode* parent,
+										   void* data)
+	b3DbvtNode*	node;
+	if(pdbvt->m_free)
+	{ node=pdbvt->m_free;pdbvt->m_free=0; }
+	else
+	{ node=new(b3AlignedAlloc(sizeof(b3DbvtNode),16)) b3DbvtNode(); }
+	node->parent	=	parent;
+	node->data		=	data;
+	node->childs[1]	=	0;
+	return(node);
+static B3_DBVT_INLINE b3DbvtNode*	b3CreateNode(	b3DynamicBvh* pdbvt,
+										   b3DbvtNode* parent,
+										   const b3DbvtVolume& volume,
+										   void* data)
+	b3DbvtNode*	node=b3CreateNode(pdbvt,parent,data);
+	node->volume=volume;
+	return(node);
+static B3_DBVT_INLINE b3DbvtNode*	b3CreateNode(	b3DynamicBvh* pdbvt,
+										   b3DbvtNode* parent,
+										   const b3DbvtVolume& volume0,
+										   const b3DbvtVolume& volume1,
+										   void* data)
+	b3DbvtNode*	node=b3CreateNode(pdbvt,parent,data);
+	b3Merge(volume0,volume1,node->volume);
+	return(node);
+static void						b3InsertLeaf(	b3DynamicBvh* pdbvt,
+										   b3DbvtNode* root,
+										   b3DbvtNode* leaf)
+	if(!pdbvt->m_root)
+	{
+		pdbvt->m_root	=	leaf;
+		leaf->parent	=	0;
+	}
+	else
+	{
+		if(!root->isleaf())
+		{
+			do	{
+				root=root->childs[b3Select(	leaf->volume,
+					root->childs[0]->volume,
+					root->childs[1]->volume)];
+			} while(!root->isleaf());
+		}
+		b3DbvtNode*	prev=root->parent;
+		b3DbvtNode*	node=b3CreateNode(pdbvt,prev,leaf->volume,root->volume,0);
+		if(prev)
+		{
+			prev->childs[b3IndexOf(root)]	=	node;
+			node->childs[0]				=	root;root->parent=node;
+			node->childs[1]				=	leaf;leaf->parent=node;
+			do	{
+				if(!prev->volume.Contain(node->volume))
+					b3Merge(prev->childs[0]->volume,prev->childs[1]->volume,prev->volume);
+				else
+					break;
+				node=prev;
+			} while(0!=(prev=node->parent));
+		}
+		else
+		{
+			node->childs[0]	=	root;root->parent=node;
+			node->childs[1]	=	leaf;leaf->parent=node;
+			pdbvt->m_root	=	node;
+		}
+	}
+static b3DbvtNode*				b3RemoveLeaf(	b3DynamicBvh* pdbvt,
+										   b3DbvtNode* leaf)
+	if(leaf==pdbvt->m_root)
+	{
+		pdbvt->m_root=0;
+		return(0);
+	}
+	else
+	{
+		b3DbvtNode*	parent=leaf->parent;
+		b3DbvtNode*	prev=parent->parent;
+		b3DbvtNode*	sibling=parent->childs[1-b3IndexOf(leaf)];			
+		if(prev)
+		{
+			prev->childs[b3IndexOf(parent)]=sibling;
+			sibling->parent=prev;
+			b3DeleteNode(pdbvt,parent);
+			while(prev)
+			{
+				const b3DbvtVolume	pb=prev->volume;
+				b3Merge(prev->childs[0]->volume,prev->childs[1]->volume,prev->volume);
+				if(b3NotEqual(pb,prev->volume))
+				{
+					prev=prev->parent;
+				} else break;
+			}
+			return(prev?prev:pdbvt->m_root);
+		}
+		else
+		{								
+			pdbvt->m_root=sibling;
+			sibling->parent=0;
+			b3DeleteNode(pdbvt,parent);
+			return(pdbvt->m_root);
+		}			
+	}
+static void						b3FetchLeaves(b3DynamicBvh* pdbvt,
+											b3DbvtNode* root,
+											b3NodeArray& leaves,
+											int depth=-1)
+	if(root->isinternal()&&depth)
+	{
+		b3FetchLeaves(pdbvt,root->childs[0],leaves,depth-1);
+		b3FetchLeaves(pdbvt,root->childs[1],leaves,depth-1);
+		b3DeleteNode(pdbvt,root);
+	}
+	else
+	{
+		leaves.push_back(root);
+	}
+static void						b3Split(	const b3NodeArray& leaves,
+									  b3NodeArray& left,
+									  b3NodeArray& right,
+									  const b3Vector3& org,
+									  const b3Vector3& axis)
+	left.resize(0);
+	right.resize(0);
+	for(int i=0,ni=leaves.size();i<ni;++i)
+	{
+		if(b3Dot(axis,leaves[i]->volume.Center()-org)<0)
+			left.push_back(leaves[i]);
+		else
+			right.push_back(leaves[i]);
+	}
+static b3DbvtVolume				b3Bounds(	const b3NodeArray& leaves)
+	B3_ATTRIBUTE_ALIGNED16(char	locals[sizeof(b3DbvtVolume)]);
+	b3DbvtVolume&	volume=*(b3DbvtVolume*)locals;
+	volume=leaves[0]->volume;
+	b3DbvtVolume volume=leaves[0]->volume;
+	for(int i=1,ni=leaves.size();i<ni;++i)
+	{
+		b3Merge(volume,leaves[i]->volume,volume);
+	}
+	return(volume);
+static void						b3BottomUp(	b3DynamicBvh* pdbvt,
+										 b3NodeArray& leaves)
+	while(leaves.size()>1)
+	{
+		b3Scalar	minsize=B3_INFINITY;
+		int			minidx[2]={-1,-1};
+		for(int i=0;i<leaves.size();++i)
+		{
+			for(int j=i+1;j<leaves.size();++j)
+			{
+				const b3Scalar	sz=b3Size(b3Merge(leaves[i]->volume,leaves[j]->volume));
+				if(sz<minsize)
+				{
+					minsize		=	sz;
+					minidx[0]	=	i;
+					minidx[1]	=	j;
+				}
+			}
+		}
+		b3DbvtNode*	n[]	=	{leaves[minidx[0]],leaves[minidx[1]]};
+		b3DbvtNode*	p	=	b3CreateNode(pdbvt,0,n[0]->volume,n[1]->volume,0);
+		p->childs[0]		=	n[0];
+		p->childs[1]		=	n[1];
+		n[0]->parent		=	p;
+		n[1]->parent		=	p;
+		leaves[minidx[0]]	=	p;
+		leaves.swap(minidx[1],leaves.size()-1);
+		leaves.pop_back();
+	}
+static b3DbvtNode*			b3TopDown(b3DynamicBvh* pdbvt,
+									b3NodeArray& leaves,
+									int bu_treshold)
+	static const b3Vector3	axis[]={b3MakeVector3(1,0,0),
+		b3MakeVector3(0,1,0),
+		b3MakeVector3(0,0,1)};
+	if(leaves.size()>1)
+	{
+		if(leaves.size()>bu_treshold)
+		{
+			const b3DbvtVolume	vol=b3Bounds(leaves);
+			const b3Vector3			org=vol.Center();
+			b3NodeArray				sets[2];
+			int						bestaxis=-1;
+			int						bestmidp=leaves.size();
+			int						splitcount[3][2]={{0,0},{0,0},{0,0}};
+			int i;
+			for( i=0;i<leaves.size();++i)
+			{
+				const b3Vector3	x=leaves[i]->volume.Center()-org;
+				for(int j=0;j<3;++j)
+				{
+					++splitcount[j][b3Dot(x,axis[j])>0?1:0];
+				}
+			}
+			for( i=0;i<3;++i)
+			{
+				if((splitcount[i][0]>0)&&(splitcount[i][1]>0))
+				{
+					const int	midp=(int)b3Fabs(b3Scalar(splitcount[i][0]-splitcount[i][1]));
+					if(midp<bestmidp)
+					{
+						bestaxis=i;
+						bestmidp=midp;
+					}
+				}
+			}
+			if(bestaxis>=0)
+			{
+				sets[0].reserve(splitcount[bestaxis][0]);
+				sets[1].reserve(splitcount[bestaxis][1]);
+				b3Split(leaves,sets[0],sets[1],org,axis[bestaxis]);
+			}
+			else
+			{
+				sets[0].reserve(leaves.size()/2+1);
+				sets[1].reserve(leaves.size()/2);
+				for(int i=0,ni=leaves.size();i<ni;++i)
+				{
+					sets[i&1].push_back(leaves[i]);
+				}
+			}
+			b3DbvtNode*	node=b3CreateNode(pdbvt,0,vol,0);
+			node->childs[0]=b3TopDown(pdbvt,sets[0],bu_treshold);
+			node->childs[1]=b3TopDown(pdbvt,sets[1],bu_treshold);
+			node->childs[0]->parent=node;
+			node->childs[1]->parent=node;
+			return(node);
+		}
+		else
+		{
+			b3BottomUp(pdbvt,leaves);
+			return(leaves[0]);
+		}
+	}
+	return(leaves[0]);
+static B3_DBVT_INLINE b3DbvtNode*	b3Sort(b3DbvtNode* n,b3DbvtNode*& r)
+	b3DbvtNode*	p=n->parent;
+	b3Assert(n->isinternal());
+	if(p>n)
+	{
+		const int		i=b3IndexOf(n);
+		const int		j=1-i;
+		b3DbvtNode*	s=p->childs[j];
+		b3DbvtNode*	q=p->parent;
+		b3Assert(n==p->childs[i]);
+		if(q) q->childs[b3IndexOf(p)]=n; else r=n;
+		s->parent=n;
+		p->parent=n;
+		n->parent=q;
+		p->childs[0]=n->childs[0];
+		p->childs[1]=n->childs[1];
+		n->childs[0]->parent=p;
+		n->childs[1]->parent=p;
+		n->childs[i]=p;
+		n->childs[j]=s;
+		b3Swap(p->volume,n->volume);
+		return(p);
+	}
+	return(n);
+#if 0
+static B3_DBVT_INLINE b3DbvtNode*	walkup(b3DbvtNode* n,int count)
+	while(n&&(count--)) n=n->parent;
+	return(n);
+// Api
+	m_root		=	0;
+	m_free		=	0;
+	m_lkhd		=	-1;
+	m_leaves	=	0;
+	m_opath		=	0;
+	clear();
+void			b3DynamicBvh::clear()
+	if(m_root)	
+		b3RecurseDeleteNode(this,m_root);
+	b3AlignedFree(m_free);
+	m_free=0;
+	m_lkhd		=	-1;
+	m_stkStack.clear();
+	m_opath		=	0;
+void			b3DynamicBvh::optimizeBottomUp()
+	if(m_root)
+	{
+		b3NodeArray leaves;
+		leaves.reserve(m_leaves);
+		b3FetchLeaves(this,m_root,leaves);
+		b3BottomUp(this,leaves);
+		m_root=leaves[0];
+	}
+void			b3DynamicBvh::optimizeTopDown(int bu_treshold)
+	if(m_root)
+	{
+		b3NodeArray	leaves;
+		leaves.reserve(m_leaves);
+		b3FetchLeaves(this,m_root,leaves);
+		m_root=b3TopDown(this,leaves,bu_treshold);
+	}
+void			b3DynamicBvh::optimizeIncremental(int passes)
+	if(passes<0) passes=m_leaves;
+	if(m_root&&(passes>0))
+	{
+		do	{
+			b3DbvtNode*		node=m_root;
+			unsigned	bit=0;
+			while(node->isinternal())
+			{
+				node=b3Sort(node,m_root)->childs[(m_opath>>bit)&1];
+				bit=(bit+1)&(sizeof(unsigned)*8-1);
+			}
+			update(node);
+			++m_opath;
+		} while(--passes);
+	}
+b3DbvtNode*	b3DynamicBvh::insert(const b3DbvtVolume& volume,void* data)
+	b3DbvtNode*	leaf=b3CreateNode(this,0,volume,data);
+	b3InsertLeaf(this,m_root,leaf);
+	++m_leaves;
+	return(leaf);
+void			b3DynamicBvh::update(b3DbvtNode* leaf,int lookahead)
+	b3DbvtNode*	root=b3RemoveLeaf(this,leaf);
+	if(root)
+	{
+		if(lookahead>=0)
+		{
+			for(int i=0;(i<lookahead)&&root->parent;++i)
+			{
+				root=root->parent;
+			}
+		} else root=m_root;
+	}
+	b3InsertLeaf(this,root,leaf);
+void			b3DynamicBvh::update(b3DbvtNode* leaf,b3DbvtVolume& volume)
+	b3DbvtNode*	root=b3RemoveLeaf(this,leaf);
+	if(root)
+	{
+		if(m_lkhd>=0)
+		{
+			for(int i=0;(i<m_lkhd)&&root->parent;++i)
+			{
+				root=root->parent;
+			}
+		} else root=m_root;
+	}
+	leaf->volume=volume;
+	b3InsertLeaf(this,root,leaf);
+bool			b3DynamicBvh::update(b3DbvtNode* leaf,b3DbvtVolume& volume,const b3Vector3& velocity,b3Scalar margin)
+	if(leaf->volume.Contain(volume)) return(false);
+	volume.Expand(b3MakeVector3(margin,margin,margin));
+	volume.SignedExpand(velocity);
+	update(leaf,volume);
+	return(true);
+bool			b3DynamicBvh::update(b3DbvtNode* leaf,b3DbvtVolume& volume,const b3Vector3& velocity)
+	if(leaf->volume.Contain(volume)) return(false);
+	volume.SignedExpand(velocity);
+	update(leaf,volume);
+	return(true);
+bool			b3DynamicBvh::update(b3DbvtNode* leaf,b3DbvtVolume& volume,b3Scalar margin)
+	if(leaf->volume.Contain(volume)) return(false);
+	volume.Expand(b3MakeVector3(margin,margin,margin));
+	update(leaf,volume);
+	return(true);
+void			b3DynamicBvh::remove(b3DbvtNode* leaf)
+	b3RemoveLeaf(this,leaf);
+	b3DeleteNode(this,leaf);
+	--m_leaves;
+void			b3DynamicBvh::write(IWriter* iwriter) const
+	b3DbvtNodeEnumerator	nodes;
+	nodes.nodes.reserve(m_leaves*2);
+	enumNodes(m_root,nodes);
+	iwriter->Prepare(m_root,nodes.nodes.size());
+	for(int i=0;i<nodes.nodes.size();++i)
+	{
+		const b3DbvtNode* n=nodes.nodes[i];
+		int			p=-1;
+		if(n->parent) p=nodes.nodes.findLinearSearch(n->parent);
+		if(n->isinternal())
+		{
+			const int	c0=nodes.nodes.findLinearSearch(n->childs[0]);
+			const int	c1=nodes.nodes.findLinearSearch(n->childs[1]);
+			iwriter->WriteNode(n,i,p,c0,c1);
+		}
+		else
+		{
+			iwriter->WriteLeaf(n,i,p);
+		}	
+	}
+void			b3DynamicBvh::clone(b3DynamicBvh& dest,IClone* iclone) const
+	dest.clear();
+	if(m_root!=0)
+	{	
+		b3AlignedObjectArray<sStkCLN>	stack;
+		stack.reserve(m_leaves);
+		stack.push_back(sStkCLN(m_root,0));
+		do	{
+			const int		i=stack.size()-1;
+			const sStkCLN	e=stack[i];
+			b3DbvtNode*			n=b3CreateNode(&dest,e.parent,e.node->volume,e.node->data);
+			stack.pop_back();
+			if(e.parent!=0)
+				e.parent->childs[i&1]=n;
+			else
+				dest.m_root=n;
+			if(e.node->isinternal())
+			{
+				stack.push_back(sStkCLN(e.node->childs[0],n));
+				stack.push_back(sStkCLN(e.node->childs[1],n));
+			}
+			else
+			{
+				iclone->CloneLeaf(n);
+			}
+		} while(stack.size()>0);
+	}
+int				b3DynamicBvh::maxdepth(const b3DbvtNode* node)
+	int	depth=0;
+	if(node) b3GetMaxDepth(node,1,depth);
+	return(depth);
+int				b3DynamicBvh::countLeaves(const b3DbvtNode* node)
+	if(node->isinternal())
+		return(countLeaves(node->childs[0])+countLeaves(node->childs[1]));
+	else
+		return(1);
+void			b3DynamicBvh::extractLeaves(const b3DbvtNode* node,b3AlignedObjectArray<const b3DbvtNode*>& leaves)
+	if(node->isinternal())
+	{
+		extractLeaves(node->childs[0],leaves);
+		extractLeaves(node->childs[1],leaves);
+	}
+	else
+	{
+		leaves.push_back(node);
+	}	
+#include <stdio.h>
+#include <stdlib.h>
+/Ox /Ob2 /Oi /Ot /I "." /I "..\.." /I "..\..\src" /D "NDEBUG" /D "_LIB" /D "_WINDOWS" /D "_CRT_SECURE_NO_DEPRECATE" /D "_CRT_NONSTDC_NO_DEPRECATE" /D "WIN32"
+/GF /FD /MT /GS- /Gy /arch:SSE2 /Zc:wchar_t- /Fp"..\..\out\release8\build\libbulletcollision\libbulletcollision.pch"
+/W3 /nologo /c /Wp64 /Zi /errorReport:prompt
+Benchmarking dbvt...
+World scale: 100.000000
+Extents base: 1.000000
+Extents range: 4.000000
+Leaves: 8192
+sizeof(b3DbvtVolume): 32 bytes
+sizeof(b3DbvtNode):   44 bytes
+[1] b3DbvtVolume intersections: 3499 ms (-1%)
+[2] b3DbvtVolume merges: 1934 ms (0%)
+[3] b3DynamicBvh::collideTT: 5485 ms (-21%)
+[4] b3DynamicBvh::collideTT self: 2814 ms (-20%)
+[5] b3DynamicBvh::collideTT xform: 7379 ms (-1%)
+[6] b3DynamicBvh::collideTT xform,self: 7270 ms (-2%)
+[7] b3DynamicBvh::rayTest: 6314 ms (0%),(332143 r/s)
+[8] insert/remove: 2093 ms (0%),(1001983 ir/s)
+[9] updates (teleport): 1879 ms (-3%),(1116100 u/s)
+[10] updates (jitter): 1244 ms (-4%),(1685813 u/s)
+[11] optimize (incremental): 2514 ms (0%),(1668000 o/s)
+[12] b3DbvtVolume notequal: 3659 ms (0%)
+[13] culling(OCL+fullsort): 2218 ms (0%),(461 t/s)
+[14] culling(OCL+qsort): 3688 ms (5%),(2221 t/s)
+[15] culling(KDOP+qsort): 1139 ms (-1%),(7192 t/s)
+[16] insert/remove batch(256): 5092 ms (0%),(823704 bir/s)
+[17] b3DbvtVolume select: 3419 ms (0%)
+struct b3DbvtBenchmark
+	struct NilPolicy : b3DynamicBvh::ICollide
+	{
+		NilPolicy() : m_pcount(0),m_depth(-B3_INFINITY),m_checksort(true)		{}
+		void	Process(const b3DbvtNode*,const b3DbvtNode*)				{ ++m_pcount; }
+		void	Process(const b3DbvtNode*)									{ ++m_pcount; }
+		void	Process(const b3DbvtNode*,b3Scalar depth)
+		{
+			++m_pcount;
+			if(m_checksort)
+			{ if(depth>=m_depth) m_depth=depth; else printf("wrong depth: %f (should be >= %f)\r\n",depth,m_depth); }
+		}
+		int			m_pcount;
+		b3Scalar	m_depth;
+		bool		m_checksort;
+	};
+	struct P14 : b3DynamicBvh::ICollide
+	{
+		struct Node
+		{
+			const b3DbvtNode*	leaf;
+			b3Scalar			depth;
+		};
+		void Process(const b3DbvtNode* leaf,b3Scalar depth)
+		{
+			Node	n;
+			n.leaf	=	leaf;
+			n.depth	=	depth;
+		}
+		static int sortfnc(const Node& a,const Node& b)
+		{
+			if(a.depth<b.depth) return(+1);
+			if(a.depth>b.depth) return(-1);
+			return(0);
+		}
+		b3AlignedObjectArray<Node>		m_nodes;
+	};
+	struct P15 : b3DynamicBvh::ICollide
+	{
+		struct Node
+		{
+			const b3DbvtNode*	leaf;
+			b3Scalar			depth;
+		};
+		void Process(const b3DbvtNode* leaf)
+		{
+			Node	n;
+			n.leaf	=	leaf;
+			n.depth	=	dot(leaf->volume.Center(),m_axis);
+		}
+		static int sortfnc(const Node& a,const Node& b)
+		{
+			if(a.depth<b.depth) return(+1);
+			if(a.depth>b.depth) return(-1);
+			return(0);
+		}
+		b3AlignedObjectArray<Node>		m_nodes;
+		b3Vector3						m_axis;
+	};
+	static b3Scalar			RandUnit()
+	{
+		return(rand()/(b3Scalar)RAND_MAX);
+	}
+	static b3Vector3		RandVector3()
+	{
+		return(b3Vector3(RandUnit(),RandUnit(),RandUnit()));
+	}
+	static b3Vector3		RandVector3(b3Scalar cs)
+	{
+		return(RandVector3()*cs-b3Vector3(cs,cs,cs)/2);
+	}
+	static b3DbvtVolume	RandVolume(b3Scalar cs,b3Scalar eb,b3Scalar es)
+	{
+		return(b3DbvtVolume::FromCE(RandVector3(cs),b3Vector3(eb,eb,eb)+RandVector3()*es));
+	}
+	static b3Transform		RandTransform(b3Scalar cs)
+	{
+		b3Transform	t;
+		t.setOrigin(RandVector3(cs));
+		t.setRotation(b3Quaternion(RandUnit()*B3_PI*2,RandUnit()*B3_PI*2,RandUnit()*B3_PI*2).normalized());
+		return(t);
+	}
+	static void				RandTree(b3Scalar cs,b3Scalar eb,b3Scalar es,int leaves,b3DynamicBvh& dbvt)
+	{
+		dbvt.clear();
+		for(int i=0;i<leaves;++i)
+		{
+			dbvt.insert(RandVolume(cs,eb,es),0);
+		}
+	}
+void			b3DynamicBvh::benchmark()
+	static const b3Scalar	cfgVolumeCenterScale		=	100;
+	static const b3Scalar	cfgVolumeExentsBase			=	1;
+	static const b3Scalar	cfgVolumeExentsScale		=	4;
+	static const int		cfgLeaves					=	8192;
+	static const bool		cfgEnable					=	true;
+	//[1] b3DbvtVolume intersections
+	bool					cfgBenchmark1_Enable		=	cfgEnable;
+	static const int		cfgBenchmark1_Iterations	=	8;
+	static const int		cfgBenchmark1_Reference		=	3499;
+	//[2] b3DbvtVolume merges
+	bool					cfgBenchmark2_Enable		=	cfgEnable;
+	static const int		cfgBenchmark2_Iterations	=	4;
+	static const int		cfgBenchmark2_Reference		=	1945;
+	//[3] b3DynamicBvh::collideTT
+	bool					cfgBenchmark3_Enable		=	cfgEnable;
+	static const int		cfgBenchmark3_Iterations	=	512;
+	static const int		cfgBenchmark3_Reference		=	5485;
+	//[4] b3DynamicBvh::collideTT self
+	bool					cfgBenchmark4_Enable		=	cfgEnable;
+	static const int		cfgBenchmark4_Iterations	=	512;
+	static const int		cfgBenchmark4_Reference		=	2814;
+	//[5] b3DynamicBvh::collideTT xform
+	bool					cfgBenchmark5_Enable		=	cfgEnable;
+	static const int		cfgBenchmark5_Iterations	=	512;
+	static const b3Scalar	cfgBenchmark5_OffsetScale	=	2;
+	static const int		cfgBenchmark5_Reference		=	7379;
+	//[6] b3DynamicBvh::collideTT xform,self
+	bool					cfgBenchmark6_Enable		=	cfgEnable;
+	static const int		cfgBenchmark6_Iterations	=	512;
+	static const b3Scalar	cfgBenchmark6_OffsetScale	=	2;
+	static const int		cfgBenchmark6_Reference		=	7270;
+	//[7] b3DynamicBvh::rayTest
+	bool					cfgBenchmark7_Enable		=	cfgEnable;
+	static const int		cfgBenchmark7_Passes		=	32;
+	static const int		cfgBenchmark7_Iterations	=	65536;
+	static const int		cfgBenchmark7_Reference		=	6307;
+	//[8] insert/remove
+	bool					cfgBenchmark8_Enable		=	cfgEnable;
+	static const int		cfgBenchmark8_Passes		=	32;
+	static const int		cfgBenchmark8_Iterations	=	65536;
+	static const int		cfgBenchmark8_Reference		=	2105;
+	//[9] updates (teleport)
+	bool					cfgBenchmark9_Enable		=	cfgEnable;
+	static const int		cfgBenchmark9_Passes		=	32;
+	static const int		cfgBenchmark9_Iterations	=	65536;
+	static const int		cfgBenchmark9_Reference		=	1879;
+	//[10] updates (jitter)
+	bool					cfgBenchmark10_Enable		=	cfgEnable;
+	static const b3Scalar	cfgBenchmark10_Scale		=	cfgVolumeCenterScale/10000;
+	static const int		cfgBenchmark10_Passes		=	32;
+	static const int		cfgBenchmark10_Iterations	=	65536;
+	static const int		cfgBenchmark10_Reference	=	1244;
+	//[11] optimize (incremental)
+	bool					cfgBenchmark11_Enable		=	cfgEnable;
+	static const int		cfgBenchmark11_Passes		=	64;
+	static const int		cfgBenchmark11_Iterations	=	65536;
+	static const int		cfgBenchmark11_Reference	=	2510;
+	//[12] b3DbvtVolume notequal
+	bool					cfgBenchmark12_Enable		=	cfgEnable;
+	static const int		cfgBenchmark12_Iterations	=	32;
+	static const int		cfgBenchmark12_Reference	=	3677;
+	//[13] culling(OCL+fullsort)
+	bool					cfgBenchmark13_Enable		=	cfgEnable;
+	static const int		cfgBenchmark13_Iterations	=	1024;
+	static const int		cfgBenchmark13_Reference	=	2231;
+	//[14] culling(OCL+qsort)
+	bool					cfgBenchmark14_Enable		=	cfgEnable;
+	static const int		cfgBenchmark14_Iterations	=	8192;
+	static const int		cfgBenchmark14_Reference	=	3500;
+	//[15] culling(KDOP+qsort)
+	bool					cfgBenchmark15_Enable		=	cfgEnable;
+	static const int		cfgBenchmark15_Iterations	=	8192;
+	static const int		cfgBenchmark15_Reference	=	1151;
+	//[16] insert/remove batch
+	bool					cfgBenchmark16_Enable		=	cfgEnable;
+	static const int		cfgBenchmark16_BatchCount	=	256;
+	static const int		cfgBenchmark16_Passes		=	16384;
+	static const int		cfgBenchmark16_Reference	=	5138;
+	//[17] select
+	bool					cfgBenchmark17_Enable		=	cfgEnable;
+	static const int		cfgBenchmark17_Iterations	=	4;
+	static const int		cfgBenchmark17_Reference	=	3390;
+	b3Clock					wallclock;
+	printf("Benchmarking dbvt...\r\n");
+	printf("\tWorld scale: %f\r\n",cfgVolumeCenterScale);
+	printf("\tExtents base: %f\r\n",cfgVolumeExentsBase);
+	printf("\tExtents range: %f\r\n",cfgVolumeExentsScale);
+	printf("\tLeaves: %u\r\n",cfgLeaves);
+	printf("\tsizeof(b3DbvtVolume): %u bytes\r\n",sizeof(b3DbvtVolume));
+	printf("\tsizeof(b3DbvtNode):   %u bytes\r\n",sizeof(b3DbvtNode));
+	if(cfgBenchmark1_Enable)
+	{// Benchmark 1	
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume>	volumes;
+		b3AlignedObjectArray<bool>			results;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		for(int i=0;i<cfgLeaves;++i)
+		{
+			volumes[i]=b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale);
+		}
+		printf("[1] b3DbvtVolume intersections: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark1_Iterations;++i)
+		{
+			for(int j=0;j<cfgLeaves;++j)
+			{
+				for(int k=0;k<cfgLeaves;++k)
+				{
+					results[k]=Intersect(volumes[j],volumes[k]);
+				}
+			}
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark1_Reference)*100/time);
+	}
+	if(cfgBenchmark2_Enable)
+	{// Benchmark 2	
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume>	volumes;
+		b3AlignedObjectArray<b3DbvtVolume>	results;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		for(int i=0;i<cfgLeaves;++i)
+		{
+			volumes[i]=b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale);
+		}
+		printf("[2] b3DbvtVolume merges: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark2_Iterations;++i)
+		{
+			for(int j=0;j<cfgLeaves;++j)
+			{
+				for(int k=0;k<cfgLeaves;++k)
+				{
+					Merge(volumes[j],volumes[k],results[k]);
+				}
+			}
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark2_Reference)*100/time);
+	}
+	if(cfgBenchmark3_Enable)
+	{// Benchmark 3	
+		srand(380843);
+		b3DynamicBvh						dbvt[2];
+		b3DbvtBenchmark::NilPolicy	policy;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt[0]);
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt[1]);
+		dbvt[0].optimizeTopDown();
+		dbvt[1].optimizeTopDown();
+		printf("[3] b3DynamicBvh::collideTT: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark3_Iterations;++i)
+		{
+			b3DynamicBvh::collideTT(dbvt[0].m_root,dbvt[1].m_root,policy);
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark3_Reference)*100/time);
+	}
+	if(cfgBenchmark4_Enable)
+	{// Benchmark 4
+		srand(380843);
+		b3DynamicBvh						dbvt;
+		b3DbvtBenchmark::NilPolicy	policy;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		printf("[4] b3DynamicBvh::collideTT self: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark4_Iterations;++i)
+		{
+			b3DynamicBvh::collideTT(dbvt.m_root,dbvt.m_root,policy);
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark4_Reference)*100/time);
+	}
+	if(cfgBenchmark5_Enable)
+	{// Benchmark 5	
+		srand(380843);
+		b3DynamicBvh								dbvt[2];
+		b3AlignedObjectArray<b3Transform>	transforms;
+		b3DbvtBenchmark::NilPolicy			policy;
+		transforms.resize(cfgBenchmark5_Iterations);
+		for(int i=0;i<transforms.size();++i)
+		{
+			transforms[i]=b3DbvtBenchmark::RandTransform(cfgVolumeCenterScale*cfgBenchmark5_OffsetScale);
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt[0]);
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt[1]);
+		dbvt[0].optimizeTopDown();
+		dbvt[1].optimizeTopDown();
+		printf("[5] b3DynamicBvh::collideTT xform: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark5_Iterations;++i)
+		{
+			b3DynamicBvh::collideTT(dbvt[0].m_root,dbvt[1].m_root,transforms[i],policy);
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark5_Reference)*100/time);
+	}
+	if(cfgBenchmark6_Enable)
+	{// Benchmark 6	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3AlignedObjectArray<b3Transform>	transforms;
+		b3DbvtBenchmark::NilPolicy			policy;
+		transforms.resize(cfgBenchmark6_Iterations);
+		for(int i=0;i<transforms.size();++i)
+		{
+			transforms[i]=b3DbvtBenchmark::RandTransform(cfgVolumeCenterScale*cfgBenchmark6_OffsetScale);
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		printf("[6] b3DynamicBvh::collideTT xform,self: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark6_Iterations;++i)
+		{
+			b3DynamicBvh::collideTT(dbvt.m_root,dbvt.m_root,transforms[i],policy);		
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark6_Reference)*100/time);
+	}
+	if(cfgBenchmark7_Enable)
+	{// Benchmark 7	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3AlignedObjectArray<b3Vector3>		rayorg;
+		b3AlignedObjectArray<b3Vector3>		raydir;
+		b3DbvtBenchmark::NilPolicy			policy;
+		rayorg.resize(cfgBenchmark7_Iterations);
+		raydir.resize(cfgBenchmark7_Iterations);
+		for(int i=0;i<rayorg.size();++i)
+		{
+			rayorg[i]=b3DbvtBenchmark::RandVector3(cfgVolumeCenterScale*2);
+			raydir[i]=b3DbvtBenchmark::RandVector3(cfgVolumeCenterScale*2);
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		printf("[7] b3DynamicBvh::rayTest: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark7_Passes;++i)
+		{
+			for(int j=0;j<cfgBenchmark7_Iterations;++j)
+			{
+				b3DynamicBvh::rayTest(dbvt.m_root,rayorg[j],rayorg[j]+raydir[j],policy);
+			}
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		unsigned	rays=cfgBenchmark7_Passes*cfgBenchmark7_Iterations;
+		printf("%u ms (%i%%),(%u r/s)\r\n",time,(time-cfgBenchmark7_Reference)*100/time,(rays*1000)/time);
+	}
+	if(cfgBenchmark8_Enable)
+	{// Benchmark 8	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		printf("[8] insert/remove: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark8_Passes;++i)
+		{
+			for(int j=0;j<cfgBenchmark8_Iterations;++j)
+			{
+				dbvt.remove(dbvt.insert(b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale),0));
+			}
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	ir=cfgBenchmark8_Passes*cfgBenchmark8_Iterations;
+		printf("%u ms (%i%%),(%u ir/s)\r\n",time,(time-cfgBenchmark8_Reference)*100/time,ir*1000/time);
+	}
+	if(cfgBenchmark9_Enable)
+	{// Benchmark 9	
+		srand(380843);
+		b3DynamicBvh										dbvt;
+		b3AlignedObjectArray<const b3DbvtNode*>	leaves;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		dbvt.extractLeaves(dbvt.m_root,leaves);
+		printf("[9] updates (teleport): ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark9_Passes;++i)
+		{
+			for(int j=0;j<cfgBenchmark9_Iterations;++j)
+			{
+				dbvt.update(const_cast<b3DbvtNode*>(leaves[rand()%cfgLeaves]),
+					b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale));
+			}
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	up=cfgBenchmark9_Passes*cfgBenchmark9_Iterations;
+		printf("%u ms (%i%%),(%u u/s)\r\n",time,(time-cfgBenchmark9_Reference)*100/time,up*1000/time);
+	}
+	if(cfgBenchmark10_Enable)
+	{// Benchmark 10	
+		srand(380843);
+		b3DynamicBvh										dbvt;
+		b3AlignedObjectArray<const b3DbvtNode*>	leaves;
+		b3AlignedObjectArray<b3Vector3>				vectors;
+		vectors.resize(cfgBenchmark10_Iterations);
+		for(int i=0;i<vectors.size();++i)
+		{
+			vectors[i]=(b3DbvtBenchmark::RandVector3()*2-b3Vector3(1,1,1))*cfgBenchmark10_Scale;
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		dbvt.extractLeaves(dbvt.m_root,leaves);
+		printf("[10] updates (jitter): ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark10_Passes;++i)
+		{
+			for(int j=0;j<cfgBenchmark10_Iterations;++j)
+			{			
+				const b3Vector3&	d=vectors[j];
+				b3DbvtNode*		l=const_cast<b3DbvtNode*>(leaves[rand()%cfgLeaves]);
+				b3DbvtVolume		v=b3DbvtVolume::FromMM(l->volume.Mins()+d,l->volume.Maxs()+d);
+				dbvt.update(l,v);
+			}
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	up=cfgBenchmark10_Passes*cfgBenchmark10_Iterations;
+		printf("%u ms (%i%%),(%u u/s)\r\n",time,(time-cfgBenchmark10_Reference)*100/time,up*1000/time);
+	}
+	if(cfgBenchmark11_Enable)
+	{// Benchmark 11	
+		srand(380843);
+		b3DynamicBvh										dbvt;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		printf("[11] optimize (incremental): ");
+		wallclock.reset();	
+		for(int i=0;i<cfgBenchmark11_Passes;++i)
+		{
+			dbvt.optimizeIncremental(cfgBenchmark11_Iterations);
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	op=cfgBenchmark11_Passes*cfgBenchmark11_Iterations;
+		printf("%u ms (%i%%),(%u o/s)\r\n",time,(time-cfgBenchmark11_Reference)*100/time,op/time*1000);
+	}
+	if(cfgBenchmark12_Enable)
+	{// Benchmark 12	
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume>	volumes;
+		b3AlignedObjectArray<bool>				results;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		for(int i=0;i<cfgLeaves;++i)
+		{
+			volumes[i]=b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale);
+		}
+		printf("[12] b3DbvtVolume notequal: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark12_Iterations;++i)
+		{
+			for(int j=0;j<cfgLeaves;++j)
+			{
+				for(int k=0;k<cfgLeaves;++k)
+				{
+					results[k]=NotEqual(volumes[j],volumes[k]);
+				}
+			}
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark12_Reference)*100/time);
+	}
+	if(cfgBenchmark13_Enable)
+	{// Benchmark 13	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3AlignedObjectArray<b3Vector3>		vectors;
+		b3DbvtBenchmark::NilPolicy			policy;
+		vectors.resize(cfgBenchmark13_Iterations);
+		for(int i=0;i<vectors.size();++i)
+		{
+			vectors[i]=(b3DbvtBenchmark::RandVector3()*2-b3Vector3(1,1,1)).normalized();
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		printf("[13] culling(OCL+fullsort): ");
+		wallclock.reset();	
+		for(int i=0;i<cfgBenchmark13_Iterations;++i)
+		{
+			static const b3Scalar	offset=0;
+			policy.m_depth=-B3_INFINITY;
+			dbvt.collideOCL(dbvt.m_root,&vectors[i],&offset,vectors[i],1,policy);
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	t=cfgBenchmark13_Iterations;
+		printf("%u ms (%i%%),(%u t/s)\r\n",time,(time-cfgBenchmark13_Reference)*100/time,(t*1000)/time);
+	}
+	if(cfgBenchmark14_Enable)
+	{// Benchmark 14	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3AlignedObjectArray<b3Vector3>		vectors;
+		b3DbvtBenchmark::P14				policy;
+		vectors.resize(cfgBenchmark14_Iterations);
+		for(int i=0;i<vectors.size();++i)
+		{
+			vectors[i]=(b3DbvtBenchmark::RandVector3()*2-b3Vector3(1,1,1)).normalized();
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		policy.m_nodes.reserve(cfgLeaves);
+		printf("[14] culling(OCL+qsort): ");
+		wallclock.reset();	
+		for(int i=0;i<cfgBenchmark14_Iterations;++i)
+		{
+			static const b3Scalar	offset=0;
+			policy.m_nodes.resize(0);
+			dbvt.collideOCL(dbvt.m_root,&vectors[i],&offset,vectors[i],1,policy,false);
+			policy.m_nodes.quickSort(b3DbvtBenchmark::P14::sortfnc);
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	t=cfgBenchmark14_Iterations;
+		printf("%u ms (%i%%),(%u t/s)\r\n",time,(time-cfgBenchmark14_Reference)*100/time,(t*1000)/time);
+	}
+	if(cfgBenchmark15_Enable)
+	{// Benchmark 15	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3AlignedObjectArray<b3Vector3>		vectors;
+		b3DbvtBenchmark::P15				policy;
+		vectors.resize(cfgBenchmark15_Iterations);
+		for(int i=0;i<vectors.size();++i)
+		{
+			vectors[i]=(b3DbvtBenchmark::RandVector3()*2-b3Vector3(1,1,1)).normalized();
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		policy.m_nodes.reserve(cfgLeaves);
+		printf("[15] culling(KDOP+qsort): ");
+		wallclock.reset();	
+		for(int i=0;i<cfgBenchmark15_Iterations;++i)
+		{
+			static const b3Scalar	offset=0;
+			policy.m_nodes.resize(0);
+			policy.m_axis=vectors[i];
+			dbvt.collideKDOP(dbvt.m_root,&vectors[i],&offset,1,policy);
+			policy.m_nodes.quickSort(b3DbvtBenchmark::P15::sortfnc);
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	t=cfgBenchmark15_Iterations;
+		printf("%u ms (%i%%),(%u t/s)\r\n",time,(time-cfgBenchmark15_Reference)*100/time,(t*1000)/time);
+	}
+	if(cfgBenchmark16_Enable)
+	{// Benchmark 16	
+		srand(380843);
+		b3DynamicBvh								dbvt;
+		b3AlignedObjectArray<b3DbvtNode*>	batch;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale,cfgLeaves,dbvt);
+		dbvt.optimizeTopDown();
+		batch.reserve(cfgBenchmark16_BatchCount);
+		printf("[16] insert/remove batch(%u): ",cfgBenchmark16_BatchCount);
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark16_Passes;++i)
+		{
+			for(int j=0;j<cfgBenchmark16_BatchCount;++j)
+			{
+				batch.push_back(dbvt.insert(b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale),0));
+			}
+			for(int j=0;j<cfgBenchmark16_BatchCount;++j)
+			{
+				dbvt.remove(batch[j]);
+			}
+			batch.resize(0);
+		}
+		const int	time=(int)wallclock.getTimeMilliseconds();
+		const int	ir=cfgBenchmark16_Passes*cfgBenchmark16_BatchCount;
+		printf("%u ms (%i%%),(%u bir/s)\r\n",time,(time-cfgBenchmark16_Reference)*100/time,int(ir*1000.0/time));
+	}
+	if(cfgBenchmark17_Enable)
+	{// Benchmark 17
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume>	volumes;
+		b3AlignedObjectArray<int>			results;
+		b3AlignedObjectArray<int>			indices;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		indices.resize(cfgLeaves);
+		for(int i=0;i<cfgLeaves;++i)
+		{
+			indices[i]=i;
+			volumes[i]=b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale,cfgVolumeExentsBase,cfgVolumeExentsScale);
+		}
+		for(int i=0;i<cfgLeaves;++i)
+		{
+			b3Swap(indices[i],indices[rand()%cfgLeaves]);
+		}
+		printf("[17] b3DbvtVolume select: ");
+		wallclock.reset();
+		for(int i=0;i<cfgBenchmark17_Iterations;++i)
+		{
+			for(int j=0;j<cfgLeaves;++j)
+			{
+				for(int k=0;k<cfgLeaves;++k)
+				{
+					const int idx=indices[k];
+					results[idx]=Select(volumes[idx],volumes[j],volumes[k]);
+				}
+			}
+		}
+		const int time=(int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n",time,(time-cfgBenchmark17_Reference)*100/time);
+	}
+	printf("\r\n\r\n");
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h
new file mode 100644
index 00000000..ce9941ef
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h
@@ -0,0 +1,1268 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///b3DynamicBvh implementation by Nathanael Presson
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+// Compile time configuration
+// Implementation profiles
+#define B3_DBVT_IMPL_GENERIC		0	// Generic implementation	
+#define B3_DBVT_IMPL_SSE			1	// SSE
+// Template implementation of ICollide
+#ifdef _WIN32
+#if (defined (_MSC_VER) && _MSC_VER >= 1400)
+#define	B3_DBVT_USE_TEMPLATE		1
+#define	B3_DBVT_USE_TEMPLATE		0
+#define	B3_DBVT_USE_TEMPLATE		0
+// Use only intrinsics instead of inline asm
+// Using memmov for collideOCL
+#define B3_DBVT_USE_MEMMOVE		1
+// Enable benchmarking code
+// Inlining
+// Specific methods implementation
+//SSE gives errors on a MSVC 7.1
+#if defined (B3_USE_SSE) //&& defined (_WIN32)
+#include <emmintrin.h>
+// Auto config and checks
+#define	B3_DBVT_VIRTUAL
+#define B3_DBVT_VIRTUAL_DTOR(a)
+#define B3_DBVT_PREFIX					template <typename T>
+#define B3_DBVT_IPOLICY				T& policy
+#define B3_DBVT_CHECKTYPE				static const ICollide&	typechecker=*(T*)1;(void)typechecker;
+#define	B3_DBVT_VIRTUAL_DTOR(a)		virtual ~a() {}
+#define B3_DBVT_VIRTUAL				virtual
+#define B3_DBVT_PREFIX
+#define B3_DBVT_IPOLICY				ICollide& policy
+#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
+#include <memory.h>
+#include <string.h>
+#error "B3_DBVT_USE_TEMPLATE undefined"
+#error "B3_DBVT_USE_MEMMOVE undefined"
+#error "B3_DBVT_ENABLE_BENCHMARK undefined"
+#error "B3_DBVT_SELECT_IMPL undefined"
+#error "B3_DBVT_MERGE_IMPL undefined"
+#ifndef B3_DBVT_INT0_IMPL
+#error "B3_DBVT_INT0_IMPL undefined"
+// Defaults volumes
+/* b3DbvtAabbMm			*/ 
+struct	b3DbvtAabbMm
+	B3_DBVT_INLINE b3Vector3			Center() const	{ return((mi+mx)/2); }
+	B3_DBVT_INLINE b3Vector3			Lengths() const	{ return(mx-mi); }
+	B3_DBVT_INLINE b3Vector3			Extents() const	{ return((mx-mi)/2); }
+	B3_DBVT_INLINE const b3Vector3&	Mins() const	{ return(mi); }
+	B3_DBVT_INLINE const b3Vector3&	Maxs() const	{ return(mx); }
+	static inline b3DbvtAabbMm		FromCE(const b3Vector3& c,const b3Vector3& e);
+	static inline b3DbvtAabbMm		FromCR(const b3Vector3& c,b3Scalar r);
+	static inline b3DbvtAabbMm		FromMM(const b3Vector3& mi,const b3Vector3& mx);
+	static inline b3DbvtAabbMm		FromPoints(const b3Vector3* pts,int n);
+	static inline b3DbvtAabbMm		FromPoints(const b3Vector3** ppts,int n);
+	B3_DBVT_INLINE void				Expand(const b3Vector3& e);
+	B3_DBVT_INLINE void				SignedExpand(const b3Vector3& e);
+	B3_DBVT_INLINE bool				Contain(const b3DbvtAabbMm& a) const;
+	B3_DBVT_INLINE int					Classify(const b3Vector3& n,b3Scalar o,int s) const;
+	B3_DBVT_INLINE b3Scalar			ProjectMinimum(const b3Vector3& v,unsigned signs) const;
+	B3_DBVT_INLINE friend bool			b3Intersect(	const b3DbvtAabbMm& a,
+		const b3DbvtAabbMm& b);
+	B3_DBVT_INLINE friend bool			b3Intersect(	const b3DbvtAabbMm& a,
+		const b3Vector3& b);
+	B3_DBVT_INLINE friend b3Scalar		b3Proximity(	const b3DbvtAabbMm& a,
+		const b3DbvtAabbMm& b);
+	B3_DBVT_INLINE friend int			b3Select(		const b3DbvtAabbMm& o,
+		const b3DbvtAabbMm& a,
+		const b3DbvtAabbMm& b);
+	B3_DBVT_INLINE friend void			b3Merge(		const b3DbvtAabbMm& a,
+		const b3DbvtAabbMm& b,
+		b3DbvtAabbMm& r);
+	B3_DBVT_INLINE friend bool			b3NotEqual(	const b3DbvtAabbMm& a,
+		const b3DbvtAabbMm& b);
+    B3_DBVT_INLINE b3Vector3&	tMins()	{ return(mi); }
+	B3_DBVT_INLINE b3Vector3&	tMaxs()	{ return(mx); }
+	B3_DBVT_INLINE void				AddSpan(const b3Vector3& d,b3Scalar& smi,b3Scalar& smx) const;
+	b3Vector3	mi,mx;
+// Types	
+typedef	b3DbvtAabbMm	b3DbvtVolume;
+/* b3DbvtNode				*/ 
+struct	b3DbvtNode
+	b3DbvtVolume	volume;
+	b3DbvtNode*		parent;
+	B3_DBVT_INLINE bool	isleaf() const		{ return(childs[1]==0); }
+	B3_DBVT_INLINE bool	isinternal() const	{ return(!isleaf()); }
+	union
+	{
+		b3DbvtNode*	childs[2];
+		void*	data;
+		int		dataAsInt;
+	};
+///The b3DynamicBvh class implements a fast dynamic bounding volume tree based on axis aligned bounding boxes (aabb tree).
+///This b3DynamicBvh is used for soft body collision detection and for the b3DynamicBvhBroadphase. It has a fast insert, remove and update of nodes.
+///Unlike the b3QuantizedBvh, nodes can be dynamically moved around, which allows for change in topology of the underlying data structure.
+struct	b3DynamicBvh
+	/* Stack element	*/ 
+	struct	sStkNN
+	{
+		const b3DbvtNode*	a;
+		const b3DbvtNode*	b;
+		sStkNN() {}
+		sStkNN(const b3DbvtNode* na,const b3DbvtNode* nb) : a(na),b(nb) {}
+	};
+	struct	sStkNP
+	{
+		const b3DbvtNode*	node;
+		int			mask;
+		sStkNP(const b3DbvtNode* n,unsigned m) : node(n),mask(m) {}
+	};
+	struct	sStkNPS
+	{
+		const b3DbvtNode*	node;
+		int			mask;
+		b3Scalar	value;
+		sStkNPS() {}
+		sStkNPS(const b3DbvtNode* n,unsigned m,b3Scalar v) : node(n),mask(m),value(v) {}
+	};
+	struct	sStkCLN
+	{
+		const b3DbvtNode*	node;
+		b3DbvtNode*		parent;
+		sStkCLN(const b3DbvtNode* n,b3DbvtNode* p) : node(n),parent(p) {}
+	};
+	// Policies/Interfaces
+	/* ICollide	*/ 
+	struct	ICollide
+	{		
+			B3_DBVT_VIRTUAL void	Process(const b3DbvtNode*,const b3DbvtNode*)		{}
+		B3_DBVT_VIRTUAL void	Process(const b3DbvtNode*)					{}
+		B3_DBVT_VIRTUAL void	Process(const b3DbvtNode* n,b3Scalar)			{ Process(n); }
+		B3_DBVT_VIRTUAL bool	Descent(const b3DbvtNode*)					{ return(true); }
+		B3_DBVT_VIRTUAL bool	AllLeaves(const b3DbvtNode*)					{ return(true); }
+	};
+	/* IWriter	*/ 
+	struct	IWriter
+	{
+		virtual ~IWriter() {}
+		virtual void		Prepare(const b3DbvtNode* root,int numnodes)=0;
+		virtual void		WriteNode(const b3DbvtNode*,int index,int parent,int child0,int child1)=0;
+		virtual void		WriteLeaf(const b3DbvtNode*,int index,int parent)=0;
+	};
+	/* IClone	*/ 
+	struct	IClone
+	{
+		virtual ~IClone()	{}
+		virtual void		CloneLeaf(b3DbvtNode*) {}
+	};
+	// Constants
+	enum	{
+	};
+	// Fields
+	b3DbvtNode*		m_root;
+	b3DbvtNode*		m_free;
+	int				m_lkhd;
+	int				m_leaves;
+	unsigned		m_opath;
+	b3AlignedObjectArray<sStkNN>	m_stkStack;
+	mutable b3AlignedObjectArray<const b3DbvtNode*>	m_rayTestStack;
+	// Methods
+	b3DynamicBvh();
+	~b3DynamicBvh();
+	void			clear();
+	bool			empty() const { return(0==m_root); }
+	void			optimizeBottomUp();
+	void			optimizeTopDown(int bu_treshold=128);
+	void			optimizeIncremental(int passes);
+	b3DbvtNode*		insert(const b3DbvtVolume& box,void* data);
+	void			update(b3DbvtNode* leaf,int lookahead=-1);
+	void			update(b3DbvtNode* leaf,b3DbvtVolume& volume);
+	bool			update(b3DbvtNode* leaf,b3DbvtVolume& volume,const b3Vector3& velocity,b3Scalar margin);
+	bool			update(b3DbvtNode* leaf,b3DbvtVolume& volume,const b3Vector3& velocity);
+	bool			update(b3DbvtNode* leaf,b3DbvtVolume& volume,b3Scalar margin);	
+	void			remove(b3DbvtNode* leaf);
+	void			write(IWriter* iwriter) const;
+	void			clone(b3DynamicBvh& dest,IClone* iclone=0) const;
+	static int		maxdepth(const b3DbvtNode* node);
+	static int		countLeaves(const b3DbvtNode* node);
+	static void		extractLeaves(const b3DbvtNode* node,b3AlignedObjectArray<const b3DbvtNode*>& leaves);
+	static void		benchmark();
+	static void		benchmark(){}
+	// B3_DBVT_IPOLICY must support ICollide policy/interface
+		static void		enumNodes(	const b3DbvtNode* root,
+		static void		enumLeaves(	const b3DbvtNode* root,
+		void		collideTT(	const b3DbvtNode* root0,
+		const b3DbvtNode* root1,
+		void		collideTTpersistentStack(	const b3DbvtNode* root0,
+		  const b3DbvtNode* root1,
+#if 0
+		void		collideTT(	const b3DbvtNode* root0,
+		const b3DbvtNode* root1,
+		const b3Transform& xform,
+		void		collideTT(	const b3DbvtNode* root0,
+		const b3Transform& xform0,
+		const b3DbvtNode* root1,
+		const b3Transform& xform1,
+		void		collideTV(	const b3DbvtNode* root,
+		const b3DbvtVolume& volume,
+		B3_DBVT_IPOLICY) const;
+	///rayTest is a re-entrant ray test, and can be called in parallel as long as the b3AlignedAlloc is thread-safe (uses locking etc)
+	///rayTest is slower than rayTestInternal, because it builds a local stack, using memory allocations, and it recomputes signs/rayDirectionInverses each time
+		static void		rayTest(	const b3DbvtNode* root,
+		const b3Vector3& rayFrom,
+		const b3Vector3& rayTo,
+	///rayTestInternal is faster than rayTest, because it uses a persistent stack (to reduce dynamic memory allocations to a minimum) and it uses precomputed signs/rayInverseDirections
+	///rayTestInternal is used by b3DynamicBvhBroadphase to accelerate world ray casts
+		void		rayTestInternal(	const b3DbvtNode* root,
+								const b3Vector3& rayFrom,
+								const b3Vector3& rayTo,
+								const b3Vector3& rayDirectionInverse,
+								unsigned int signs[3],
+								b3Scalar lambda_max,
+								const b3Vector3& aabbMin,
+								const b3Vector3& aabbMax,
+								B3_DBVT_IPOLICY) const;
+		static void		collideKDOP(const b3DbvtNode* root,
+		const b3Vector3* normals,
+		const b3Scalar* offsets,
+		int count,
+		static void		collideOCL(	const b3DbvtNode* root,
+		const b3Vector3* normals,
+		const b3Scalar* offsets,
+		const b3Vector3& sortaxis,
+		int count,								
+		bool fullsort=true);
+		static void		collideTU(	const b3DbvtNode* root,
+	// Helpers	
+	static B3_DBVT_INLINE int	nearest(const int* i,const b3DynamicBvh::sStkNPS* a,b3Scalar v,int l,int h)
+	{
+		int	m=0;
+		while(l<h)
+		{
+			m=(l+h)>>1;
+			if(a[i[m]].value>=v) l=m+1; else h=m;
+		}
+		return(h);
+	}
+	static B3_DBVT_INLINE int	allocate(	b3AlignedObjectArray<int>& ifree,
+		b3AlignedObjectArray<sStkNPS>& stock,
+		const sStkNPS& value)
+	{
+		int	i;
+		if(ifree.size()>0)
+		{ i=ifree[ifree.size()-1];ifree.pop_back();stock[i]=value; }
+		else
+		{ i=stock.size();stock.push_back(value); }
+		return(i); 
+	}
+	//
+	b3DynamicBvh(const b3DynamicBvh&)	{}	
+// Inline's
+inline b3DbvtAabbMm			b3DbvtAabbMm::FromCE(const b3Vector3& c,const b3Vector3& e)
+	b3DbvtAabbMm box;
+	box.mi=c-e;box.mx=c+e;
+	return(box);
+inline b3DbvtAabbMm			b3DbvtAabbMm::FromCR(const b3Vector3& c,b3Scalar r)
+	return(FromCE(c,b3MakeVector3(r,r,r)));
+inline b3DbvtAabbMm			b3DbvtAabbMm::FromMM(const b3Vector3& mi,const b3Vector3& mx)
+	b3DbvtAabbMm box;
+	box.mi=mi;box.mx=mx;
+	return(box);
+inline b3DbvtAabbMm			b3DbvtAabbMm::FromPoints(const b3Vector3* pts,int n)
+	b3DbvtAabbMm box;
+	box.mi=box.mx=pts[0];
+	for(int i=1;i<n;++i)
+	{
+		box.mi.setMin(pts[i]);
+		box.mx.setMax(pts[i]);
+	}
+	return(box);
+inline b3DbvtAabbMm			b3DbvtAabbMm::FromPoints(const b3Vector3** ppts,int n)
+	b3DbvtAabbMm box;
+	box.mi=box.mx=*ppts[0];
+	for(int i=1;i<n;++i)
+	{
+		box.mi.setMin(*ppts[i]);
+		box.mx.setMax(*ppts[i]);
+	}
+	return(box);
+B3_DBVT_INLINE void		b3DbvtAabbMm::Expand(const b3Vector3& e)
+	mi-=e;mx+=e;
+B3_DBVT_INLINE void		b3DbvtAabbMm::SignedExpand(const b3Vector3& e)
+	if(e.x>0) mx.setX(mx.x+e[0]); else mi.setX(mi.x+e[0]);
+	if(e.y>0) mx.setY(mx.y+e[1]); else mi.setY(mi.y+e[1]);
+	if(e.z>0) mx.setZ(mx.z+e[2]); else mi.setZ(mi.z+e[2]);
+B3_DBVT_INLINE bool		b3DbvtAabbMm::Contain(const b3DbvtAabbMm& a) const
+	return(	(mi.x<=a.mi.x)&&
+		(mi.y<=a.mi.y)&&
+		(mi.z<=a.mi.z)&&
+		(mx.x>=a.mx.x)&&
+		(mx.y>=a.mx.y)&&
+		(mx.z>=a.mx.z));
+B3_DBVT_INLINE int		b3DbvtAabbMm::Classify(const b3Vector3& n,b3Scalar o,int s) const
+	b3Vector3			pi,px;
+	switch(s)
+	{
+	case	(0+0+0):	px=b3MakeVector3(mi.x,mi.y,mi.z);
+		pi=b3MakeVector3(mx.x,mx.y,mx.z);break;
+	case	(1+0+0):	px=b3MakeVector3(mx.x,mi.y,mi.z);
+		pi=b3MakeVector3(mi.x,mx.y,mx.z);break;
+	case	(0+2+0):	px=b3MakeVector3(mi.x,mx.y,mi.z);
+		pi=b3MakeVector3(mx.x,mi.y,mx.z);break;
+	case	(1+2+0):	px=b3MakeVector3(mx.x,mx.y,mi.z);
+		pi=b3MakeVector3(mi.x,mi.y,mx.z);break;
+	case	(0+0+4):	px=b3MakeVector3(mi.x,mi.y,mx.z);
+		pi=b3MakeVector3(mx.x,mx.y,mi.z);break;
+	case	(1+0+4):	px=b3MakeVector3(mx.x,mi.y,mx.z);
+		pi=b3MakeVector3(mi.x,mx.y,mi.z);break;
+	case	(0+2+4):	px=b3MakeVector3(mi.x,mx.y,mx.z);
+		pi=b3MakeVector3(mx.x,mi.y,mi.z);break;
+	case	(1+2+4):	px=b3MakeVector3(mx.x,mx.y,mx.z);
+		pi=b3MakeVector3(mi.x,mi.y,mi.z);break;
+	}
+	if((b3Dot(n,px)+o)<0)		return(-1);
+	if((b3Dot(n,pi)+o)>=0)	return(+1);
+	return(0);
+B3_DBVT_INLINE b3Scalar	b3DbvtAabbMm::ProjectMinimum(const b3Vector3& v,unsigned signs) const
+	const b3Vector3*	b[]={&mx,&mi};
+	const b3Vector3		p = b3MakeVector3(	b[(signs>>0)&1]->x,
+		b[(signs>>1)&1]->y,
+		b[(signs>>2)&1]->z);
+	return(b3Dot(p,v));
+B3_DBVT_INLINE void		b3DbvtAabbMm::AddSpan(const b3Vector3& d,b3Scalar& smi,b3Scalar& smx) const
+	for(int i=0;i<3;++i)
+	{
+		if(d[i]<0)
+		{ smi+=mx[i]*d[i];smx+=mi[i]*d[i]; }
+		else
+		{ smi+=mi[i]*d[i];smx+=mx[i]*d[i]; }
+	}
+B3_DBVT_INLINE bool		b3Intersect(	const b3DbvtAabbMm& a,
+								  const b3DbvtAabbMm& b)
+	const __m128	rt(_mm_or_ps(	_mm_cmplt_ps(_mm_load_ps(b.mx),_mm_load_ps(a.mi)),
+		_mm_cmplt_ps(_mm_load_ps(a.mx),_mm_load_ps(b.mi))));
+#if defined (_WIN32)
+	const __int32*	pu((const __int32*)&rt);
+    const int*	pu((const int*)&rt);
+	return((pu[0]|pu[1]|pu[2])==0);
+	return(	(a.mi.x<=b.mx.x)&&
+		(a.mx.x>=b.mi.x)&&
+		(a.mi.y<=b.mx.y)&&
+		(a.mx.y>=b.mi.y)&&
+		(a.mi.z<=b.mx.z)&&		
+		(a.mx.z>=b.mi.z));
+B3_DBVT_INLINE bool		b3Intersect(	const b3DbvtAabbMm& a,
+								  const b3Vector3& b)
+	return(	(b.x>=a.mi.x)&&
+		(b.y>=a.mi.y)&&
+		(b.z>=a.mi.z)&&
+		(b.x<=a.mx.x)&&
+		(b.y<=a.mx.y)&&
+		(b.z<=a.mx.z));
+B3_DBVT_INLINE b3Scalar	b3Proximity(	const b3DbvtAabbMm& a,
+								  const b3DbvtAabbMm& b)
+	const b3Vector3	d=(a.mi+a.mx)-(b.mi+b.mx);
+	return(b3Fabs(d.x)+b3Fabs(d.y)+b3Fabs(d.z));
+B3_DBVT_INLINE int			b3Select(	const b3DbvtAabbMm& o,
+							   const b3DbvtAabbMm& a,
+							   const b3DbvtAabbMm& b)
+#if defined (_WIN32)
+	static B3_ATTRIBUTE_ALIGNED16(const unsigned __int32)	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
+    static B3_ATTRIBUTE_ALIGNED16(const unsigned int)	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x00000000 /*0x7fffffff*/};
+	///@todo: the intrinsic version is 11% slower
+	union b3SSEUnion ///NOTE: if we use more intrinsics, move b3SSEUnion into the LinearMath directory
+	{
+	   __m128		ssereg;
+	   float		floats[4];
+	   int			ints[4];
+	};
+	__m128	omi(_mm_load_ps(o.mi));
+	omi=_mm_add_ps(omi,_mm_load_ps(o.mx));
+	__m128	ami(_mm_load_ps(a.mi));
+	ami=_mm_add_ps(ami,_mm_load_ps(a.mx));
+	ami=_mm_sub_ps(ami,omi);
+	ami=_mm_and_ps(ami,_mm_load_ps((const float*)mask));
+	__m128	bmi(_mm_load_ps(b.mi));
+	bmi=_mm_add_ps(bmi,_mm_load_ps(b.mx));
+	bmi=_mm_sub_ps(bmi,omi);
+	bmi=_mm_and_ps(bmi,_mm_load_ps((const float*)mask));
+	__m128	t0(_mm_movehl_ps(ami,ami));
+	ami=_mm_add_ps(ami,t0);
+	ami=_mm_add_ss(ami,_mm_shuffle_ps(ami,ami,1));
+	__m128 t1(_mm_movehl_ps(bmi,bmi));
+	bmi=_mm_add_ps(bmi,t1);
+	bmi=_mm_add_ss(bmi,_mm_shuffle_ps(bmi,bmi,1));
+	b3SSEUnion tmp;
+	tmp.ssereg = _mm_cmple_ss(bmi,ami);
+	return tmp.ints[0]&1;
+	B3_ATTRIBUTE_ALIGNED16(__int32	r[1]);
+	__asm
+	{
+		mov		eax,o
+			mov		ecx,a
+			mov		edx,b
+			movaps	xmm0,[eax]
+		movaps	xmm5,mask
+			addps	xmm0,[eax+16]	
+		movaps	xmm1,[ecx]
+		movaps	xmm2,[edx]
+		addps	xmm1,[ecx+16]
+		addps	xmm2,[edx+16]
+		subps	xmm1,xmm0
+			subps	xmm2,xmm0
+			andps	xmm1,xmm5
+			andps	xmm2,xmm5
+			movhlps	xmm3,xmm1
+			movhlps	xmm4,xmm2
+			addps	xmm1,xmm3
+			addps	xmm2,xmm4
+			pshufd	xmm3,xmm1,1
+			pshufd	xmm4,xmm2,1
+			addss	xmm1,xmm3
+			addss	xmm2,xmm4
+			cmpless	xmm2,xmm1
+			movss	r,xmm2
+	}
+	return(r[0]&1);
+	return(b3Proximity(o,a)<b3Proximity(o,b)?0:1);
+B3_DBVT_INLINE void		b3Merge(	const b3DbvtAabbMm& a,
+							  const b3DbvtAabbMm& b,
+							  b3DbvtAabbMm& r)
+	__m128	ami(_mm_load_ps(a.mi));
+	__m128	amx(_mm_load_ps(a.mx));
+	__m128	bmi(_mm_load_ps(b.mi));
+	__m128	bmx(_mm_load_ps(b.mx));
+	ami=_mm_min_ps(ami,bmi);
+	amx=_mm_max_ps(amx,bmx);
+	_mm_store_ps(r.mi,ami);
+	_mm_store_ps(r.mx,amx);
+	for(int i=0;i<3;++i)
+	{
+		if(a.mi[i]<b.mi[i]) r.mi[i]=a.mi[i]; else r.mi[i]=b.mi[i];
+		if(a.mx[i]>b.mx[i]) r.mx[i]=a.mx[i]; else r.mx[i]=b.mx[i];
+	}
+B3_DBVT_INLINE bool		b3NotEqual(	const b3DbvtAabbMm& a,
+								 const b3DbvtAabbMm& b)
+	return(	(a.mi.x!=b.mi.x)||
+		(a.mi.y!=b.mi.y)||
+		(a.mi.z!=b.mi.z)||
+		(a.mx.x!=b.mx.x)||
+		(a.mx.y!=b.mx.y)||
+		(a.mx.z!=b.mx.z));
+// Inline's
+inline void		b3DynamicBvh::enumNodes(	const b3DbvtNode* root,
+								  B3_DBVT_IPOLICY)
+		policy.Process(root);
+	if(root->isinternal())
+	{
+		enumNodes(root->childs[0],policy);
+		enumNodes(root->childs[1],policy);
+	}
+inline void		b3DynamicBvh::enumLeaves(	const b3DbvtNode* root,
+								   B3_DBVT_IPOLICY)
+		if(root->isinternal())
+		{
+			enumLeaves(root->childs[0],policy);
+			enumLeaves(root->childs[1],policy);
+		}
+		else
+		{
+			policy.Process(root);
+		}
+inline void		b3DynamicBvh::collideTT(	const b3DbvtNode* root0,
+								  const b3DbvtNode* root1,
+								  B3_DBVT_IPOLICY)
+		if(root0&&root1)
+		{
+			int								depth=1;
+			int								treshold=B3_DOUBLE_STACKSIZE-4;
+			b3AlignedObjectArray<sStkNN>	stkStack;
+			stkStack.resize(B3_DOUBLE_STACKSIZE);
+			stkStack[0]=sStkNN(root0,root1);
+			do	{		
+				sStkNN	p=stkStack[--depth];
+				if(depth>treshold)
+				{
+					stkStack.resize(stkStack.size()*2);
+					treshold=stkStack.size()-4;
+				}
+				if(p.a==p.b)
+				{
+					if(p.a->isinternal())
+					{
+						stkStack[depth++]=sStkNN(p.a->childs[0],p.a->childs[0]);
+						stkStack[depth++]=sStkNN(p.a->childs[1],p.a->childs[1]);
+						stkStack[depth++]=sStkNN(p.a->childs[0],p.a->childs[1]);
+					}
+				}
+				else if(b3Intersect(p.a->volume,p.b->volume))
+				{
+					if(p.a->isinternal())
+					{
+						if(p.b->isinternal())
+						{
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[1]);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[1]);
+						}
+						else
+						{
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b);
+						}
+					}
+					else
+					{
+						if(p.b->isinternal())
+						{
+							stkStack[depth++]=sStkNN(p.a,p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a,p.b->childs[1]);
+						}
+						else
+						{
+							policy.Process(p.a,p.b);
+						}
+					}
+				}
+			} while(depth);
+		}
+inline void		b3DynamicBvh::collideTTpersistentStack(	const b3DbvtNode* root0,
+								  const b3DbvtNode* root1,
+								  B3_DBVT_IPOLICY)
+		if(root0&&root1)
+		{
+			int								depth=1;
+			int								treshold=B3_DOUBLE_STACKSIZE-4;
+			m_stkStack.resize(B3_DOUBLE_STACKSIZE);
+			m_stkStack[0]=sStkNN(root0,root1);
+			do	{		
+				sStkNN	p=m_stkStack[--depth];
+				if(depth>treshold)
+				{
+					m_stkStack.resize(m_stkStack.size()*2);
+					treshold=m_stkStack.size()-4;
+				}
+				if(p.a==p.b)
+				{
+					if(p.a->isinternal())
+					{
+						m_stkStack[depth++]=sStkNN(p.a->childs[0],p.a->childs[0]);
+						m_stkStack[depth++]=sStkNN(p.a->childs[1],p.a->childs[1]);
+						m_stkStack[depth++]=sStkNN(p.a->childs[0],p.a->childs[1]);
+					}
+				}
+				else if(b3Intersect(p.a->volume,p.b->volume))
+				{
+					if(p.a->isinternal())
+					{
+						if(p.b->isinternal())
+						{
+							m_stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[0]);
+							m_stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[0]);
+							m_stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[1]);
+							m_stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[1]);
+						}
+						else
+						{
+							m_stkStack[depth++]=sStkNN(p.a->childs[0],p.b);
+							m_stkStack[depth++]=sStkNN(p.a->childs[1],p.b);
+						}
+					}
+					else
+					{
+						if(p.b->isinternal())
+						{
+							m_stkStack[depth++]=sStkNN(p.a,p.b->childs[0]);
+							m_stkStack[depth++]=sStkNN(p.a,p.b->childs[1]);
+						}
+						else
+						{
+							policy.Process(p.a,p.b);
+						}
+					}
+				}
+			} while(depth);
+		}
+#if 0
+inline void		b3DynamicBvh::collideTT(	const b3DbvtNode* root0,
+								  const b3DbvtNode* root1,
+								  const b3Transform& xform,
+								  B3_DBVT_IPOLICY)
+		if(root0&&root1)
+		{
+			int								depth=1;
+			int								treshold=B3_DOUBLE_STACKSIZE-4;
+			b3AlignedObjectArray<sStkNN>	stkStack;
+			stkStack.resize(B3_DOUBLE_STACKSIZE);
+			stkStack[0]=sStkNN(root0,root1);
+			do	{
+				sStkNN	p=stkStack[--depth];
+				if(b3Intersect(p.a->volume,p.b->volume,xform))
+				{
+					if(depth>treshold)
+					{
+						stkStack.resize(stkStack.size()*2);
+						treshold=stkStack.size()-4;
+					}
+					if(p.a->isinternal())
+					{
+						if(p.b->isinternal())
+						{					
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[1]);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[1]);
+						}
+						else
+						{
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b);
+						}
+					}
+					else
+					{
+						if(p.b->isinternal())
+						{
+							stkStack[depth++]=sStkNN(p.a,p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a,p.b->childs[1]);
+						}
+						else
+						{
+							policy.Process(p.a,p.b);
+						}
+					}
+				}
+			} while(depth);
+		}
+inline void		b3DynamicBvh::collideTT(	const b3DbvtNode* root0,
+								  const b3Transform& xform0,
+								  const b3DbvtNode* root1,
+								  const b3Transform& xform1,
+								  B3_DBVT_IPOLICY)
+	const b3Transform	xform=xform0.inverse()*xform1;
+	collideTT(root0,root1,xform,policy);
+inline void		b3DynamicBvh::collideTV(	const b3DbvtNode* root,
+								  const b3DbvtVolume& vol,
+								  B3_DBVT_IPOLICY) const
+		if(root)
+		{
+			B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)		volume(vol);
+			b3AlignedObjectArray<const b3DbvtNode*>	stack;
+			stack.resize(0);
+			stack.reserve(B3_SIMPLE_STACKSIZE);
+			stack.push_back(root);
+			do	{
+				const b3DbvtNode*	n=stack[stack.size()-1];
+				stack.pop_back();
+				if(b3Intersect(n->volume,volume))
+				{
+					if(n->isinternal())
+					{
+						stack.push_back(n->childs[0]);
+						stack.push_back(n->childs[1]);
+					}
+					else
+					{
+						policy.Process(n);
+					}
+				}
+			} while(stack.size()>0);
+		}
+inline void		b3DynamicBvh::rayTestInternal(	const b3DbvtNode* root,
+								const b3Vector3& rayFrom,
+								const b3Vector3& rayTo,
+								const b3Vector3& rayDirectionInverse,
+								unsigned int signs[3],
+								b3Scalar lambda_max,
+								const b3Vector3& aabbMin,
+								const b3Vector3& aabbMax,
+								B3_DBVT_IPOLICY) const
+        (void) rayTo;
+	if(root)
+	{
+		int								depth=1;
+		int								treshold=B3_DOUBLE_STACKSIZE-2;
+		b3AlignedObjectArray<const b3DbvtNode*>&	stack = m_rayTestStack;
+		stack.resize(B3_DOUBLE_STACKSIZE);
+		stack[0]=root;
+		b3Vector3 bounds[2];
+		do	
+		{
+			const b3DbvtNode*	node=stack[--depth];
+			bounds[0] = node->volume.Mins()-aabbMax;
+			bounds[1] = node->volume.Maxs()-aabbMin;
+			b3Scalar tmin=1.f,lambda_min=0.f;
+			unsigned int result1=false;
+			result1 = b3RayAabb2(rayFrom,rayDirectionInverse,signs,bounds,tmin,lambda_min,lambda_max);
+			if(result1)
+			{
+				if(node->isinternal())
+				{
+					if(depth>treshold)
+					{
+						stack.resize(stack.size()*2);
+						treshold=stack.size()-2;
+					}
+					stack[depth++]=node->childs[0];
+					stack[depth++]=node->childs[1];
+				}
+				else
+				{
+					policy.Process(node);
+				}
+			}
+		} while(depth);
+	}
+inline void		b3DynamicBvh::rayTest(	const b3DbvtNode* root,
+								const b3Vector3& rayFrom,
+								const b3Vector3& rayTo,
+								B3_DBVT_IPOLICY)
+		if(root)
+		{
+			b3Vector3 rayDir = (rayTo-rayFrom);
+			rayDir.normalize ();
+			///what about division by zero? --> just set rayDirection[i] to INF/B3_LARGE_FLOAT
+			b3Vector3 rayDirectionInverse;
+			rayDirectionInverse[0] = rayDir[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[0];
+			rayDirectionInverse[1] = rayDir[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[1];
+			rayDirectionInverse[2] = rayDir[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[2];
+			unsigned int signs[3] = { rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0};
+			b3Scalar lambda_max = rayDir.dot(rayTo-rayFrom);
+			b3Vector3 resultNormal;
+			b3AlignedObjectArray<const b3DbvtNode*>	stack;
+			int								depth=1;
+			int								treshold=B3_DOUBLE_STACKSIZE-2;
+			stack.resize(B3_DOUBLE_STACKSIZE);
+			stack[0]=root;
+			b3Vector3 bounds[2];
+			do	{
+				const b3DbvtNode*	node=stack[--depth];
+				bounds[0] = node->volume.Mins();
+				bounds[1] = node->volume.Maxs();
+				b3Scalar tmin=1.f,lambda_min=0.f;
+				unsigned int result1 = b3RayAabb2(rayFrom,rayDirectionInverse,signs,bounds,tmin,lambda_min,lambda_max);
+				b3Scalar param=1.f;
+				bool result2 = b3RayAabb(rayFrom,rayTo,node->volume.Mins(),node->volume.Maxs(),param,resultNormal);
+				b3Assert(result1 == result2);
+#endif //TEST_BTRAY_AABB2
+				if(result1)
+				{
+					if(node->isinternal())
+					{
+						if(depth>treshold)
+						{
+							stack.resize(stack.size()*2);
+							treshold=stack.size()-2;
+						}
+						stack[depth++]=node->childs[0];
+						stack[depth++]=node->childs[1];
+					}
+					else
+					{
+						policy.Process(node);
+					}
+				}
+			} while(depth);
+		}
+inline void		b3DynamicBvh::collideKDOP(const b3DbvtNode* root,
+									const b3Vector3* normals,
+									const b3Scalar* offsets,
+									int count,
+									B3_DBVT_IPOLICY)
+		if(root)
+		{
+			const int						inside=(1<<count)-1;
+			b3AlignedObjectArray<sStkNP>	stack;
+			int								signs[sizeof(unsigned)*8];
+			b3Assert(count<int (sizeof(signs)/sizeof(signs[0])));
+			for(int i=0;i<count;++i)
+			{
+				signs[i]=	((normals[i].x>=0)?1:0)+
+					((normals[i].y>=0)?2:0)+
+					((normals[i].z>=0)?4:0);
+			}
+			stack.reserve(B3_SIMPLE_STACKSIZE);
+			stack.push_back(sStkNP(root,0));
+			do	{
+				sStkNP	se=stack[stack.size()-1];
+				bool	out=false;
+				stack.pop_back();
+				for(int i=0,j=1;(!out)&&(i<count);++i,j<<=1)
+				{
+					if(0==(se.mask&j))
+					{
+						const int	side=se.node->volume.Classify(normals[i],offsets[i],signs[i]);
+						switch(side)
+						{
+						case	-1:	out=true;break;
+						case	+1:	se.mask|=j;break;
+						}
+					}
+				}
+				if(!out)
+				{
+					if((se.mask!=inside)&&(se.node->isinternal()))
+					{
+						stack.push_back(sStkNP(se.node->childs[0],se.mask));
+						stack.push_back(sStkNP(se.node->childs[1],se.mask));
+					}
+					else
+					{
+						if(policy.AllLeaves(se.node)) enumLeaves(se.node,policy);
+					}
+				}
+			} while(stack.size());
+		}
+inline void		b3DynamicBvh::collideOCL(	const b3DbvtNode* root,
+								   const b3Vector3* normals,
+								   const b3Scalar* offsets,
+								   const b3Vector3& sortaxis,
+								   int count,
+								   B3_DBVT_IPOLICY,
+								   bool fsort)
+		if(root)
+		{
+			const unsigned					srtsgns=(sortaxis[0]>=0?1:0)+
+				(sortaxis[1]>=0?2:0)+
+				(sortaxis[2]>=0?4:0);
+			const int						inside=(1<<count)-1;
+			b3AlignedObjectArray<sStkNPS>	stock;
+			b3AlignedObjectArray<int>		ifree;
+			b3AlignedObjectArray<int>		stack;
+			int								signs[sizeof(unsigned)*8];
+			b3Assert(count<int (sizeof(signs)/sizeof(signs[0])));
+			for(int i=0;i<count;++i)
+			{
+				signs[i]=	((normals[i].x>=0)?1:0)+
+					((normals[i].y>=0)?2:0)+
+					((normals[i].z>=0)?4:0);
+			}
+			stock.reserve(B3_SIMPLE_STACKSIZE);
+			stack.reserve(B3_SIMPLE_STACKSIZE);
+			ifree.reserve(B3_SIMPLE_STACKSIZE);
+			stack.push_back(allocate(ifree,stock,sStkNPS(root,0,root->volume.ProjectMinimum(sortaxis,srtsgns))));
+			do	{
+				const int	id=stack[stack.size()-1];
+				sStkNPS		se=stock[id];
+				stack.pop_back();ifree.push_back(id);
+				if(se.mask!=inside)
+				{
+					bool	out=false;
+					for(int i=0,j=1;(!out)&&(i<count);++i,j<<=1)
+					{
+						if(0==(se.mask&j))
+						{
+							const int	side=se.node->volume.Classify(normals[i],offsets[i],signs[i]);
+							switch(side)
+							{
+							case	-1:	out=true;break;
+							case	+1:	se.mask|=j;break;
+							}
+						}
+					}
+					if(out) continue;
+				}
+				if(policy.Descent(se.node))
+				{
+					if(se.node->isinternal())
+					{
+						const b3DbvtNode* pns[]={	se.node->childs[0],se.node->childs[1]};
+						sStkNPS		nes[]={	sStkNPS(pns[0],se.mask,pns[0]->volume.ProjectMinimum(sortaxis,srtsgns)),
+							sStkNPS(pns[1],se.mask,pns[1]->volume.ProjectMinimum(sortaxis,srtsgns))};
+						const int	q=nes[0].value<nes[1].value?1:0;				
+						int			j=stack.size();
+						if(fsort&&(j>0))
+						{
+							/* Insert 0	*/ 
+							j=nearest(&stack[0],&stock[0],nes[q].value,0,stack.size());
+							stack.push_back(0);
+							memmove(&stack[j+1],&stack[j],sizeof(int)*(stack.size()-j-1));
+							for(int k=stack.size()-1;k>j;--k) stack[k]=stack[k-1];
+							stack[j]=allocate(ifree,stock,nes[q]);
+							/* Insert 1	*/ 
+							j=nearest(&stack[0],&stock[0],nes[1-q].value,j,stack.size());
+							stack.push_back(0);
+							memmove(&stack[j+1],&stack[j],sizeof(int)*(stack.size()-j-1));
+							for(int k=stack.size()-1;k>j;--k) stack[k]=stack[k-1];
+							stack[j]=allocate(ifree,stock,nes[1-q]);
+						}
+						else
+						{
+							stack.push_back(allocate(ifree,stock,nes[q]));
+							stack.push_back(allocate(ifree,stock,nes[1-q]));
+						}
+					}
+					else
+					{
+						policy.Process(se.node,se.value);
+					}
+				}
+			} while(stack.size());
+		}
+inline void		b3DynamicBvh::collideTU(	const b3DbvtNode* root,
+								  B3_DBVT_IPOLICY)
+		if(root)
+		{
+			b3AlignedObjectArray<const b3DbvtNode*>	stack;
+			stack.reserve(B3_SIMPLE_STACKSIZE);
+			stack.push_back(root);
+			do	{
+				const b3DbvtNode*	n=stack[stack.size()-1];
+				stack.pop_back();
+				if(policy.Descent(n))
+				{
+					if(n->isinternal())
+					{ stack.push_back(n->childs[0]);stack.push_back(n->childs[1]); }
+					else
+					{ policy.Process(n); }
+				}
+			} while(stack.size()>0);
+		}
+// PP Cleanup
+#undef B3_DBVT_PREFIX
+#undef B3_DBVT_IMPL_SSE
+#undef B3_DBVT_INT0_IMPL
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.cpp b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.cpp
new file mode 100644
index 00000000..ca08429a
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.cpp
@@ -0,0 +1,804 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///b3DynamicBvhBroadphase implementation by Nathanael Presson
+#include "b3DynamicBvhBroadphase.h"
+#include "b3OverlappingPair.h"
+// Profiling
+#include <stdio.h>
+struct	b3ProfileScope
+	__forceinline b3ProfileScope(b3Clock& clock,unsigned long& value) :
+	m_clock(&clock),m_value(&value),m_base(clock.getTimeMicroseconds())
+	{
+	}
+	__forceinline ~b3ProfileScope()
+	{
+		(*m_value)+=m_clock->getTimeMicroseconds()-m_base;
+	}
+	b3Clock*		m_clock;
+	unsigned long*	m_value;
+	unsigned long	m_base;
+#define	b3SPC(_value_)	b3ProfileScope	spc_scope(m_clock,_value_)
+#define	b3SPC(_value_)
+// Helpers
+template <typename T>
+static inline void	b3ListAppend(T* item,T*& list)
+	item->links[0]=0;
+	item->links[1]=list;
+	if(list) list->links[0]=item;
+	list=item;
+template <typename T>
+static inline void	b3ListRemove(T* item,T*& list)
+	if(item->links[0]) item->links[0]->links[1]=item->links[1]; else list=item->links[1];
+	if(item->links[1]) item->links[1]->links[0]=item->links[0];
+template <typename T>
+static inline int	b3ListCount(T* root)
+	int	n=0;
+	while(root) { ++n;root=root->links[1]; }
+	return(n);
+template <typename T>
+static inline void	b3Clear(T& value)
+	static const struct ZeroDummy : T {} zerodummy;
+	value=zerodummy;
+// Colliders
+/* Tree collider	*/ 
+struct	b3DbvtTreeCollider : b3DynamicBvh::ICollide
+	b3DynamicBvhBroadphase*	pbp;
+	b3DbvtProxy*		proxy;
+	b3DbvtTreeCollider(b3DynamicBvhBroadphase* p) : pbp(p) {}
+	void	Process(const b3DbvtNode* na,const b3DbvtNode* nb)
+	{
+		if(na!=nb)
+		{
+			b3DbvtProxy*	pa=(b3DbvtProxy*)na->data;
+			b3DbvtProxy*	pb=(b3DbvtProxy*)nb->data;
+			if(pa->m_uniqueId>pb->m_uniqueId) 
+				b3Swap(pa,pb);
+			pbp->m_paircache->addOverlappingPair(pa->getUid(),pb->getUid());
+			++pbp->m_newpairs;
+		}
+	}
+	void	Process(const b3DbvtNode* n)
+	{
+		Process(n,proxy->leaf);
+	}
+// b3DynamicBvhBroadphase
+b3DynamicBvhBroadphase::b3DynamicBvhBroadphase(int proxyCapacity, b3OverlappingPairCache* paircache)
+	m_deferedcollide	=	false;
+	m_needcleanup		=	true;
+	m_releasepaircache	=	(paircache!=0)?false:true;
+	m_prediction		=	0;
+	m_stageCurrent		=	0;
+	m_fixedleft			=	0;
+	m_fupdates			=	1;
+	m_dupdates			=	0;
+	m_cupdates			=	10;
+	m_newpairs			=	1;
+	m_updates_call		=	0;
+	m_updates_done		=	0;
+	m_updates_ratio		=	0;
+	m_paircache			=	paircache? paircache	: new(b3AlignedAlloc(sizeof(b3HashedOverlappingPairCache),16)) b3HashedOverlappingPairCache();
+	m_pid				=	0;
+	m_cid				=	0;
+	for(int i=0;i<=STAGECOUNT;++i)
+	{
+		m_stageRoots[i]=0;
+	}
+	b3Clear(m_profiling);
+	m_proxies.resize(proxyCapacity);
+	if(m_releasepaircache) 
+	{
+		m_paircache->~b3OverlappingPairCache();
+		b3AlignedFree(m_paircache);
+	}
+b3BroadphaseProxy*				b3DynamicBvhBroadphase::createProxy(	const b3Vector3& aabbMin,
+															  const b3Vector3& aabbMax,
+															  int objectId,
+															  void* userPtr,
+															  short int collisionFilterGroup,
+															  short int collisionFilterMask)
+	b3DbvtProxy* mem = &m_proxies[objectId];
+	b3DbvtProxy*		proxy=new(mem) b3DbvtProxy(	aabbMin,aabbMax,userPtr,
+		collisionFilterGroup,
+		collisionFilterMask);
+	b3DbvtAabbMm aabb = b3DbvtVolume::FromMM(aabbMin,aabbMax);
+	//bproxy->aabb			=	b3DbvtVolume::FromMM(aabbMin,aabbMax);
+	proxy->stage		=	m_stageCurrent;
+	proxy->m_uniqueId	=	objectId;
+	proxy->leaf			=	m_sets[0].insert(aabb,proxy);
+	b3ListAppend(proxy,m_stageRoots[m_stageCurrent]);
+	if(!m_deferedcollide)
+	{
+		b3DbvtTreeCollider	collider(this);
+		collider.proxy=proxy;
+		m_sets[0].collideTV(m_sets[0].m_root,aabb,collider);
+		m_sets[1].collideTV(m_sets[1].m_root,aabb,collider);
+	}
+	return(proxy);
+void							b3DynamicBvhBroadphase::destroyProxy(	b3BroadphaseProxy* absproxy,
+															   b3Dispatcher* dispatcher)
+	b3DbvtProxy*	proxy=(b3DbvtProxy*)absproxy;
+	if(proxy->stage==STAGECOUNT)
+		m_sets[1].remove(proxy->leaf);
+	else
+		m_sets[0].remove(proxy->leaf);
+	b3ListRemove(proxy,m_stageRoots[proxy->stage]);
+	m_paircache->removeOverlappingPairsContainingProxy(proxy->getUid(),dispatcher);
+	m_needcleanup=true;
+void	b3DynamicBvhBroadphase::getAabb(int objectId,b3Vector3& aabbMin, b3Vector3& aabbMax ) const
+	const b3DbvtProxy*						proxy=&m_proxies[objectId];
+	aabbMin = proxy->m_aabbMin;
+	aabbMax = proxy->m_aabbMax;
+void	b3DynamicBvhBroadphase::getAabb(b3BroadphaseProxy* absproxy,b3Vector3& aabbMin, b3Vector3& aabbMax ) const
+	b3DbvtProxy*						proxy=(b3DbvtProxy*)absproxy;
+	aabbMin = proxy->m_aabbMin;
+	aabbMax = proxy->m_aabbMax;
+struct	BroadphaseRayTester : b3DynamicBvh::ICollide
+	b3BroadphaseRayCallback& m_rayCallback;
+	BroadphaseRayTester(b3BroadphaseRayCallback& orgCallback)
+		:m_rayCallback(orgCallback)
+	{
+	}
+	void					Process(const b3DbvtNode* leaf)
+	{
+		b3DbvtProxy*	proxy=(b3DbvtProxy*)leaf->data;
+		m_rayCallback.process(proxy);
+	}
+void	b3DynamicBvhBroadphase::rayTest(const b3Vector3& rayFrom,const b3Vector3& rayTo, b3BroadphaseRayCallback& rayCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+	BroadphaseRayTester callback(rayCallback);
+	m_sets[0].rayTestInternal(	m_sets[0].m_root,
+		rayFrom,
+		rayTo,
+		rayCallback.m_rayDirectionInverse,
+		rayCallback.m_signs,
+		rayCallback.m_lambda_max,
+		aabbMin,
+		aabbMax,
+		callback);
+	m_sets[1].rayTestInternal(	m_sets[1].m_root,
+		rayFrom,
+		rayTo,
+		rayCallback.m_rayDirectionInverse,
+		rayCallback.m_signs,
+		rayCallback.m_lambda_max,
+		aabbMin,
+		aabbMax,
+		callback);
+struct	BroadphaseAabbTester : b3DynamicBvh::ICollide
+	b3BroadphaseAabbCallback& m_aabbCallback;
+	BroadphaseAabbTester(b3BroadphaseAabbCallback& orgCallback)
+		:m_aabbCallback(orgCallback)
+	{
+	}
+	void					Process(const b3DbvtNode* leaf)
+	{
+		b3DbvtProxy*	proxy=(b3DbvtProxy*)leaf->data;
+		m_aabbCallback.process(proxy);
+	}
+void	b3DynamicBvhBroadphase::aabbTest(const b3Vector3& aabbMin,const b3Vector3& aabbMax,b3BroadphaseAabbCallback& aabbCallback)
+	BroadphaseAabbTester callback(aabbCallback);
+	const B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)	bounds=b3DbvtVolume::FromMM(aabbMin,aabbMax);
+		//process all children, that overlap with  the given AABB bounds
+	m_sets[0].collideTV(m_sets[0].m_root,bounds,callback);
+	m_sets[1].collideTV(m_sets[1].m_root,bounds,callback);
+void							b3DynamicBvhBroadphase::setAabb(int objectId,
+														  const b3Vector3& aabbMin,
+														  const b3Vector3& aabbMax,
+														  b3Dispatcher* /*dispatcher*/)
+	b3DbvtProxy*						proxy=&m_proxies[objectId];
+//	b3DbvtProxy*						proxy=(b3DbvtProxy*)absproxy;
+	B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)	aabb=b3DbvtVolume::FromMM(aabbMin,aabbMax);
+	if(b3NotEqual(aabb,proxy->leaf->volume))
+	{
+		bool	docollide=false;
+		if(proxy->stage==STAGECOUNT)
+		{/* fixed -> dynamic set	*/ 
+			m_sets[1].remove(proxy->leaf);
+			proxy->leaf=m_sets[0].insert(aabb,proxy);
+			docollide=true;
+		}
+		else
+		{/* dynamic set				*/ 
+			++m_updates_call;
+			if(b3Intersect(proxy->leaf->volume,aabb))
+			{/* Moving				*/ 
+				const b3Vector3	delta=aabbMin-proxy->m_aabbMin;
+				b3Vector3		velocity(((proxy->m_aabbMax-proxy->m_aabbMin)/2)*m_prediction);
+				if(delta[0]<0) velocity[0]=-velocity[0];
+				if(delta[1]<0) velocity[1]=-velocity[1];
+				if(delta[2]<0) velocity[2]=-velocity[2];
+				if	(
+#ifdef B3_DBVT_BP_MARGIN				
+					m_sets[0].update(proxy->leaf,aabb,velocity,B3_DBVT_BP_MARGIN)
+					m_sets[0].update(proxy->leaf,aabb,velocity)
+					)
+				{
+					++m_updates_done;
+					docollide=true;
+				}
+			}
+			else
+			{/* Teleporting			*/ 
+				m_sets[0].update(proxy->leaf,aabb);
+				++m_updates_done;
+				docollide=true;
+			}	
+		}
+		b3ListRemove(proxy,m_stageRoots[proxy->stage]);
+		proxy->m_aabbMin = aabbMin;
+		proxy->m_aabbMax = aabbMax;
+		proxy->stage	=	m_stageCurrent;
+		b3ListAppend(proxy,m_stageRoots[m_stageCurrent]);
+		if(docollide)
+		{
+			m_needcleanup=true;
+			if(!m_deferedcollide)
+			{
+				b3DbvtTreeCollider	collider(this);
+				m_sets[1].collideTTpersistentStack(m_sets[1].m_root,proxy->leaf,collider);
+				m_sets[0].collideTTpersistentStack(m_sets[0].m_root,proxy->leaf,collider);
+			}
+		}	
+	}
+void							b3DynamicBvhBroadphase::setAabbForceUpdate(		b3BroadphaseProxy* absproxy,
+														  const b3Vector3& aabbMin,
+														  const b3Vector3& aabbMax,
+														  b3Dispatcher* /*dispatcher*/)
+	b3DbvtProxy*						proxy=(b3DbvtProxy*)absproxy;
+	B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)	aabb=b3DbvtVolume::FromMM(aabbMin,aabbMax);
+	bool	docollide=false;
+	if(proxy->stage==STAGECOUNT)
+	{/* fixed -> dynamic set	*/ 
+		m_sets[1].remove(proxy->leaf);
+		proxy->leaf=m_sets[0].insert(aabb,proxy);
+		docollide=true;
+	}
+	else
+	{/* dynamic set				*/ 
+		++m_updates_call;
+		/* Teleporting			*/ 
+		m_sets[0].update(proxy->leaf,aabb);
+		++m_updates_done;
+		docollide=true;
+	}
+	b3ListRemove(proxy,m_stageRoots[proxy->stage]);
+	proxy->m_aabbMin = aabbMin;
+	proxy->m_aabbMax = aabbMax;
+	proxy->stage	=	m_stageCurrent;
+	b3ListAppend(proxy,m_stageRoots[m_stageCurrent]);
+	if(docollide)
+	{
+		m_needcleanup=true;
+		if(!m_deferedcollide)
+		{
+			b3DbvtTreeCollider	collider(this);
+			m_sets[1].collideTTpersistentStack(m_sets[1].m_root,proxy->leaf,collider);
+			m_sets[0].collideTTpersistentStack(m_sets[0].m_root,proxy->leaf,collider);
+		}
+	}	
+void							b3DynamicBvhBroadphase::calculateOverlappingPairs(b3Dispatcher* dispatcher)
+	collide(dispatcher);
+	if(0==(m_pid%B3_DBVT_BP_PROFILING_RATE))
+	{	
+		printf("fixed(%u) dynamics(%u) pairs(%u)\r\n",m_sets[1].m_leaves,m_sets[0].m_leaves,m_paircache->getNumOverlappingPairs());
+		unsigned int	total=m_profiling.m_total;
+		if(total<=0) total=1;
+		printf("ddcollide: %u%% (%uus)\r\n",(50+m_profiling.m_ddcollide*100)/total,m_profiling.m_ddcollide/B3_DBVT_BP_PROFILING_RATE);
+		printf("fdcollide: %u%% (%uus)\r\n",(50+m_profiling.m_fdcollide*100)/total,m_profiling.m_fdcollide/B3_DBVT_BP_PROFILING_RATE);
+		printf("cleanup:   %u%% (%uus)\r\n",(50+m_profiling.m_cleanup*100)/total,m_profiling.m_cleanup/B3_DBVT_BP_PROFILING_RATE);
+		printf("total:     %uus\r\n",total/B3_DBVT_BP_PROFILING_RATE);
+		const unsigned long	sum=m_profiling.m_ddcollide+
+			m_profiling.m_fdcollide+
+			m_profiling.m_cleanup;
+		printf("leaked: %u%% (%uus)\r\n",100-((50+sum*100)/total),(total-sum)/B3_DBVT_BP_PROFILING_RATE);
+		printf("job counts: %u%%\r\n",(m_profiling.m_jobcount*100)/((m_sets[0].m_leaves+m_sets[1].m_leaves)*B3_DBVT_BP_PROFILING_RATE));
+		b3Clear(m_profiling);
+		m_clock.reset();
+	}
+	performDeferredRemoval(dispatcher);
+void b3DynamicBvhBroadphase::performDeferredRemoval(b3Dispatcher* dispatcher)
+	if (m_paircache->hasDeferredRemoval())
+	{
+		b3BroadphasePairArray&	overlappingPairArray = m_paircache->getOverlappingPairArray();
+		//perform a sort, to find duplicates and to sort 'invalid' pairs to the end
+		overlappingPairArray.quickSort(b3BroadphasePairSortPredicate());
+		int invalidPair = 0;
+		int i;
+		b3BroadphasePair previousPair = b3MakeBroadphasePair(-1,-1);
+		for (i=0;i<overlappingPairArray.size();i++)
+		{
+			b3BroadphasePair& pair = overlappingPairArray[i];
+			bool isDuplicate = (pair == previousPair);
+			previousPair = pair;
+			bool needsRemoval = false;
+			if (!isDuplicate)
+			{
+				//important to perform AABB check that is consistent with the broadphase
+				b3DbvtProxy*		pa=&m_proxies[pair.x];
+				b3DbvtProxy*		pb=&m_proxies[pair.y];
+				bool hasOverlap = b3Intersect(pa->leaf->volume,pb->leaf->volume);
+				if (hasOverlap)
+				{
+					needsRemoval = false;
+				} else
+				{
+					needsRemoval = true;
+				}
+			} else
+			{
+				//remove duplicate
+				needsRemoval = true;
+				//should have no algorithm
+			}
+			if (needsRemoval)
+			{
+				m_paircache->cleanOverlappingPair(pair,dispatcher);
+				pair.x = -1;
+				pair.y = -1;
+				invalidPair++;
+			} 
+		}
+		//perform a sort, to sort 'invalid' pairs to the end
+		overlappingPairArray.quickSort(b3BroadphasePairSortPredicate());
+		overlappingPairArray.resize(overlappingPairArray.size() - invalidPair);
+	}
+void							b3DynamicBvhBroadphase::collide(b3Dispatcher* dispatcher)
+	/*printf("---------------------------------------------------------\n");
+	printf("m_sets[0].m_leaves=%d\n",m_sets[0].m_leaves);
+	printf("m_sets[1].m_leaves=%d\n",m_sets[1].m_leaves);
+	printf("numPairs = %d\n",getOverlappingPairCache()->getNumOverlappingPairs());
+	{
+		int i;
+		for (i=0;i<getOverlappingPairCache()->getNumOverlappingPairs();i++)
+		{
+			printf("pair[%d]=(%d,%d),",i,getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy0->getUid(),
+				getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy1->getUid());
+		}
+		printf("\n");
+	}
+	b3SPC(m_profiling.m_total);
+	/* optimize				*/ 
+	m_sets[0].optimizeIncremental(1+(m_sets[0].m_leaves*m_dupdates)/100);
+	if(m_fixedleft)
+	{
+		const int count=1+(m_sets[1].m_leaves*m_fupdates)/100;
+		m_sets[1].optimizeIncremental(1+(m_sets[1].m_leaves*m_fupdates)/100);
+		m_fixedleft=b3Max<int>(0,m_fixedleft-count);
+	}
+	/* dynamic -> fixed set	*/ 
+	m_stageCurrent=(m_stageCurrent+1)%STAGECOUNT;
+	b3DbvtProxy*	current=m_stageRoots[m_stageCurrent];
+	if(current)
+	{
+		b3DbvtTreeCollider	collider(this);
+		do	{
+			b3DbvtProxy*	next=current->links[1];
+			b3ListRemove(current,m_stageRoots[current->stage]);
+			b3ListAppend(current,m_stageRoots[STAGECOUNT]);
+			m_paircache->removeOverlappingPairsContainingProxy(current,dispatcher);
+			collider.proxy=current;
+			b3DynamicBvh::collideTV(m_sets[0].m_root,current->aabb,collider);
+			b3DynamicBvh::collideTV(m_sets[1].m_root,current->aabb,collider);
+			m_sets[0].remove(current->leaf);
+			B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)	curAabb=b3DbvtVolume::FromMM(current->m_aabbMin,current->m_aabbMax);
+			current->leaf	=	m_sets[1].insert(curAabb,current);
+			current->stage	=	STAGECOUNT;	
+			current			=	next;
+		} while(current);
+		m_fixedleft=m_sets[1].m_leaves;
+		m_needcleanup=true;
+	}
+	/* collide dynamics		*/ 
+	{
+		b3DbvtTreeCollider	collider(this);
+		if(m_deferedcollide)
+		{
+			b3SPC(m_profiling.m_fdcollide);
+			m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[1].m_root,collider);
+		}
+		if(m_deferedcollide)
+		{
+			b3SPC(m_profiling.m_ddcollide);
+			m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[0].m_root,collider);
+		}
+	}
+	/* clean up				*/ 
+	if(m_needcleanup)
+	{
+		b3SPC(m_profiling.m_cleanup);
+		b3BroadphasePairArray&	pairs=m_paircache->getOverlappingPairArray();
+		if(pairs.size()>0)
+		{
+			int			ni=b3Min(pairs.size(),b3Max<int>(m_newpairs,(pairs.size()*m_cupdates)/100));
+			for(int i=0;i<ni;++i)
+			{
+				b3BroadphasePair&	p=pairs[(m_cid+i)%pairs.size()];
+				b3DbvtProxy*		pa=&m_proxies[p.x];
+				b3DbvtProxy*		pb=&m_proxies[p.y];
+				if(!b3Intersect(pa->leaf->volume,pb->leaf->volume))
+				{
+					if(pa->m_uniqueId>pb->m_uniqueId) 
+						b3Swap(pa,pb);
+					m_paircache->removeOverlappingPair(pa->getUid(),pb->getUid(),dispatcher);
+					--ni;--i;
+				}
+			}
+			if(pairs.size()>0) m_cid=(m_cid+ni)%pairs.size(); else m_cid=0;
+		}
+	}
+	++m_pid;
+	m_newpairs=1;
+	m_needcleanup=false;
+	if(m_updates_call>0)
+	{ m_updates_ratio=m_updates_done/(b3Scalar)m_updates_call; }
+	else
+	{ m_updates_ratio=0; }
+	m_updates_done/=2;
+	m_updates_call/=2;
+void							b3DynamicBvhBroadphase::optimize()
+	m_sets[0].optimizeTopDown();
+	m_sets[1].optimizeTopDown();
+b3OverlappingPairCache*			b3DynamicBvhBroadphase::getOverlappingPairCache()
+	return(m_paircache);
+const b3OverlappingPairCache*	b3DynamicBvhBroadphase::getOverlappingPairCache() const
+	return(m_paircache);
+void							b3DynamicBvhBroadphase::getBroadphaseAabb(b3Vector3& aabbMin,b3Vector3& aabbMax) const
+	B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)	bounds;
+	if(!m_sets[0].empty())
+		if(!m_sets[1].empty())	b3Merge(	m_sets[0].m_root->volume,
+			m_sets[1].m_root->volume,bounds);
+		else
+			bounds=m_sets[0].m_root->volume;
+	else if(!m_sets[1].empty())	bounds=m_sets[1].m_root->volume;
+	else
+		bounds=b3DbvtVolume::FromCR(b3MakeVector3(0,0,0),0);
+	aabbMin=bounds.Mins();
+	aabbMax=bounds.Maxs();
+void b3DynamicBvhBroadphase::resetPool(b3Dispatcher* dispatcher)
+	int totalObjects = m_sets[0].m_leaves + m_sets[1].m_leaves;
+	if (!totalObjects)
+	{
+		//reset internal dynamic tree data structures
+		m_sets[0].clear();
+		m_sets[1].clear();
+		m_deferedcollide	=	false;
+		m_needcleanup		=	true;
+		m_stageCurrent		=	0;
+		m_fixedleft			=	0;
+		m_fupdates			=	1;
+		m_dupdates			=	0;
+		m_cupdates			=	10;
+		m_newpairs			=	1;
+		m_updates_call		=	0;
+		m_updates_done		=	0;
+		m_updates_ratio		=	0;
+		m_pid				=	0;
+		m_cid				=	0;
+		for(int i=0;i<=STAGECOUNT;++i)
+		{
+			m_stageRoots[i]=0;
+		}
+	}
+void							b3DynamicBvhBroadphase::printStats()
+struct	b3BroadphaseBenchmark
+	struct	Experiment
+	{
+		const char*			name;
+		int					object_count;
+		int					update_count;
+		int					spawn_count;
+		int					iterations;
+		b3Scalar			speed;
+		b3Scalar			amplitude;
+	};
+	struct	Object
+	{
+		b3Vector3			center;
+		b3Vector3			extents;
+		b3BroadphaseProxy*	proxy;
+		b3Scalar			time;
+		void				update(b3Scalar speed,b3Scalar amplitude,b3BroadphaseInterface* pbi)
+		{
+			time		+=	speed;
+			center[0]	=	b3Cos(time*(b3Scalar)2.17)*amplitude+
+				b3Sin(time)*amplitude/2;
+			center[1]	=	b3Cos(time*(b3Scalar)1.38)*amplitude+
+				b3Sin(time)*amplitude;
+			center[2]	=	b3Sin(time*(b3Scalar)0.777)*amplitude;
+			pbi->setAabb(proxy,center-extents,center+extents,0);
+		}
+	};
+	static int		UnsignedRand(int range=RAND_MAX-1)	{ return(rand()%(range+1)); }
+	static b3Scalar	UnitRand()							{ return(UnsignedRand(16384)/(b3Scalar)16384); }
+	static void		OutputTime(const char* name,b3Clock& c,unsigned count=0)
+	{
+		const unsigned long	us=c.getTimeMicroseconds();
+		const unsigned long	ms=(us+500)/1000;
+		const b3Scalar		sec=us/(b3Scalar)(1000*1000);
+		if(count>0)
+			printf("%s : %u us (%u ms), %.2f/s\r\n",name,us,ms,count/sec);
+		else
+			printf("%s : %u us (%u ms)\r\n",name,us,ms);
+	}
+void							b3DynamicBvhBroadphase::benchmark(b3BroadphaseInterface* pbi)
+	static const b3BroadphaseBenchmark::Experiment		experiments[]=
+	{
+		{"1024o.10%",1024,10,0,8192,(b3Scalar)0.005,(b3Scalar)100},
+		/*{"4096o.10%",4096,10,0,8192,(b3Scalar)0.005,(b3Scalar)100},
+		{"8192o.10%",8192,10,0,8192,(b3Scalar)0.005,(b3Scalar)100},*/
+	};
+	static const int										nexperiments=sizeof(experiments)/sizeof(experiments[0]);
+	b3AlignedObjectArray<b3BroadphaseBenchmark::Object*>	objects;
+	b3Clock													wallclock;
+	/* Begin			*/ 
+	for(int iexp=0;iexp<nexperiments;++iexp)
+	{
+		const b3BroadphaseBenchmark::Experiment&	experiment=experiments[iexp];
+		const int									object_count=experiment.object_count;
+		const int									update_count=(object_count*experiment.update_count)/100;
+		const int									spawn_count=(object_count*experiment.spawn_count)/100;
+		const b3Scalar								speed=experiment.speed;	
+		const b3Scalar								amplitude=experiment.amplitude;
+		printf("Experiment #%u '%s':\r\n",iexp,experiment.name);
+		printf("\tObjects: %u\r\n",object_count);
+		printf("\tUpdate: %u\r\n",update_count);
+		printf("\tSpawn: %u\r\n",spawn_count);
+		printf("\tSpeed: %f\r\n",speed);
+		printf("\tAmplitude: %f\r\n",amplitude);
+		srand(180673);
+		/* Create objects	*/ 
+		wallclock.reset();
+		objects.reserve(object_count);
+		for(int i=0;i<object_count;++i)
+		{
+			b3BroadphaseBenchmark::Object*	po=new b3BroadphaseBenchmark::Object();
+			po->center[0]=b3BroadphaseBenchmark::UnitRand()*50;
+			po->center[1]=b3BroadphaseBenchmark::UnitRand()*50;
+			po->center[2]=b3BroadphaseBenchmark::UnitRand()*50;
+			po->extents[0]=b3BroadphaseBenchmark::UnitRand()*2+2;
+			po->extents[1]=b3BroadphaseBenchmark::UnitRand()*2+2;
+			po->extents[2]=b3BroadphaseBenchmark::UnitRand()*2+2;
+			po->time=b3BroadphaseBenchmark::UnitRand()*2000;
+			po->proxy=pbi->createProxy(po->center-po->extents,po->center+po->extents,0,po,1,1,0,0);
+			objects.push_back(po);
+		}
+		b3BroadphaseBenchmark::OutputTime("\tInitialization",wallclock);
+		/* First update		*/ 
+		wallclock.reset();
+		for(int i=0;i<objects.size();++i)
+		{
+			objects[i]->update(speed,amplitude,pbi);
+		}
+		b3BroadphaseBenchmark::OutputTime("\tFirst update",wallclock);
+		/* Updates			*/ 
+		wallclock.reset();
+		for(int i=0;i<experiment.iterations;++i)
+		{
+			for(int j=0;j<update_count;++j)
+			{				
+				objects[j]->update(speed,amplitude,pbi);
+			}
+			pbi->calculateOverlappingPairs(0);
+		}
+		b3BroadphaseBenchmark::OutputTime("\tUpdate",wallclock,experiment.iterations);
+		/* Clean up			*/ 
+		wallclock.reset();
+		for(int i=0;i<objects.size();++i)
+		{
+			pbi->destroyProxy(objects[i]->proxy,0);
+			delete objects[i];
+		}
+		objects.resize(0);
+		b3BroadphaseBenchmark::OutputTime("\tRelease",wallclock);
+	}
+/*void							b3DynamicBvhBroadphase::benchmark(b3BroadphaseInterface*)
+#undef	b3SPC
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h
new file mode 100644
index 00000000..74e6ef04
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h
@@ -0,0 +1,208 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///b3DynamicBvhBroadphase implementation by Nathanael Presson
+#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h"
+#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "b3BroadphaseCallback.h"
+// Compile time config
+#define	B3_DBVT_BP_PROFILE					0
+//#define B3_DBVT_BP_SORTPAIRS				1
+#define B3_DBVT_BP_MARGIN					(b3Scalar)0.05
+B3_ATTRIBUTE_ALIGNED16(struct) b3BroadphaseProxy
+	///optional filtering to cull potential collisions
+	enum CollisionFilterGroups
+	{
+	        DefaultFilter = 1,
+	        StaticFilter = 2,
+	        KinematicFilter = 4,
+	        DebrisFilter = 8,
+			SensorTrigger = 16,
+			CharacterFilter = 32,
+	        AllFilter = -1 //all bits sets: DefaultFilter | StaticFilter | KinematicFilter | DebrisFilter | SensorTrigger
+	};
+	//Usually the client b3CollisionObject or Rigidbody class
+	void*	m_clientObject;
+	short int m_collisionFilterGroup;
+	short int m_collisionFilterMask;
+	void*	m_multiSapParentProxy;		
+	int			m_uniqueId;//m_uniqueId is introduced for paircache. could get rid of this, by calculating the address offset etc.
+	b3Vector3	m_aabbMin;
+	b3Vector3	m_aabbMax;
+	B3_FORCE_INLINE int getUid() const
+	{
+		return m_uniqueId;
+	}
+	//used for memory pools
+	b3BroadphaseProxy() :m_clientObject(0),m_multiSapParentProxy(0)
+	{
+	}
+	b3BroadphaseProxy(const b3Vector3& aabbMin,const b3Vector3& aabbMax,void* userPtr,short int collisionFilterGroup, short int collisionFilterMask,void* multiSapParentProxy=0)
+		:m_clientObject(userPtr),
+		m_collisionFilterGroup(collisionFilterGroup),
+		m_collisionFilterMask(collisionFilterMask),
+		m_aabbMin(aabbMin),
+		m_aabbMax(aabbMax)
+	{
+		m_multiSapParentProxy = multiSapParentProxy;
+	}
+// b3DbvtProxy
+struct b3DbvtProxy : b3BroadphaseProxy
+	/* Fields		*/ 
+	//b3DbvtAabbMm	aabb;
+	b3DbvtNode*		leaf;
+	b3DbvtProxy*	links[2];
+	int				stage;
+	/* ctor			*/ 
+	explicit b3DbvtProxy() {}
+	b3DbvtProxy(const b3Vector3& aabbMin,const b3Vector3& aabbMax,void* userPtr,short int collisionFilterGroup, short int collisionFilterMask) :
+	b3BroadphaseProxy(aabbMin,aabbMax,userPtr,collisionFilterGroup,collisionFilterMask)
+	{
+		links[0]=links[1]=0;
+	}
+typedef b3AlignedObjectArray<b3DbvtProxy*>	b3DbvtProxyArray;
+///The b3DynamicBvhBroadphase implements a broadphase using two dynamic AABB bounding volume hierarchies/trees (see b3DynamicBvh).
+///One tree is used for static/non-moving objects, and another tree is used for dynamic objects. Objects can move from one tree to the other.
+///This is a very fast broadphase, especially for very dynamic worlds where many objects are moving. Its insert/add and remove of objects is generally faster than the sweep and prune broadphases b3AxisSweep3 and b332BitAxisSweep3.
+struct	b3DynamicBvhBroadphase 
+	/* Config		*/ 
+	enum	{
+		DYNAMIC_SET			=	0,	/* Dynamic set index	*/ 
+		FIXED_SET			=	1,	/* Fixed set index		*/ 
+		STAGECOUNT			=	2	/* Number of stages		*/ 
+	};
+	/* Fields		*/ 
+	b3DynamicBvh					m_sets[2];					// Dbvt sets
+	b3DbvtProxy*			m_stageRoots[STAGECOUNT+1];	// Stages list
+	b3AlignedObjectArray<b3DbvtProxy>	m_proxies;
+	b3OverlappingPairCache*	m_paircache;				// Pair cache
+	b3Scalar				m_prediction;				// Velocity prediction
+	int						m_stageCurrent;				// Current stage
+	int						m_fupdates;					// % of fixed updates per frame
+	int						m_dupdates;					// % of dynamic updates per frame
+	int						m_cupdates;					// % of cleanup updates per frame
+	int						m_newpairs;					// Number of pairs created
+	int						m_fixedleft;				// Fixed optimization left
+	unsigned				m_updates_call;				// Number of updates call
+	unsigned				m_updates_done;				// Number of updates done
+	b3Scalar				m_updates_ratio;			// m_updates_done/m_updates_call
+	int						m_pid;						// Parse id
+	int						m_cid;						// Cleanup index
+	bool					m_releasepaircache;			// Release pair cache on delete
+	bool					m_deferedcollide;			// Defere dynamic/static collision to collide call
+	bool					m_needcleanup;				// Need to run cleanup?
+	b3Clock					m_clock;
+	struct	{
+		unsigned long		m_total;
+		unsigned long		m_ddcollide;
+		unsigned long		m_fdcollide;
+		unsigned long		m_cleanup;
+		unsigned long		m_jobcount;
+	}				m_profiling;
+	/* Methods		*/ 
+	b3DynamicBvhBroadphase(int proxyCapacity, b3OverlappingPairCache* paircache=0);
+	~b3DynamicBvhBroadphase();
+	void							collide(b3Dispatcher* dispatcher);
+	void							optimize();
+	/* b3BroadphaseInterface Implementation	*/
+	b3BroadphaseProxy*				createProxy(const b3Vector3& aabbMin,const b3Vector3& aabbMax,int objectIndex,void* userPtr,short int collisionFilterGroup,short int collisionFilterMask);
+	virtual void					destroyProxy(b3BroadphaseProxy* proxy,b3Dispatcher* dispatcher);
+	virtual void					setAabb(int objectId,const b3Vector3& aabbMin,const b3Vector3& aabbMax,b3Dispatcher* dispatcher);
+	virtual void					rayTest(const b3Vector3& rayFrom,const b3Vector3& rayTo, b3BroadphaseRayCallback& rayCallback, const b3Vector3& aabbMin=b3MakeVector3(0,0,0), const b3Vector3& aabbMax = b3MakeVector3(0,0,0));
+	virtual void					aabbTest(const b3Vector3& aabbMin, const b3Vector3& aabbMax, b3BroadphaseAabbCallback& callback);
+	//virtual void					getAabb(b3BroadphaseProxy* proxy,b3Vector3& aabbMin, b3Vector3& aabbMax ) const;
+	virtual void					getAabb(int objectId,b3Vector3& aabbMin, b3Vector3& aabbMax ) const;
+	virtual	void					calculateOverlappingPairs(b3Dispatcher* dispatcher=0);
+	virtual	b3OverlappingPairCache*	getOverlappingPairCache();
+	virtual	const b3OverlappingPairCache*	getOverlappingPairCache() const;
+	virtual	void					getBroadphaseAabb(b3Vector3& aabbMin,b3Vector3& aabbMax) const;
+	virtual	void					printStats();
+	///reset broadphase internal structures, to ensure determinism/reproducability
+	virtual void resetPool(b3Dispatcher* dispatcher);
+	void	performDeferredRemoval(b3Dispatcher* dispatcher);
+	void	setVelocityPrediction(b3Scalar prediction)
+	{
+		m_prediction = prediction;
+	}
+	b3Scalar getVelocityPrediction() const
+	{
+		return m_prediction;
+	}
+	///this setAabbForceUpdate is similar to setAabb but always forces the aabb update. 
+	///it is not part of the b3BroadphaseInterface but specific to b3DynamicBvhBroadphase.
+	///it bypasses certain optimizations that prevent aabb updates (when the aabb shrinks), see
+	///http://code.google.com/p/bullet/issues/detail?id=223
+	void							setAabbForceUpdate(		b3BroadphaseProxy* absproxy,const b3Vector3& aabbMin,const b3Vector3& aabbMax,b3Dispatcher* /*dispatcher*/);
+	//static void						benchmark(b3BroadphaseInterface*);
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h
new file mode 100644
index 00000000..39bf27de
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h
@@ -0,0 +1,72 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/shared/b3Int4.h"
+#define B3_NEW_PAIR_MARKER -1
+typedef b3Int4 b3BroadphasePair;
+inline b3Int4 b3MakeBroadphasePair(int xx,int yy)
+	b3Int4 pair;
+	if (xx < yy)
+    { 
+        pair.x = xx; 
+        pair.y = yy;
+    }
+    else 
+    { 
+		pair.x = yy;
+        pair.y = xx;
+    }
+	pair.z = B3_NEW_PAIR_MARKER;
+	pair.w = B3_NEW_PAIR_MARKER;
+	return pair;
+/*struct b3BroadphasePair : public b3Int4
+	explicit b3BroadphasePair(){}
+class b3BroadphasePairSortPredicate
+	public:
+		bool operator() ( const b3BroadphasePair& a, const b3BroadphasePair& b ) const
+		{
+			const int uidA0 = a.x;
+			const int uidB0 = b.x;
+			const int uidA1 = a.y;
+			const int uidB1 = b.y;
+			return uidA0 > uidB0 || (uidA0 == uidB0 && uidA1 > uidB1); 
+		}
+B3_FORCE_INLINE bool operator==(const b3BroadphasePair& a, const b3BroadphasePair& b) 
+	 return (a.x == b.x ) && (a.y == b.y );
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.cpp b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.cpp
new file mode 100644
index 00000000..b5061939
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.cpp
@@ -0,0 +1,638 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3OverlappingPairCache.h"
+//#include "b3Dispatcher.h"
+//#include "b3CollisionAlgorithm.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#include <stdio.h>
+int	b3g_overlappingPairs = 0;
+int b3g_removePairs =0;
+int b3g_addedPairs =0;
+int b3g_findPairs =0;
+	m_overlapFilterCallback(0),
+	m_blockedForChanges(false)
+	int initialAllocatedSize= 2;
+	m_overlappingPairArray.reserve(initialAllocatedSize);
+	growTables();
+void	b3HashedOverlappingPairCache::cleanOverlappingPair(b3BroadphasePair& pair,b3Dispatcher* dispatcher)
+/*	if (pair.m_algorithm)
+	{
+		{
+			pair.m_algorithm->~b3CollisionAlgorithm();
+			dispatcher->freeCollisionAlgorithm(pair.m_algorithm);
+			pair.m_algorithm=0;
+		}
+	}
+	*/
+void	b3HashedOverlappingPairCache::cleanProxyFromPairs(int proxy,b3Dispatcher* dispatcher)
+	class	CleanPairCallback : public b3OverlapCallback
+	{
+		int m_cleanProxy;
+		b3OverlappingPairCache*	m_pairCache;
+		b3Dispatcher* m_dispatcher;
+	public:
+		CleanPairCallback(int cleanProxy,b3OverlappingPairCache* pairCache,b3Dispatcher* dispatcher)
+			:m_cleanProxy(cleanProxy),
+			m_pairCache(pairCache),
+			m_dispatcher(dispatcher)
+		{
+		}
+		virtual	bool	processOverlap(b3BroadphasePair& pair)
+		{
+			if ((pair.x == m_cleanProxy) ||
+				(pair.y == m_cleanProxy))
+			{
+				m_pairCache->cleanOverlappingPair(pair,m_dispatcher);
+			}
+			return false;
+		}
+	};
+	CleanPairCallback cleanPairs(proxy,this,dispatcher);
+	processAllOverlappingPairs(&cleanPairs,dispatcher);
+void	b3HashedOverlappingPairCache::removeOverlappingPairsContainingProxy(int proxy,b3Dispatcher* dispatcher)
+	class	RemovePairCallback : public b3OverlapCallback
+	{
+		int m_obsoleteProxy;
+	public:
+		RemovePairCallback(int obsoleteProxy)
+			:m_obsoleteProxy(obsoleteProxy)
+		{
+		}
+		virtual	bool	processOverlap(b3BroadphasePair& pair)
+		{
+			return ((pair.x == m_obsoleteProxy) ||
+				(pair.y == m_obsoleteProxy));
+		}
+	};
+	RemovePairCallback removeCallback(proxy);
+	processAllOverlappingPairs(&removeCallback,dispatcher);
+b3BroadphasePair* b3HashedOverlappingPairCache::findPair(int proxy0, int proxy1)
+	b3g_findPairs++;
+	if(proxy0 >proxy1) 
+		b3Swap(proxy0,proxy1);
+	int proxyId1 = proxy0;
+	int proxyId2 = proxy1;
+	/*if (proxyId1 > proxyId2) 
+		b3Swap(proxyId1, proxyId2);*/
+	int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));
+	if (hash >= m_hashTable.size())
+	{
+		return NULL;
+	}
+	int index = m_hashTable[hash];
+	while (index != B3_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyId1, proxyId2) == false)
+	{
+		index = m_next[index];
+	}
+	if (index == B3_NULL_PAIR)
+	{
+		return NULL;
+	}
+	b3Assert(index < m_overlappingPairArray.size());
+	return &m_overlappingPairArray[index];
+//#include <stdio.h>
+void	b3HashedOverlappingPairCache::growTables()
+	int newCapacity = m_overlappingPairArray.capacity();
+	if (m_hashTable.size() < newCapacity)
+	{
+		//grow hashtable and next table
+		int curHashtableSize = m_hashTable.size();
+		m_hashTable.resize(newCapacity);
+		m_next.resize(newCapacity);
+		int i;
+		for (i= 0; i < newCapacity; ++i)
+		{
+			m_hashTable[i] = B3_NULL_PAIR;
+		}
+		for (i = 0; i < newCapacity; ++i)
+		{
+			m_next[i] = B3_NULL_PAIR;
+		}
+		for(i=0;i<curHashtableSize;i++)
+		{
+			const b3BroadphasePair& pair = m_overlappingPairArray[i];
+			int proxyId1 = pair.x;
+			int proxyId2 = pair.y;
+			/*if (proxyId1 > proxyId2) 
+				b3Swap(proxyId1, proxyId2);*/
+			int	hashValue = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));	// New hash value with new mask
+			m_next[i] = m_hashTable[hashValue];
+			m_hashTable[hashValue] = i;
+		}
+	}
+b3BroadphasePair* b3HashedOverlappingPairCache::internalAddPair(int proxy0, int proxy1)
+	if(proxy0>proxy1) 
+		b3Swap(proxy0,proxy1);
+	int proxyId1 = proxy0;
+	int proxyId2 = proxy1;
+	/*if (proxyId1 > proxyId2) 
+		b3Swap(proxyId1, proxyId2);*/
+	int	hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));	// New hash value with new mask
+	b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash);
+	if (pair != NULL)
+	{
+		return pair;
+	}
+	/*for(int i=0;i<m_overlappingPairArray.size();++i)
+		{
+		if(	(m_overlappingPairArray[i].m_pProxy0==proxy0)&&
+			(m_overlappingPairArray[i].m_pProxy1==proxy1))
+			{
+			printf("Adding duplicated %u<>%u\r\n",proxyId1,proxyId2);
+			internalFindPair(proxy0, proxy1, hash);
+			}
+		}*/
+	int count = m_overlappingPairArray.size();
+	int oldCapacity = m_overlappingPairArray.capacity();
+	pair = &m_overlappingPairArray.expandNonInitializing();
+	//this is where we add an actual pair, so also call the 'ghost'
+//	if (m_ghostPairCallback)
+//		m_ghostPairCallback->addOverlappingPair(proxy0,proxy1);
+	int newCapacity = m_overlappingPairArray.capacity();
+	if (oldCapacity < newCapacity)
+	{
+		growTables();
+		//hash with new capacity
+		hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));
+	}
+	*pair = b3MakeBroadphasePair(proxy0,proxy1);
+//	pair->m_pProxy0 = proxy0;
+//	pair->m_pProxy1 = proxy1;
+	//pair->m_algorithm = 0;
+	//pair->m_internalTmpValue = 0;
+	m_next[count] = m_hashTable[hash];
+	m_hashTable[hash] = count;
+	return pair;
+void* b3HashedOverlappingPairCache::removeOverlappingPair(int proxy0, int proxy1,b3Dispatcher* dispatcher)
+	b3g_removePairs++;
+	if(proxy0>proxy1) 
+		b3Swap(proxy0,proxy1);
+	int proxyId1 = proxy0;
+	int proxyId2 = proxy1;
+	/*if (proxyId1 > proxyId2) 
+		b3Swap(proxyId1, proxyId2);*/
+	int	hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));
+	b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash);
+	if (pair == NULL)
+	{
+		return 0;
+	}
+	cleanOverlappingPair(*pair,dispatcher);
+	int pairIndex = int(pair - &m_overlappingPairArray[0]);
+	b3Assert(pairIndex < m_overlappingPairArray.size());
+	// Remove the pair from the hash table.
+	int index = m_hashTable[hash];
+	b3Assert(index != B3_NULL_PAIR);
+	int previous = B3_NULL_PAIR;
+	while (index != pairIndex)
+	{
+		previous = index;
+		index = m_next[index];
+	}
+	if (previous != B3_NULL_PAIR)
+	{
+		b3Assert(m_next[previous] == pairIndex);
+		m_next[previous] = m_next[pairIndex];
+	}
+	else
+	{
+		m_hashTable[hash] = m_next[pairIndex];
+	}
+	// We now move the last pair into spot of the
+	// pair being removed. We need to fix the hash
+	// table indices to support the move.
+	int lastPairIndex = m_overlappingPairArray.size() - 1;
+	//if (m_ghostPairCallback)
+	//	m_ghostPairCallback->removeOverlappingPair(proxy0, proxy1,dispatcher);
+	// If the removed pair is the last pair, we are done.
+	if (lastPairIndex == pairIndex)
+	{
+		m_overlappingPairArray.pop_back();
+		return 0;
+	}
+	// Remove the last pair from the hash table.
+	const b3BroadphasePair* last = &m_overlappingPairArray[lastPairIndex];
+		/* missing swap here too, Nat. */ 
+	int lastHash = static_cast<int>(getHash(static_cast<unsigned int>(last->x), static_cast<unsigned int>(last->y)) & (m_overlappingPairArray.capacity()-1));
+	index = m_hashTable[lastHash];
+	b3Assert(index != B3_NULL_PAIR);
+	previous = B3_NULL_PAIR;
+	while (index != lastPairIndex)
+	{
+		previous = index;
+		index = m_next[index];
+	}
+	if (previous != B3_NULL_PAIR)
+	{
+		b3Assert(m_next[previous] == lastPairIndex);
+		m_next[previous] = m_next[lastPairIndex];
+	}
+	else
+	{
+		m_hashTable[lastHash] = m_next[lastPairIndex];
+	}
+	// Copy the last pair into the remove pair's spot.
+	m_overlappingPairArray[pairIndex] = m_overlappingPairArray[lastPairIndex];
+	// Insert the last pair into the hash table
+	m_next[pairIndex] = m_hashTable[lastHash];
+	m_hashTable[lastHash] = pairIndex;
+	m_overlappingPairArray.pop_back();
+	return 0;
+//#include <stdio.h>
+void	b3HashedOverlappingPairCache::processAllOverlappingPairs(b3OverlapCallback* callback,b3Dispatcher* dispatcher)
+	int i;
+//	printf("m_overlappingPairArray.size()=%d\n",m_overlappingPairArray.size());
+	for (i=0;i<m_overlappingPairArray.size();)
+	{
+		b3BroadphasePair* pair = &m_overlappingPairArray[i];
+		if (callback->processOverlap(*pair))
+		{
+			removeOverlappingPair(pair->x,pair->y,dispatcher);
+			b3g_overlappingPairs--;
+		} else
+		{
+			i++;
+		}
+	}
+void	b3HashedOverlappingPairCache::sortOverlappingPairs(b3Dispatcher* dispatcher)
+	///need to keep hashmap in sync with pair address, so rebuild all
+	b3BroadphasePairArray tmpPairs;
+	int i;
+	for (i=0;i<m_overlappingPairArray.size();i++)
+	{
+		tmpPairs.push_back(m_overlappingPairArray[i]);
+	}
+	for (i=0;i<tmpPairs.size();i++)
+	{
+		removeOverlappingPair(tmpPairs[i].x,tmpPairs[i].y,dispatcher);
+	}
+	for (i = 0; i < m_next.size(); i++)
+	{
+		m_next[i] = B3_NULL_PAIR;
+	}
+	tmpPairs.quickSort(b3BroadphasePairSortPredicate());
+	for (i=0;i<tmpPairs.size();i++)
+	{
+		addOverlappingPair(tmpPairs[i].x ,tmpPairs[i].y);
+	}
+void*	b3SortedOverlappingPairCache::removeOverlappingPair(int proxy0,int proxy1, b3Dispatcher* dispatcher )
+	if (!hasDeferredRemoval())
+	{
+		b3BroadphasePair findPair = b3MakeBroadphasePair(proxy0,proxy1);
+		int findIndex = m_overlappingPairArray.findLinearSearch(findPair);
+		if (findIndex < m_overlappingPairArray.size())
+		{
+			b3g_overlappingPairs--;
+			b3BroadphasePair& pair = m_overlappingPairArray[findIndex];
+			cleanOverlappingPair(pair,dispatcher);
+			//if (m_ghostPairCallback)
+			//	m_ghostPairCallback->removeOverlappingPair(proxy0, proxy1,dispatcher);
+			m_overlappingPairArray.swap(findIndex,m_overlappingPairArray.capacity()-1);
+			m_overlappingPairArray.pop_back();
+			return 0;
+		}
+	}
+	return 0;
+b3BroadphasePair*	b3SortedOverlappingPairCache::addOverlappingPair(int proxy0,int proxy1)
+	//don't add overlap with own
+	b3Assert(proxy0 != proxy1);
+	if (!needsBroadphaseCollision(proxy0,proxy1))
+		return 0;
+	b3BroadphasePair* pair = &m_overlappingPairArray.expandNonInitializing();
+	*pair = b3MakeBroadphasePair(proxy0,proxy1);
+	b3g_overlappingPairs++;
+	b3g_addedPairs++;
+//	if (m_ghostPairCallback)
+//		m_ghostPairCallback->addOverlappingPair(proxy0, proxy1);
+	return pair;
+///this findPair becomes really slow. Either sort the list to speedup the query, or
+///use a different solution. It is mainly used for Removing overlapping pairs. Removal could be delayed.
+///we could keep a linked list in each proxy, and store pair in one of the proxies (with lowest memory address)
+///Also we can use a 2D bitmap, which can be useful for a future GPU implementation
+ b3BroadphasePair*	b3SortedOverlappingPairCache::findPair(int proxy0,int proxy1)
+	if (!needsBroadphaseCollision(proxy0,proxy1))
+		return 0;
+	b3BroadphasePair tmpPair = b3MakeBroadphasePair(proxy0,proxy1);
+	int findIndex = m_overlappingPairArray.findLinearSearch(tmpPair);
+	if (findIndex < m_overlappingPairArray.size())
+	{
+		//b3Assert(it != m_overlappingPairSet.end());
+		 b3BroadphasePair* pair = &m_overlappingPairArray[findIndex];
+		return pair;
+	}
+	return 0;
+//#include <stdio.h>
+void	b3SortedOverlappingPairCache::processAllOverlappingPairs(b3OverlapCallback* callback,b3Dispatcher* dispatcher)
+	int i;
+	for (i=0;i<m_overlappingPairArray.size();)
+	{
+		b3BroadphasePair* pair = &m_overlappingPairArray[i];
+		if (callback->processOverlap(*pair))
+		{
+			cleanOverlappingPair(*pair,dispatcher);
+			pair->x = -1;
+			pair->y = -1;
+			m_overlappingPairArray.swap(i,m_overlappingPairArray.size()-1);
+			m_overlappingPairArray.pop_back();
+			b3g_overlappingPairs--;
+		} else
+		{
+			i++;
+		}
+	}
+	m_blockedForChanges(false),
+	m_hasDeferredRemoval(true),
+	m_overlapFilterCallback(0)
+	int initialAllocatedSize= 2;
+	m_overlappingPairArray.reserve(initialAllocatedSize);
+void	b3SortedOverlappingPairCache::cleanOverlappingPair(b3BroadphasePair& pair,b3Dispatcher* dispatcher)
+/*	if (pair.m_algorithm)
+	{
+		{
+			pair.m_algorithm->~b3CollisionAlgorithm();
+			dispatcher->freeCollisionAlgorithm(pair.m_algorithm);
+			pair.m_algorithm=0;
+			b3g_removePairs--;
+		}
+	}
+	*/
+void	b3SortedOverlappingPairCache::cleanProxyFromPairs(int proxy,b3Dispatcher* dispatcher)
+	class	CleanPairCallback : public b3OverlapCallback
+	{
+		int m_cleanProxy;
+		b3OverlappingPairCache*	m_pairCache;
+		b3Dispatcher* m_dispatcher;
+	public:
+		CleanPairCallback(int cleanProxy,b3OverlappingPairCache* pairCache,b3Dispatcher* dispatcher)
+			:m_cleanProxy(cleanProxy),
+			m_pairCache(pairCache),
+			m_dispatcher(dispatcher)
+		{
+		}
+		virtual	bool	processOverlap(b3BroadphasePair& pair)
+		{
+			if ((pair.x == m_cleanProxy) ||
+				(pair.y == m_cleanProxy))
+			{
+				m_pairCache->cleanOverlappingPair(pair,m_dispatcher);
+			}
+			return false;
+		}
+	};
+	CleanPairCallback cleanPairs(proxy,this,dispatcher);
+	processAllOverlappingPairs(&cleanPairs,dispatcher);
+void	b3SortedOverlappingPairCache::removeOverlappingPairsContainingProxy(int proxy,b3Dispatcher* dispatcher)
+	class	RemovePairCallback : public b3OverlapCallback
+	{
+		int m_obsoleteProxy;
+	public:
+		RemovePairCallback(int obsoleteProxy)
+			:m_obsoleteProxy(obsoleteProxy)
+		{
+		}
+		virtual	bool	processOverlap(b3BroadphasePair& pair)
+		{
+			return ((pair.x == m_obsoleteProxy) ||
+				(pair.y == m_obsoleteProxy));
+		}
+	};
+	RemovePairCallback removeCallback(proxy);
+	processAllOverlappingPairs(&removeCallback,dispatcher);
+void	b3SortedOverlappingPairCache::sortOverlappingPairs(b3Dispatcher* dispatcher)
+	//should already be sorted
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h
new file mode 100644
index 00000000..ae0799fb
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h
@@ -0,0 +1,474 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+class b3Dispatcher;
+#include "b3OverlappingPair.h"
+typedef b3AlignedObjectArray<b3BroadphasePair>	b3BroadphasePairArray;
+struct	b3OverlapCallback
+	virtual ~b3OverlapCallback()
+	{}
+	//return true for deletion of the pair
+	virtual bool	processOverlap(b3BroadphasePair& pair) = 0;
+struct b3OverlapFilterCallback
+	virtual ~b3OverlapFilterCallback()
+	{}
+	// return true when pairs need collision
+	virtual bool	needBroadphaseCollision(int proxy0,int proxy1) const = 0;
+extern int b3g_removePairs;
+extern int b3g_addedPairs;
+extern int b3g_findPairs;
+const int B3_NULL_PAIR=0xffffffff;
+///The b3OverlappingPairCache provides an interface for overlapping pair management (add, remove, storage), used by the b3BroadphaseInterface broadphases.
+///The b3HashedOverlappingPairCache and b3SortedOverlappingPairCache classes are two implementations.
+class b3OverlappingPairCache 
+	virtual ~b3OverlappingPairCache() {} // this is needed so we can get to the derived class destructor
+	virtual b3BroadphasePair*	getOverlappingPairArrayPtr() = 0;
+	virtual const b3BroadphasePair*	getOverlappingPairArrayPtr() const = 0;
+	virtual b3BroadphasePairArray&	getOverlappingPairArray() = 0;
+	virtual	void	cleanOverlappingPair(b3BroadphasePair& pair,b3Dispatcher* dispatcher) = 0;
+	virtual int getNumOverlappingPairs() const = 0;
+	virtual void	cleanProxyFromPairs(int proxy,b3Dispatcher* dispatcher) = 0;
+	virtual	void setOverlapFilterCallback(b3OverlapFilterCallback* callback) = 0;
+	virtual void	processAllOverlappingPairs(b3OverlapCallback*,b3Dispatcher* dispatcher) = 0;
+	virtual b3BroadphasePair* findPair(int proxy0, int proxy1) = 0;
+	virtual bool	hasDeferredRemoval() = 0;
+	//virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* ghostPairCallback)=0;
+	virtual b3BroadphasePair* 	addOverlappingPair(int proxy0,int proxy1)=0;
+	virtual void*	removeOverlappingPair(int proxy0,int proxy1,b3Dispatcher* dispatcher)=0;
+	virtual void	removeOverlappingPairsContainingProxy(int /*proxy0*/,b3Dispatcher* /*dispatcher*/)=0;
+	virtual void	sortOverlappingPairs(b3Dispatcher* dispatcher) = 0;
+/// Hash-space based Pair Cache, thanks to Erin Catto, Box2D, http://www.box2d.org, and Pierre Terdiman, Codercorner, http://codercorner.com
+class b3HashedOverlappingPairCache : public b3OverlappingPairCache
+	b3BroadphasePairArray	m_overlappingPairArray;
+	b3OverlapFilterCallback* m_overlapFilterCallback;
+	bool		m_blockedForChanges;
+	b3HashedOverlappingPairCache();
+	virtual ~b3HashedOverlappingPairCache();
+	virtual void	removeOverlappingPairsContainingProxy(int proxy,b3Dispatcher* dispatcher);
+	virtual void*	removeOverlappingPair(int proxy0,int proxy1,b3Dispatcher* dispatcher);
+	B3_FORCE_INLINE bool needsBroadphaseCollision(int proxy0,int proxy1) const
+	{
+		if (m_overlapFilterCallback)
+			return m_overlapFilterCallback->needBroadphaseCollision(proxy0,proxy1);
+		bool collides = true;//(proxy0->m_collisionFilterGroup & proxy1->m_collisionFilterMask) != 0;
+		//collides = collides && (proxy1->m_collisionFilterGroup & proxy0->m_collisionFilterMask);
+		return collides;
+	}
+	// Add a pair and return the new pair. If the pair already exists,
+	// no new pair is created and the old one is returned.
+	virtual b3BroadphasePair* 	addOverlappingPair(int proxy0,int proxy1)
+	{
+		b3g_addedPairs++;
+		if (!needsBroadphaseCollision(proxy0,proxy1))
+			return 0;
+		return internalAddPair(proxy0,proxy1);
+	}
+	void	cleanProxyFromPairs(int proxy,b3Dispatcher* dispatcher);
+	virtual void	processAllOverlappingPairs(b3OverlapCallback*,b3Dispatcher* dispatcher);
+	virtual b3BroadphasePair*	getOverlappingPairArrayPtr()
+	{
+		return &m_overlappingPairArray[0];
+	}
+	const b3BroadphasePair*	getOverlappingPairArrayPtr() const
+	{
+		return &m_overlappingPairArray[0];
+	}
+	b3BroadphasePairArray&	getOverlappingPairArray()
+	{
+		return m_overlappingPairArray;
+	}
+	const b3BroadphasePairArray&	getOverlappingPairArray() const
+	{
+		return m_overlappingPairArray;
+	}
+	void	cleanOverlappingPair(b3BroadphasePair& pair,b3Dispatcher* dispatcher);
+	b3BroadphasePair* findPair(int proxy0, int proxy1);
+	int GetCount() const { return m_overlappingPairArray.size(); }
+//	b3BroadphasePair* GetPairs() { return m_pairs; }
+	b3OverlapFilterCallback* getOverlapFilterCallback()
+	{
+		return m_overlapFilterCallback;
+	}
+	void setOverlapFilterCallback(b3OverlapFilterCallback* callback)
+	{
+		m_overlapFilterCallback = callback;
+	}
+	int	getNumOverlappingPairs() const
+	{
+		return m_overlappingPairArray.size();
+	}
+	b3BroadphasePair* 	internalAddPair(int proxy0,int proxy1);
+	void	growTables();
+	B3_FORCE_INLINE bool equalsPair(const b3BroadphasePair& pair, int proxyId1, int proxyId2)
+	{	
+		return pair.x == proxyId1 && pair.y  == proxyId2;
+	}
+	/*
+	// Thomas Wang's hash, see: http://www.concentric.net/~Ttwang/tech/inthash.htm
+	// This assumes proxyId1 and proxyId2 are 16-bit.
+	B3_FORCE_INLINE int getHash(int proxyId1, int proxyId2)
+	{
+		int key = (proxyId2 << 16) | proxyId1;
+		key = ~key + (key << 15);
+		key = key ^ (key >> 12);
+		key = key + (key << 2);
+		key = key ^ (key >> 4);
+		key = key * 2057;
+		key = key ^ (key >> 16);
+		return key;
+	}
+	*/
+	B3_FORCE_INLINE	unsigned int getHash(unsigned int proxyId1, unsigned int proxyId2)
+	{
+		int key = static_cast<int>(((unsigned int)proxyId1) | (((unsigned int)proxyId2) <<16));
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^=  (key >> 10);
+		key +=  (key << 3);
+		key ^=  (key >> 6);
+		key += ~(key << 11);
+		key ^=  (key >> 16);
+		return static_cast<unsigned int>(key);
+	}
+	B3_FORCE_INLINE b3BroadphasePair* internalFindPair(int proxy0, int proxy1, int hash)
+	{
+		int proxyId1 = proxy0;
+		int proxyId2 = proxy1;
+		#if 0 // wrong, 'equalsPair' use unsorted uids, copy-past devil striked again. Nat.
+		if (proxyId1 > proxyId2) 
+			b3Swap(proxyId1, proxyId2);
+		#endif
+		int index = m_hashTable[hash];
+		while( index != B3_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyId1, proxyId2) == false)
+		{
+			index = m_next[index];
+		}
+		if ( index == B3_NULL_PAIR )
+		{
+			return NULL;
+		}
+		b3Assert(index < m_overlappingPairArray.size());
+		return &m_overlappingPairArray[index];
+	}
+	virtual bool	hasDeferredRemoval()
+	{
+		return false;
+	}
+/*	virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* ghostPairCallback)
+	{
+		m_ghostPairCallback = ghostPairCallback;
+	}
+	*/
+	virtual void	sortOverlappingPairs(b3Dispatcher* dispatcher);
+	b3AlignedObjectArray<int>	m_hashTable;
+	b3AlignedObjectArray<int>	m_next;
+//	b3OverlappingPairCallback*	m_ghostPairCallback;
+///b3SortedOverlappingPairCache maintains the objects with overlapping AABB
+///Typically managed by the Broadphase, Axis3Sweep or b3SimpleBroadphase
+class	b3SortedOverlappingPairCache : public b3OverlappingPairCache
+	protected:
+		//avoid brute-force finding all the time
+		b3BroadphasePairArray	m_overlappingPairArray;
+		//during the dispatch, check that user doesn't destroy/create proxy
+		bool		m_blockedForChanges;
+		///by default, do the removal during the pair traversal
+		bool		m_hasDeferredRemoval;
+		//if set, use the callback instead of the built in filter in needBroadphaseCollision
+		b3OverlapFilterCallback* m_overlapFilterCallback;
+//		b3OverlappingPairCallback*	m_ghostPairCallback;
+	public:
+		b3SortedOverlappingPairCache();	
+		virtual ~b3SortedOverlappingPairCache();
+		virtual void	processAllOverlappingPairs(b3OverlapCallback*,b3Dispatcher* dispatcher);
+		void*	removeOverlappingPair(int proxy0,int proxy1,b3Dispatcher* dispatcher);
+		void	cleanOverlappingPair(b3BroadphasePair& pair,b3Dispatcher* dispatcher);
+		b3BroadphasePair*	addOverlappingPair(int proxy0,int proxy1);
+		b3BroadphasePair*	findPair(int proxy0,int proxy1);
+		void	cleanProxyFromPairs(int proxy,b3Dispatcher* dispatcher);
+		virtual void	removeOverlappingPairsContainingProxy(int proxy,b3Dispatcher* dispatcher);
+		inline bool needsBroadphaseCollision(int proxy0,int proxy1) const
+		{
+			if (m_overlapFilterCallback)
+				return m_overlapFilterCallback->needBroadphaseCollision(proxy0,proxy1);
+			bool collides = true;//(proxy0->m_collisionFilterGroup & proxy1->m_collisionFilterMask) != 0;
+			//collides = collides && (proxy1->m_collisionFilterGroup & proxy0->m_collisionFilterMask);
+			return collides;
+		}
+		b3BroadphasePairArray&	getOverlappingPairArray()
+		{
+			return m_overlappingPairArray;
+		}
+		const b3BroadphasePairArray&	getOverlappingPairArray() const
+		{
+			return m_overlappingPairArray;
+		}
+		b3BroadphasePair*	getOverlappingPairArrayPtr()
+		{
+			return &m_overlappingPairArray[0];
+		}
+		const b3BroadphasePair*	getOverlappingPairArrayPtr() const
+		{
+			return &m_overlappingPairArray[0];
+		}
+		int	getNumOverlappingPairs() const
+		{
+			return m_overlappingPairArray.size();
+		}
+		b3OverlapFilterCallback* getOverlapFilterCallback()
+		{
+			return m_overlapFilterCallback;
+		}
+		void setOverlapFilterCallback(b3OverlapFilterCallback* callback)
+		{
+			m_overlapFilterCallback = callback;
+		}
+		virtual bool	hasDeferredRemoval()
+		{
+			return m_hasDeferredRemoval;
+		}
+/*		virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* ghostPairCallback)
+		{
+			m_ghostPairCallback = ghostPairCallback;
+		}
+		*/
+		virtual void	sortOverlappingPairs(b3Dispatcher* dispatcher);
+///b3NullPairCache skips add/removal of overlapping pairs. Userful for benchmarking and unit testing.
+class b3NullPairCache : public b3OverlappingPairCache
+	b3BroadphasePairArray	m_overlappingPairArray;
+	virtual b3BroadphasePair*	getOverlappingPairArrayPtr()
+	{
+		return &m_overlappingPairArray[0];
+	}
+	const b3BroadphasePair*	getOverlappingPairArrayPtr() const
+	{
+		return &m_overlappingPairArray[0];
+	}
+	b3BroadphasePairArray&	getOverlappingPairArray()
+	{
+		return m_overlappingPairArray;
+	}
+	virtual	void	cleanOverlappingPair(b3BroadphasePair& /*pair*/,b3Dispatcher* /*dispatcher*/)
+	{
+	}
+	virtual int getNumOverlappingPairs() const
+	{
+		return 0;
+	}
+	virtual void	cleanProxyFromPairs(int /*proxy*/,b3Dispatcher* /*dispatcher*/)
+	{
+	}
+	virtual	void setOverlapFilterCallback(b3OverlapFilterCallback* /*callback*/)
+	{
+	}
+	virtual void	processAllOverlappingPairs(b3OverlapCallback*,b3Dispatcher* /*dispatcher*/)
+	{
+	}
+	virtual b3BroadphasePair* findPair(int /*proxy0*/, int /*proxy1*/)
+	{
+		return 0;
+	}
+	virtual bool	hasDeferredRemoval()
+	{
+		return true;
+	}
+//	virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* /* ghostPairCallback */)
+//	{
+//	}
+	virtual b3BroadphasePair*	addOverlappingPair(int /*proxy0*/,int /*proxy1*/)
+	{
+		return 0;
+	}
+	virtual void*	removeOverlappingPair(int /*proxy0*/,int /*proxy1*/,b3Dispatcher* /*dispatcher*/)
+	{
+		return 0;
+	}
+	virtual void	removeOverlappingPairsContainingProxy(int /*proxy0*/,b3Dispatcher* /*dispatcher*/)
+	{
+	}
+	virtual void	sortOverlappingPairs(b3Dispatcher* dispatcher)
+	{
+        (void) dispatcher;
+	}
diff --git a/src/bullet/Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h b/src/bullet/Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h
new file mode 100644
index 00000000..7f9bf990
--- /dev/null
+++ b/src/bullet/Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h
@@ -0,0 +1,59 @@
+#ifndef B3_AABB_H
+#define B3_AABB_H
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Mat3x3.h"
+typedef struct b3Aabb b3Aabb_t;
+struct b3Aabb
+	union
+	{
+		float m_min[4];
+		b3Float4 m_minVec;
+		int m_minIndices[4];
+	};
+	union
+	{
+		float	m_max[4];
+		b3Float4 m_maxVec;
+		int m_signedMaxIndices[4];
+	};
+inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,
+						b3Float4ConstArg pos,
+						b3QuatConstArg orn,
+						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)
+		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);
+		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);
+		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);
+		b3Mat3x3 m;
+		m = b3QuatGetRotationMatrix(orn);
+		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);
+		b3Float4 center = b3TransformPoint(localCenter,pos,orn);
+		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),
+										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),
+										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),
+										 0.f);
+		*aabbMinOut = center-extent;
+		*aabbMaxOut = center+extent;
+/// conservative test for overlap between two aabbs
+inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,
+								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)
+	bool overlap = true;
+	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;
+	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;
+	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;
+	return overlap;
+#endif //B3_AABB_H
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3Config.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3Config.h
new file mode 100644
index 00000000..e23fe11a
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3Config.h
@@ -0,0 +1,41 @@
+#ifndef B3_CONFIG_H
+#define B3_CONFIG_H
+struct	b3Config
+	int	m_maxConvexBodies;
+	int	m_maxConvexShapes;
+	int	m_maxBroadphasePairs;
+	int m_maxContactCapacity;
+	int m_compoundPairCapacity;
+	int m_maxVerticesPerFace;
+	int m_maxFacesPerShape;
+	int	m_maxConvexVertices;
+	int m_maxConvexIndices;
+	int m_maxConvexUniqueEdges;
+	int	m_maxCompoundChildShapes;
+	int m_maxTriConvexPairCapacity;
+	b3Config()
+		:m_maxConvexBodies(32*1024),
+		m_maxVerticesPerFace(64),
+		m_maxFacesPerShape(12),
+		m_maxConvexVertices(8192),
+		m_maxConvexIndices(81920),
+		m_maxConvexUniqueEdges(8192),
+		m_maxCompoundChildShapes(8192),
+		m_maxTriConvexPairCapacity(256*1024)
+	{
+		m_maxConvexShapes = m_maxConvexBodies;
+		m_maxBroadphasePairs = 16*m_maxConvexBodies;
+		m_maxContactCapacity = m_maxBroadphasePairs;
+		m_compoundPairCapacity = 1024*1024;
+	}
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h
new file mode 100644
index 00000000..fb251656
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h
@@ -0,0 +1,46 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_CONTACT4_H
+#define B3_CONTACT4_H
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+B3_ATTRIBUTE_ALIGNED16(struct) b3Contact4 : public b3Contact4Data
+	int getBodyA()const {return abs(m_bodyAPtrAndSignBit);}
+	int getBodyB()const {return abs(m_bodyBPtrAndSignBit);}
+	bool isBodyAFixed()const { return m_bodyAPtrAndSignBit<0;}
+	bool isBodyBFixed()const { return m_bodyBPtrAndSignBit<0;}
+	//	todo. make it safer
+	int& getBatchIdx() { return m_batchIdx; }
+	const int& getBatchIdx() const { return m_batchIdx; }
+	float getRestituitionCoeff() const { return ((float)m_restituitionCoeffCmp/(float)0xffff); }
+	void setRestituitionCoeff( float c ) { b3Assert( c >= 0.f && c <= 1.f ); m_restituitionCoeffCmp = (unsigned short)(c*0xffff); }
+	float getFrictionCoeff() const { return ((float)m_frictionCoeffCmp/(float)0xffff); }
+	void setFrictionCoeff( float c ) { b3Assert( c >= 0.f && c <= 1.f ); m_frictionCoeffCmp = (unsigned short)(c*0xffff); }
+	//float& getNPoints() { return m_worldNormal[3]; }
+	int getNPoints() const { return (int) m_worldNormalOnB.w; }
+	float getPenetration(int idx) const { return m_worldPosB[idx].w; }
+	bool isInvalid() const { return (getBodyA()==0 || getBodyB()==0); }
+#endif //B3_CONTACT4_H
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.cpp b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.cpp
new file mode 100644
index 00000000..55706fa6
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.cpp
@@ -0,0 +1,520 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "b3ConvexUtility.h"
+#include "Bullet3Geometry/b3ConvexHullComputer.h"
+#include "Bullet3Geometry/b3GrahamScan2dConvexHull.h"
+#include "Bullet3Common/b3Quaternion.h"
+#include "Bullet3Common/b3HashMap.h"
+bool	b3ConvexUtility::initializePolyhedralFeatures(const b3Vector3* orgVertices, int numPoints, bool mergeCoplanarTriangles)
+	b3ConvexHullComputer conv;
+	conv.compute(&orgVertices[0].getX(), sizeof(b3Vector3),numPoints,0.f,0.f);
+	b3AlignedObjectArray<b3Vector3> faceNormals;
+	int numFaces = conv.faces.size();
+	faceNormals.resize(numFaces);
+	b3ConvexHullComputer* convexUtil = &conv;
+	b3AlignedObjectArray<b3MyFace>	tmpFaces;
+	tmpFaces.resize(numFaces);
+	int numVertices = convexUtil->vertices.size();
+	m_vertices.resize(numVertices);
+	for (int p=0;p<numVertices;p++)
+	{
+		m_vertices[p] = convexUtil->vertices[p];
+	}
+	for (int i=0;i<numFaces;i++)
+	{
+		int face = convexUtil->faces[i];
+		//printf("face=%d\n",face);
+		const b3ConvexHullComputer::Edge*  firstEdge = &convexUtil->edges[face];
+		const b3ConvexHullComputer::Edge*  edge = firstEdge;
+		b3Vector3 edges[3];
+		int numEdges = 0;
+		//compute face normals
+		do
+		{
+			int src = edge->getSourceVertex();
+			tmpFaces[i].m_indices.push_back(src);
+			int targ = edge->getTargetVertex();
+			b3Vector3 wa = convexUtil->vertices[src];
+			b3Vector3 wb = convexUtil->vertices[targ];
+			b3Vector3 newEdge = wb-wa;
+			newEdge.normalize();
+			if (numEdges<2)
+				edges[numEdges++] = newEdge;
+			edge = edge->getNextEdgeOfFace();
+		} while (edge!=firstEdge);
+		b3Scalar planeEq = 1e30f;
+		if (numEdges==2)
+		{
+			faceNormals[i] = edges[0].cross(edges[1]);
+			faceNormals[i].normalize();
+			tmpFaces[i].m_plane[0] = faceNormals[i].getX();
+			tmpFaces[i].m_plane[1] = faceNormals[i].getY();
+			tmpFaces[i].m_plane[2] = faceNormals[i].getZ();
+			tmpFaces[i].m_plane[3] = planeEq;
+		}
+		else
+		{
+			b3Assert(0);//degenerate?
+			faceNormals[i].setZero();
+		}
+		for (int v=0;v<tmpFaces[i].m_indices.size();v++)
+		{
+			b3Scalar eq = m_vertices[tmpFaces[i].m_indices[v]].dot(faceNormals[i]);
+			if (planeEq>eq)
+			{
+				planeEq=eq;
+			}
+		}
+		tmpFaces[i].m_plane[3] = -planeEq;
+	}
+	//merge coplanar faces and copy them to m_polyhedron
+	b3Scalar faceWeldThreshold= 0.999f;
+	b3AlignedObjectArray<int> todoFaces;
+	for (int i=0;i<tmpFaces.size();i++)
+		todoFaces.push_back(i);
+	while (todoFaces.size())
+	{
+		b3AlignedObjectArray<int> coplanarFaceGroup;
+		int refFace = todoFaces[todoFaces.size()-1];
+		coplanarFaceGroup.push_back(refFace);
+		b3MyFace& faceA = tmpFaces[refFace];
+		todoFaces.pop_back();
+		b3Vector3 faceNormalA = b3MakeVector3(faceA.m_plane[0],faceA.m_plane[1],faceA.m_plane[2]);
+		for (int j=todoFaces.size()-1;j>=0;j--)
+		{
+			int i = todoFaces[j];
+			b3MyFace& faceB = tmpFaces[i];
+			b3Vector3 faceNormalB = b3MakeVector3(faceB.m_plane[0],faceB.m_plane[1],faceB.m_plane[2]);
+			if (faceNormalA.dot(faceNormalB)>faceWeldThreshold)
+			{
+				coplanarFaceGroup.push_back(i);
+				todoFaces.remove(i);
+			}
+		}
+		bool did_merge = false;
+		if (coplanarFaceGroup.size()>1)
+		{
+			//do the merge: use Graham Scan 2d convex hull
+			b3AlignedObjectArray<b3GrahamVector3> orgpoints;
+			b3Vector3 averageFaceNormal = b3MakeVector3(0,0,0);
+			for (int i=0;i<coplanarFaceGroup.size();i++)
+			{
+//				m_polyhedron->m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
+				b3MyFace& face = tmpFaces[coplanarFaceGroup[i]];
+				b3Vector3 faceNormal = b3MakeVector3(face.m_plane[0],face.m_plane[1],face.m_plane[2]);
+				averageFaceNormal+=faceNormal;
+				for (int f=0;f<face.m_indices.size();f++)
+				{
+					int orgIndex = face.m_indices[f];
+					b3Vector3 pt = m_vertices[orgIndex];
+					bool found = false;
+					for (int i=0;i<orgpoints.size();i++)
+					{
+						//if ((orgpoints[i].m_orgIndex == orgIndex) || ((rotatedPt-orgpoints[i]).length2()<0.0001))
+						if (orgpoints[i].m_orgIndex == orgIndex)
+						{
+							found=true;
+							break;
+						}
+					}
+					if (!found)
+						orgpoints.push_back(b3GrahamVector3(pt,orgIndex));
+				}
+			}
+			b3MyFace combinedFace;
+			for (int i=0;i<4;i++)
+				combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
+			b3AlignedObjectArray<b3GrahamVector3> hull;
+			averageFaceNormal.normalize();
+			b3GrahamScanConvexHull2D(orgpoints,hull,averageFaceNormal);
+			for (int i=0;i<hull.size();i++)
+			{
+				combinedFace.m_indices.push_back(hull[i].m_orgIndex);
+				for(int k = 0; k < orgpoints.size(); k++) 
+				{
+					if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex) 
+					{
+						orgpoints[k].m_orgIndex = -1; // invalidate...
+						break;
+					}
+				}
+			}
+			// are there rejected vertices?
+			bool reject_merge = false;
+			for(int i = 0; i < orgpoints.size(); i++) {
+				if(orgpoints[i].m_orgIndex == -1)
+					continue; // this is in the hull...
+				// this vertex is rejected -- is anybody else using this vertex?
+				for(int j = 0; j < tmpFaces.size(); j++) {
+					b3MyFace& face = tmpFaces[j];
+					// is this a face of the current coplanar group?
+					bool is_in_current_group = false;
+					for(int k = 0; k < coplanarFaceGroup.size(); k++) {
+						if(coplanarFaceGroup[k] == j) {
+							is_in_current_group = true;
+							break;
+						}
+					}
+					if(is_in_current_group) // ignore this face...
+						continue;
+					// does this face use this rejected vertex?
+					for(int v = 0; v < face.m_indices.size(); v++) {
+						if(face.m_indices[v] == orgpoints[i].m_orgIndex) {
+							// this rejected vertex is used in another face -- reject merge
+							reject_merge = true;
+							break;
+						}
+					}
+					if(reject_merge)
+						break;
+				}
+				if(reject_merge)
+					break;
+			}
+			if (!reject_merge)
+			{
+				// do this merge!
+				did_merge = true;
+				m_faces.push_back(combinedFace);
+			}
+		}
+		if(!did_merge)
+		{
+			for (int i=0;i<coplanarFaceGroup.size();i++)
+			{
+				b3MyFace face = tmpFaces[coplanarFaceGroup[i]];
+				m_faces.push_back(face);
+			}
+		} 
+	}
+	initialize();
+	return true;
+inline bool IsAlmostZero(const b3Vector3& v)
+	if(fabsf(v.getX())>1e-6 || fabsf(v.getY())>1e-6 || fabsf(v.getZ())>1e-6)	return false;
+	return true;
+struct b3InternalVertexPair
+	b3InternalVertexPair(short int v0,short int v1)
+		:m_v0(v0),
+		m_v1(v1)
+	{
+		if (m_v1>m_v0)
+			b3Swap(m_v0,m_v1);
+	}
+	short int m_v0;
+	short int m_v1;
+	int getHash() const
+	{
+		return m_v0+(m_v1<<16);
+	}
+	bool equals(const b3InternalVertexPair& other) const
+	{
+		return m_v0==other.m_v0 && m_v1==other.m_v1;
+	}
+struct b3InternalEdge
+	b3InternalEdge()
+		:m_face0(-1),
+		m_face1(-1)
+	{
+	}
+	short int m_face0;
+	short int m_face1;
+bool b3ConvexUtility::testContainment() const
+	for(int p=0;p<8;p++)
+	{
+		b3Vector3 LocalPt;
+		if(p==0)		LocalPt = m_localCenter + b3Vector3(m_extents[0], m_extents[1], m_extents[2]);
+		else if(p==1)	LocalPt = m_localCenter + b3Vector3(m_extents[0], m_extents[1], -m_extents[2]);
+		else if(p==2)	LocalPt = m_localCenter + b3Vector3(m_extents[0], -m_extents[1], m_extents[2]);
+		else if(p==3)	LocalPt = m_localCenter + b3Vector3(m_extents[0], -m_extents[1], -m_extents[2]);
+		else if(p==4)	LocalPt = m_localCenter + b3Vector3(-m_extents[0], m_extents[1], m_extents[2]);
+		else if(p==5)	LocalPt = m_localCenter + b3Vector3(-m_extents[0], m_extents[1], -m_extents[2]);
+		else if(p==6)	LocalPt = m_localCenter + b3Vector3(-m_extents[0], -m_extents[1], m_extents[2]);
+		else if(p==7)	LocalPt = m_localCenter + b3Vector3(-m_extents[0], -m_extents[1], -m_extents[2]);
+		for(int i=0;i<m_faces.size();i++)
+		{
+			const b3Vector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const b3Scalar d = LocalPt.dot(Normal) + m_faces[i].m_plane[3];
+			if(d>0.0f)
+				return false;
+		}
+	}
+	return true;
+void	b3ConvexUtility::initialize()
+	b3HashMap<b3InternalVertexPair,b3InternalEdge> edges;
+	b3Scalar TotalArea = 0.0f;
+	m_localCenter.setValue(0, 0, 0);
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices;
+		for(int j=0;j<NbTris;j++)
+		{
+			int k = (j+1)%numVertices;
+			b3InternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
+			b3InternalEdge* edptr = edges.find(vp);
+			b3Vector3 edge = m_vertices[vp.m_v1]-m_vertices[vp.m_v0];
+			edge.normalize();
+			bool found = false;
+			b3Vector3 diff,diff2;
+			for (int p=0;p<m_uniqueEdges.size();p++)
+			{
+				diff = m_uniqueEdges[p]-edge;
+				diff2 = m_uniqueEdges[p]+edge;
+			//	if ((diff.length2()==0.f) || 
+				//	(diff2.length2()==0.f))
+				if (IsAlmostZero(diff) || 
+				IsAlmostZero(diff2))
+				{
+					found = true;
+					break;
+				}
+			}
+			if (!found)
+			{
+				m_uniqueEdges.push_back(edge);
+			}
+			if (edptr)
+			{
+					//TBD: figure out why I added this assert
+//				b3Assert(edptr->m_face0>=0);
+	//			b3Assert(edptr->m_face1<0);
+				edptr->m_face1 = i;
+			} else
+			{
+				b3InternalEdge ed;
+				ed.m_face0 = i;
+				edges.insert(vp,ed);
+			}
+		}
+	}
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		m_faces[i].m_connectedFaces.resize(numVertices);
+		for(int j=0;j<numVertices;j++)
+		{
+			int k = (j+1)%numVertices;
+			b3InternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
+			b3InternalEdge* edptr = edges.find(vp);
+			b3Assert(edptr);
+			b3Assert(edptr->m_face0>=0);
+			b3Assert(edptr->m_face1>=0);
+			int connectedFace = (edptr->m_face0==i)?edptr->m_face1:edptr->m_face0;
+			m_faces[i].m_connectedFaces[j] = connectedFace;
+		}
+	}
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices-2;
+		const b3Vector3& p0 = m_vertices[m_faces[i].m_indices[0]];
+		for(int j=1;j<=NbTris;j++)
+		{
+			int k = (j+1)%numVertices;
+			const b3Vector3& p1 = m_vertices[m_faces[i].m_indices[j]];
+			const b3Vector3& p2 = m_vertices[m_faces[i].m_indices[k]];
+			b3Scalar Area = ((p0 - p1).cross(p0 - p2)).length() * 0.5f;
+			b3Vector3 Center = (p0+p1+p2)/3.0f;
+			m_localCenter += Area * Center;
+			TotalArea += Area;
+		}
+	}
+	m_localCenter /= TotalArea;
+	if(1)
+	{
+		m_radius = FLT_MAX;
+		for(int i=0;i<m_faces.size();i++)
+		{
+			const b3Vector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const b3Scalar dist = b3Fabs(m_localCenter.dot(Normal) + m_faces[i].m_plane[3]);
+			if(dist<m_radius)
+				m_radius = dist;
+		}
+		b3Scalar MinX = FLT_MAX;
+		b3Scalar MinY = FLT_MAX;
+		b3Scalar MinZ = FLT_MAX;
+		b3Scalar MaxX = -FLT_MAX;
+		b3Scalar MaxY = -FLT_MAX;
+		b3Scalar MaxZ = -FLT_MAX;
+		for(int i=0; i<m_vertices.size(); i++)
+		{
+			const b3Vector3& pt = m_vertices[i];
+			if(pt.getX()<MinX)	MinX = pt.getX();
+			if(pt.getX()>MaxX)	MaxX = pt.getX();
+			if(pt.getY()<MinY)	MinY = pt.getY();
+			if(pt.getY()>MaxY)	MaxY = pt.getY();
+			if(pt.getZ()<MinZ)	MinZ = pt.getZ();
+			if(pt.getZ()>MaxZ)	MaxZ = pt.getZ();
+		}
+		mC.setValue(MaxX+MinX, MaxY+MinY, MaxZ+MinZ);
+		mE.setValue(MaxX-MinX, MaxY-MinY, MaxZ-MinZ);
+//		const b3Scalar r = m_radius / sqrtf(2.0f);
+		const b3Scalar r = m_radius / sqrtf(3.0f);
+		const int LargestExtent = mE.maxAxis();
+		const b3Scalar Step = (mE[LargestExtent]*0.5f - r)/1024.0f;
+		m_extents[0] = m_extents[1] = m_extents[2] = r;
+		m_extents[LargestExtent] = mE[LargestExtent]*0.5f;
+		bool FoundBox = false;
+		for(int j=0;j<1024;j++)
+		{
+			if(testContainment())
+			{
+				FoundBox = true;
+				break;
+			}
+			m_extents[LargestExtent] -= Step;
+		}
+		if(!FoundBox)
+		{
+			m_extents[0] = m_extents[1] = m_extents[2] = r;
+		}
+		else
+		{
+			// Refine the box
+			const b3Scalar Step = (m_radius - r)/1024.0f;
+			const int e0 = (1<<LargestExtent) & 3;
+			const int e1 = (1<<e0) & 3;
+			for(int j=0;j<1024;j++)
+			{
+				const b3Scalar Saved0 = m_extents[e0];
+				const b3Scalar Saved1 = m_extents[e1];
+				m_extents[e0] += Step;
+				m_extents[e1] += Step;
+				if(!testContainment())
+				{
+					m_extents[e0] = Saved0;
+					m_extents[e1] = Saved1;
+					break;
+				}
+			}
+		}
+	}
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h
new file mode 100644
index 00000000..86c4151f
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h
@@ -0,0 +1,62 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Transform.h"
+struct b3MyFace
+	b3AlignedObjectArray<int>	m_indices;
+	b3Scalar	m_plane[4];
+B3_ATTRIBUTE_ALIGNED16(class) b3ConvexUtility
+	public:
+	b3Vector3		m_localCenter;
+	b3Vector3		m_extents;
+	b3Vector3		mC;
+	b3Vector3		mE;
+	b3Scalar		m_radius;
+	b3AlignedObjectArray<b3Vector3>	m_vertices;
+	b3AlignedObjectArray<b3MyFace>	m_faces;
+	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
+	b3ConvexUtility()
+	{
+	}
+	virtual ~b3ConvexUtility();
+	bool	initializePolyhedralFeatures(const b3Vector3* orgVertices, int numVertices, bool mergeCoplanarTriangles=true);
+	void	initialize();
+	bool testContainment() const;
\ No newline at end of file
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.cpp b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.cpp
new file mode 100644
index 00000000..c3134b2c
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.cpp
@@ -0,0 +1,323 @@
+#include "b3CpuNarrowPhase.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h"
+struct b3CpuNarrowPhaseInternalData
+	b3AlignedObjectArray<b3Aabb> m_localShapeAABBCPU;
+	b3AlignedObjectArray<b3Collidable>	m_collidablesCPU;
+	b3AlignedObjectArray<b3ConvexUtility*> m_convexData;
+	b3Config m_config;
+	b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
+	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
+	b3AlignedObjectArray<b3Vector3> m_convexVertices;
+	b3AlignedObjectArray<int> m_convexIndices;
+	b3AlignedObjectArray<b3GpuFace> m_convexFaces;
+	b3AlignedObjectArray<b3Contact4Data> m_contacts;
+	int	m_numAcceleratedShapes;
+const b3AlignedObjectArray<b3Contact4Data>& b3CpuNarrowPhase::getContacts() const
+	return m_data->m_contacts;
+b3Collidable& b3CpuNarrowPhase::getCollidableCpu(int collidableIndex)
+	return m_data->m_collidablesCPU[collidableIndex];
+const b3Collidable& b3CpuNarrowPhase::getCollidableCpu(int collidableIndex) const
+	return m_data->m_collidablesCPU[collidableIndex];
+b3CpuNarrowPhase::b3CpuNarrowPhase(const struct b3Config& config)
+	m_data = new b3CpuNarrowPhaseInternalData;
+	m_data->m_config = config;
+	m_data->m_numAcceleratedShapes = 0;
+	delete m_data;
+void b3CpuNarrowPhase::computeContacts(b3AlignedObjectArray<b3Int4>& pairs, b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace, b3AlignedObjectArray<b3RigidBodyData>& bodies)
+	int nPairs = pairs.size();
+	int numContacts = 0;
+	int maxContactCapacity = m_data->m_config.m_maxContactCapacity;
+	m_data->m_contacts.resize(maxContactCapacity);
+	for (int i=0;i<nPairs;i++)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = bodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = bodies[bodyIndexB].m_collidableIdx;
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+//			computeContactSphereConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&bodies[0],
+//				&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+		}
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+//			computeContactSphereConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+//				&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//printf("convex-sphere\n");
+		}
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+//			computeContactPlaneConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("convex-plane\n");
+		}
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+//			computeContactPlaneConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&bodies[0],
+//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("plane-convex\n");
+		}
+			if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+//			computeContactCompoundCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&cpuChildShapes[0], hostAabbsWorldSpace,hostAabbsLocalSpace,hostVertices,hostUniqueEdges,hostIndices,hostFaces,&hostContacts[0],
+//			nContacts,maxContactCapacity,treeNodesCPU,subTreesCPU,bvhInfoCPU);	
+//			printf("convex-plane\n");
+		}
+				if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+//			computeContactPlaneCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&cpuChildShapes[0], &hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("convex-plane\n");
+		}
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+//			computeContactPlaneCompound(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&bodies[0],
+//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&cpuChildShapes[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("plane-convex\n");
+		}
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			//printf("pairs[i].z=%d\n",pairs[i].z);
+			//int contactIndex = computeContactConvexConvex2(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,bodies,
+			//		m_data->m_collidablesCPU,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+			int contactIndex = b3ContactConvexConvexSAT(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,bodies,
+				m_data->m_collidablesCPU,m_data->m_convexPolyhedra,m_data->m_convexVertices,m_data->m_uniqueEdges,m_data->m_convexIndices,m_data->m_convexFaces,m_data->m_contacts,numContacts,maxContactCapacity);
+			if (contactIndex>=0)
+			{
+				pairs[i].z = contactIndex;
+			}
+//			printf("plane-convex\n");
+		}
+	}
+	m_data->m_contacts.resize(numContacts);
+int	b3CpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr)
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex<0)
+		return collidableIndex;
+	b3Collidable& col = m_data->m_collidablesCPU[collidableIndex];
+	col.m_shapeType = SHAPE_CONVEX_HULL;
+	col.m_shapeIndex = -1;
+	{
+		b3Vector3 localCenter=b3MakeVector3(0,0,0);
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+			localCenter+=utilPtr->m_vertices[i];
+		localCenter*= (1.f/utilPtr->m_vertices.size());
+		utilPtr->m_localCenter = localCenter;
+		col.m_shapeIndex = registerConvexHullShapeInternal(utilPtr,col);
+	}
+	if (col.m_shapeIndex>=0)
+	{
+		b3Aabb aabb;
+		b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f);
+		b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f);
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+		{
+			myAabbMin.setMin(utilPtr->m_vertices[i]);
+			myAabbMax.setMax(utilPtr->m_vertices[i]);
+		}
+		aabb.m_min[0] = myAabbMin[0];
+		aabb.m_min[1] = myAabbMin[1];
+		aabb.m_min[2] = myAabbMin[2];
+		aabb.m_minIndices[3] = 0;
+		aabb.m_max[0] = myAabbMax[0];
+		aabb.m_max[1] = myAabbMax[1];
+		aabb.m_max[2] = myAabbMax[2];
+		aabb.m_signedMaxIndices[3] = 0;
+		m_data->m_localShapeAABBCPU.push_back(aabb);
+	}
+	return collidableIndex;
+int	b3CpuNarrowPhase::allocateCollidable()
+	int curSize = m_data->m_collidablesCPU.size();
+	if (curSize<m_data->m_config.m_maxConvexShapes)
+	{
+		m_data->m_collidablesCPU.expand();
+		return curSize;
+	}
+	else
+	{
+		b3Error("allocateCollidable out-of-range %d\n",m_data->m_config.m_maxConvexShapes);
+	}
+	return -1;
+int	b3CpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
+	b3AlignedObjectArray<b3Vector3> verts;
+	unsigned char* vts = (unsigned char*) vertices;
+	for (int i=0;i<numVertices;i++)
+	{
+		float* vertex = (float*) &vts[i*strideInBytes];
+		verts.push_back(b3MakeVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2]));
+	}
+	b3ConvexUtility* utilPtr = new b3ConvexUtility();
+	bool merge = true;
+	if (numVertices)
+	{
+		utilPtr->initializePolyhedralFeatures(&verts[0],verts.size(),merge);
+	}
+	int collidableIndex = registerConvexHullShape(utilPtr);
+	delete utilPtr;
+	return collidableIndex;
+int b3CpuNarrowPhase::registerConvexHullShapeInternal(b3ConvexUtility* convexPtr,b3Collidable& col)
+	m_data->m_convexData.resize(m_data->m_numAcceleratedShapes+1);
+	m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
+	b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
+	convex.mC = convexPtr->mC;
+	convex.mE = convexPtr->mE;
+	convex.m_extents= convexPtr->m_extents;
+	convex.m_localCenter = convexPtr->m_localCenter;
+	convex.m_radius = convexPtr->m_radius;
+	convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size();
+	int edgeOffset = m_data->m_uniqueEdges.size();
+	convex.m_uniqueEdgesOffset = edgeOffset;
+	m_data->m_uniqueEdges.resize(edgeOffset+convex.m_numUniqueEdges);
+	//convex data here
+	int i;
+	for ( i=0;i<convexPtr->m_uniqueEdges.size();i++)
+	{
+		m_data->m_uniqueEdges[edgeOffset+i] = convexPtr->m_uniqueEdges[i];
+	}
+	int faceOffset = m_data->m_convexFaces.size();
+	convex.m_faceOffset = faceOffset;
+	convex.m_numFaces = convexPtr->m_faces.size();
+	m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
+	for (i=0;i<convexPtr->m_faces.size();i++)
+	{
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector3(convexPtr->m_faces[i].m_plane[0],
+																			convexPtr->m_faces[i].m_plane[1],
+																			convexPtr->m_faces[i].m_plane[2],
+																			convexPtr->m_faces[i].m_plane[3]);
+		int indexOffset = m_data->m_convexIndices.size();
+		int numIndices = convexPtr->m_faces[i].m_indices.size();
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
+		m_data->m_convexIndices.resize(indexOffset+numIndices);
+		for (int p=0;p<numIndices;p++)
+		{
+			m_data->m_convexIndices[indexOffset+p] = convexPtr->m_faces[i].m_indices[p];
+		}
+	}
+	convex.m_numVertices = convexPtr->m_vertices.size();
+	int vertexOffset = m_data->m_convexVertices.size();
+	convex.m_vertexOffset =vertexOffset;
+	m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
+	for (int i=0;i<convexPtr->m_vertices.size();i++)
+	{
+		m_data->m_convexVertices[vertexOffset+i] = convexPtr->m_vertices[i];
+	}
+	(m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr;
+	return m_data->m_numAcceleratedShapes++;
+const b3Aabb& b3CpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const
+	return m_data->m_localShapeAABBCPU[collidableIndex];
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h
new file mode 100644
index 00000000..528be334
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h
@@ -0,0 +1,105 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+class b3CpuNarrowPhase
+	struct b3CpuNarrowPhaseInternalData*	m_data;
+	int m_acceleratedCompanionShapeIndex;
+	int m_planeBodyIndex;
+	int	m_static0Index;
+	int registerConvexHullShapeInternal(class b3ConvexUtility* convexPtr,b3Collidable& col);
+	int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
+	b3CpuNarrowPhase(const struct b3Config& config);
+	virtual ~b3CpuNarrowPhase(void);
+	int		registerSphereShape(float radius);
+	int		registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
+	int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
+	int registerFace(const b3Vector3& faceNormal, float faceConstant);
+	int	registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling);
+	//do they need to be merged?
+	int	registerConvexHullShape(b3ConvexUtility* utilPtr);
+	int	registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+	//int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
+	void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
+	void	writeAllBodiesToGpu();
+	void  reset();
+	void	readbackAllBodiesToCpu();
+	bool	getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const;
+	void setObjectTransformCpu(float* position, float* orientation , int bodyIndex);
+	void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);
+	//virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
+	virtual void computeContacts(b3AlignedObjectArray<b3Int4>& pairs, b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace, b3AlignedObjectArray<b3RigidBodyData>& bodies);
+	const struct b3RigidBodyData* getBodiesCpu() const;
+	//struct b3RigidBodyData* getBodiesCpu();
+	int	getNumBodiesGpu() const;
+	int	getNumBodyInertiasGpu() const;
+	const struct b3Collidable* getCollidablesCpu() const;
+	int		getNumCollidablesGpu() const;
+	/*const struct b3Contact4* getContactsCPU() const;
+	int	getNumContactsGpu() const;
+	*/
+	const b3AlignedObjectArray<b3Contact4Data>& getContacts() const;
+	int getNumRigidBodies() const;
+	int allocateCollidable();
+	int getStatic0Index() const
+	{
+		return m_static0Index;
+	}
+	b3Collidable& getCollidableCpu(int collidableIndex);
+	const b3Collidable& getCollidableCpu(int collidableIndex) const;
+	const b3CpuNarrowPhaseInternalData*	getInternalData() const
+	{
+			return m_data;
+	}
+	const struct b3Aabb& getLocalSpaceAabb(int collidableIndex) const;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h
new file mode 100644
index 00000000..fba8bd07
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h
@@ -0,0 +1,24 @@
+#ifndef B3_RAYCAST_INFO_H
+#define B3_RAYCAST_INFO_H
+#include "Bullet3Common/b3Vector3.h"
+B3_ATTRIBUTE_ALIGNED16(struct) b3RayInfo
+	b3Vector3 m_from;
+	b3Vector3 m_to;
+B3_ATTRIBUTE_ALIGNED16(struct) b3RayHit
+		b3Scalar	m_hitFraction;
+		int	m_hitBody;
+		int	m_hitResult1;
+		int	m_hitResult2;
+		b3Vector3 m_hitPoint;
+		b3Vector3 m_hitNormal;
+#endif //B3_RAYCAST_INFO_H
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h
new file mode 100644
index 00000000..d58f7180
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h
@@ -0,0 +1,30 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_RIGID_BODY_CL
+#define B3_RIGID_BODY_CL
+#include "Bullet3Common/b3Scalar.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+inline float	b3GetInvMass(const b3RigidBodyData& body)
+		return body.m_invMass;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h
new file mode 100644
index 00000000..8788ccbb
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h
@@ -0,0 +1,20 @@
+typedef struct b3BvhSubtreeInfoData b3BvhSubtreeInfoData_t;
+struct b3BvhSubtreeInfoData
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h
new file mode 100644
index 00000000..2618da24
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h
@@ -0,0 +1,126 @@
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+// work-in-progress
+void   b3BvhTraversal( __global const b3Int4* pairs, 
+									__global const b3RigidBodyData* rigidBodies, 
+									__global const b3Collidable* collidables,
+									__global b3Aabb* aabbs,
+									__global b3Int4* concavePairsOut,
+									__global volatile int* numConcavePairsOut,
+									__global const b3BvhSubtreeInfo* subtreeHeadersRoot,
+									__global const b3QuantizedBvhNode* quantizedNodesRoot,
+									__global const b3BvhInfo* bvhInfos,
+									int numPairs,
+									int maxNumConcavePairsCapacity,
+									int id)
+	int bodyIndexA = pairs[id].x;
+	int bodyIndexB = pairs[id].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	//once the broadphase avoids static-static pairs, we can remove this test
+	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+	{
+		return;
+	}
+	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)
+		return;
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	if (shapeTypeB!=SHAPE_CONVEX_HULL &&
+		shapeTypeB!=SHAPE_SPHERE	&&
+		)
+		return;
+	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
+	b3Float4	bvhAabbMin = bvhInfo.m_aabbMin;
+	b3Float4	bvhAabbMax = bvhInfo.m_aabbMax;
+	b3Float4	bvhQuantization = bvhInfo.m_quantization;
+	int numSubtreeHeaders = bvhInfo.m_numSubTrees;
+	__global const b3BvhSubtreeInfoData* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
+	__global const b3QuantizedBvhNodeData* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	b3QuantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_minVec,false,bvhAabbMin, bvhAabbMax,bvhQuantization);
+	b3QuantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_maxVec,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);
+	for (int i=0;i<numSubtreeHeaders;i++)
+	{
+		b3BvhSubtreeInfoData subtree = subtreeHeaders[i];
+		int overlap = b3TestQuantizedAabbAgainstQuantizedAabbSlow(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			int startNodeIndex = subtree.m_rootNodeIndex;
+			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;
+			int curIndex = startNodeIndex;
+			int escapeIndex;
+			int isLeafNode;
+			int aabbOverlap;
+			while (curIndex < endNodeIndex)
+			{
+				b3QuantizedBvhNodeData rootNode = quantizedNodes[curIndex];
+				aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabbSlow(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);
+				isLeafNode = b3IsLeaf(&rootNode);
+				if (aabbOverlap)
+				{
+					if (isLeafNode)
+					{
+						int triangleIndex = b3GetTriangleIndex(&rootNode);
+						{
+								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+								int pairIdx = b3AtomicAdd (numConcavePairsOut,numChildrenB);
+								for (int b=0;b<numChildrenB;b++)
+								{
+									if ((pairIdx+b)<maxNumConcavePairsCapacity)
+									{
+										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+										b3Int4 newPair = b3MakeInt4(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);
+										concavePairsOut[pairIdx+b] = newPair;
+									}
+								}
+						} else
+						{
+							int pairIdx = b3AtomicInc(numConcavePairsOut);
+							if (pairIdx<maxNumConcavePairsCapacity)
+							{
+								b3Int4 newPair = b3MakeInt4(bodyIndexA,bodyIndexB,triangleIndex,0);
+								concavePairsOut[pairIdx] = newPair;
+							}
+						}
+					} 
+					curIndex++;
+				} else
+				{
+					if (isLeafNode)
+					{
+						curIndex++;
+					} else
+					{
+						escapeIndex = b3GetEscapeIndex(&rootNode);
+						curIndex += escapeIndex;
+					}
+				}
+			}
+		}
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h
new file mode 100644
index 00000000..b5633ecd
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h
@@ -0,0 +1,188 @@
+#ifndef B3_CLIP_FACES_H
+#define B3_CLIP_FACES_H
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+inline b3Float4 b3Lerp3(b3Float4ConstArg a,b3Float4ConstArg b, float  t)
+	return b3MakeFloat4(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFaceGlobal(__global const b3Float4* pVtxIn, int numVertsIn, b3Float4ConstArg planeNormalWS,float planeEqWS, __global b3Float4* ppVtxOut)
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+    //double-check next test
+    //	if (numVertsIn < 2)
+    //		return 0;
+	b3Float4 firstVertex=pVtxIn[numVertsIn-1];
+	b3Float4 endVertex = pVtxIn[0];
+	ds = b3Dot(planeNormalWS,firstVertex)+planeEqWS;
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = b3Dot(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+__kernel void   clipFacesAndFindContactsKernel(    __global const b3Float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global b3Int4* clippingFacesOut,
+                                                   __global b3Float4* worldVertsA1,
+                                                   __global b3Float4* worldNormalsA1,
+                                                   __global b3Float4* worldVertsB1,
+                                                   __global b3Float4* worldVertsB2,
+                                                    int vertexFaceCapacity,
+															int pairIndex
+                                                   )
+//    int i = get_global_id(0);
+	//int pairIndex = i;
+	int i = pairIndex;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+//	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+//			int bodyIndexA = pairs[i].x;
+	//		int bodyIndexB = pairs[i].y;
+            int numLocalContactsOut = 0;
+            int capacityWorldVertsB2 = vertexFaceCapacity;
+            __global b3Float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+            __global b3Float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+            {
+                __global b3Int4* clippingFaces = clippingFacesOut;
+                int closestFaceA = clippingFaces[pairIndex].x;
+                int closestFaceB = clippingFaces[pairIndex].y;
+                int numVertsInA = clippingFaces[pairIndex].z;
+                int numVertsInB = clippingFaces[pairIndex].w;
+                int numVertsOut = 0;
+                if (closestFaceA>=0)
+                {
+                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+                    for(int e0=0;e0<numVertsInA;e0++)
+                    {
+                        const b3Float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+                        const b3Float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+                        const b3Float4 WorldEdge0 = aw - bw;
+                        b3Float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+                        b3Float4 planeNormalWS1 = -b3Cross(WorldEdge0,worldPlaneAnormal1);
+                        b3Float4 worldA1 = aw;
+                        float planeEqWS1 = -b3Dot(worldA1,planeNormalWS1);
+                        b3Float4 planeNormalWS = planeNormalWS1;
+                        float planeEqWS=planeEqWS1;
+                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+                        __global b3Float4* tmp = pVtxOut;
+                        pVtxOut = pVtxIn;
+                        pVtxIn = tmp;
+                        numVertsInB = numVertsOut;
+                        numVertsOut = 0;
+                    }
+                    b3Float4 planeNormalWS = worldNormalsA1[pairIndex];
+                    float planeEqWS=-b3Dot(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+                    for (int i=0;i<numVertsInB;i++)
+                    {
+                        float depth = b3Dot(planeNormalWS,pVtxIn[i])+planeEqWS;
+                        if (depth <=minDist)
+                        {
+                            depth = minDist;
+                        }
+						static float maxDepth = 0.f;
+						if (depth < maxDepth)
+						{
+							maxDepth = depth;
+							if (maxDepth < -10)
+							{
+								printf("error at framecount %d?\n",myframecount);
+							}
+							printf("maxDepth = %f\n", maxDepth);
+						}
+                        if (depth <=maxDist)
+                        {
+                            b3Float4 pointInWorld = pVtxIn[i];
+                            pVtxOut[numLocalContactsOut++] = b3MakeFloat4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+                        }
+                    }
+                }
+                clippingFaces[pairIndex].w =numLocalContactsOut;
+            }
+            for (int i=0;i<numLocalContactsOut;i++)
+                pVtxIn[i] = pVtxOut[i];
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+#endif //B3_CLIP_FACES_H
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h
new file mode 100644
index 00000000..77cdc7b7
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h
@@ -0,0 +1,76 @@
+#ifndef B3_COLLIDABLE_H
+#define B3_COLLIDABLE_H
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Quat.h"
+enum b3ShapeTypes
+typedef struct b3Collidable b3Collidable_t;
+struct b3Collidable
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union
+	{
+		float m_radius;
+		int	m_compoundBvhIndex;
+	};
+	int m_shapeType;
+	union
+	{
+		int m_shapeIndex;
+		float m_height;
+	};
+typedef struct b3GpuChildShape b3GpuChildShape_t;
+struct b3GpuChildShape
+	b3Float4	m_childPosition;
+	b3Quat		m_childOrientation;
+	union
+	{
+		int			m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS
+		int			m_capsuleAxis;
+	};
+	union 
+	{
+		float		m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES
+		int			m_numChildShapes;//used for compound shape
+	};
+	union 
+	{
+		float		m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES
+		int	m_collidableShapeIndex;
+	};
+	int			m_shapeType;
+struct b3CompoundOverlappingPair
+	int m_bodyIndexA;
+	int m_bodyIndexB;
+//	int	m_pairType;
+	int m_childShapeIndexA;
+	int m_childShapeIndexB;
+#endif //B3_COLLIDABLE_H
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h
new file mode 100644
index 00000000..dfd45cc5
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h
@@ -0,0 +1,40 @@
+#ifndef B3_CONTACT4DATA_H
+#define B3_CONTACT4DATA_H
+#include "Bullet3Common/shared/b3Float4.h"
+typedef  struct b3Contact4Data b3Contact4Data_t;
+struct b3Contact4Data
+	b3Float4	m_worldPosB[4];
+//	b3Float4	m_localPosA[4];
+//	b3Float4	m_localPosB[4];
+	b3Float4	m_worldNormalOnB;	//	w: m_nPoints
+	unsigned short  m_restituitionCoeffCmp;
+	unsigned short  m_frictionCoeffCmp;
+	int m_batchIdx;
+	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr
+	int m_bodyBPtrAndSignBit;
+	int	m_childIndexA;
+	int	m_childIndexB;
+	int m_unused1;
+	int m_unused2;
+inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)
+	return (int)contact->m_worldNormalOnB.w;
+inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)
+	contact->m_worldNormalOnB.w = (float)numPoints;
+#endif //B3_CONTACT4DATA_H
\ No newline at end of file
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h
new file mode 100644
index 00000000..65b33390
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h
@@ -0,0 +1,523 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h"
+#define B3_MAX_VERTS 1024
+inline b3Float4 b3Lerp3(const b3Float4& a,const b3Float4& b, float  t)
+	return b3MakeVector3(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+inline int b3ClipFace(const b3Float4* pVtxIn, int numVertsIn, b3Float4& planeNormalWS,float planeEqWS, b3Float4* ppVtxOut)
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+	if (numVertsIn < 2)
+		return 0;
+	b3Float4 firstVertex=pVtxIn[numVertsIn-1];
+	b3Float4 endVertex = pVtxIn[0];
+	ds = b3Dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = b3Dot3F4(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+inline int b3ClipFaceAgainstHull(const b3Float4& separatingNormal, const b3ConvexPolyhedronData* hullA,  
+	const b3Float4& posA, const b3Quaternion& ornA, b3Float4* worldVertsB1, int numWorldVertsB1,
+	b3Float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	const b3AlignedObjectArray<b3Float4>& verticesA,	const b3AlignedObjectArray<b3GpuFace>& facesA,	const b3AlignedObjectArray<int>& indicesA,
+	//const b3Float4* verticesB,	const b3GpuFace* facesB,	const int* indicesB,
+	b3Float4* contactsOut,
+	int contactCapacity)
+	int numContactsOut = 0;
+	b3Float4* pVtxIn = worldVertsB1;
+	b3Float4* pVtxOut = worldVertsB2;
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const b3Float4 Normal = b3MakeVector3(
+				facesA[hullA->m_faceOffset+face].m_plane.x, 
+				facesA[hullA->m_faceOffset+face].m_plane.y, 
+				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const b3Float4 faceANormalWS = b3QuatRotate(ornA,Normal);
+			float d = b3Dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+	b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numContacts = numWorldVertsB1;
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const b3Float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];
+		const b3Float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const b3Float4 edge0 = a - b;
+		const b3Float4 WorldEdge0 = b3QuatRotate(ornA,edge0);
+		b3Float4 planeNormalA = b3MakeFloat4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		b3Float4 worldPlaneAnormal1 = b3QuatRotate(ornA,planeNormalA);
+		b3Float4 planeNormalWS1 = -b3Cross3(WorldEdge0,worldPlaneAnormal1);
+		b3Float4 worldA1 = b3TransformPoint(a,posA,ornA);
+		float planeEqWS1 = -b3Dot3F4(worldA1,planeNormalWS1);
+		b3Float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = b3ClipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+		//btSwap(pVtxIn,pVtxOut);
+		b3Float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+	// only keep points that are behind the witness face
+	{
+		b3Float4 localPlaneNormal  = b3MakeFloat4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		b3Float4 planeNormalWS = b3QuatRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-b3Dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = b3Dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+			if (numContactsOut<contactCapacity)
+			{
+				if (depth <=maxDist)
+				{
+					b3Float4 pointInWorld = pVtxIn[i];
+					//resultOut.addContactPoint(separatingNormal,point,depth);
+					contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+					//printf("depth=%f\n",depth);
+				}
+			} else
+			{
+				b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut,contactCapacity);
+			}
+		}
+	}
+	return numContactsOut;
+inline int	b3ClipHullAgainstHull(const b3Float4& separatingNormal, 
+	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const b3Float4& posA, const b3Quaternion& ornA,const b3Float4& posB, const b3Quaternion& ornB, 
+	b3Float4* worldVertsB1, b3Float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	const b3AlignedObjectArray<b3Float4>& verticesA,	const b3AlignedObjectArray<b3GpuFace>& facesA,	const b3AlignedObjectArray<int>& indicesA,
+	const b3AlignedObjectArray<b3Float4>& verticesB,	const b3AlignedObjectArray<b3GpuFace>& facesB,	const b3AlignedObjectArray<int>& indicesB,
+	b3Float4*	contactsOut,
+	int contactCapacity)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	B3_PROFILE("clipHullAgainstHull");
+	float curMaxDist=maxDist;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+	{
+		//B3_PROFILE("closestFaceB");
+		if (hullB.m_numFaces!=1)
+		{
+			//printf("wtf\n");
+		}
+		static bool once = true;
+		//printf("separatingNormal=%f,%f,%f\n",separatingNormal.x,separatingNormal.y,separatingNormal.z);
+		for(int face=0;face<hullB.m_numFaces;face++)
+		{
+			if (once)
+				printf("face %d\n",face);
+			const b3GpuFace* faceB = &facesB[hullB.m_faceOffset+face];
+			if (once)
+			{
+				for (int i=0;i<faceB->m_numIndices;i++)
+				{
+					b3Float4 vert = verticesB[hullB.m_vertexOffset+indicesB[faceB->m_indexOffset+i]];
+					printf("vert[%d] = %f,%f,%f\n",i,vert.x,vert.y,vert.z);
+				}
+			}
+			//if (facesB[hullB.m_faceOffset+face].m_numIndices>2)
+			{
+				const b3Float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset+face].m_plane.x, 
+					facesB[hullB.m_faceOffset+face].m_plane.y, facesB[hullB.m_faceOffset+face].m_plane.z,0.f);
+				const b3Float4 WorldNormal = b3QuatRotate(ornB, Normal);
+				if (once)
+					printf("faceNormal = %f,%f,%f\n",Normal.x,Normal.y,Normal.z);
+				float d = b3Dot3F4(WorldNormal,separatingNormal);
+				if (d > dmax)
+				{
+					dmax = d;
+					closestFaceB = face;
+				}
+			}
+		}
+		once = false;
+	}
+	b3Assert(closestFaceB>=0);
+	{
+		//B3_PROFILE("worldVertsB1");
+		const b3GpuFace& polyB = facesB[hullB.m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const b3Float4& b = verticesB[hullB.m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = b3TransformPoint(b,posB,ornB);
+		}
+	}
+	if (closestFaceB>=0)
+	{
+		//B3_PROFILE("clipFaceAgainstHull");
+		numContactsOut = b3ClipFaceAgainstHull((b3Float4&)separatingNormal, &hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,
+				verticesA,				facesA,				indicesA,
+				contactsOut,contactCapacity);
+	}
+	return numContactsOut;
+inline int b3ClipHullHullSingle(
+			int bodyIndexA, int bodyIndexB,
+										 const b3Float4& posA,
+										 const b3Quaternion& ornA,
+										 const b3Float4& posB,
+										 const b3Quaternion& ornB,
+			int collidableIndexA, int collidableIndexB,
+			const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf, 
+			b3AlignedObjectArray<b3Contact4Data>* globalContactOut, 
+			int& nContacts,
+			const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA,
+			const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB,
+			const b3AlignedObjectArray<b3Vector3>& verticesA, 
+			const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, 
+			const b3AlignedObjectArray<b3GpuFace>& facesA,
+			const b3AlignedObjectArray<int>& indicesA,
+			const b3AlignedObjectArray<b3Vector3>& verticesB,
+			const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB,
+			const b3AlignedObjectArray<b3GpuFace>& facesB,
+			const b3AlignedObjectArray<int>& indicesB,
+			const b3AlignedObjectArray<b3Collidable>& hostCollidablesA,
+			const b3AlignedObjectArray<b3Collidable>& hostCollidablesB,
+			const b3Vector3& sepNormalWorldSpace,
+			int maxContactCapacity			)
+	int contactIndex = -1;
+	b3ConvexPolyhedronData hullA, hullB;
+    b3Collidable colA = hostCollidablesA[collidableIndexA];
+    hullA = hostConvexDataA[colA.m_shapeIndex];
+    //printf("numvertsA = %d\n",hullA.m_numVertices);
+    b3Collidable colB = hostCollidablesB[collidableIndexB];
+    hullB = hostConvexDataB[colB.m_shapeIndex];
+    //printf("numvertsB = %d\n",hullB.m_numVertices);
+	b3Float4 contactsOut[B3_MAX_VERTS];
+	int localContactCapacity = B3_MAX_VERTS;
+#ifdef _WIN32
+	b3Assert(_finite(bodyBuf->at(bodyIndexA).m_pos.x));
+	b3Assert(_finite(bodyBuf->at(bodyIndexB).m_pos.x));
+	{
+		b3Float4 worldVertsB1[B3_MAX_VERTS];
+		b3Float4 worldVertsB2[B3_MAX_VERTS];
+		int capacityWorldVerts = B3_MAX_VERTS;
+		b3Float4 hostNormal = b3MakeFloat4(sepNormalWorldSpace.x,sepNormalWorldSpace.y,sepNormalWorldSpace.z,0.f);
+		int shapeA = hostCollidablesA[collidableIndexA].m_shapeIndex;
+		int shapeB = hostCollidablesB[collidableIndexB].m_shapeIndex;
+		b3Scalar minDist = -1;
+		b3Scalar maxDist = 0.;
+		b3Transform trA,trB;
+		{
+		//B3_PROFILE("b3TransformPoint computation");
+		//trA.setIdentity();
+		trA.setOrigin(b3MakeVector3(posA.x,posA.y,posA.z));
+		trA.setRotation(b3Quaternion(ornA.x,ornA.y,ornA.z,ornA.w));
+		//trB.setIdentity();
+		trB.setOrigin(b3MakeVector3(posB.x,posB.y,posB.z));
+		trB.setRotation(b3Quaternion(ornB.x,ornB.y,ornB.z,ornB.w));
+		}
+		b3Quaternion trAorn = trA.getRotation();
+        b3Quaternion trBorn = trB.getRotation();
+		int numContactsOut = b3ClipHullAgainstHull(hostNormal, 
+						hostConvexDataA.at(shapeA), 
+						hostConvexDataB.at(shapeB),
+								(b3Float4&)trA.getOrigin(), (b3Quaternion&)trAorn,
+								(b3Float4&)trB.getOrigin(), (b3Quaternion&)trBorn,
+								worldVertsB1,worldVertsB2,capacityWorldVerts,
+								minDist, maxDist,
+								verticesA,	facesA,indicesA,
+								verticesB,	facesB,indicesB,
+								contactsOut,localContactCapacity);
+		if (numContactsOut>0)
+		{
+			B3_PROFILE("overlap");
+			b3Float4 normalOnSurfaceB = (b3Float4&)hostNormal;
+//			b3Float4 centerOut;
+			b3Int4 contactIdx;
+			contactIdx.x = 0;
+			contactIdx.y = 1;
+			contactIdx.z = 2;
+			contactIdx.w = 3;
+			int numPoints = 0;
+			{
+				B3_PROFILE("extractManifold");
+				numPoints = b3ReduceContacts(contactsOut, numContactsOut, normalOnSurfaceB,  &contactIdx);
+			}
+			b3Assert(numPoints);
+			if (nContacts<maxContactCapacity)
+			{
+				contactIndex = nContacts;
+				globalContactOut->expand();
+				b3Contact4Data& contact = globalContactOut->at(nContacts);
+				contact.m_batchIdx = 0;//i;
+				contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+				contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+				contact.m_frictionCoeffCmp = 45874;
+				contact.m_restituitionCoeffCmp = 0;
+				float distance = 0.f;
+				for (int p=0;p<numPoints;p++)
+				{
+					contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]];//check if it is actually on B
+					contact.m_worldNormalOnB = normalOnSurfaceB; 
+				}
+				//printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints);
+				contact.m_worldNormalOnB.w = (b3Scalar)numPoints;
+				nContacts++;
+			} else
+			{
+				b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts,maxContactCapacity);
+			}
+		}
+	}
+	return contactIndex;
+inline int b3ContactConvexConvexSAT(
+																int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, 
+																const b3AlignedObjectArray<b3Collidable>& collidables,
+																const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes,
+																const b3AlignedObjectArray<b3Float4>& convexVertices,
+																const b3AlignedObjectArray<b3Float4>& uniqueEdges,
+																const b3AlignedObjectArray<int>& convexIndices,
+																const b3AlignedObjectArray<b3GpuFace>& faces,
+																b3AlignedObjectArray<b3Contact4Data>& globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+	int contactIndex = -1;
+	b3Float4 posA = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+	b3Float4 posB = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+	b3ConvexPolyhedronData hullA, hullB;
+	b3Float4 sepNormalWorldSpace;
+    b3Collidable colA = collidables[collidableIndexA];
+    hullA = convexShapes[colA.m_shapeIndex];
+    //printf("numvertsA = %d\n",hullA.m_numVertices);
+    b3Collidable colB = collidables[collidableIndexB];
+    hullB = convexShapes[colB.m_shapeIndex];
+    //printf("numvertsB = %d\n",hullB.m_numVertices);
+//	b3Float4 contactsOut[B3_MAX_VERTS];
+	int contactCapacity = B3_MAX_VERTS;
+	int numContactsOut=0;
+#ifdef _WIN32
+	b3Assert(_finite(rigidBodies[bodyIndexA].m_pos.x));
+	b3Assert(_finite(rigidBodies[bodyIndexB].m_pos.x));
+		bool foundSepAxis = b3FindSeparatingAxis(hullA,hullB,
+							posA,
+							ornA,
+							posB,
+							ornB,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							sepNormalWorldSpace
+							);
+	if (foundSepAxis)
+	{
+		contactIndex = b3ClipHullHullSingle(
+			bodyIndexA, bodyIndexB,
+						   posA,ornA,
+						   posB,ornB,
+			collidableIndexA, collidableIndexB,
+			&rigidBodies, 
+			&globalContactsOut,
+			nGlobalContactsOut,
+			convexShapes,
+			convexShapes,
+			convexVertices, 
+			uniqueEdges, 
+			faces,
+			convexIndices,
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+			collidables,
+			collidables,
+			sepNormalWorldSpace,
+			maxContactCapacity);
+	}
+	return contactIndex;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactSphereSphere.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactSphereSphere.h
new file mode 100644
index 00000000..a3fa8228
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactSphereSphere.h
@@ -0,0 +1,162 @@
+void	computeContactSphereConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+	float radius = collidables[collidableIndexA].m_radius;
+	float4 spherePos1 = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion sphereOrn = rigidBodies[bodyIndexA].m_quat;
+	float4 pos = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion quat = rigidBodies[bodyIndexB].m_quat;
+	b3Transform tr;
+	tr.setIdentity();
+	tr.setOrigin(pos);
+	tr.setRotation(quat);
+	b3Transform trInv = tr.inverse();
+	float4 spherePos = trInv(spherePos1);
+	int collidableIndex = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndex = collidables[collidableIndex].m_shapeIndex;
+	int numFaces = convexShapes[shapeIndex].m_numFaces;
+	float4 closestPnt = b3MakeVector3(0, 0, 0, 0);
+	float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0);
+	float minDist = -1000000.f; // TODO: What is the largest/smallest float?
+	bool bCollide = true;
+	int region = -1;
+	float4 localHitNormal;
+	for ( int f = 0; f < numFaces; f++ )
+	{
+		b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];
+		float4 planeEqn;
+		float4 localPlaneNormal = b3MakeVector3(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		float4 n1 = localPlaneNormal;//quatRotate(quat,localPlaneNormal);
+		planeEqn = n1;
+		planeEqn[3] = face.m_plane.w;
+		float4 pntReturn;
+		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);
+		if ( dist > radius)
+		{
+			bCollide = false;
+			break;
+		}
+		if ( dist > 0 )
+		{
+			//might hit an edge or vertex
+			b3Vector3 out;
+			bool isInPoly = IsPointInPolygon(spherePos,
+					&face,
+					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],
+					convexIndices,
+                    &out);
+			if (isInPoly)
+			{
+				if (dist>minDist)
+				{
+					minDist = dist;
+					closestPnt = pntReturn;
+					localHitNormal = planeEqn;
+					region=1;
+				}
+			} else
+			{
+				b3Vector3 tmp = spherePos-out;
+				b3Scalar l2 = tmp.length2();
+				if (l2<radius*radius)
+				{
+					dist  = b3Sqrt(l2);
+					if (dist>minDist)
+					{
+						minDist = dist;
+						closestPnt = out;
+						localHitNormal = tmp/dist;
+						region=2;
+					}
+				} else
+				{
+					bCollide = false;
+					break;
+				}
+			}
+		}
+		else
+		{
+			if ( dist > minDist )
+			{
+				minDist = dist;
+				closestPnt = pntReturn;
+				localHitNormal = planeEqn;
+				region=3;
+			}
+		}
+	}
+	static int numChecks = 0;
+	numChecks++;
+	if (bCollide && minDist > -10000)
+	{
+		float4 normalOnSurfaceB1 = tr.getBasis()*localHitNormal;//-hitNormalWorld;
+		float4 pOnB1 = tr(closestPnt);
+		//printf("dist ,%f,",minDist);
+		float actualDepth = minDist-radius;
+		if (actualDepth<0)
+		{
+		//printf("actualDepth = ,%f,", actualDepth);
+		//printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z);
+		//printf("region=,%d,\n", region);
+		pOnB1[3] = actualDepth;
+		int dstIdx;
+//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+		if (nGlobalContactsOut < maxContactCapacity)
+		{
+			dstIdx=nGlobalContactsOut;
+			nGlobalContactsOut++;
+			b3Contact4* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = normalOnSurfaceB1;
+			c->setFrictionCoeff(0.7);
+			c->setRestituitionCoeff(0.f);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_worldPosB[0] = pOnB1;
+			int numPoints = 1;
+			c->m_worldNormalOnB.w = (b3Scalar)numPoints;
+		}//if (dstIdx < numPairs)
+		}
+	}//if (hasCollision)
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h
new file mode 100644
index 00000000..5c5f4e29
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h
@@ -0,0 +1,40 @@
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Quat.h"
+typedef struct b3GpuFace b3GpuFace_t;
+struct b3GpuFace
+	b3Float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+	int m_unusedPadding1;
+	int m_unusedPadding2;
+typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;
+struct b3ConvexPolyhedronData
+	b3Float4		m_localCenter;
+	b3Float4		m_extents;
+	b3Float4		mC;
+	b3Float4		mE;
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h
new file mode 100644
index 00000000..5f301d54
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h
@@ -0,0 +1,832 @@
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+inline void b3Project(__global const b3ConvexPolyhedronData* hull,  b3Float4ConstArg pos, b3QuatConstArg orn, 
+const b3Float4* dir, __global const b3Float4* vertices, float* min, float* max)
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),*dir);
+	float offset = b3Dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = b3Dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+inline bool b3TestSepAxis(const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, 
+	b3Float4ConstArg posA,b3QuatConstArg ornA,
+	b3Float4ConstArg posB,b3QuatConstArg ornB,
+	b3Float4* sep_axis, const b3Float4* verticesA, __global const b3Float4* verticesB,float* depth)
+	float Min0,Max0;
+	float Min1,Max1;
+	b3Project(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	b3Project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+bool b3FindSeparatingAxis(	const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, 
+	b3Float4ConstArg posA1,
+	b3QuatConstArg ornA,
+	b3Float4ConstArg posB1,
+	b3QuatConstArg ornB,
+	b3Float4ConstArg DeltaC2,
+	const b3Float4* verticesA, 
+	const b3Float4* uniqueEdgesA, 
+	const b3GpuFace* facesA,
+	const int*  indicesA,
+	__global const b3Float4* verticesB, 
+	__global const b3Float4* uniqueEdgesB, 
+	__global const b3GpuFace* facesB,
+	__global const int*  indicesB,
+	b3Float4* sep,
+	float* dmin)
+	b3Float4 posA = posA1;
+	posA.w = 0.f;
+	b3Float4 posB = posB1;
+	posB.w = 0.f;
+	static int maxFaceVertex = 0;
+	int curFaceVertexAB = hullA->m_numFaces*hullB->m_numVertices;
+	curFaceVertexAB+= hullB->m_numFaces*hullA->m_numVertices;
+	if (curFaceVertexAB>maxFaceVertex)
+	{
+		maxFaceVertex = curFaceVertexAB;
+		printf("curFaceVertexAB = %d\n",curFaceVertexAB);
+		printf("hullA->m_numFaces = %d\n",hullA->m_numFaces);
+		printf("hullA->m_numVertices = %d\n",hullA->m_numVertices);
+		printf("hullB->m_numVertices = %d\n",hullB->m_numVertices);
+	}
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const b3Float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			b3Float4 faceANormalWS = b3QuatRotate(ornA,normal);
+			if (b3Dot(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!b3TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((b3Dot(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+b3Vector3 unitSphere162[]=
+	b3MakeVector3(0.000000,-1.000000,0.000000),
+bool b3FindSeparatingAxisEdgeEdge(	const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, 
+	b3Float4ConstArg posA1,
+	b3QuatConstArg ornA,
+	b3Float4ConstArg posB1,
+	b3QuatConstArg ornB,
+	b3Float4ConstArg DeltaC2,
+	const b3Float4* verticesA, 
+	const b3Float4* uniqueEdgesA, 
+	const b3GpuFace* facesA,
+	const int*  indicesA,
+	__global const b3Float4* verticesB, 
+	__global const b3Float4* uniqueEdgesB, 
+	__global const b3GpuFace* facesB,
+	__global const int*  indicesB,
+		b3Float4* sep,
+	float* dmin,
+	bool searchAllEdgeEdge)
+	b3Float4 posA = posA1;
+	posA.w = 0.f;
+	b3Float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test edges
+	static int maxEdgeTests = 0;
+	int curEdgeTests = hullA->m_numUniqueEdges * hullB->m_numUniqueEdges;
+	if (curEdgeTests >maxEdgeTests )
+	{
+		maxEdgeTests  = curEdgeTests ;
+		printf("maxEdgeTests = %d\n",maxEdgeTests );
+		printf("hullA->m_numUniqueEdges = %d\n",hullA->m_numUniqueEdges);
+		printf("hullB->m_numUniqueEdges = %d\n",hullB->m_numUniqueEdges);
+	}
+	if (searchAllEdgeEdge)
+	{
+		for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+		{
+			const b3Float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+			b3Float4 edge0World = b3QuatRotate(ornA,edge0);
+			for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+			{
+				const b3Float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+				b3Float4 edge1World = b3QuatRotate(ornB,edge1);
+				b3Float4 crossje = b3Cross(edge0World,edge1World);
+				curEdgeEdge++;
+				if(!b3IsAlmostZero(crossje))
+				{
+					crossje = b3Normalized(crossje);
+					if (b3Dot(DeltaC2,crossje)<0)
+						crossje *= -1.f;
+					float dist;
+					bool result = true;
+					{
+						float Min0,Max0;
+						float Min1,Max1;
+						b3Project(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+						b3Project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+						if(Max0<Min1 || Max1<Min0)
+							return false;
+						float d0 = Max0 - Min1;
+						float d1 = Max1 - Min0;
+						dist = d0<d1 ? d0:d1;
+						result = true;
+					}
+					if(dist<*dmin)
+					{
+						*dmin = dist;
+						*sep = crossje;
+					}
+				}
+			}
+		}
+	} else
+	{
+		int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+		//printf("numDirections =%d\n",numDirections );
+		for(int i=0;i<numDirections;i++)
+		{
+			b3Float4 crossje = unitSphere162[i];
+			{
+				//if (b3Dot(DeltaC2,crossje)>0)
+				{
+					float dist;
+					bool result = true;
+					{
+						float Min0,Max0;
+						float Min1,Max1;
+						b3Project(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+						b3Project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+						if(Max0<Min1 || Max1<Min0)
+							return false;
+						float d0 = Max0 - Min1;
+						float d1 = Max1 - Min0;
+						dist = d0<d1 ? d0:d1;
+						result = true;
+					}
+					if(dist<*dmin)
+					{
+						*dmin = dist;
+						*sep = crossje;
+					}
+				}
+			}
+		}
+	}
+	if((b3Dot(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+inline int	b3FindClippingFaces(b3Float4ConstArg separatingNormal,
+                      __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,
+                      b3Float4ConstArg posA, b3QuatConstArg ornA,b3Float4ConstArg posB, b3QuatConstArg ornB,
+                       __global b3Float4* worldVertsA1,
+                      __global b3Float4* worldNormalsA1,
+                      __global b3Float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+                      __global const b3Float4* verticesA,
+                      __global const b3GpuFace_t* facesA,
+                      __global const int* indicesA,
+						__global const b3Float4* verticesB,
+                      __global const b3GpuFace_t* facesB,
+                      __global const int* indicesB,
+                      __global b3Int4* clippingFaces, int pairIndex)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const b3Float4 Normal = b3MakeFloat4(facesB[hullB->m_faceOffset+face].m_plane.x,
+                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const b3Float4 WorldNormal = b3QuatRotate(ornB, Normal);
+			float d = b3Dot(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+	{
+		const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const b3Float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = b3TransformPoint(b,posB,ornB);
+		}
+	}
+    int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const b3Float4 Normal = b3MakeFloat4(
+                                              facesA[hullA->m_faceOffset+face].m_plane.x,
+                                              facesA[hullA->m_faceOffset+face].m_plane.y,
+                                              facesA[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const b3Float4 faceANormalWS = b3QuatRotate(ornA,Normal);
+			float d = b3Dot(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        const b3Float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+        worldVertsA1[pairIndex*capacityWorldVerts+e0] = b3TransformPoint(a, posA,ornA);
+    }
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+	return numContactsOut;
+__kernel void   b3FindConcaveSeparatingAxisKernel( __global b3Int4* concavePairs,
+																					__global const b3RigidBodyData* rigidBodies,
+																					__global const b3Collidable* collidables,
+																					__global const b3ConvexPolyhedronData* convexShapes, 
+																					__global const b3Float4* vertices,
+																					__global const b3Float4* uniqueEdges,
+																					__global const b3GpuFace* faces,
+																					__global const int* indices,
+																					__global const b3GpuChildShape* gpuChildShapes,
+																					__global b3Aabb* aabbs,
+																					__global b3Float4* concaveSeparatingNormalsOut,
+																					__global b3Int4* clippingFacesOut,
+																					__global b3Vector3* worldVertsA1Out,
+																					__global b3Vector3* worldNormalsA1Out,
+																					__global b3Vector3* worldVertsB1Out,
+																					__global int* hasSeparatingNormals,
+																					int vertexFaceCapacity,
+																					int numConcavePairs,
+																					int pairIdx
+																					)
+	int i = pairIdx;
+/*	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	int pairIdx = i;
+	*/
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&
+		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+	hasSeparatingNormals[i] = 0;
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	int f = concavePairs[i].z;
+	bool overlap = false;
+	b3ConvexPolyhedronData convexPolyhedronA;
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	b3Float4	localCenter = b3MakeFloat4(0.f,0.f,0.f,0.f);
+	b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	b3Aabb triAabb;
+	triAabb.m_minVec = b3MakeFloat4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_maxVec = b3MakeFloat4(-1e30f,-1e30f,-1e30f,0.f);
+	b3Float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		b3Float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+		triAabb.m_minVec = b3MinFloat4(triAabb.m_minVec,vert);		
+		triAabb.m_maxVec = b3MaxFloat4(triAabb.m_maxVec,vert);		
+	}
+	overlap = true;
+	overlap = (triAabb.m_minVec.x > aabbs[bodyIndexB].m_maxVec.x || triAabb.m_maxVec.x < aabbs[bodyIndexB].m_minVec.x) ? false : overlap;
+	overlap = (triAabb.m_minVec.z > aabbs[bodyIndexB].m_maxVec.z || triAabb.m_maxVec.z < aabbs[bodyIndexB].m_minVec.z) ? false : overlap;
+	overlap = (triAabb.m_minVec.y > aabbs[bodyIndexB].m_maxVec.y || triAabb.m_maxVec.y < aabbs[bodyIndexB].m_minVec.y) ? false : overlap;
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis=5;
+		b3Float4 sepAxis=b3MakeFloat4(1,2,3,4);
+		int localCC=0;
+		numActualConcaveConvexTests++;
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		b3Float4 uniqueEdgesA[3];
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+		convexPolyhedronA.m_faceOffset = 0;
+		b3Float4 normal = b3MakeFloat4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = b3Dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				b3Float4 v0 = verticesA[i];
+				b3Float4 v1 = verticesA[prevVertex];
+				b3Float4 edgeNormal = b3Normalized(b3Cross(normal,v1-v0));
+				float c = -b3Dot(edgeNormal,v0);
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = B3_TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+		b3Float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		b3Float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+		b3Quaternion ornB =rigidBodies[bodyIndexB].m_quat;
+		///////////////////
+		///compound shape support
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			b3Float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			b3Quaternion childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			b3Float4 newPosB = b3TransformPoint(childPosB,posB,ornB);
+			b3Quaternion newOrnB = b3QuatMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+		b3Float4 c0local = convexPolyhedronA.m_localCenter;
+		b3Float4 c0 = b3TransformPoint(c0local, posA, ornA);
+		b3Float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		b3Float4 c1 = b3TransformPoint(c1local,posB,ornB);
+		const b3Float4 DeltaC2 = c0 - c1;
+		bool sepA = b3FindSeparatingAxis(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+												posA,ornA,
+												posB,ornB,
+												DeltaC2,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												vertices,uniqueEdges,faces,indices,
+												&sepAxis,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = b3FindSeparatingAxis(	&convexShapes[shapeIndexB],&convexPolyhedronA,
+												posB,ornB,
+												posA,ornA,
+												DeltaC2,
+												vertices,uniqueEdges,faces,indices,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												&sepAxis,&dmin);
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				bool sepEE = b3FindSeparatingAxisEdgeEdge(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+															posA,ornA,
+															posB,ornB,
+															DeltaC2,
+															verticesA,uniqueEdgesA,facesA,indicesA,
+															vertices,uniqueEdges,faces,indices,
+															&sepAxis,&dmin,true);
+				if (!sepEE)
+				{
+					hasSeparatingAxis = 0;
+				} else
+				{
+					hasSeparatingAxis = 1;
+				}
+			}
+		}	
+		if (hasSeparatingAxis)
+		{
+			hasSeparatingNormals[i]=1;
+			sepAxis.w = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			//now compute clipping faces A and B, and world-space clipping vertices A and B...
+			float minDist = -1e30f;
+			float maxDist = 0.02f;
+			b3FindClippingFaces(sepAxis,
+                     &convexPolyhedronA,
+					 &convexShapes[shapeIndexB],
+					 posA,ornA,
+					 posB,ornB,
+                       worldVertsA1Out,
+                      worldNormalsA1Out,
+                      worldVertsB1Out,
+					  vertexFaceCapacity,
+                      minDist, maxDist,
+                      verticesA,
+                      facesA,
+                      indicesA,
+					  vertices,
+                      faces,
+                      indices,
+                      clippingFacesOut, pairIdx);
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h
new file mode 100644
index 00000000..d7fde050
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h
@@ -0,0 +1,206 @@
+inline void b3ProjectAxis(const b3ConvexPolyhedronData& hull,  const b3Float4& pos, const b3Quaternion& orn, const b3Float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max)
+	min = FLT_MAX;
+	max = -FLT_MAX;
+	int numVerts = hull.m_numVertices;
+	const b3Float4 localDir = b3QuatRotate(orn.inverse(),dir);
+	b3Scalar offset = b3Dot3F4(pos,dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		//b3Vector3 pt = trans * vertices[m_vertexOffset+i];
+		//b3Scalar dp = pt.dot(dir);
+		b3Vector3 vertex = vertices[hull.m_vertexOffset+i];
+		b3Scalar dp = b3Dot3F4((b3Float4&)vertices[hull.m_vertexOffset+i],localDir);
+		//b3Assert(dp==dpL);
+		if(dp < min)	min = dp;
+		if(dp > max)	max = dp;
+	}
+	if(min>max)
+	{
+		b3Scalar tmp = min;
+		min = max;
+		max = tmp;
+	}
+	min += offset;
+	max += offset;
+inline bool b3TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const b3Float4& posA,const b3Quaternion& ornA,
+	const b3Float4& posB,const b3Quaternion& ornB,
+	const b3Float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB,b3Scalar& depth)
+	b3Scalar Min0,Max0;
+	b3Scalar Min1,Max1;
+	b3ProjectAxis(hullA,posA,ornA,sep_axis,verticesA, Min0, Max0);
+	b3ProjectAxis(hullB,posB,ornB, sep_axis,verticesB, Min1, Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	b3Scalar d0 = Max0 - Min1;
+	b3Assert(d0>=0.0f);
+	b3Scalar d1 = Max1 - Min0;
+	b3Assert(d1>=0.0f);
+	depth = d0<d1 ? d0:d1;
+	return true;
+inline bool b3FindSeparatingAxis(	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const b3Float4& posA1,
+	const b3Quaternion& ornA,
+	const b3Float4& posB1,
+	const b3Quaternion& ornB,
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, 
+	const b3AlignedObjectArray<b3GpuFace>& facesA,
+	const b3AlignedObjectArray<int>& indicesA,
+	const b3AlignedObjectArray<b3Vector3>& verticesB, 
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, 
+	const b3AlignedObjectArray<b3GpuFace>& facesB,
+	const b3AlignedObjectArray<int>& indicesB,
+	b3Vector3& sep)
+	B3_PROFILE("findSeparatingAxis");
+	b3Float4 posA = posA1;
+	posA.w = 0.f;
+	b3Float4 posB = posB1;
+	posB.w = 0.f;
+	b3Float4 c0local = (b3Float4&)hullA.m_localCenter;
+	b3Float4 c0 = b3TransformPoint(c0local, posA, ornA);
+	b3Float4 c1local = (b3Float4&)hullB.m_localCenter;
+	b3Float4 c1 = b3TransformPoint(c1local,posB,ornB);
+	const b3Float4 deltaC2 = c0 - c1;
+	b3Scalar dmin = FLT_MAX;
+	int curPlaneTests=0;
+	int numFacesA = hullA.m_numFaces;
+	// Test normals from hullA
+	for(int i=0;i<numFacesA;i++)
+	{
+		const b3Float4& normal = (b3Float4&)facesA[hullA.m_faceOffset+i].m_plane;
+		b3Float4 faceANormalWS = b3QuatRotate(ornA,normal);
+		if (b3Dot3F4(deltaC2,faceANormalWS)<0)
+			faceANormalWS*=-1.f;
+		curPlaneTests++;
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+		b3Scalar d;
+		if(!b3TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,faceANormalWS, verticesA, verticesB,d))
+			return false;
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)faceANormalWS;
+		}
+	}
+	int numFacesB = hullB.m_numFaces;
+	// Test normals from hullB
+	for(int i=0;i<numFacesB;i++)
+	{
+		b3Float4 normal = (b3Float4&)facesB[hullB.m_faceOffset+i].m_plane;
+		b3Float4 WorldNormal = b3QuatRotate(ornB, normal);
+		if (b3Dot3F4(deltaC2,WorldNormal)<0)
+		{
+			WorldNormal*=-1.f;
+		}
+		curPlaneTests++;
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+		b3Scalar d;
+		if(!b3TestSepAxis(hullA, hullB,posA,ornA,posB,ornB,WorldNormal,verticesA,verticesB,d))
+			return false;
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)WorldNormal;
+		}
+	}
+//	b3Vector3 edgeAstart,edgeAend,edgeBstart,edgeBend;
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA.m_numUniqueEdges;e0++)
+	{
+		const b3Float4& edge0 = (b3Float4&) uniqueEdgesA[hullA.m_uniqueEdgesOffset+e0];
+		b3Float4 edge0World = b3QuatRotate(ornA,(b3Float4&)edge0);
+		for(int e1=0;e1<hullB.m_numUniqueEdges;e1++)
+		{
+			const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset+e1];
+			b3Float4 edge1World = b3QuatRotate(ornB,(b3Float4&)edge1);
+			b3Float4 crossje = b3Cross3(edge0World,edge1World);
+			curEdgeEdge++;
+			if(!b3IsAlmostZero((b3Vector3&)crossje))
+			{
+				crossje = b3FastNormalized3(crossje);
+				if (b3Dot3F4(deltaC2,crossje)<0)
+					crossje*=-1.f;
+				gExpectedNbTests++;
+				if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin))
+					continue;
+				gActualNbTests++;
+				b3Scalar dist;
+				if(!b3TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,crossje, verticesA,verticesB,dist))
+					return false;
+				if(dist<dmin)
+				{
+					dmin = dist;
+					sep = (b3Vector3&)crossje;
+				}
+			}
+		}
+	}
+	if((b3Dot3F4(-deltaC2,(b3Float4&)sep))>0.0f)
+		sep = -sep;
+	return true;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h
new file mode 100644
index 00000000..083b0b5e
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h
@@ -0,0 +1,920 @@
+ * ---------------------------------
+ * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>
+ *
+ *  This file was ported from mpr.c file, part of libccd.
+ *  The Minkoski Portal Refinement implementation was ported 
+ *  to OpenCL by Erwin Coumans for the Bullet 3 Physics library.
+ *  at http://github.com/erwincoumans/bullet3
+ *
+ *  Distributed under the OSI-approved BSD License (the "License");
+ *  see <http://www.opensource.org/licenses/bsd-license.php>.
+ *  This software is distributed WITHOUT ANY WARRANTY; without even the
+ *  See the License for more information.
+ */
+#include "Bullet3Common/shared/b3PlatformDefinitions.h"
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#ifdef __cplusplus
+#define B3_MPR_SQRT sqrtf
+#define B3_MPR_SQRT sqrt
+#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))
+#define B3_MPR_FABS fabs
+#define B3_MPR_TOLERANCE 1E-6f
+#define B3_MPR_MAX_ITERATIONS 1000
+struct _b3MprSupport_t 
+    b3Float4 v;  //!< Support point in minkowski sum
+    b3Float4 v1; //!< Support point in obj1
+    b3Float4 v2; //!< Support point in obj2
+typedef struct _b3MprSupport_t b3MprSupport_t;
+struct _b3MprSimplex_t 
+    b3MprSupport_t ps[4];
+    int last; //!< index of last added point
+typedef struct _b3MprSimplex_t b3MprSimplex_t;
+inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)
+    return &s->ps[idx];
+inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)
+    s->last = size - 1;
+inline int b3MprSimplexSize(const b3MprSimplex_t *s)
+    return s->last + 1;
+inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)
+    // here is no check on boundaries
+    return &s->ps[idx];
+inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)
+    *d = *s;
+inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)
+    b3MprSupportCopy(s->ps + pos, a);
+inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)
+    b3MprSupport_t supp;
+    b3MprSupportCopy(&supp, &s->ps[pos1]);
+    b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);
+    b3MprSupportCopy(&s->ps[pos2], &supp);
+inline int b3MprIsZero(float val)
+    return B3_MPR_FABS(val) < FLT_EPSILON;
+inline int b3MprEq(float _a, float _b)
+    float ab;
+    float a, b;
+    ab = B3_MPR_FABS(_a - _b);
+    if (B3_MPR_FABS(ab) < FLT_EPSILON)
+        return 1;
+    a = B3_MPR_FABS(_a);
+    b = B3_MPR_FABS(_b);
+    if (b > a){
+        return ab < FLT_EPSILON * b;
+    }else{
+        return ab < FLT_EPSILON * a;
+    }
+inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)
+    return b3MprEq((*a).x, (*b).x)
+            && b3MprEq((*a).y, (*b).y)
+            && b3MprEq((*a).z, (*b).z);
+inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, 	b3ConstArray(b3Float4) verticesA)
+	b3Float4 supVec = b3MakeFloat4(0,0,0,0);
+	float maxDot = -B3_LARGE_FLOAT;
+    if( 0 < hull->m_numVertices )
+    {
+        const b3Float4 scaled = supportVec;
+		int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);
+        return verticesA[hull->m_vertexOffset+index];
+    }
+    return supVec;
+B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
+													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
+													b3ConstArray(b3Collidable_t)				cpuCollidables,
+													b3ConstArray(b3Float4)					cpuVertices,
+													__global b3Float4* sepAxis,
+														const b3Float4* _dir, b3Float4* outp, int logme)
+	//dir is in worldspace, move to local space
+	b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;
+	b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;
+	b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);
+	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);
+	//find local support vertex
+	int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;
+	b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);
+	__global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];
+	b3Float4 pInA;
+	if (logme)
+	{
+		b3Float4 supVec = b3MakeFloat4(0,0,0,0);
+		float maxDot = -B3_LARGE_FLOAT;
+		if( 0 < hull->m_numVertices )
+		{
+			const b3Float4 scaled = localDir;
+			int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);
+			pInA = cpuVertices[hull->m_vertexOffset+index];
+		}
+	} else
+	{
+		pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);
+	}
+	//move vertex to world space
+	*outp = b3TransformPoint(pInA,pos,orn);
+inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB,   b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
+													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
+													b3ConstArray(b3Collidable_t)				cpuCollidables,
+													b3ConstArray(b3Float4)					cpuVertices,
+													__global b3Float4* sepAxis,
+													const b3Float4* _dir, b3MprSupport_t *supp)
+    b3Float4 dir;
+	dir = *_dir;
+	b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);
+    dir = *_dir*-1.f;
+	b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);
+    supp->v = supp->v1 - supp->v2;
+inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)
+    center->v1 = cpuBodyBuf[bodyIndexA].m_pos;
+	center->v2 = cpuBodyBuf[bodyIndexB].m_pos;
+    center->v = center->v1 - center->v2;
+inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)
+	(*v).x = x;
+	(*v).y = y;
+	(*v).z = z;
+	(*v).w = 0.f;
+inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)
+    (*v).x += (*w).x;
+    (*v).y += (*w).y;
+    (*v).z += (*w).z;
+inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)
+    *v = *w;
+inline void b3MprVec3Scale(b3Float4 *d, float k)
+    *d *= k;
+inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)
+    float dot;
+	dot = b3Dot3F4(*a,*b);
+    return dot;
+inline float b3MprVec3Len2(const b3Float4 *v)
+    return b3MprVec3Dot(v, v);
+inline void b3MprVec3Normalize(b3Float4 *d)
+    float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));
+    b3MprVec3Scale(d, k);
+inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)
+	*d = b3Cross3(*a,*b);
+inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)
+	*d = *v - *w;
+inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)
+    b3Float4 v2v1, v3v1;
+    b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,
+                       &b3MprSimplexPoint(portal, 1)->v);
+    b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,
+                       &b3MprSimplexPoint(portal, 1)->v);
+    b3MprVec3Cross(dir, &v2v1, &v3v1);
+    b3MprVec3Normalize(dir);
+inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,
+                                       const b3Float4 *dir)
+    float dot;
+    dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);
+    return b3MprIsZero(dot) || dot > 0.f;
+inline int portalReachTolerance(const b3MprSimplex_t *portal,
+                                     const b3MprSupport_t *v4,
+                                     const b3Float4 *dir)
+    float dv1, dv2, dv3, dv4;
+    float dot1, dot2, dot3;
+    // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}
+    dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);
+    dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);
+    dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);
+    dv4 = b3MprVec3Dot(&v4->v, dir);
+    dot1 = dv4 - dv1;
+    dot2 = dv4 - dv2;
+    dot3 = dv4 - dv3;
+    dot1 = B3_MPR_FMIN(dot1, dot2);
+    dot1 = B3_MPR_FMIN(dot1, dot3);
+    return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;
+inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal,   
+                                         const b3MprSupport_t *v4,
+                                         const b3Float4 *dir)
+    float dot;
+    dot = b3MprVec3Dot(&v4->v, dir);
+    return b3MprIsZero(dot) || dot > 0.f;
+inline void b3ExpandPortal(b3MprSimplex_t *portal,
+                              const b3MprSupport_t *v4)
+    float dot;
+    b3Float4 v4v0;
+    b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);
+    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);
+    if (dot > 0.f){
+        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);
+        if (dot > 0.f){
+            b3MprSimplexSet(portal, 1, v4);
+        }else{
+            b3MprSimplexSet(portal, 3, v4);
+        }
+    }else{
+        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);
+        if (dot > 0.f){
+            b3MprSimplexSet(portal, 2, v4);
+        }else{
+            b3MprSimplexSet(portal, 1, v4);
+        }
+    }
+B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
+													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
+													b3ConstArray(b3Collidable_t)				cpuCollidables,
+													b3ConstArray(b3Float4)					cpuVertices,
+													__global b3Float4* sepAxis,
+													__global int*	hasSepAxis,
+													b3MprSimplex_t *portal)
+    b3Float4 dir, va, vb;
+    float dot;
+    int cont;
+    // vertex 0 is center of portal
+    b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));
+    // vertex 0 is center of portal
+    b3MprSimplexSetSize(portal, 1);
+	b3Float4 zero = b3MakeFloat4(0,0,0,0);
+	b3Float4* b3mpr_vec3_origin = &zero;
+    if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){
+        // Portal's center lies on origin (0,0,0) => we know that objects
+        // intersect but we would need to know penetration info.
+        // So move center little bit...
+        b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);
+        b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);
+    }
+    // vertex 1 = support in direction of origin
+    b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);
+    b3MprVec3Scale(&dir, -1.f);
+    b3MprVec3Normalize(&dir);
+    b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));
+    b3MprSimplexSetSize(portal, 2);
+    // test if origin isn't outside of v1
+    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);
+    if (b3MprIsZero(dot) || dot < 0.f)
+        return -1;
+    // vertex 2
+    b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,
+                       &b3MprSimplexPoint(portal, 1)->v);
+    if (b3MprIsZero(b3MprVec3Len2(&dir))){
+        if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){
+            // origin lies on v1
+            return 1;
+        }else{
+            // origin lies on v0-v1 segment
+            return 2;
+        }
+    }
+    b3MprVec3Normalize(&dir);
+	 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));
+    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);
+    if (b3MprIsZero(dot) || dot < 0.f)
+        return -1;
+    b3MprSimplexSetSize(portal, 3);
+    // vertex 3 direction
+    b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,
+                     &b3MprSimplexPoint(portal, 0)->v);
+    b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,
+                     &b3MprSimplexPoint(portal, 0)->v);
+    b3MprVec3Cross(&dir, &va, &vb);
+    b3MprVec3Normalize(&dir);
+    // it is better to form portal faces to be oriented "outside" origin
+    dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);
+    if (dot > 0.f){
+        b3MprSimplexSwap(portal, 1, 2);
+        b3MprVec3Scale(&dir, -1.f);
+    }
+    while (b3MprSimplexSize(portal) < 4){
+		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));
+        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);
+        if (b3MprIsZero(dot) || dot < 0.f)
+            return -1;
+        cont = 0;
+        // test if origin is outside (v1, v0, v3) - set v2 as v3 and
+        // continue
+        b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,
+                          &b3MprSimplexPoint(portal, 3)->v);
+        dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);
+        if (dot < 0.f && !b3MprIsZero(dot)){
+            b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));
+            cont = 1;
+        }
+        if (!cont){
+            // test if origin is outside (v3, v0, v2) - set v1 as v3 and
+            // continue
+            b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,
+                              &b3MprSimplexPoint(portal, 2)->v);
+            dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);
+            if (dot < 0.f && !b3MprIsZero(dot)){
+                b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));
+                cont = 1;
+            }
+        }
+        if (cont){
+            b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,
+                             &b3MprSimplexPoint(portal, 0)->v);
+            b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,
+                             &b3MprSimplexPoint(portal, 0)->v);
+            b3MprVec3Cross(&dir, &va, &vb);
+            b3MprVec3Normalize(&dir);
+        }else{
+            b3MprSimplexSetSize(portal, 4);
+        }
+    }
+    return 0;
+B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
+													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
+													b3ConstArray(b3Collidable_t)				cpuCollidables,
+													b3ConstArray(b3Float4)					cpuVertices,
+													__global b3Float4* sepAxis,
+													b3MprSimplex_t *portal)
+    b3Float4 dir;
+    b3MprSupport_t v4;
+	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)
+    //while (1)
+	{
+        // compute direction outside the portal (from v0 throught v1,v2,v3
+        // face)
+        b3PortalDir(portal, &dir);
+        // test if origin is inside the portal
+        if (portalEncapsulesOrigin(portal, &dir))
+            return 0;
+        // get next support point
+		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);
+        // test if v4 can expand portal to contain origin and if portal
+        // expanding doesn't reach given tolerance
+        if (!portalCanEncapsuleOrigin(portal, &v4, &dir)
+                || portalReachTolerance(portal, &v4, &dir))
+		{
+            return -1;
+        }
+        // v1-v2-v3 triangle must be rearranged to face outside Minkowski
+        // difference (direction from v0).
+        b3ExpandPortal(portal, &v4);
+    }
+    return -1;
+B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)
+	b3Float4 zero = b3MakeFloat4(0,0,0,0);
+	b3Float4* b3mpr_vec3_origin = &zero;
+    b3Float4 dir;
+    size_t i;
+    float b[4], sum, inv;
+    b3Float4 vec, p1, p2;
+    b3PortalDir(portal, &dir);
+    // use barycentric coordinates of tetrahedron to find origin
+    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,
+                       &b3MprSimplexPoint(portal, 2)->v);
+    b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);
+    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,
+                       &b3MprSimplexPoint(portal, 2)->v);
+    b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);
+    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,
+                       &b3MprSimplexPoint(portal, 1)->v);
+    b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);
+    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,
+                       &b3MprSimplexPoint(portal, 1)->v);
+    b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);
+	sum = b[0] + b[1] + b[2] + b[3];
+    if (b3MprIsZero(sum) || sum < 0.f){
+		b[0] = 0.f;
+        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,
+                           &b3MprSimplexPoint(portal, 3)->v);
+        b[1] = b3MprVec3Dot(&vec, &dir);
+        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,
+                           &b3MprSimplexPoint(portal, 1)->v);
+        b[2] = b3MprVec3Dot(&vec, &dir);
+        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,
+                           &b3MprSimplexPoint(portal, 2)->v);
+        b[3] = b3MprVec3Dot(&vec, &dir);
+		sum = b[1] + b[2] + b[3];
+	}
+	inv = 1.f / sum;
+    b3MprVec3Copy(&p1, b3mpr_vec3_origin);
+    b3MprVec3Copy(&p2, b3mpr_vec3_origin);
+    for (i = 0; i < 4; i++){
+        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);
+        b3MprVec3Scale(&vec, b[i]);
+        b3MprVec3Add(&p1, &vec);
+        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);
+        b3MprVec3Scale(&vec, b[i]);
+        b3MprVec3Add(&p2, &vec);
+    }
+    b3MprVec3Scale(&p1, inv);
+    b3MprVec3Scale(&p2, inv);
+    b3MprVec3Copy(pos, &p1);
+    b3MprVec3Add(pos, &p2);
+    b3MprVec3Scale(pos, 0.5);
+inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)
+    b3Float4 ab;
+    b3MprVec3Sub2(&ab, a, b);
+    return b3MprVec3Len2(&ab);
+inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,
+                                                  const b3Float4 *x0,
+                                                  const b3Float4 *b,
+                                                  b3Float4 *witness)
+    // The computation comes from solving equation of segment:
+    //      S(t) = x0 + t.d
+    //          where - x0 is initial point of segment
+    //                - d is direction of segment from x0 (|d| > 0)
+    //                - t belongs to <0, 1> interval
+    // 
+    // Than, distance from a segment to some point P can be expressed:
+    //      D(t) = |x0 + t.d - P|^2
+    //          which is distance from any point on segment. Minimization
+    //          of this function brings distance from P to segment.
+    // Minimization of D(t) leads to simple quadratic equation that's
+    // solving is straightforward.
+    //
+    // Bonus of this method is witness point for free.
+    float dist, t;
+    b3Float4 d, a;
+    // direction of segment
+    b3MprVec3Sub2(&d, b, x0);
+    // precompute vector from P to x0
+    b3MprVec3Sub2(&a, x0, P);
+    t  = -1.f * b3MprVec3Dot(&a, &d);
+    t /= b3MprVec3Len2(&d);
+    if (t < 0.f || b3MprIsZero(t)){
+        dist = b3MprVec3Dist2(x0, P);
+        if (witness)
+            b3MprVec3Copy(witness, x0);
+    }else if (t > 1.f || b3MprEq(t, 1.f)){
+        dist = b3MprVec3Dist2(b, P);
+        if (witness)
+            b3MprVec3Copy(witness, b);
+    }else{
+        if (witness){
+            b3MprVec3Copy(witness, &d);
+            b3MprVec3Scale(witness, t);
+            b3MprVec3Add(witness, x0);
+            dist = b3MprVec3Dist2(witness, P);
+        }else{
+            // recycling variables
+            b3MprVec3Scale(&d, t);
+            b3MprVec3Add(&d, &a);
+            dist = b3MprVec3Len2(&d);
+        }
+    }
+    return dist;
+inline float b3MprVec3PointTriDist2(const b3Float4 *P,
+                                const b3Float4 *x0, const b3Float4 *B,
+                                const b3Float4 *C,
+                                b3Float4 *witness)
+    // Computation comes from analytic expression for triangle (x0, B, C)
+    //      T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and
+    // Then equation for distance is:
+    //      D(s, t) = | T(s, t) - P |^2
+    // This leads to minimization of quadratic function of two variables.
+    // The solution from is taken only if s is between 0 and 1, t is
+    // between 0 and 1 and t + s < 1, otherwise distance from segment is
+    // computed.
+    b3Float4 d1, d2, a;
+    float u, v, w, p, q, r;
+    float s, t, dist, dist2;
+    b3Float4 witness2;
+    b3MprVec3Sub2(&d1, B, x0);
+    b3MprVec3Sub2(&d2, C, x0);
+    b3MprVec3Sub2(&a, x0, P);
+    u = b3MprVec3Dot(&a, &a);
+    v = b3MprVec3Dot(&d1, &d1);
+    w = b3MprVec3Dot(&d2, &d2);
+    p = b3MprVec3Dot(&a, &d1);
+    q = b3MprVec3Dot(&a, &d2);
+    r = b3MprVec3Dot(&d1, &d2);
+    s = (q * r - w * p) / (w * v - r * r);
+    t = (-s * r - q) / w;
+    if ((b3MprIsZero(s) || s > 0.f)
+            && (b3MprEq(s, 1.f) || s < 1.f)
+            && (b3MprIsZero(t) || t > 0.f)
+            && (b3MprEq(t, 1.f) || t < 1.f)
+            && (b3MprEq(t + s, 1.f) || t + s < 1.f)){
+        if (witness){
+            b3MprVec3Scale(&d1, s);
+            b3MprVec3Scale(&d2, t);
+            b3MprVec3Copy(witness, x0);
+            b3MprVec3Add(witness, &d1);
+            b3MprVec3Add(witness, &d2);
+            dist = b3MprVec3Dist2(witness, P);
+        }else{
+            dist  = s * s * v;
+            dist += t * t * w;
+            dist += 2.f * s * t * r;
+            dist += 2.f * s * p;
+            dist += 2.f * t * q;
+            dist += u;
+        }
+    }else{
+        dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);
+        dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);
+        if (dist2 < dist){
+            dist = dist2;
+            if (witness)
+                b3MprVec3Copy(witness, &witness2);
+        }
+        dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);
+        if (dist2 < dist){
+            dist = dist2;
+            if (witness)
+                b3MprVec3Copy(witness, &witness2);
+        }
+    }
+    return dist;
+B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
+													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
+													b3ConstArray(b3Collidable_t)				cpuCollidables,
+													b3ConstArray(b3Float4)					cpuVertices,
+													__global b3Float4* sepAxis,
+                       b3MprSimplex_t *portal,
+                       float *depth, b3Float4 *pdir, b3Float4 *pos)
+    b3Float4 dir;
+    b3MprSupport_t v4;
+    unsigned long iterations;
+	b3Float4 zero = b3MakeFloat4(0,0,0,0);
+	b3Float4* b3mpr_vec3_origin = &zero;
+    iterations = 1UL;
+	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)
+    //while (1)
+	{
+        // compute portal direction and obtain next support point
+        b3PortalDir(portal, &dir);
+		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);
+        // reached tolerance -> find penetration info
+        if (portalReachTolerance(portal, &v4, &dir)
+                || iterations ==B3_MPR_MAX_ITERATIONS)
+		{
+            *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);
+            *depth = B3_MPR_SQRT(*depth);
+			if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))
+			{
+				*pdir = dir;
+			} 
+			b3MprVec3Normalize(pdir);
+            // barycentric coordinates:
+            b3FindPos(portal, pos);
+            return;
+        }
+        b3ExpandPortal(portal, &v4);
+        iterations++;
+    }
+B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)
+    // Touching contact on portal's v1 - so depth is zero and direction
+    // is unimportant and pos can be guessed
+    *depth = 0.f;
+    b3Float4 zero = b3MakeFloat4(0,0,0,0);
+	b3Float4* b3mpr_vec3_origin = &zero;
+	b3MprVec3Copy(dir, b3mpr_vec3_origin);
+    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);
+    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);
+    b3MprVec3Scale(pos, 0.5);
+B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,
+                              float *depth, b3Float4 *dir, b3Float4 *pos)
+    // Origin lies on v0-v1 segment.
+    // Depth is distance to v1, direction also and position must be
+    // computed
+    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);
+    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);
+    b3MprVec3Scale(pos, 0.5f);
+    b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);
+    *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));
+    b3MprVec3Normalize(dir);
+inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,
+					b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+					b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
+					b3ConstArray(b3Collidable_t)	cpuCollidables,
+					b3ConstArray(b3Float4)	cpuVertices,
+					__global b3Float4* sepAxis,
+					__global int*	hasSepAxis,
+					float *depthOut, b3Float4* dirOut, b3Float4* posOut)
+	 b3MprSimplex_t portal;
+//	if (!hasSepAxis[pairIndex])
+	//	return -1;
+	hasSepAxis[pairIndex] = 0;
+	 int res;
+    // Phase 1: Portal discovery
+    res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);
+	//sepAxis[pairIndex] = *pdir;//or -dir?
+	switch (res)
+	{
+	case 0:
+		{
+			// Phase 2: Portal refinement
+			res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);
+			if (res < 0)
+				return -1;
+			// Phase 3. Penetration info
+			b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);
+			hasSepAxis[pairIndex] = 1;
+			sepAxis[pairIndex] = -*dirOut;
+			break;
+		}
+	case 1:
+		{
+			 // Touching contact on portal's v1.
+			b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);
+			break;
+		}
+	case 2:
+		{
+			b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);
+			break;
+		}
+	default:
+		{
+			hasSepAxis[pairIndex]=0;
+			//if (res < 0)
+			//{
+				// Origin isn't inside portal - no collision.
+				return -1;
+			//}
+		}
+	};
+	return 0;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h
new file mode 100644
index 00000000..718222eb
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h
@@ -0,0 +1,196 @@
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+int b3ExtractManifoldSequentialGlobal(__global const b3Float4* p, int nPoints, b3Float4ConstArg nearNormal, b3Int4* contactIdx)
+	if( nPoints == 0 )
+        return 0;
+    if (nPoints <=4)
+        return nPoints;
+    if (nPoints >64)
+        nPoints = 64;
+	b3Float4 center = b3MakeFloat4(0,0,0,0);
+	{
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+	//	sample 4 directions
+    b3Float4 aVector = p[0] - center;
+    b3Float4 u = b3Cross( nearNormal, aVector );
+    b3Float4 v = b3Cross( nearNormal, u );
+    u = b3Normalized( u );
+    v = b3Normalized( v );
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    int minIndex=-1;
+    b3Float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        b3Float4 r = p[ie]-center;
+        f = b3Dot( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        f = b3Dot( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        f = b3Dot( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        f = b3Dot( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+    }
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    return 4;
+__kernel void   b3NewContactReductionKernel( __global b3Int4* pairs,
+                                                   __global const b3RigidBodyData_t* rigidBodies,
+                                                   __global const b3Float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global struct b3Contact4Data* globalContactsOut,
+                                                   __global b3Int4* clippingFaces,
+                                                   __global b3Float4* worldVertsB2,
+                                                   volatile __global int* nGlobalContactsOut,
+                                                   int vertexFaceCapacity,
+												   int contactCapacity,
+                                                   int numPairs,
+												   int pairIndex
+                                                   )
+//    int i = get_global_id(0);
+	//int pairIndex = i;
+	int i = pairIndex;
+    b3Int4 contactIdx;
+    contactIdx=b3MakeInt4(0,1,2,3);
+	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			int nPoints = clippingFaces[pairIndex].w;
+            if (nPoints>0)
+            {
+                 __global b3Float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];
+                b3Float4 normal = -separatingNormals[i];
+                int nReducedContacts = b3ExtractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+                int dstIdx;
+                dstIdx = b3AtomicInc( nGlobalContactsOut);
+//#if 0
+                b3Assert(dstIdx < contactCapacity);
+				if (dstIdx < contactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					pairs[pairIndex].w = dstIdx;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+                    c->m_childIndexA =-1;
+					c->m_childIndexB =-1;
+                    switch (nReducedContacts)
+                    {
+                        case 4:
+                            c->m_worldPosB[3] = pointsIn[contactIdx.w];
+                        case 3:
+                            c->m_worldPosB[2] = pointsIn[contactIdx.z];
+                        case 2:
+                            c->m_worldPosB[1] = pointsIn[contactIdx.y];
+                        case 1:
+                            c->m_worldPosB[0] = pointsIn[contactIdx.x];
+                        default:
+                        {
+                        }
+                    };
+					GET_NPOINTS(*c) = nReducedContacts;
+                 }
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h
new file mode 100644
index 00000000..3661e43c
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h
@@ -0,0 +1,90 @@
+#include "Bullet3Common/shared/b3Float4.h"
+#define B3_MAX_NUM_PARTS_IN_BITS 10
+///b3QuantizedBvhNodeData is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct b3QuantizedBvhNodeData b3QuantizedBvhNodeData_t;
+struct b3QuantizedBvhNodeData
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+inline int	b3GetTriangleIndex(const b3QuantizedBvhNodeData* rootNode)
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-B3_MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+inline int b3IsLeaf(const b3QuantizedBvhNodeData* rootNode)
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+inline int b3GetEscapeIndex(const b3QuantizedBvhNodeData* rootNode)
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+inline void b3QuantizeWithClamp(unsigned short* out, b3Float4ConstArg point2,int isMax, b3Float4ConstArg bvhAabbMin, b3Float4ConstArg bvhAabbMax, b3Float4ConstArg bvhQuantization)
+	b3Float4 clampedPoint = b3MaxFloat4(point2,bvhAabbMin);
+	clampedPoint = b3MinFloat4 (clampedPoint, bvhAabbMax);
+	b3Float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
+	if (isMax)
+	{
+		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));
+		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));
+		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));
+	} else
+	{
+		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));
+		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));
+		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));
+	}
+inline int b3TestQuantizedAabbAgainstQuantizedAabbSlow(
+								const unsigned short int* aabbMin1,
+								const unsigned short int* aabbMax1,
+								const unsigned short int* aabbMin2,
+								const unsigned short int* aabbMax2)
+	//int overlap = 1;
+	if (aabbMin1[0] > aabbMax2[0])
+		return 0;
+	if (aabbMax1[0] < aabbMin2[0])
+		return 0;
+	if (aabbMin1[1] > aabbMax2[1])
+		return 0;
+	if (aabbMax1[1] < aabbMin2[1])
+		return 0;
+	if (aabbMin1[2] > aabbMax2[2])
+		return 0;
+	if (aabbMax1[2] < aabbMin2[2])
+		return 0;
+	return 1;
+	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
+	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
+	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
+	//return overlap;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h
new file mode 100644
index 00000000..35b51970
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h
@@ -0,0 +1,97 @@
+inline int b3ReduceContacts(const b3Float4* p, int nPoints, const b3Float4& nearNormal, b3Int4* contactIdx)
+	if( nPoints == 0 )
+        return 0;
+    if (nPoints <=4)
+        return nPoints;
+    if (nPoints >64)
+        nPoints = 64;
+	b3Float4 center = b3MakeFloat4(0,0,0,0);
+	{
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+	//	sample 4 directions
+    b3Float4 aVector = p[0] - center;
+    b3Float4 u = b3Cross3( nearNormal, aVector );
+    b3Float4 v = b3Cross3( nearNormal, u );
+    u = b3FastNormalized3( u );
+    v = b3FastNormalized3( v );
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    int minIndex=-1;
+    b3Float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        b3Float4 r = p[ie]-center;
+        f = b3Dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        f = b3Dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        f = b3Dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        f = b3Dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+    }
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    return 4;
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h
new file mode 100644
index 00000000..50632c87
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h
@@ -0,0 +1,34 @@
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Quat.h"
+#include "Bullet3Common/shared/b3Mat3x3.h"
+typedef struct b3RigidBodyData b3RigidBodyData_t;
+struct b3RigidBodyData
+	b3Float4				m_pos;
+	b3Quat					m_quat;
+	b3Float4				m_linVel;
+	b3Float4				m_angVel;
+	int 					m_collidableIdx;
+	float 				m_invMass;
+	float 				m_restituitionCoeff;
+	float 				m_frictionCoeff;
+typedef struct b3InertiaData b3InertiaData_t;
+struct b3InertiaData
+	b3Mat3x3 m_invInertiaWorld;
+	b3Mat3x3 m_initInvInertia;
\ No newline at end of file
diff --git a/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h
new file mode 100644
index 00000000..8d40d19a
--- /dev/null
+++ b/src/bullet/Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h
@@ -0,0 +1,40 @@
+#ifndef B3_UPDATE_AABBS_H
+#define B3_UPDATE_AABBS_H
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+void b3ComputeWorldAabb(  int bodyId, __global const b3RigidBodyData_t* bodies, __global const  b3Collidable_t* collidables, __global const  b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)
+	__global const b3RigidBodyData_t* body = &bodies[bodyId];
+	b3Float4 position = body->m_pos;
+	b3Quat	orientation = body->m_quat;
+	int collidableIndex = body->m_collidableIdx;
+	int shapeIndex = collidables[collidableIndex].m_shapeIndex;
+	if (shapeIndex>=0)
+	{
+		b3Aabb_t localAabb = localShapeAABB[collidableIndex];
+		b3Aabb_t worldAabb;
+		b3Float4 aabbAMinOut,aabbAMaxOut;	
+		float margin = 0.f;
+		b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);
+		worldAabb.m_minVec =aabbAMinOut;
+		worldAabb.m_minIndices[3] = bodyId;
+		worldAabb.m_maxVec = aabbAMaxOut;
+		worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;
+		worldAabbs[bodyId] = worldAabb;
+	}
+#endif //B3_UPDATE_AABBS_H
diff --git a/src/bullet/Bullet3Common/b3AlignedAllocator.cpp b/src/bullet/Bullet3Common/b3AlignedAllocator.cpp
new file mode 100644
index 00000000..b98e2b4d
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3AlignedAllocator.cpp
@@ -0,0 +1,181 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3AlignedAllocator.h"
+int b3g_numAlignedAllocs = 0;
+int b3g_numAlignedFree = 0;
+int b3g_totalBytesAlignedAllocs = 0;//detect memory leaks
+static void *b3AllocDefault(size_t size)
+	return malloc(size);
+static void b3FreeDefault(void *ptr)
+	free(ptr);
+static b3AllocFunc* b3s_allocFunc = b3AllocDefault;
+static b3FreeFunc* b3s_freeFunc = b3FreeDefault;
+#include <malloc.h>
+static void *b3AlignedAllocDefault(size_t size, int alignment)
+	return _aligned_malloc(size, (size_t)alignment);
+static void b3AlignedFreeDefault(void *ptr)
+	_aligned_free(ptr);
+#elif defined(__CELLOS_LV2__)
+#include <stdlib.h>
+static inline void *b3AlignedAllocDefault(size_t size, int alignment)
+	return memalign(alignment, size);
+static inline void b3AlignedFreeDefault(void *ptr)
+	free(ptr);
+static inline void *b3AlignedAllocDefault(size_t size, int alignment)
+  void *ret;
+  char *real;
+  real = (char *)b3s_allocFunc(size + sizeof(void *) + (alignment-1));
+  if (real) {
+	ret = b3AlignPointer(real + sizeof(void *),alignment);
+    *((void **)(ret)-1) = (void *)(real);
+  } else {
+    ret = (void *)(real);
+  }
+  return (ret);
+static inline void b3AlignedFreeDefault(void *ptr)
+  void* real;
+  if (ptr) {
+    real = *((void **)(ptr)-1);
+    b3s_freeFunc(real);
+  }
+static b3AlignedAllocFunc* b3s_alignedAllocFunc = b3AlignedAllocDefault;
+static b3AlignedFreeFunc* b3s_alignedFreeFunc = b3AlignedFreeDefault;
+void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc)
+  b3s_alignedAllocFunc = allocFunc ? allocFunc : b3AlignedAllocDefault;
+  b3s_alignedFreeFunc = freeFunc ? freeFunc : b3AlignedFreeDefault;
+void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc)
+  b3s_allocFunc = allocFunc ? allocFunc : b3AllocDefault;
+  b3s_freeFunc = freeFunc ? freeFunc : b3FreeDefault;
+//this generic allocator provides the total allocated number of bytes
+#include <stdio.h>
+void*   b3AlignedAllocInternal  (size_t size, int alignment,int line,char* filename)
+ void *ret;
+ char *real;
+ b3g_totalBytesAlignedAllocs += size;
+ b3g_numAlignedAllocs++;
+ real = (char *)b3s_allocFunc(size + 2*sizeof(void *) + (alignment-1));
+ if (real) {
+   ret = (void*) b3AlignPointer(real + 2*sizeof(void *), alignment);
+   *((void **)(ret)-1) = (void *)(real);
+       *((int*)(ret)-2) = size;
+ } else {
+   ret = (void *)(real);//??
+ }
+ b3Printf("allocation#%d at address %x, from %s,line %d, size %d\n",b3g_numAlignedAllocs,real, filename,line,size);
+ int* ptr = (int*)ret;
+ *ptr = 12;
+ return (ret);
+void    b3AlignedFreeInternal   (void* ptr,int line,char* filename)
+ void* real;
+ b3g_numAlignedFree++;
+ if (ptr) {
+   real = *((void **)(ptr)-1);
+       int size = *((int*)(ptr)-2);
+       b3g_totalBytesAlignedAllocs -= size;
+	   b3Printf("free #%d at address %x, from %s,line %d, size %d\n",b3g_numAlignedFree,real, filename,line,size);
+   b3s_freeFunc(real);
+ } else
+ {
+	 b3Printf("NULL ptr\n");
+ }
+void*	b3AlignedAllocInternal	(size_t size, int alignment)
+	b3g_numAlignedAllocs++;
+	void* ptr;
+	ptr = b3s_alignedAllocFunc(size, alignment);
+//	b3Printf("b3AlignedAllocInternal %d, %x\n",size,ptr);
+	return ptr;
+void	b3AlignedFreeInternal	(void* ptr)
+	if (!ptr)
+	{
+		return;
+	}
+	b3g_numAlignedFree++;
+//	b3Printf("b3AlignedFreeInternal %x\n",ptr);
+	b3s_alignedFreeFunc(ptr);
diff --git a/src/bullet/Bullet3Common/b3AlignedAllocator.h b/src/bullet/Bullet3Common/b3AlignedAllocator.h
new file mode 100644
index 00000000..be418bd5
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3AlignedAllocator.h
@@ -0,0 +1,107 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///we probably replace this with our own aligned memory allocator
+///so we replace _aligned_malloc and _aligned_free with our own
+///that is better portable and more predictable
+#include "b3Scalar.h"
+#define b3AlignedAlloc(a,b) \
+		b3AlignedAllocInternal(a,b,__LINE__,__FILE__)
+#define b3AlignedFree(ptr) \
+		b3AlignedFreeInternal(ptr,__LINE__,__FILE__)
+void*	b3AlignedAllocInternal	(size_t size, int alignment,int line,char* filename);
+void	b3AlignedFreeInternal	(void* ptr,int line,char* filename);
+	void*	b3AlignedAllocInternal	(size_t size, int alignment);
+	void	b3AlignedFreeInternal	(void* ptr);
+	#define b3AlignedAlloc(size,alignment) b3AlignedAllocInternal(size,alignment)
+	#define b3AlignedFree(ptr) b3AlignedFreeInternal(ptr)
+typedef int	btSizeType;
+typedef void *(b3AlignedAllocFunc)(size_t size, int alignment);
+typedef void (b3AlignedFreeFunc)(void *memblock);
+typedef void *(b3AllocFunc)(size_t size);
+typedef void (b3FreeFunc)(void *memblock);
+///The developer can let all Bullet memory allocations go through a custom memory allocator, using b3AlignedAllocSetCustom
+void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc);
+///If the developer has already an custom aligned allocator, then b3AlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
+void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc);
+///The b3AlignedAllocator is a portable class for aligned memory allocations.
+///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using b3AlignedAllocSetCustom and b3AlignedAllocSetCustomAligned.
+template < typename T , unsigned Alignment >
+class b3AlignedAllocator {
+	typedef b3AlignedAllocator< T , Alignment > self_type;
+	//just going down a list:
+	b3AlignedAllocator() {}
+	/*
+	b3AlignedAllocator( const self_type & ) {}
+	*/
+	template < typename Other >
+	b3AlignedAllocator( const b3AlignedAllocator< Other , Alignment > & ) {}
+	typedef const T*         const_pointer;
+	typedef const T&         const_reference;
+	typedef T*               pointer;
+	typedef T&               reference;
+	typedef T                value_type;
+	pointer       address   ( reference        ref ) const                           { return &ref; }
+	const_pointer address   ( const_reference  ref ) const                           { return &ref; }
+	pointer       allocate  ( btSizeType        n   , const_pointer *      hint = 0 ) {
+		(void)hint;
+		return reinterpret_cast< pointer >(b3AlignedAlloc( sizeof(value_type) * n , Alignment ));
+	}
+	void          construct ( pointer          ptr , const value_type &   value    ) { new (ptr) value_type( value ); }
+	void          deallocate( pointer          ptr ) {
+		b3AlignedFree( reinterpret_cast< void * >( ptr ) );
+	}
+	void          destroy   ( pointer          ptr )                                 { ptr->~value_type(); }
+	template < typename O > struct rebind {
+		typedef b3AlignedAllocator< O , Alignment > other;
+	};
+	template < typename O >
+	self_type & operator=( const b3AlignedAllocator< O , Alignment > & ) { return *this; }
+	friend bool operator==( const self_type & , const self_type & ) { return true; }
diff --git a/src/bullet/Bullet3Common/b3AlignedObjectArray.h b/src/bullet/Bullet3Common/b3AlignedObjectArray.h
new file mode 100644
index 00000000..cc510fc4
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3AlignedObjectArray.h
@@ -0,0 +1,517 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_OBJECT_ARRAY__
+#define B3_OBJECT_ARRAY__
+#include "b3Scalar.h" // has definitions like B3_FORCE_INLINE
+#include "b3AlignedAllocator.h"
+///If the platform doesn't support placement new, you can disable B3_USE_PLACEMENT_NEW
+///then the b3AlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
+///You can enable B3_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
+///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and
+//#define B3_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
+#define B3_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
+#ifdef B3_USE_MEMCPY
+#include <memory.h>
+#include <string.h>
+#endif //B3_USE_MEMCPY
+#include <new> //for placement new
+///The b3AlignedObjectArray template class uses a subset of the stl::vector interface for its methods
+///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
+template <typename T> 
+//template <class T> 
+class b3AlignedObjectArray
+	b3AlignedAllocator<T , 16>	m_allocator;
+	int					m_size;
+	int					m_capacity;
+	T*					m_data;
+	//PCK: added this line
+	bool				m_ownsMemory;
+	B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T> &other)
+	{
+		copyFromArray(other);
+		return *this;
+	}
+		B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T> &other);
+		B3_FORCE_INLINE	int	allocSize(int size)
+		{
+			return (size ? size*2 : 1);
+		}
+		B3_FORCE_INLINE	void	copy(int start,int end, T* dest) const
+		{
+			int i;
+			for (i=start;i<end;++i)
+				new (&dest[i]) T(m_data[i]);
+				dest[i] = m_data[i];
+		}
+		B3_FORCE_INLINE	void	init()
+		{
+			//PCK: added this line
+			m_ownsMemory = true;
+			m_data = 0;
+			m_size = 0;
+			m_capacity = 0;
+		}
+		B3_FORCE_INLINE	void	destroy(int first,int last)
+		{
+			int i;
+			for (i=first; i<last;i++)
+			{
+				m_data[i].~T();
+			}
+		}
+		B3_FORCE_INLINE	void* allocate(int size)
+		{
+			if (size)
+				return m_allocator.allocate(size);
+			return 0;
+		}
+		B3_FORCE_INLINE	void	deallocate()
+		{
+			if(m_data)	{
+				//PCK: enclosed the deallocation in this block
+				if (m_ownsMemory)
+				{
+					m_allocator.deallocate(m_data);
+				}
+				m_data = 0;
+			}
+		}
+	public:
+		b3AlignedObjectArray()
+		{
+			init();
+		}
+		~b3AlignedObjectArray()
+		{
+			clear();
+		}
+		///Generally it is best to avoid using the copy constructor of an b3AlignedObjectArray, and use a (const) reference to the array instead.
+		b3AlignedObjectArray(const b3AlignedObjectArray& otherArray)
+		{
+			init();
+			int otherSize = otherArray.size();
+			resize (otherSize);
+			otherArray.copy(0, otherSize, m_data);
+		}
+		/// return the number of elements in the array
+		B3_FORCE_INLINE	int size() const
+		{	
+			return m_size;
+		}
+		B3_FORCE_INLINE const T& at(int n) const
+		{
+			b3Assert(n>=0);
+			b3Assert(n<size());
+			return m_data[n];
+		}
+		B3_FORCE_INLINE T& at(int n)
+		{
+			b3Assert(n>=0);
+			b3Assert(n<size());
+			return m_data[n];
+		}
+		B3_FORCE_INLINE const T& operator[](int n) const
+		{
+			b3Assert(n>=0);
+			b3Assert(n<size());
+			return m_data[n];
+		}
+		B3_FORCE_INLINE T& operator[](int n)
+		{
+			b3Assert(n>=0);
+			b3Assert(n<size());
+			return m_data[n];
+		}
+		///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
+		B3_FORCE_INLINE	void	clear()
+		{
+			destroy(0,size());
+			deallocate();
+			init();
+		}
+		B3_FORCE_INLINE	void	pop_back()
+		{
+			b3Assert(m_size>0);
+			m_size--;
+			m_data[m_size].~T();
+		}
+		///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
+		///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
+		B3_FORCE_INLINE	void	resizeNoInitialize(int newsize)
+		{
+			int curSize = size();
+			if (newsize < curSize)
+			{
+			} else
+			{
+				if (newsize > size())
+				{
+					reserve(newsize);
+				}
+				//leave this uninitialized
+			}
+			m_size = newsize;
+		}
+		B3_FORCE_INLINE	void	resize(int newsize, const T& fillData=T())
+		{
+			int curSize = size();
+			if (newsize < curSize)
+			{
+				for(int i = newsize; i < curSize; i++)
+				{
+					m_data[i].~T();
+				}
+			} else
+			{
+				if (newsize > size())
+				{
+					reserve(newsize);
+				}
+				for (int i=curSize;i<newsize;i++)
+				{
+					new ( &m_data[i]) T(fillData);
+				}
+			}
+			m_size = newsize;
+		}
+		B3_FORCE_INLINE	T&  expandNonInitializing( )
+		{	
+			int sz = size();
+			if( sz == capacity() )
+			{
+				reserve( allocSize(size()) );
+			}
+			m_size++;
+			return m_data[sz];		
+		}
+		B3_FORCE_INLINE	T&  expand( const T& fillValue=T())
+		{	
+			int sz = size();
+			if( sz == capacity() )
+			{
+				reserve( allocSize(size()) );
+			}
+			m_size++;
+			new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
+			return m_data[sz];		
+		}
+		B3_FORCE_INLINE	void push_back(const T& _Val)
+		{	
+			int sz = size();
+			if( sz == capacity() )
+			{
+				reserve( allocSize(size()) );
+			}
+			new ( &m_data[m_size] ) T(_Val);
+			m_data[size()] = _Val;			
+			m_size++;
+		}
+		/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
+		B3_FORCE_INLINE	int capacity() const
+		{	
+			return m_capacity;
+		}
+		B3_FORCE_INLINE	void reserve(int _Count)
+		{	// determine new minimum length of allocated storage
+			if (capacity() < _Count)
+			{	// not enough room, reallocate
+				T*	s = (T*)allocate(_Count);
+				b3Assert(s);
+				if (s==0)
+				{
+					b3Error("b3AlignedObjectArray reserve out-of-memory\n");
+					_Count=0;
+					m_size=0;
+				}
+				copy(0, size(), s);
+				destroy(0,size());
+				deallocate();
+				//PCK: added this line
+				m_ownsMemory = true;
+				m_data = s;
+				m_capacity = _Count;
+			}
+		}
+		class less
+		{
+			public:
+				bool operator() ( const T& a, const T& b )
+				{
+					return ( a < b );
+				}
+		};
+		template <typename L>
+		void quickSortInternal(const L& CompareFunc,int lo, int hi)
+		{
+		//  lo is the lower index, hi is the upper index
+		//  of the region of array a that is to be sorted
+			int i=lo, j=hi;
+			T x=m_data[(lo+hi)/2];
+			//  partition
+			do
+			{    
+				while (CompareFunc(m_data[i],x)) 
+					i++; 
+				while (CompareFunc(x,m_data[j])) 
+					j--;
+				if (i<=j)
+				{
+					swap(i,j);
+					i++; j--;
+				}
+			} while (i<=j);
+			//  recursion
+			if (lo<j) 
+				quickSortInternal( CompareFunc, lo, j);
+			if (i<hi) 
+				quickSortInternal( CompareFunc, i, hi);
+		}
+		template <typename L>
+		void quickSort(const L& CompareFunc)
+		{
+			//don't sort 0 or 1 elements
+			if (size()>1)
+			{
+				quickSortInternal(CompareFunc,0,size()-1);
+			}
+		}
+		///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
+		template <typename L>
+		void downHeap(T *pArr, int k, int n, const L& CompareFunc)
+		{
+			/*  PRE: a[k+1..N] is a heap */
+			/* POST:  a[k..N]  is a heap */
+			T temp = pArr[k - 1];
+			/* k has child(s) */
+			while (k <= n/2) 
+			{
+				int child = 2*k;
+				if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
+				{
+					child++;
+				}
+				/* pick larger child */
+				if (CompareFunc(temp , pArr[child - 1]))
+				{
+					/* move child up */
+					pArr[k - 1] = pArr[child - 1];
+					k = child;
+				}
+				else
+				{
+					break;
+				}
+			}
+			pArr[k - 1] = temp;
+		} /*downHeap*/
+		void	swap(int index0,int index1)
+		{
+#ifdef B3_USE_MEMCPY
+			char	temp[sizeof(T)];
+			memcpy(temp,&m_data[index0],sizeof(T));
+			memcpy(&m_data[index0],&m_data[index1],sizeof(T));
+			memcpy(&m_data[index1],temp,sizeof(T));
+			T temp = m_data[index0];
+			m_data[index0] = m_data[index1];
+			m_data[index1] = temp;
+		}
+	template <typename L>
+	void heapSort(const L& CompareFunc)
+	{
+		/* sort a[0..N-1],  N.B. 0 to N-1 */
+		int k;
+		int n = m_size;
+		for (k = n/2; k > 0; k--) 
+		{
+			downHeap(m_data, k, n, CompareFunc);
+		}
+		/* a[1..N] is now a heap */
+		while ( n>=1 ) 
+		{
+			swap(0,n-1); /* largest of a[0..n-1] */
+			n = n - 1;
+			/* restore a[1..i-1] heap */
+			downHeap(m_data, 1, n, CompareFunc);
+		} 
+	}
+	///non-recursive binary search, assumes sorted array
+	int	findBinarySearch(const T& key) const
+	{
+		int first = 0;
+		int last = size()-1;
+		//assume sorted array
+		while (first <= last) {
+			int mid = (first + last) / 2;  // compute mid point.
+			if (key > m_data[mid]) 
+				first = mid + 1;  // repeat search in top half.
+			else if (key < m_data[mid]) 
+				last = mid - 1; // repeat search in bottom half.
+			else
+				return mid;     // found it. return position /////
+		}
+		return size();    // failed to find key
+	}
+	int	findLinearSearch(const T& key) const
+	{
+		int index=size();
+		int i;
+		for (i=0;i<size();i++)
+		{
+			if (m_data[i] == key)
+			{
+				index = i;
+				break;
+			}
+		}
+		return index;
+	}
+	void	remove(const T& key)
+	{
+		int findIndex = findLinearSearch(key);
+		if (findIndex<size())
+		{
+			swap( findIndex,size()-1);
+			pop_back();
+		}
+	}
+	//PCK: whole function
+	void initializeFromBuffer(void *buffer, int size, int capacity)
+	{
+		clear();
+		m_ownsMemory = false;
+		m_data = (T*)buffer;
+		m_size = size;
+		m_capacity = capacity;
+	}
+	void copyFromArray(const b3AlignedObjectArray& otherArray)
+	{
+		int otherSize = otherArray.size();
+		resize (otherSize);
+		otherArray.copy(0, otherSize, m_data);
+	}
+#endif //B3_OBJECT_ARRAY__
diff --git a/src/bullet/Bullet3Common/b3CommandLineArgs.h b/src/bullet/Bullet3Common/b3CommandLineArgs.h
new file mode 100644
index 00000000..eeba5450
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3CommandLineArgs.h
@@ -0,0 +1,106 @@
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+#include <cstring>
+#include <sstream>
+class b3CommandLineArgs
+	std::map<std::string, std::string> pairs;
+	// Constructor
+	b3CommandLineArgs(int argc, char **argv)
+	{
+		addArgs(argc,argv);
+	}
+	void addArgs(int argc, char**argv)
+	{
+		using namespace std;
+	    for (int i = 1; i < argc; i++)
+	    {
+	        std::string arg = argv[i];
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+        	std::string::size_type pos;
+		    std::string key, val;
+	        if ((pos = arg.find( '=')) == std::string::npos) {
+	        	key = std::string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = std::string(arg, 2, pos - 2);
+	        	val = std::string(arg, pos + 1, arg.length() - 1);
+	        }
+			//only add new keys, don't replace existing
+			if(pairs.find(key) == pairs.end())
+			{
+        		pairs[key] = val;
+			}
+	    }
+	}
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<std::string, std::string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+	template <typename T>
+	bool GetCmdLineArgument(const char *arg_name, T &val);
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+template <typename T>
+inline bool b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+	using namespace std;
+	map<std::string, std::string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+		return true;
+    }
+	return false;
+template <>
+inline bool b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+	using namespace std;
+	map<std::string, std::string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		std::string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		std::strcpy(val, s.c_str());
+		return true;
+	} else {
+    	val = NULL;
+	}
+	return false;
diff --git a/src/bullet/Bullet3Common/b3FileUtils.h b/src/bullet/Bullet3Common/b3FileUtils.h
new file mode 100644
index 00000000..1a331029
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3FileUtils.h
@@ -0,0 +1,138 @@
+#ifndef B3_FILE_UTILS_H
+#define B3_FILE_UTILS_H
+#include <stdio.h>
+#include "b3Scalar.h"
+#include <stddef.h>//ptrdiff_h
+#include <string.h>
+struct b3FileUtils
+	b3FileUtils()
+	{
+	}
+	virtual ~b3FileUtils()
+	{
+	}
+	static bool findFile(const char* orgFileName, char* relativeFileName, int maxRelativeFileNameMaxLen)
+	{
+		FILE* f=0;
+		f = fopen(orgFileName,"rb");
+                if (f)
+                {
+			//printf("original file found: [%s]\n", orgFileName);
+			sprintf(relativeFileName,"%s", orgFileName);
+			fclose(f);
+			return true;
+		}
+		//printf("Trying various directories, relative to current working directory\n");	
+			const char* prefix[]={"./","./data/","../data/","../../data/","../../../data/","../../../../data/"};
+			int numPrefixes = sizeof(prefix)/sizeof(const char*);
+			f=0;
+			bool fileFound = false;
+			for (int i=0;!f && i<numPrefixes;i++)
+			{
+#ifdef _WIN32
+				sprintf_s(relativeFileName,maxRelativeFileNameMaxLen,"%s%s",prefix[i],orgFileName);
+				sprintf(relativeFileName,"%s%s",prefix[i],orgFileName);
+				f = fopen(relativeFileName,"rb");
+				if (f)
+				{
+					fileFound = true;
+					break;
+				}
+			}
+			if (f)
+			{
+				fclose(f);
+			}
+		return fileFound;
+	}
+	static const char* strip2(const char* name, const char* pattern)
+	{
+		size_t const patlen = strlen(pattern);
+		size_t patcnt = 0;
+		const char * oriptr;
+		const char * patloc;
+		// find how many times the pattern occurs in the original string
+		for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen)
+		{
+			patcnt++;
+		}
+		return oriptr;
+	}
+	static int extractPath(const char* fileName, char* path, int maxPathLength)
+	{
+		const char* stripped = strip2(fileName, "/");
+		stripped = strip2(stripped, "\\");
+		ptrdiff_t len = stripped-fileName;
+		b3Assert((len+1)<maxPathLength);
+		if (len && ((len+1)<maxPathLength))
+		{
+			for (int i=0;i<len;i++)
+			{
+				path[i] = fileName[i];
+			}
+			path[len]=0;
+		} else
+		{
+			len = 0;
+			b3Assert(maxPathLength>0);
+			if (maxPathLength>0)
+			{
+				path[len] = 0;
+			}
+		}
+		return len;
+	}
+	static char toLowerChar(const char t)
+	{
+		if (t>=(char)'A' && t<=(char)'Z')
+			return t + ((char)'a' - (char)'A');
+		else
+			return t;
+	}
+	static void toLower(char* str)
+	{
+		int len=strlen(str);
+		for (int i=0;i<len;i++)
+		{
+			str[i] = toLowerChar(str[i]);
+		}
+	}
+	/*static const char* strip2(const char* name, const char* pattern)
+	{
+		size_t const patlen = strlen(pattern);
+		size_t patcnt = 0;
+		const char * oriptr;
+		const char * patloc;
+		// find how many times the pattern occurs in the original string
+		for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+		{
+			patcnt++;
+		}
+		return oriptr;
+	}
+	*/
+#endif //B3_FILE_UTILS_H
diff --git a/src/bullet/Bullet3Common/b3HashMap.h b/src/bullet/Bullet3Common/b3HashMap.h
new file mode 100644
index 00000000..3569503f
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3HashMap.h
@@ -0,0 +1,450 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_HASH_MAP_H
+#define B3_HASH_MAP_H
+#include "b3AlignedObjectArray.h"
+///very basic hashable string implementation, compatible with b3HashMap
+struct b3HashString
+	const char* m_string;
+	unsigned int	m_hash;
+	B3_FORCE_INLINE	unsigned int getHash()const
+	{
+		return m_hash;
+	}
+	b3HashString(const char* name)
+		:m_string(name)
+	{
+		/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
+		static const unsigned int  InitialFNV = 2166136261u;
+		static const unsigned int FNVMultiple = 16777619u;
+		/* Fowler / Noll / Vo (FNV) Hash */
+		unsigned int hash = InitialFNV;
+		for(int i = 0; m_string[i]; i++)
+		{
+			hash = hash ^ (m_string[i]);       /* xor  the low 8 bits */
+			hash = hash * FNVMultiple;  /* multiply by the magic number */
+		}
+		m_hash = hash;
+	}
+	int portableStringCompare(const char* src,	const char* dst) const
+	{
+			int ret = 0 ;
+			while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
+					++src, ++dst;
+			if ( ret < 0 )
+					ret = -1 ;
+			else if ( ret > 0 )
+					ret = 1 ;
+			return( ret );
+	}
+	bool equals(const b3HashString& other) const
+	{
+		return (m_string == other.m_string) ||
+			(0==portableStringCompare(m_string,other.m_string));
+	}
+const int B3_HASH_NULL=0xffffffff;
+class b3HashInt
+	int	m_uid;
+	b3HashInt(int uid)	:m_uid(uid)
+	{
+	}
+	int	getUid1() const
+	{
+		return m_uid;
+	}
+	void	setUid1(int uid)
+	{
+		m_uid = uid;
+	}
+	bool equals(const b3HashInt& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+	//to our success
+	B3_FORCE_INLINE	unsigned int getHash()const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		return key;
+	}
+class b3HashPtr
+	union
+	{
+		const void*	m_pointer;
+		int	m_hashValues[2];
+	};
+	b3HashPtr(const void* ptr)
+		:m_pointer(ptr)
+	{
+	}
+	const void*	getPointer() const
+	{
+		return m_pointer;
+	}
+	bool equals(const b3HashPtr& other) const
+	{
+		return getPointer() == other.getPointer();
+	}
+	//to our success
+	B3_FORCE_INLINE	unsigned int getHash()const
+	{
+		const bool VOID_IS_8 = ((sizeof(void*)==8));
+		int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
+		// Thomas Wang's hash
+		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		return key;
+	}
+template <class Value>
+class b3HashKeyPtr
+        int     m_uid;
+        b3HashKeyPtr(int uid)    :m_uid(uid)
+        {
+        }
+        int     getUid1() const
+        {
+                return m_uid;
+        }
+        bool equals(const b3HashKeyPtr<Value>& other) const
+        {
+                return getUid1() == other.getUid1();
+        }
+        //to our success
+        B3_FORCE_INLINE       unsigned int getHash()const
+        {
+                int key = m_uid;
+                // Thomas Wang's hash
+                key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+                return key;
+        }
+template <class Value>
+class b3HashKey
+	int	m_uid;
+	b3HashKey(int uid)	:m_uid(uid)
+	{
+	}
+	int	getUid1() const
+	{
+		return m_uid;
+	}
+	bool equals(const b3HashKey<Value>& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+	//to our success
+	B3_FORCE_INLINE	unsigned int getHash()const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		return key;
+	}
+///The b3HashMap template class implements a generic and lightweight hashmap.
+///A basic sample of how to use b3HashMap is located in Demos\BasicDemo\main.cpp
+template <class Key, class Value>
+class b3HashMap
+	b3AlignedObjectArray<int>		m_hashTable;
+	b3AlignedObjectArray<int>		m_next;
+	b3AlignedObjectArray<Value>		m_valueArray;
+	b3AlignedObjectArray<Key>		m_keyArray;
+	void	growTables(const Key& /*key*/)
+	{
+		int newCapacity = m_valueArray.capacity();
+		if (m_hashTable.size() < newCapacity)
+		{
+			//grow hashtable and next table
+			int curHashtableSize = m_hashTable.size();
+			m_hashTable.resize(newCapacity);
+			m_next.resize(newCapacity);
+			int i;
+			for (i= 0; i < newCapacity; ++i)
+			{
+				m_hashTable[i] = B3_HASH_NULL;
+			}
+			for (i = 0; i < newCapacity; ++i)
+			{
+				m_next[i] = B3_HASH_NULL;
+			}
+			for(i=0;i<curHashtableSize;i++)
+			{
+				//const Value& value = m_valueArray[i];
+				//const Key& key = m_keyArray[i];
+				int	hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1);	// New hash value with new mask
+				m_next[i] = m_hashTable[hashValue];
+				m_hashTable[hashValue] = i;
+			}
+		}
+	}
+	public:
+	void insert(const Key& key, const Value& value) {
+		int hash = key.getHash() & (m_valueArray.capacity()-1);
+		//replace value if the key is already there
+		int index = findIndex(key);
+		if (index != B3_HASH_NULL)
+		{
+			m_valueArray[index]=value;
+			return;
+		}
+		int count = m_valueArray.size();
+		int oldCapacity = m_valueArray.capacity();
+		m_valueArray.push_back(value);
+		m_keyArray.push_back(key);
+		int newCapacity = m_valueArray.capacity();
+		if (oldCapacity < newCapacity)
+		{
+			growTables(key);
+			//hash with new capacity
+			hash = key.getHash() & (m_valueArray.capacity()-1);
+		}
+		m_next[count] = m_hashTable[hash];
+		m_hashTable[hash] = count;
+	}
+	void remove(const Key& key) {
+		int hash = key.getHash() & (m_valueArray.capacity()-1);
+		int pairIndex = findIndex(key);
+		if (pairIndex ==B3_HASH_NULL)
+		{
+			return;
+		}
+		// Remove the pair from the hash table.
+		int index = m_hashTable[hash];
+		b3Assert(index != B3_HASH_NULL);
+		int previous = B3_HASH_NULL;
+		while (index != pairIndex)
+		{
+			previous = index;
+			index = m_next[index];
+		}
+		if (previous != B3_HASH_NULL)
+		{
+			b3Assert(m_next[previous] == pairIndex);
+			m_next[previous] = m_next[pairIndex];
+		}
+		else
+		{
+			m_hashTable[hash] = m_next[pairIndex];
+		}
+		// We now move the last pair into spot of the
+		// pair being removed. We need to fix the hash
+		// table indices to support the move.
+		int lastPairIndex = m_valueArray.size() - 1;
+		// If the removed pair is the last pair, we are done.
+		if (lastPairIndex == pairIndex)
+		{
+			m_valueArray.pop_back();
+			m_keyArray.pop_back();
+			return;
+		}
+		// Remove the last pair from the hash table.
+		int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
+		index = m_hashTable[lastHash];
+		b3Assert(index != B3_HASH_NULL);
+		previous = B3_HASH_NULL;
+		while (index != lastPairIndex)
+		{
+			previous = index;
+			index = m_next[index];
+		}
+		if (previous != B3_HASH_NULL)
+		{
+			b3Assert(m_next[previous] == lastPairIndex);
+			m_next[previous] = m_next[lastPairIndex];
+		}
+		else
+		{
+			m_hashTable[lastHash] = m_next[lastPairIndex];
+		}
+		// Copy the last pair into the remove pair's spot.
+		m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
+		m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
+		// Insert the last pair into the hash table
+		m_next[pairIndex] = m_hashTable[lastHash];
+		m_hashTable[lastHash] = pairIndex;
+		m_valueArray.pop_back();
+		m_keyArray.pop_back();
+	}
+	int size() const
+	{
+		return m_valueArray.size();
+	}
+	const Value* getAtIndex(int index) const
+	{
+		b3Assert(index < m_valueArray.size());
+		return &m_valueArray[index];
+	}
+	Value* getAtIndex(int index)
+	{
+		b3Assert(index < m_valueArray.size());
+		return &m_valueArray[index];
+	}
+	Value* operator[](const Key& key) {
+		return find(key);
+	}
+	const Value*	find(const Key& key) const
+	{
+		int index = findIndex(key);
+		if (index == B3_HASH_NULL)
+		{
+			return NULL;
+		}
+		return &m_valueArray[index];
+	}
+	Value*	find(const Key& key)
+	{
+		int index = findIndex(key);
+		if (index == B3_HASH_NULL)
+		{
+			return NULL;
+		}
+		return &m_valueArray[index];
+	}
+	int	findIndex(const Key& key) const
+	{
+		unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
+		if (hash >= (unsigned int)m_hashTable.size())
+		{
+			return B3_HASH_NULL;
+		}
+		int index = m_hashTable[hash];
+		while ((index != B3_HASH_NULL) && key.equals(m_keyArray[index]) == false)
+		{
+			index = m_next[index];
+		}
+		return index;
+	}
+	void	clear()
+	{
+		m_hashTable.clear();
+		m_next.clear();
+		m_valueArray.clear();
+		m_keyArray.clear();
+	}
+#endif //B3_HASH_MAP_H
diff --git a/src/bullet/Bullet3Common/b3Logging.cpp b/src/bullet/Bullet3Common/b3Logging.cpp
new file mode 100644
index 00000000..a8e95071
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Logging.cpp
@@ -0,0 +1,160 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "b3Logging.h"
+#include <stdio.h>
+#include <stdarg.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif //_WIN32
+void b3PrintfFuncDefault(const char* msg)
+#ifdef _WIN32
+	OutputDebugStringA(msg);
+	printf("%s",msg);
+    //is this portable?
+    fflush(stdout);
+void b3WarningMessageFuncDefault(const char* msg)
+#ifdef _WIN32
+	OutputDebugStringA(msg);
+	printf("%s",msg);
+    //is this portable?
+    fflush(stdout);
+void b3ErrorMessageFuncDefault(const char* msg)
+#ifdef _WIN32
+	OutputDebugStringA(msg);
+	printf("%s",msg);
+    //is this portable?
+    fflush(stdout);
+static b3PrintfFunc* b3s_printfFunc = b3PrintfFuncDefault;
+static b3WarningMessageFunc* b3s_warningMessageFunc = b3WarningMessageFuncDefault;
+static b3ErrorMessageFunc* b3s_errorMessageFunc = b3ErrorMessageFuncDefault;
+///The developer can route b3Printf output using their own implementation
+void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc)
+	b3s_printfFunc = printfFunc;
+void b3SetCustomWarningMessageFunc(b3PrintfFunc* warningMessageFunc)
+	b3s_warningMessageFunc = warningMessageFunc;
+void b3SetCustomErrorMessageFunc(b3PrintfFunc* errorMessageFunc)
+	b3s_errorMessageFunc = errorMessageFunc;
+//#define B3_MAX_DEBUG_STRING_LENGTH 2048
+void b3OutputPrintfVarArgsInternal(const char *str, ...)
+    char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0};
+    va_list argList;
+    va_start(argList, str);
+#ifdef _MSC_VER
+    vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+    vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+        (b3s_printfFunc)(strDebug);
+    va_end(argList);    
+void b3OutputWarningMessageVarArgsInternal(const char *str, ...)
+    char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0};
+    va_list argList;
+    va_start(argList, str);
+#ifdef _MSC_VER
+    vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+    vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+        (b3s_warningMessageFunc)(strDebug);
+    va_end(argList);    
+void b3OutputErrorMessageVarArgsInternal(const char *str, ...)
+    char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0};
+    va_list argList;
+    va_start(argList, str);
+#ifdef _MSC_VER
+    vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+    vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+        (b3s_errorMessageFunc)(strDebug);
+    va_end(argList);    
+void	b3EnterProfileZoneDefault(const char* name)
+void	b3LeaveProfileZoneDefault()
+static b3EnterProfileZoneFunc* b3s_enterFunc = b3EnterProfileZoneDefault;
+static b3LeaveProfileZoneFunc* b3s_leaveFunc = b3LeaveProfileZoneDefault;
+void b3EnterProfileZone(const char* name)
+	(b3s_enterFunc)(name);
+void b3LeaveProfileZone()
+	(b3s_leaveFunc)();
+void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc)
+	b3s_enterFunc = enterFunc;
+void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc)
+	b3s_leaveFunc = leaveFunc;
+#ifndef _MSC_VER
+#undef vsprintf_s
diff --git a/src/bullet/Bullet3Common/b3Logging.h b/src/bullet/Bullet3Common/b3Logging.h
new file mode 100644
index 00000000..b302effe
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Logging.h
@@ -0,0 +1,77 @@
+#ifndef B3_LOGGING_H
+#define B3_LOGGING_H
+#ifdef __cplusplus
+extern "C" {
+///We add the do/while so that the statement "if (condition) b3Printf("test"); else {...}" would fail
+///You can also customize the message by uncommenting out a different line below
+#define b3Printf(...) b3OutputPrintfVarArgsInternal(__VA_ARGS__)
+//#define b3Printf(...) do {b3OutputPrintfVarArgsInternal("b3Printf[%s,%d]:",__FILE__,__LINE__);b3OutputPrintfVarArgsInternal(__VA_ARGS__); } while(0)
+//#define b3Printf b3OutputPrintfVarArgsInternal
+//#define b3Printf(...) printf(__VA_ARGS__)
+//#define b3Printf(...)
+#define b3Warning(...) do {b3OutputWarningMessageVarArgsInternal("b3Warning[%s,%d]:\n",__FILE__,__LINE__);b3OutputWarningMessageVarArgsInternal(__VA_ARGS__); }while(0)
+#define b3Error(...) do {b3OutputErrorMessageVarArgsInternal("b3Error[%s,%d]:\n",__FILE__,__LINE__);b3OutputErrorMessageVarArgsInternal(__VA_ARGS__); } while(0)
+#ifndef B3_NO_PROFILE
+void b3EnterProfileZone(const char* name);
+void b3LeaveProfileZone();
+#ifdef __cplusplus
+class	b3ProfileZone
+	b3ProfileZone(const char* name)
+	{ 
+		b3EnterProfileZone( name ); 
+	}
+	~b3ProfileZone()
+	{ 
+		b3LeaveProfileZone(); 
+	}
+#define	B3_PROFILE( name )			b3ProfileZone __profile( name )
+#else //B3_NO_PROFILE
+#define	B3_PROFILE( name )
+#define b3StartProfile(a)
+#define b3StopProfile
+#endif //#ifndef B3_NO_PROFILE
+typedef void (b3PrintfFunc)(const char* msg);
+typedef void (b3WarningMessageFunc)(const char* msg);
+typedef void (b3ErrorMessageFunc)(const char* msg);
+typedef void (b3EnterProfileZoneFunc)(const char* msg);
+typedef void (b3LeaveProfileZoneFunc)();
+///The developer can route b3Printf output using their own implementation
+void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc);
+void b3SetCustomWarningMessageFunc(b3WarningMessageFunc* warningMsgFunc);
+void b3SetCustomErrorMessageFunc(b3ErrorMessageFunc* errorMsgFunc);
+///Set custom profile zone functions (zones can be nested)
+void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc);
+void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc);
+///Don't use those internal functions directly, use the b3Printf or b3SetCustomPrintfFunc instead (or warning/error version)
+void b3OutputPrintfVarArgsInternal(const char *str, ...);
+void b3OutputWarningMessageVarArgsInternal(const char *str, ...);
+void b3OutputErrorMessageVarArgsInternal(const char *str, ...);
+#ifdef __cplusplus
+    }
\ No newline at end of file
diff --git a/src/bullet/Bullet3Common/b3Matrix3x3.h b/src/bullet/Bullet3Common/b3Matrix3x3.h
new file mode 100644
index 00000000..89b57cf5
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Matrix3x3.h
@@ -0,0 +1,1362 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef	B3_MATRIX3x3_H
+#define B3_MATRIX3x3_H
+#include "b3Vector3.h"
+#include "b3Quaternion.h"
+#include <stdio.h>
+#ifdef B3_USE_SSE
+//const __m128 B3_ATTRIBUTE_ALIGNED16(b3v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
+#define b3Matrix3x3Data	b3Matrix3x3DoubleData 
+#define b3Matrix3x3Data	b3Matrix3x3FloatData
+/**@brief The b3Matrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with b3Quaternion, b3Transform and b3Vector3.
+* Make sure to only include a pure orthogonal matrix without scaling. */
+B3_ATTRIBUTE_ALIGNED16(class) b3Matrix3x3 {
+	///Data storage for the matrix, each vector is a row of the matrix
+	b3Vector3 m_el[3];
+	/** @brief No initializaion constructor */
+	b3Matrix3x3 () {}
+	//		explicit b3Matrix3x3(const b3Scalar *m) { setFromOpenGLSubMatrix(m); }
+	/**@brief Constructor from Quaternion */
+	explicit b3Matrix3x3(const b3Quaternion& q) { setRotation(q); }
+	/*
+	template <typename b3Scalar>
+	Matrix3x3(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{ 
+	setEulerYPR(yaw, pitch, roll);
+	}
+	*/
+	/** @brief Constructor with row major formatting */
+	b3Matrix3x3(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
+		const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
+		const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
+	{ 
+		setValue(xx, xy, xz, 
+			yx, yy, yz, 
+			zx, zy, zz);
+	}
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+	B3_FORCE_INLINE b3Matrix3x3 (const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2 ) 
+	{
+        m_el[0].mVec128 = v0;
+        m_el[1].mVec128 = v1;
+        m_el[2].mVec128 = v2;
+	}
+	B3_FORCE_INLINE b3Matrix3x3 (const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2 ) 
+	{
+        m_el[0] = v0;
+        m_el[1] = v1;
+        m_el[2] = v2;
+	}
+	// Copy constructor
+	B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& rhs)
+	{
+		m_el[0].mVec128 = rhs.m_el[0].mVec128;
+		m_el[1].mVec128 = rhs.m_el[1].mVec128;
+		m_el[2].mVec128 = rhs.m_el[2].mVec128;
+	}
+	// Assignment Operator
+	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m) 
+	{
+		m_el[0].mVec128 = m.m_el[0].mVec128;
+		m_el[1].mVec128 = m.m_el[1].mVec128;
+		m_el[2].mVec128 = m.m_el[2].mVec128;
+		return *this;
+	}
+	/** @brief Copy constructor */
+	B3_FORCE_INLINE b3Matrix3x3 (const b3Matrix3x3& other)
+	{
+		m_el[0] = other.m_el[0];
+		m_el[1] = other.m_el[1];
+		m_el[2] = other.m_el[2];
+	}
+	/** @brief Assignment Operator */
+	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& other)
+	{
+		m_el[0] = other.m_el[0];
+		m_el[1] = other.m_el[1];
+		m_el[2] = other.m_el[2];
+		return *this;
+	}
+	/** @brief Get a column of the matrix as a vector 
+	*  @param i Column number 0 indexed */
+	B3_FORCE_INLINE b3Vector3 getColumn(int i) const
+	{
+		return b3MakeVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
+	}
+	/** @brief Get a row of the matrix as a vector 
+	*  @param i Row number 0 indexed */
+	B3_FORCE_INLINE const b3Vector3& getRow(int i) const
+	{
+		b3FullAssert(0 <= i && i < 3);
+		return m_el[i];
+	}
+	/** @brief Get a mutable reference to a row of the matrix as a vector 
+	*  @param i Row number 0 indexed */
+	B3_FORCE_INLINE b3Vector3&  operator[](int i)
+	{ 
+		b3FullAssert(0 <= i && i < 3);
+		return m_el[i]; 
+	}
+	/** @brief Get a const reference to a row of the matrix as a vector 
+	*  @param i Row number 0 indexed */
+	B3_FORCE_INLINE const b3Vector3& operator[](int i) const
+	{
+		b3FullAssert(0 <= i && i < 3);
+		return m_el[i]; 
+	}
+	/** @brief Multiply by the target matrix on the right
+	*  @param m Rotation matrix to be applied 
+	* Equivilant to this = this * m */
+	b3Matrix3x3& operator*=(const b3Matrix3x3& m); 
+	/** @brief Adds by the target matrix on the right
+	*  @param m matrix to be applied 
+	* Equivilant to this = this + m */
+	b3Matrix3x3& operator+=(const b3Matrix3x3& m); 
+	/** @brief Substractss by the target matrix on the right
+	*  @param m matrix to be applied 
+	* Equivilant to this = this - m */
+	b3Matrix3x3& operator-=(const b3Matrix3x3& m); 
+	/** @brief Set from the rotational part of a 4x4 OpenGL matrix
+	*  @param m A pointer to the beginning of the array of scalars*/
+	void setFromOpenGLSubMatrix(const b3Scalar *m)
+	{
+		m_el[0].setValue(m[0],m[4],m[8]);
+		m_el[1].setValue(m[1],m[5],m[9]);
+		m_el[2].setValue(m[2],m[6],m[10]);
+	}
+	/** @brief Set the values of the matrix explicitly (row major)
+	*  @param xx Top left
+	*  @param xy Top Middle
+	*  @param xz Top Right
+	*  @param yx Middle Left
+	*  @param yy Middle Middle
+	*  @param yz Middle Right
+	*  @param zx Bottom Left
+	*  @param zy Bottom Middle
+	*  @param zz Bottom Right*/
+	void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz, 
+		const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz, 
+		const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
+	{
+		m_el[0].setValue(xx,xy,xz);
+		m_el[1].setValue(yx,yy,yz);
+		m_el[2].setValue(zx,zy,zz);
+	}
+	/** @brief Set the matrix from a quaternion
+	*  @param q The Quaternion to match */  
+	void setRotation(const b3Quaternion& q) 
+	{
+		b3Scalar d = q.length2();
+		b3FullAssert(d != b3Scalar(0.0));
+		b3Scalar s = b3Scalar(2.0) / d;
+    #if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+        __m128	vs, Q = q.get128();
+		__m128i Qi = b3CastfTo128i(Q);
+        __m128	Y, Z;
+        __m128	V1, V2, V3;
+        __m128	V11, V21, V31;
+        __m128	NQ = _mm_xor_ps(Q, b3vMzeroMask);
+		__m128i NQi = b3CastfTo128i(NQ);
+        V1 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,0,2,3)));	// Y X Z W
+		V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0,0,1,3));     // -X -X  Y  W
+        V3 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(2,1,0,3)));	// Z Y X W
+        V1 = _mm_xor_ps(V1, b3vMPPP);	//	change the sign of the first element
+        V11	= b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,1,0,3)));	// Y Y X W
+		V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
+		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0,2,0,3));	//  X  Z -X -W
+		V2 = V2 * V1;	//
+		V1 = V1 * V11;	//
+		V3 = V3 * V31;	//
+        V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2,3,1,3));	//	-Z -W  Y  W
+		V11 = V11 * V21;	//
+        V21 = _mm_xor_ps(V21, b3vMPPP);	//	change the sign of the first element
+		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3,3,1,3));	//	 W  W -Y -W
+        V31 = _mm_xor_ps(V31, b3vMPPP);	//	change the sign of the first element
+		Y = b3CastiTo128f(_mm_shuffle_epi32 (NQi, B3_SHUFFLE(3,2,0,3)));	// -W -Z -X -W
+		Z = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,0,1,3)));	//  Y  X  Y  W
+		vs = _mm_load_ss(&s);
+		V21 = V21 * Y;
+		V31 = V31 * Z;
+		V1 = V1 + V11;
+        V2 = V2 + V21;
+        V3 = V3 + V31;
+        vs = b3_splat3_ps(vs, 0);
+            //	s ready
+        V1 = V1 * vs;
+        V2 = V2 * vs;
+        V3 = V3 * vs;
+        V1 = V1 + b3v1000;
+        V2 = V2 + b3v0100;
+        V3 = V3 + b3v0010;
+        m_el[0] = b3MakeVector3(V1); 
+        m_el[1] = b3MakeVector3(V2);
+        m_el[2] = b3MakeVector3(V3);
+    #else    
+		b3Scalar xs = q.getX() * s,   ys = q.getY() * s,   zs = q.getZ() * s;
+		b3Scalar wx = q.getW() * xs,  wy = q.getW() * ys,  wz = q.getW() * zs;
+		b3Scalar xx = q.getX() * xs,  xy = q.getX() * ys,  xz = q.getX() * zs;
+		b3Scalar yy = q.getY() * ys,  yz = q.getY() * zs,  zz = q.getZ() * zs;
+		setValue(
+            b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy,
+			xy + wz, b3Scalar(1.0) - (xx + zz), yz - wx,
+			xz - wy, yz + wx, b3Scalar(1.0) - (xx + yy));
+	#endif
+    }
+	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
+	*  @param yaw Yaw about Y axis
+	*  @param pitch Pitch about X axis
+	*  @param roll Roll about Z axis 
+	*/
+	void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll) 
+	{
+		setEulerZYX(roll, pitch, yaw);
+	}
+	/** @brief Set the matrix from euler angles YPR around ZYX axes
+	* @param eulerX Roll about X axis
+	* @param eulerY Pitch around Y axis
+	* @param eulerZ Yaw aboud Z axis
+	* 
+	* These angles are used to produce a rotation matrix. The euler
+	* angles are applied in ZYX order. I.e a vector is first rotated 
+	* about X then Y and then Z
+	**/
+	void setEulerZYX(b3Scalar eulerX,b3Scalar eulerY,b3Scalar eulerZ) { 
+		///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
+		b3Scalar ci ( b3Cos(eulerX)); 
+		b3Scalar cj ( b3Cos(eulerY)); 
+		b3Scalar ch ( b3Cos(eulerZ)); 
+		b3Scalar si ( b3Sin(eulerX)); 
+		b3Scalar sj ( b3Sin(eulerY)); 
+		b3Scalar sh ( b3Sin(eulerZ)); 
+		b3Scalar cc = ci * ch; 
+		b3Scalar cs = ci * sh; 
+		b3Scalar sc = si * ch; 
+		b3Scalar ss = si * sh;
+		setValue(cj * ch, sj * sc - cs, sj * cc + ss,
+			cj * sh, sj * ss + cc, sj * cs - sc, 
+			-sj,      cj * si,      cj * ci);
+	}
+	/**@brief Set the matrix to the identity */
+	void setIdentity()
+	{ 
+#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) || defined(B3_USE_NEON)
+			m_el[0] = b3MakeVector3(b3v1000); 
+			m_el[1] = b3MakeVector3(b3v0100);
+			m_el[2] = b3MakeVector3(b3v0010);
+		setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), 
+			b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), 
+			b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0)); 
+	}
+	static const b3Matrix3x3&	getIdentity()
+	{
+#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) || defined(B3_USE_NEON)
+        static const b3Matrix3x3 
+        identityMatrix(b3v1000, b3v0100, b3v0010);
+		static const b3Matrix3x3 
+        identityMatrix(
+            b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), 
+			b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), 
+			b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
+		return identityMatrix;
+	}
+	/**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
+	* @param m The array to be filled */
+	void getOpenGLSubMatrix(b3Scalar *m) const 
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+        __m128 v0 = m_el[0].mVec128;
+        __m128 v1 = m_el[1].mVec128;
+        __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
+        __m128 *vm = (__m128 *)m;
+        __m128 vT;
+        v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
+        vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
+        v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
+        v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
+        v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
+        v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));	// z0 z1 z2 0
+        vm[0] = v0;
+        vm[1] = v1;
+        vm[2] = v2;
+#elif defined(B3_USE_NEON)
+        // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+        static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
+        float32x4_t *vm = (float32x4_t *)m;
+        float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+        float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
+        float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
+        float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
+        float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
+        float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
+        vm[0] = v0;
+        vm[1] = v1;
+        vm[2] = v2;
+		m[0]  = b3Scalar(m_el[0].getX()); 
+		m[1]  = b3Scalar(m_el[1].getX());
+		m[2]  = b3Scalar(m_el[2].getX());
+		m[3]  = b3Scalar(0.0); 
+		m[4]  = b3Scalar(m_el[0].getY());
+		m[5]  = b3Scalar(m_el[1].getY());
+		m[6]  = b3Scalar(m_el[2].getY());
+		m[7]  = b3Scalar(0.0); 
+		m[8]  = b3Scalar(m_el[0].getZ()); 
+		m[9]  = b3Scalar(m_el[1].getZ());
+		m[10] = b3Scalar(m_el[2].getZ());
+		m[11] = b3Scalar(0.0); 
+	}
+	/**@brief Get the matrix represented as a quaternion 
+	* @param q The quaternion which will be set */
+	void getRotation(b3Quaternion& q) const
+	{
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+        b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
+        b3Scalar s, x;
+        union {
+            b3SimdFloat4 vec;
+            b3Scalar f[4];
+        } temp;
+        if (trace > b3Scalar(0.0)) 
+        {
+            x = trace + b3Scalar(1.0);
+            temp.f[0]=m_el[2].getY() - m_el[1].getZ();
+            temp.f[1]=m_el[0].getZ() - m_el[2].getX();
+            temp.f[2]=m_el[1].getX() - m_el[0].getY();
+            temp.f[3]=x;
+            //temp.f[3]= s * b3Scalar(0.5);
+        } 
+        else 
+        {
+            int i, j, k;
+            if(m_el[0].getX() < m_el[1].getY()) 
+            { 
+                if( m_el[1].getY() < m_el[2].getZ() )
+                    { i = 2; j = 0; k = 1; }
+                else
+                    { i = 1; j = 2; k = 0; }
+            }
+            else
+            {
+                if( m_el[0].getX() < m_el[2].getZ())
+                    { i = 2; j = 0; k = 1; }
+                else
+                    { i = 0; j = 1; k = 2; }
+            }
+            x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0);
+            temp.f[3] = (m_el[k][j] - m_el[j][k]);
+            temp.f[j] = (m_el[j][i] + m_el[i][j]);
+            temp.f[k] = (m_el[k][i] + m_el[i][k]);
+            temp.f[i] = x;
+            //temp.f[i] = s * b3Scalar(0.5);
+        }
+        s = b3Sqrt(x);
+        q.set128(temp.vec);
+        s = b3Scalar(0.5) / s;
+        q *= s;
+		b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
+		b3Scalar temp[4];
+		if (trace > b3Scalar(0.0)) 
+		{
+			b3Scalar s = b3Sqrt(trace + b3Scalar(1.0));
+			temp[3]=(s * b3Scalar(0.5));
+			s = b3Scalar(0.5) / s;
+			temp[0]=((m_el[2].getY() - m_el[1].getZ()) * s);
+			temp[1]=((m_el[0].getZ() - m_el[2].getX()) * s);
+			temp[2]=((m_el[1].getX() - m_el[0].getY()) * s);
+		} 
+		else 
+		{
+			int i = m_el[0].getX() < m_el[1].getY() ? 
+				(m_el[1].getY() < m_el[2].getZ() ? 2 : 1) :
+				(m_el[0].getX() < m_el[2].getZ() ? 2 : 0); 
+			int j = (i + 1) % 3;  
+			int k = (i + 2) % 3;
+			b3Scalar s = b3Sqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0));
+			temp[i] = s * b3Scalar(0.5);
+			s = b3Scalar(0.5) / s;
+			temp[3] = (m_el[k][j] - m_el[j][k]) * s;
+			temp[j] = (m_el[j][i] + m_el[i][j]) * s;
+			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
+		}
+		q.setValue(temp[0],temp[1],temp[2],temp[3]);
+	}
+	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
+	* @param yaw Yaw around Y axis
+	* @param pitch Pitch around X axis
+	* @param roll around Z axis */	
+	void getEulerYPR(b3Scalar& yaw, b3Scalar& pitch, b3Scalar& roll) const
+	{
+		// first use the normal calculus
+		yaw = b3Scalar(b3Atan2(m_el[1].getX(), m_el[0].getX()));
+		pitch = b3Scalar(b3Asin(-m_el[2].getX()));
+		roll = b3Scalar(b3Atan2(m_el[2].getY(), m_el[2].getZ()));
+		// on pitch = +/-HalfPI
+		if (b3Fabs(pitch)==B3_HALF_PI)
+		{
+			if (yaw>0)
+				yaw-=B3_PI;
+			else
+				yaw+=B3_PI;
+			if (roll>0)
+				roll-=B3_PI;
+			else
+				roll+=B3_PI;
+		}
+	};
+	/**@brief Get the matrix represented as euler angles around ZYX
+	* @param yaw Yaw around X axis
+	* @param pitch Pitch around Y axis
+	* @param roll around X axis 
+	* @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/	
+	void getEulerZYX(b3Scalar& yaw, b3Scalar& pitch, b3Scalar& roll, unsigned int solution_number = 1) const
+	{
+		struct Euler
+		{
+			b3Scalar yaw;
+			b3Scalar pitch;
+			b3Scalar roll;
+		};
+		Euler euler_out;
+		Euler euler_out2; //second solution
+		//get the pointer to the raw data
+		// Check that pitch is not at a singularity
+		if (b3Fabs(m_el[2].getX()) >= 1)
+		{
+			euler_out.yaw = 0;
+			euler_out2.yaw = 0;
+			// From difference of angles formula
+			b3Scalar delta = b3Atan2(m_el[0].getX(),m_el[0].getZ());
+			if (m_el[2].getX() > 0)  //gimbal locked up
+			{
+				euler_out.pitch = B3_PI / b3Scalar(2.0);
+				euler_out2.pitch = B3_PI / b3Scalar(2.0);
+				euler_out.roll = euler_out.pitch + delta;
+				euler_out2.roll = euler_out.pitch + delta;
+			}
+			else // gimbal locked down
+			{
+				euler_out.pitch = -B3_PI / b3Scalar(2.0);
+				euler_out2.pitch = -B3_PI / b3Scalar(2.0);
+				euler_out.roll = -euler_out.pitch + delta;
+				euler_out2.roll = -euler_out.pitch + delta;
+			}
+		}
+		else
+		{
+			euler_out.pitch = - b3Asin(m_el[2].getX());
+			euler_out2.pitch = B3_PI - euler_out.pitch;
+			euler_out.roll = b3Atan2(m_el[2].getY()/b3Cos(euler_out.pitch), 
+				m_el[2].getZ()/b3Cos(euler_out.pitch));
+			euler_out2.roll = b3Atan2(m_el[2].getY()/b3Cos(euler_out2.pitch), 
+				m_el[2].getZ()/b3Cos(euler_out2.pitch));
+			euler_out.yaw = b3Atan2(m_el[1].getX()/b3Cos(euler_out.pitch), 
+				m_el[0].getX()/b3Cos(euler_out.pitch));
+			euler_out2.yaw = b3Atan2(m_el[1].getX()/b3Cos(euler_out2.pitch), 
+				m_el[0].getX()/b3Cos(euler_out2.pitch));
+		}
+		if (solution_number == 1)
+		{ 
+			yaw = euler_out.yaw; 
+			pitch = euler_out.pitch;
+			roll = euler_out.roll;
+		}
+		else
+		{ 
+			yaw = euler_out2.yaw; 
+			pitch = euler_out2.pitch;
+			roll = euler_out2.roll;
+		}
+	}
+	/**@brief Create a scaled copy of the matrix 
+	* @param s Scaling vector The elements of the vector will scale each column */
+	b3Matrix3x3 scaled(const b3Vector3& s) const
+	{
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+		return b3Matrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
+		return b3Matrix3x3(
+            m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(),
+			m_el[1].getX() * s.getX(), m_el[1].getY() * s.getY(), m_el[1].getZ() * s.getZ(),
+			m_el[2].getX() * s.getX(), m_el[2].getY() * s.getY(), m_el[2].getZ() * s.getZ());
+	}
+	/**@brief Return the determinant of the matrix */
+	b3Scalar            determinant() const;
+	/**@brief Return the adjoint of the matrix */
+	b3Matrix3x3 adjoint() const;
+	/**@brief Return the matrix with all values non negative */
+	b3Matrix3x3 absolute() const;
+	/**@brief Return the transpose of the matrix */
+	b3Matrix3x3 transpose() const;
+	/**@brief Return the inverse of the matrix */
+	b3Matrix3x3 inverse() const; 
+	b3Matrix3x3 transposeTimes(const b3Matrix3x3& m) const;
+	b3Matrix3x3 timesTranspose(const b3Matrix3x3& m) const;
+	B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const 
+	{
+		return m_el[0].getX() * v.getX() + m_el[1].getX() * v.getY() + m_el[2].getX() * v.getZ();
+	}
+	B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const 
+	{
+		return m_el[0].getY() * v.getX() + m_el[1].getY() * v.getY() + m_el[2].getY() * v.getZ();
+	}
+	B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const 
+	{
+		return m_el[0].getZ() * v.getX() + m_el[1].getZ() * v.getY() + m_el[2].getZ() * v.getZ();
+	}
+	/**@brief diagonalizes this matrix by the Jacobi method.
+	* @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
+	* coordinate system, i.e., old_this = rot * new_this * rot^T. 
+	* @param threshold See iteration
+	* @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied 
+	* by the sum of the absolute values of the diagonal, or when maxSteps have been executed. 
+	* 
+	* Note that this matrix is assumed to be symmetric. 
+	*/
+	void diagonalize(b3Matrix3x3& rot, b3Scalar threshold, int maxSteps)
+	{
+		rot.setIdentity();
+		for (int step = maxSteps; step > 0; step--)
+		{
+			// find off-diagonal element [p][q] with largest magnitude
+			int p = 0;
+			int q = 1;
+			int r = 2;
+			b3Scalar max = b3Fabs(m_el[0][1]);
+			b3Scalar v = b3Fabs(m_el[0][2]);
+			if (v > max)
+			{
+				q = 2;
+				r = 1;
+				max = v;
+			}
+			v = b3Fabs(m_el[1][2]);
+			if (v > max)
+			{
+				p = 1;
+				q = 2;
+				r = 0;
+				max = v;
+			}
+			b3Scalar t = threshold * (b3Fabs(m_el[0][0]) + b3Fabs(m_el[1][1]) + b3Fabs(m_el[2][2]));
+			if (max <= t)
+			{
+				if (max <= B3_EPSILON * t)
+				{
+					return;
+				}
+				step = 1;
+			}
+			// compute Jacobi rotation J which leads to a zero for element [p][q] 
+			b3Scalar mpq = m_el[p][q];
+			b3Scalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
+			b3Scalar theta2 = theta * theta;
+			b3Scalar cos;
+			b3Scalar sin;
+			if (theta2 * theta2 < b3Scalar(10 / B3_EPSILON))
+			{
+				t = (theta >= 0) ? 1 / (theta + b3Sqrt(1 + theta2))
+					: 1 / (theta - b3Sqrt(1 + theta2));
+				cos = 1 / b3Sqrt(1 + t * t);
+				sin = cos * t;
+			}
+			else
+			{
+				// approximation for large theta-value, i.e., a nearly diagonal matrix
+				t = 1 / (theta * (2 + b3Scalar(0.5) / theta2));
+				cos = 1 - b3Scalar(0.5) * t * t;
+				sin = cos * t;
+			}
+			// apply rotation to matrix (this = J^T * this * J)
+			m_el[p][q] = m_el[q][p] = 0;
+			m_el[p][p] -= t * mpq;
+			m_el[q][q] += t * mpq;
+			b3Scalar mrp = m_el[r][p];
+			b3Scalar mrq = m_el[r][q];
+			m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
+			m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
+			// apply rotation to rot (rot = rot * J)
+			for (int i = 0; i < 3; i++)
+			{
+				b3Vector3& row = rot[i];
+				mrp = row[p];
+				mrq = row[q];
+				row[p] = cos * mrp - sin * mrq;
+				row[q] = cos * mrq + sin * mrp;
+			}
+		}
+	}
+	/**@brief Calculate the matrix cofactor 
+	* @param r1 The first row to use for calculating the cofactor
+	* @param c1 The first column to use for calculating the cofactor
+	* @param r1 The second row to use for calculating the cofactor
+	* @param c1 The second column to use for calculating the cofactor
+	* See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
+	*/
+	b3Scalar cofac(int r1, int c1, int r2, int c2) const 
+	{
+		return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
+	}
+	void	serialize(struct	b3Matrix3x3Data& dataOut) const;
+	void	serializeFloat(struct	b3Matrix3x3FloatData& dataOut) const;
+	void	deSerialize(const struct	b3Matrix3x3Data& dataIn);
+	void	deSerializeFloat(const struct	b3Matrix3x3FloatData& dataIn);
+	void	deSerializeDouble(const struct	b3Matrix3x3DoubleData& dataIn);
+B3_FORCE_INLINE b3Matrix3x3& 
+b3Matrix3x3::operator*=(const b3Matrix3x3& m)
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+    __m128 rv00, rv01, rv02;
+    __m128 rv10, rv11, rv12;
+    __m128 rv20, rv21, rv22;
+    __m128 mv0, mv1, mv2;
+    rv02 = m_el[0].mVec128;
+    rv12 = m_el[1].mVec128;
+    rv22 = m_el[2].mVec128;
+    mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask); 
+    mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask); 
+    mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask); 
+    // rv0
+    rv00 = b3_splat_ps(rv02, 0);
+    rv01 = b3_splat_ps(rv02, 1);
+    rv02 = b3_splat_ps(rv02, 2);
+    rv00 = _mm_mul_ps(rv00, mv0);
+    rv01 = _mm_mul_ps(rv01, mv1);
+    rv02 = _mm_mul_ps(rv02, mv2);
+    // rv1
+    rv10 = b3_splat_ps(rv12, 0);
+    rv11 = b3_splat_ps(rv12, 1);
+    rv12 = b3_splat_ps(rv12, 2);
+    rv10 = _mm_mul_ps(rv10, mv0);
+    rv11 = _mm_mul_ps(rv11, mv1);
+    rv12 = _mm_mul_ps(rv12, mv2);
+    // rv2
+    rv20 = b3_splat_ps(rv22, 0);
+    rv21 = b3_splat_ps(rv22, 1);
+    rv22 = b3_splat_ps(rv22, 2);
+    rv20 = _mm_mul_ps(rv20, mv0);
+    rv21 = _mm_mul_ps(rv21, mv1);
+    rv22 = _mm_mul_ps(rv22, mv2);
+    rv00 = _mm_add_ps(rv00, rv01);
+    rv10 = _mm_add_ps(rv10, rv11);
+    rv20 = _mm_add_ps(rv20, rv21);
+    m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
+    m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
+    m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
+#elif defined(B3_USE_NEON)
+    float32x4_t rv0, rv1, rv2;
+    float32x4_t v0, v1, v2;
+    float32x4_t mv0, mv1, mv2;
+    v0 = m_el[0].mVec128;
+    v1 = m_el[1].mVec128;
+    v2 = m_el[2].mVec128;
+    mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask); 
+    mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask); 
+    mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask); 
+    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+    m_el[0].mVec128 = rv0;
+    m_el[1].mVec128 = rv1;
+    m_el[2].mVec128 = rv2;
+	setValue(
+        m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
+		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
+	return *this;
+B3_FORCE_INLINE b3Matrix3x3& 
+b3Matrix3x3::operator+=(const b3Matrix3x3& m)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+    m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
+    m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
+    m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
+	setValue(
+		m_el[0][0]+m.m_el[0][0], 
+		m_el[0][1]+m.m_el[0][1],
+		m_el[0][2]+m.m_el[0][2],
+		m_el[1][0]+m.m_el[1][0], 
+		m_el[1][1]+m.m_el[1][1],
+		m_el[1][2]+m.m_el[1][2],
+		m_el[2][0]+m.m_el[2][0], 
+		m_el[2][1]+m.m_el[2][1],
+		m_el[2][2]+m.m_el[2][2]);
+	return *this;
+B3_FORCE_INLINE b3Matrix3x3
+operator*(const b3Matrix3x3& m, const b3Scalar & k)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    __m128 vk = b3_splat_ps(_mm_load_ss((float *)&k), 0x80);
+    return b3Matrix3x3(
+                _mm_mul_ps(m[0].mVec128, vk), 
+                _mm_mul_ps(m[1].mVec128, vk), 
+                _mm_mul_ps(m[2].mVec128, vk)); 
+#elif defined(B3_USE_NEON)
+    return b3Matrix3x3(
+                vmulq_n_f32(m[0].mVec128, k),
+                vmulq_n_f32(m[1].mVec128, k),
+                vmulq_n_f32(m[2].mVec128, k)); 
+	return b3Matrix3x3(
+		m[0].getX()*k,m[0].getY()*k,m[0].getZ()*k,
+		m[1].getX()*k,m[1].getY()*k,m[1].getZ()*k,
+		m[2].getX()*k,m[2].getY()*k,m[2].getZ()*k);
+B3_FORCE_INLINE b3Matrix3x3 
+operator+(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+	return b3Matrix3x3(
+        m1[0].mVec128 + m2[0].mVec128,
+        m1[1].mVec128 + m2[1].mVec128,
+        m1[2].mVec128 + m2[2].mVec128);
+	return b3Matrix3x3(
+        m1[0][0]+m2[0][0], 
+        m1[0][1]+m2[0][1],
+        m1[0][2]+m2[0][2],
+        m1[1][0]+m2[1][0], 
+        m1[1][1]+m2[1][1],
+        m1[1][2]+m2[1][2],
+        m1[2][0]+m2[2][0], 
+        m1[2][1]+m2[2][1],
+        m1[2][2]+m2[2][2]);
+B3_FORCE_INLINE b3Matrix3x3 
+operator-(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+	return b3Matrix3x3(
+        m1[0].mVec128 - m2[0].mVec128,
+        m1[1].mVec128 - m2[1].mVec128,
+        m1[2].mVec128 - m2[2].mVec128);
+	return b3Matrix3x3(
+        m1[0][0]-m2[0][0], 
+        m1[0][1]-m2[0][1],
+        m1[0][2]-m2[0][2],
+        m1[1][0]-m2[1][0], 
+        m1[1][1]-m2[1][1],
+        m1[1][2]-m2[1][2],
+        m1[2][0]-m2[2][0], 
+        m1[2][1]-m2[2][1],
+        m1[2][2]-m2[2][2]);
+B3_FORCE_INLINE b3Matrix3x3& 
+b3Matrix3x3::operator-=(const b3Matrix3x3& m)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+    m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
+    m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
+    m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
+	setValue(
+	m_el[0][0]-m.m_el[0][0], 
+	m_el[0][1]-m.m_el[0][1],
+	m_el[0][2]-m.m_el[0][2],
+	m_el[1][0]-m.m_el[1][0], 
+	m_el[1][1]-m.m_el[1][1],
+	m_el[1][2]-m.m_el[1][2],
+	m_el[2][0]-m.m_el[2][0], 
+	m_el[2][1]-m.m_el[2][1],
+	m_el[2][2]-m.m_el[2][2]);
+	return *this;
+B3_FORCE_INLINE b3Scalar 
+b3Matrix3x3::determinant() const
+	return b3Triple((*this)[0], (*this)[1], (*this)[2]);
+B3_FORCE_INLINE b3Matrix3x3 
+b3Matrix3x3::absolute() const
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    return b3Matrix3x3(
+            _mm_and_ps(m_el[0].mVec128, b3vAbsfMask),
+            _mm_and_ps(m_el[1].mVec128, b3vAbsfMask),
+            _mm_and_ps(m_el[2].mVec128, b3vAbsfMask));
+#elif defined(B3_USE_NEON)
+    return b3Matrix3x3(
+            (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask),
+            (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask),
+            (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask));
+	return b3Matrix3x3(
+            b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()),
+            b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()),
+            b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ()));
+B3_FORCE_INLINE b3Matrix3x3 
+b3Matrix3x3::transpose() const 
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    __m128 v0 = m_el[0].mVec128;
+    __m128 v1 = m_el[1].mVec128;
+    __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
+    __m128 vT;
+    v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
+    vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
+    v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
+    v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
+    v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
+    v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));	// z0 z1 z2 0
+    return b3Matrix3x3( v0, v1, v2 );
+#elif defined(B3_USE_NEON)
+    // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+    static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
+    float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+    float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
+    float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
+    float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
+    float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
+    float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
+    return b3Matrix3x3( v0, v1, v2 ); 
+	return b3Matrix3x3( m_el[0].getX(), m_el[1].getX(), m_el[2].getX(),
+                        m_el[0].getY(), m_el[1].getY(), m_el[2].getY(),
+                        m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ());
+B3_FORCE_INLINE b3Matrix3x3 
+b3Matrix3x3::adjoint() const 
+	return b3Matrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
+		cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
+		cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
+B3_FORCE_INLINE b3Matrix3x3 
+b3Matrix3x3::inverse() const
+	b3Vector3 co = b3MakeVector3(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
+	b3Scalar det = (*this)[0].dot(co);
+	b3FullAssert(det != b3Scalar(0.0));
+	b3Scalar s = b3Scalar(1.0) / det;
+	return b3Matrix3x3(co.getX() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
+		co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
+		co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
+B3_FORCE_INLINE b3Matrix3x3 
+b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    // zeros w
+//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
+    __m128 row = m_el[0].mVec128;
+    __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, b3vFFF0fMask );
+    __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, b3vFFF0fMask);
+    __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, b3vFFF0fMask );
+    __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
+    __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
+    __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
+    row = m_el[1].mVec128;
+    r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
+    r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
+    r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
+    row = m_el[2].mVec128;
+    r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
+    r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
+    r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
+    return b3Matrix3x3( r0, r1, r2 );
+#elif defined B3_USE_NEON
+    // zeros w
+    static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
+    float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
+    float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
+    float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
+    float32x4_t row = m_el[0].mVec128;
+    float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
+    float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
+    float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
+    row = m_el[1].mVec128;
+    r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
+    r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
+    r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
+    row = m_el[2].mVec128;
+    r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
+    r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
+    r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
+    return b3Matrix3x3( r0, r1, r2 );
+    return b3Matrix3x3(
+		m_el[0].getX() * m[0].getX() + m_el[1].getX() * m[1].getX() + m_el[2].getX() * m[2].getX(),
+		m_el[0].getX() * m[0].getY() + m_el[1].getX() * m[1].getY() + m_el[2].getX() * m[2].getY(),
+		m_el[0].getX() * m[0].getZ() + m_el[1].getX() * m[1].getZ() + m_el[2].getX() * m[2].getZ(),
+		m_el[0].getY() * m[0].getX() + m_el[1].getY() * m[1].getX() + m_el[2].getY() * m[2].getX(),
+		m_el[0].getY() * m[0].getY() + m_el[1].getY() * m[1].getY() + m_el[2].getY() * m[2].getY(),
+		m_el[0].getY() * m[0].getZ() + m_el[1].getY() * m[1].getZ() + m_el[2].getY() * m[2].getZ(),
+		m_el[0].getZ() * m[0].getX() + m_el[1].getZ() * m[1].getX() + m_el[2].getZ() * m[2].getX(),
+		m_el[0].getZ() * m[0].getY() + m_el[1].getZ() * m[1].getY() + m_el[2].getZ() * m[2].getY(),
+		m_el[0].getZ() * m[0].getZ() + m_el[1].getZ() * m[1].getZ() + m_el[2].getZ() * m[2].getZ());
+B3_FORCE_INLINE b3Matrix3x3 
+b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    __m128 a0 = m_el[0].mVec128;
+    __m128 a1 = m_el[1].mVec128;
+    __m128 a2 = m_el[2].mVec128;
+    b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
+    __m128 mx = mT[0].mVec128;
+    __m128 my = mT[1].mVec128;
+    __m128 mz = mT[2].mVec128;
+    __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
+    __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
+    __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
+    r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
+    r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
+    r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
+    r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
+    r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
+    r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
+    return b3Matrix3x3( r0, r1, r2);
+#elif defined B3_USE_NEON
+    float32x4_t a0 = m_el[0].mVec128;
+    float32x4_t a1 = m_el[1].mVec128;
+    float32x4_t a2 = m_el[2].mVec128;
+    b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
+    float32x4_t mx = mT[0].mVec128;
+    float32x4_t my = mT[1].mVec128;
+    float32x4_t mz = mT[2].mVec128;
+    float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
+    float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
+    float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
+    r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
+    r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
+    r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
+    r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
+    r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
+    r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
+    return b3Matrix3x3( r0, r1, r2 );
+	return b3Matrix3x3(
+		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
+		m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
+		m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
+B3_FORCE_INLINE b3Vector3 
+operator*(const b3Matrix3x3& m, const b3Vector3& v) 
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+    return v.dot3(m[0], m[1], m[2]);
+	return b3MakeVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Vector3& v, const b3Matrix3x3& m)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    const __m128 vv = v.mVec128;
+    __m128 c0 = b3_splat_ps( vv, 0);
+    __m128 c1 = b3_splat_ps( vv, 1);
+    __m128 c2 = b3_splat_ps( vv, 2);
+    c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask) );
+    c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask) );
+    c0 = _mm_add_ps(c0, c1);
+    c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask) );
+    return b3MakeVector3(_mm_add_ps(c0, c2));
+#elif defined(B3_USE_NEON)
+    const float32x4_t vv = v.mVec128;
+    const float32x2_t vlo = vget_low_f32(vv);
+    const float32x2_t vhi = vget_high_f32(vv);
+    float32x4_t c0, c1, c2;
+    c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
+    c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
+    c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
+    c0 = vmulq_lane_f32(c0, vlo, 0);
+    c1 = vmulq_lane_f32(c1, vlo, 1);
+    c2 = vmulq_lane_f32(c2, vhi, 0);
+    c0 = vaddq_f32(c0, c1);
+    c0 = vaddq_f32(c0, c2);
+    return b3MakeVector3(c0);
+	return b3MakeVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
+B3_FORCE_INLINE b3Matrix3x3 
+operator*(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    __m128 m10 = m1[0].mVec128;  
+    __m128 m11 = m1[1].mVec128;
+    __m128 m12 = m1[2].mVec128;
+    __m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask);
+    __m128 c0 = b3_splat_ps( m10, 0);
+    __m128 c1 = b3_splat_ps( m11, 0);
+    __m128 c2 = b3_splat_ps( m12, 0);
+    c0 = _mm_mul_ps(c0, m2v);
+    c1 = _mm_mul_ps(c1, m2v);
+    c2 = _mm_mul_ps(c2, m2v);
+    m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask);
+    __m128 c0_1 = b3_splat_ps( m10, 1);
+    __m128 c1_1 = b3_splat_ps( m11, 1);
+    __m128 c2_1 = b3_splat_ps( m12, 1);
+    c0_1 = _mm_mul_ps(c0_1, m2v);
+    c1_1 = _mm_mul_ps(c1_1, m2v);
+    c2_1 = _mm_mul_ps(c2_1, m2v);
+    m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask);
+    c0 = _mm_add_ps(c0, c0_1);
+    c1 = _mm_add_ps(c1, c1_1);
+    c2 = _mm_add_ps(c2, c2_1);
+    m10 = b3_splat_ps( m10, 2);
+    m11 = b3_splat_ps( m11, 2);
+    m12 = b3_splat_ps( m12, 2);
+    m10 = _mm_mul_ps(m10, m2v);
+    m11 = _mm_mul_ps(m11, m2v);
+    m12 = _mm_mul_ps(m12, m2v);
+    c0 = _mm_add_ps(c0, m10);
+    c1 = _mm_add_ps(c1, m11);
+    c2 = _mm_add_ps(c2, m12);
+    return b3Matrix3x3(c0, c1, c2);
+#elif defined(B3_USE_NEON)
+    float32x4_t rv0, rv1, rv2;
+    float32x4_t v0, v1, v2;
+    float32x4_t mv0, mv1, mv2;
+    v0 = m1[0].mVec128;
+    v1 = m1[1].mVec128;
+    v2 = m1[2].mVec128;
+    mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask); 
+    mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask); 
+    mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask); 
+    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+	return b3Matrix3x3(rv0, rv1, rv2);
+	return b3Matrix3x3(
+		m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
+		m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
+		m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
+B3_FORCE_INLINE b3Matrix3x3 b3MultTransposeLeft(const b3Matrix3x3& m1, const b3Matrix3x3& m2) {
+return b3Matrix3x3(
+m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
+m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
+m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
+m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
+m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
+m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
+m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
+m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
+m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
+/**@brief Equality operator between two matrices
+* It will test all elements are equal.  */
+B3_FORCE_INLINE bool operator==(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+    __m128 c0, c1, c2;
+    c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
+    c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
+    c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
+    c0 = _mm_and_ps(c0, c1);
+    c0 = _mm_and_ps(c0, c2);
+    return (0x7 == _mm_movemask_ps((__m128)c0));
+	return 
+    (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
+		m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
+		m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
+///for serialization
+struct	b3Matrix3x3FloatData
+	b3Vector3FloatData m_el[3];
+///for serialization
+struct	b3Matrix3x3DoubleData
+	b3Vector3DoubleData m_el[3];
+B3_FORCE_INLINE	void	b3Matrix3x3::serialize(struct	b3Matrix3x3Data& dataOut) const
+	for (int i=0;i<3;i++)
+		m_el[i].serialize(dataOut.m_el[i]);
+B3_FORCE_INLINE	void	b3Matrix3x3::serializeFloat(struct	b3Matrix3x3FloatData& dataOut) const
+	for (int i=0;i<3;i++)
+		m_el[i].serializeFloat(dataOut.m_el[i]);
+B3_FORCE_INLINE	void	b3Matrix3x3::deSerialize(const struct	b3Matrix3x3Data& dataIn)
+	for (int i=0;i<3;i++)
+		m_el[i].deSerialize(dataIn.m_el[i]);
+B3_FORCE_INLINE	void	b3Matrix3x3::deSerializeFloat(const struct	b3Matrix3x3FloatData& dataIn)
+	for (int i=0;i<3;i++)
+		m_el[i].deSerializeFloat(dataIn.m_el[i]);
+B3_FORCE_INLINE	void	b3Matrix3x3::deSerializeDouble(const struct	b3Matrix3x3DoubleData& dataIn)
+	for (int i=0;i<3;i++)
+		m_el[i].deSerializeDouble(dataIn.m_el[i]);
+#endif //B3_MATRIX3x3_H
diff --git a/src/bullet/Bullet3Common/b3MinMax.h b/src/bullet/Bullet3Common/b3MinMax.h
new file mode 100644
index 00000000..73af23a4
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3MinMax.h
@@ -0,0 +1,71 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_GEN_MINMAX_H
+#define B3_GEN_MINMAX_H
+#include "b3Scalar.h"
+template <class T>
+B3_FORCE_INLINE const T& b3Min(const T& a, const T& b) 
+  return a < b ? a : b ;
+template <class T>
+B3_FORCE_INLINE const T& b3Max(const T& a, const T& b) 
+  return  a > b ? a : b;
+template <class T>
+B3_FORCE_INLINE const T& b3Clamped(const T& a, const T& lb, const T& ub) 
+	return a < lb ? lb : (ub < a ? ub : a); 
+template <class T>
+B3_FORCE_INLINE void b3SetMin(T& a, const T& b) 
+    if (b < a) 
+	{
+		a = b;
+	}
+template <class T>
+B3_FORCE_INLINE void b3SetMax(T& a, const T& b) 
+    if (a < b) 
+	{
+		a = b;
+	}
+template <class T>
+B3_FORCE_INLINE void b3Clamp(T& a, const T& lb, const T& ub) 
+	if (a < lb) 
+	{
+		a = lb; 
+	}
+	else if (ub < a) 
+	{
+		a = ub;
+	}
+#endif //B3_GEN_MINMAX_H
diff --git a/src/bullet/Bullet3Common/b3PoolAllocator.h b/src/bullet/Bullet3Common/b3PoolAllocator.h
new file mode 100644
index 00000000..2fcdcf5b
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3PoolAllocator.h
@@ -0,0 +1,121 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3Scalar.h"
+#include "b3AlignedAllocator.h"
+///The b3PoolAllocator class allows to efficiently allocate a large pool of objects, instead of dynamically allocating them separately.
+class b3PoolAllocator
+	int				m_elemSize;
+	int				m_maxElements;
+	int				m_freeCount;
+	void*			m_firstFree;
+	unsigned char*	m_pool;
+	b3PoolAllocator(int elemSize, int maxElements)
+		:m_elemSize(elemSize),
+		m_maxElements(maxElements)
+	{
+		m_pool = (unsigned char*) b3AlignedAlloc( static_cast<unsigned int>(m_elemSize*m_maxElements),16);
+		unsigned char* p = m_pool;
+        m_firstFree = p;
+        m_freeCount = m_maxElements;
+        int count = m_maxElements;
+        while (--count) {
+            *(void**)p = (p + m_elemSize);
+            p += m_elemSize;
+        }
+        *(void**)p = 0;
+    }
+	~b3PoolAllocator()
+	{
+		b3AlignedFree( m_pool);
+	}
+	int	getFreeCount() const
+	{
+		return m_freeCount;
+	}
+	int getUsedCount() const
+	{
+		return m_maxElements - m_freeCount;
+	}
+	int getMaxCount() const
+	{
+		return m_maxElements;
+	}
+	void*	allocate(int size)
+	{
+		// release mode fix
+		(void)size;
+		b3Assert(!size || size<=m_elemSize);
+		b3Assert(m_freeCount>0);
+        void* result = m_firstFree;
+        m_firstFree = *(void**)m_firstFree;
+        --m_freeCount;
+        return result;
+	}
+	bool validPtr(void* ptr)
+	{
+		if (ptr) {
+			if (((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize))
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+	void	freeMemory(void* ptr)
+	{
+		 if (ptr) {
+            b3Assert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize);
+            *(void**)ptr = m_firstFree;
+            m_firstFree = ptr;
+            ++m_freeCount;
+        }
+	}
+	int	getElementSize() const
+	{
+		return m_elemSize;
+	}
+	unsigned char*	getPoolAddress()
+	{
+		return m_pool;
+	}
+	const unsigned char*	getPoolAddress() const
+	{
+		return m_pool;
+	}
diff --git a/src/bullet/Bullet3Common/b3QuadWord.h b/src/bullet/Bullet3Common/b3QuadWord.h
new file mode 100644
index 00000000..65c95819
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3QuadWord.h
@@ -0,0 +1,245 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3Scalar.h"
+#include "b3MinMax.h"
+#if defined (__CELLOS_LV2) && defined (__SPU__)
+#include <altivec.h>
+/**@brief The b3QuadWord class is base class for b3Vector3 and b3Quaternion. 
+ * Some issues under PS3 Linux with IBM 2.1 SDK, gcc compiler prevent from using aligned quadword.
+ */
+#ifndef USE_LIBSPE2
+B3_ATTRIBUTE_ALIGNED16(class) b3QuadWord
+class b3QuadWord
+#if defined (__SPU__) && defined (__CELLOS_LV2__)
+	union {
+		vec_float4 mVec128;
+		b3Scalar	m_floats[4];
+	};
+	vec_float4	get128() const
+	{
+		return mVec128;
+	}
+#else //__CELLOS_LV2__ __SPU__
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON) 
+	union {
+		b3SimdFloat4 mVec128;
+		b3Scalar	m_floats[4];
+		struct {b3Scalar x,y,z,w;};
+	};
+	B3_FORCE_INLINE	b3SimdFloat4	get128() const
+	{
+		return mVec128;
+	}
+	B3_FORCE_INLINE	void	set128(b3SimdFloat4 v128)
+	{
+		mVec128 = v128;
+	}
+	union
+	{
+		b3Scalar	m_floats[4];
+		struct {b3Scalar x,y,z,w;};
+	};
+#endif // B3_USE_SSE
+#endif //__CELLOS_LV2__ __SPU__
+	public:
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+	// Set Vector 
+	B3_FORCE_INLINE b3QuadWord(const b3SimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+	// Copy constructor
+	B3_FORCE_INLINE b3QuadWord(const b3QuadWord& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+	// Assignment Operator
+	B3_FORCE_INLINE b3QuadWord& 
+	operator=(const b3QuadWord& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
+  /**@brief Return the x value */
+		B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
+  /**@brief Return the y value */
+		B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
+  /**@brief Return the z value */
+		B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
+  /**@brief Set the x value */
+		B3_FORCE_INLINE void	setX(b3Scalar _x) { m_floats[0] = _x;};
+  /**@brief Set the y value */
+		B3_FORCE_INLINE void	setY(b3Scalar _y) { m_floats[1] = _y;};
+  /**@brief Set the z value */
+		B3_FORCE_INLINE void	setZ(b3Scalar _z) { m_floats[2] = _z;};
+  /**@brief Set the w value */
+		B3_FORCE_INLINE void	setW(b3Scalar _w) { m_floats[3] = _w;};
+  /**@brief Return the x value */
+	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}      
+	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
+	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
+	B3_FORCE_INLINE	operator       b3Scalar *()       { return &m_floats[0]; }
+	B3_FORCE_INLINE	operator const b3Scalar *() const { return &m_floats[0]; }
+	B3_FORCE_INLINE	bool	operator==(const b3QuadWord& other) const
+	{
+#ifdef B3_USE_SSE
+        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+		return ((m_floats[3]==other.m_floats[3]) && 
+                (m_floats[2]==other.m_floats[2]) && 
+                (m_floats[1]==other.m_floats[1]) && 
+                (m_floats[0]==other.m_floats[0]));
+	}
+	B3_FORCE_INLINE	bool	operator!=(const b3QuadWord& other) const
+	{
+		return !(*this == other);
+	}
+  /**@brief Set x,y,z and zero w 
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   */
+		B3_FORCE_INLINE void 	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+		{
+			m_floats[0]=_x;
+			m_floats[1]=_y;
+			m_floats[2]=_z;
+			m_floats[3] = 0.f;
+		}
+/*		void getValue(b3Scalar *m) const 
+		{
+			m[0] = m_floats[0];
+			m[1] = m_floats[1];
+			m[2] = m_floats[2];
+		}
+/**@brief Set the values 
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   * @param w Value of w
+   */
+		B3_FORCE_INLINE void	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w)
+		{
+			m_floats[0]=_x;
+			m_floats[1]=_y;
+			m_floats[2]=_z;
+			m_floats[3]=_w;
+		}
+  /**@brief No initialization constructor */
+		B3_FORCE_INLINE b3QuadWord()
+		//	:m_floats[0](b3Scalar(0.)),m_floats[1](b3Scalar(0.)),m_floats[2](b3Scalar(0.)),m_floats[3](b3Scalar(0.))
+		{
+		}
+  /**@brief Three argument constructor (zeros w)
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   */
+		B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)		
+		{
+			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f;
+		}
+/**@brief Initializing constructor
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   * @param w Value of w
+   */
+		B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w) 
+		{
+			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w;
+		}
+  /**@brief Set each element to the max of the current values and the values of another b3QuadWord
+   * @param other The other b3QuadWord to compare with 
+   */
+		B3_FORCE_INLINE void	setMax(const b3QuadWord& other)
+		{
+        #ifdef B3_USE_SSE
+            mVec128 = _mm_max_ps(mVec128, other.mVec128);
+        #elif defined(B3_USE_NEON)
+            mVec128 = vmaxq_f32(mVec128, other.mVec128);
+        #else
+        	b3SetMax(m_floats[0], other.m_floats[0]);
+			b3SetMax(m_floats[1], other.m_floats[1]);
+			b3SetMax(m_floats[2], other.m_floats[2]);
+			b3SetMax(m_floats[3], other.m_floats[3]);
+		#endif
+        }
+  /**@brief Set each element to the min of the current values and the values of another b3QuadWord
+   * @param other The other b3QuadWord to compare with 
+   */
+		B3_FORCE_INLINE void	setMin(const b3QuadWord& other)
+		{
+        #ifdef B3_USE_SSE
+            mVec128 = _mm_min_ps(mVec128, other.mVec128);
+        #elif defined(B3_USE_NEON)
+            mVec128 = vminq_f32(mVec128, other.mVec128);
+        #else
+        	b3SetMin(m_floats[0], other.m_floats[0]);
+			b3SetMin(m_floats[1], other.m_floats[1]);
+			b3SetMin(m_floats[2], other.m_floats[2]);
+			b3SetMin(m_floats[3], other.m_floats[3]);
+		#endif
+        }
+#endif //B3_SIMD_QUADWORD_H
diff --git a/src/bullet/Bullet3Common/b3Quaternion.h b/src/bullet/Bullet3Common/b3Quaternion.h
new file mode 100644
index 00000000..c89f2cf3
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Quaternion.h
@@ -0,0 +1,893 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3Vector3.h"
+#include "b3QuadWord.h"
+#ifdef B3_USE_SSE
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3vQInv) = {-0.0f, -0.0f, -0.0f, +0.0f};
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f};
+/**@brief The b3Quaternion implements quaternion to perform linear algebra rotations in combination with b3Matrix3x3, b3Vector3 and b3Transform. */
+class b3Quaternion : public b3QuadWord {
+  /**@brief No initialization constructor */
+	b3Quaternion() {}
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))|| defined(B3_USE_NEON) 
+	// Set Vector 
+	B3_FORCE_INLINE b3Quaternion(const b3SimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+	// Copy constructor
+	B3_FORCE_INLINE b3Quaternion(const b3Quaternion& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+	// Assignment Operator
+	B3_FORCE_INLINE b3Quaternion& 
+	operator=(const b3Quaternion& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
+	//		template <typename b3Scalar>
+	//		explicit Quaternion(const b3Scalar *v) : Tuple4<b3Scalar>(v) {}
+  /**@brief Constructor from scalars */
+	b3Quaternion(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) 
+		: b3QuadWord(_x, _y, _z, _w) 
+	{
+		//b3Assert(!((_x==1.f) && (_y==0.f) && (_z==0.f) && (_w==0.f)));
+	}
+  /**@brief Axis angle Constructor
+   * @param axis The axis which the rotation is around
+   * @param angle The magnitude of the rotation around the angle (Radians) */
+	b3Quaternion(const b3Vector3& _axis, const b3Scalar& _angle) 
+	{ 
+		setRotation(_axis, _angle); 
+	}
+  /**@brief Constructor from Euler angles
+   * @param yaw Angle around Y unless B3_EULER_DEFAULT_ZYX defined then Z
+   * @param pitch Angle around X unless B3_EULER_DEFAULT_ZYX defined then Y
+   * @param roll Angle around Z unless B3_EULER_DEFAULT_ZYX defined then X */
+	b3Quaternion(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{ 
+		setEuler(yaw, pitch, roll); 
+		setEulerZYX(yaw, pitch, roll); 
+	}
+  /**@brief Set the rotation using axis angle notation 
+   * @param axis The axis around which to rotate
+   * @param angle The magnitude of the rotation in Radians */
+	void setRotation(const b3Vector3& axis, const b3Scalar& _angle)
+	{
+		b3Scalar d = axis.length();
+		b3Assert(d != b3Scalar(0.0));
+		b3Scalar s = b3Sin(_angle * b3Scalar(0.5)) / d;
+		setValue(axis.getX() * s, axis.getY() * s, axis.getZ() * s, 
+			b3Cos(_angle * b3Scalar(0.5)));
+	}
+  /**@brief Set the quaternion using Euler angles
+   * @param yaw Angle around Y
+   * @param pitch Angle around X
+   * @param roll Angle around Z */
+	void setEuler(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{
+		b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5);  
+		b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5);  
+		b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5);  
+		b3Scalar cosYaw = b3Cos(halfYaw);
+		b3Scalar sinYaw = b3Sin(halfYaw);
+		b3Scalar cosPitch = b3Cos(halfPitch);
+		b3Scalar sinPitch = b3Sin(halfPitch);
+		b3Scalar cosRoll = b3Cos(halfRoll);
+		b3Scalar sinRoll = b3Sin(halfRoll);
+		setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
+			cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
+			sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
+			cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
+	}
+  /**@brief Set the quaternion using euler angles 
+   * @param yaw Angle around Z
+   * @param pitch Angle around Y
+   * @param roll Angle around X */
+	void setEulerZYX(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{
+		b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5);  
+		b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5);  
+		b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5);  
+		b3Scalar cosYaw = b3Cos(halfYaw);
+		b3Scalar sinYaw = b3Sin(halfYaw);
+		b3Scalar cosPitch = b3Cos(halfPitch);
+		b3Scalar sinPitch = b3Sin(halfPitch);
+		b3Scalar cosRoll = b3Cos(halfRoll);
+		b3Scalar sinRoll = b3Sin(halfRoll);
+		setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x
+                         cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y
+                         cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z
+                         cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx
+	}
+  /**@brief Add two quaternions
+   * @param q The quaternion to add to this one */
+	B3_FORCE_INLINE	b3Quaternion& operator+=(const b3Quaternion& q)
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, q.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, q.mVec128);
+		m_floats[0] += q.getX(); 
+        m_floats[1] += q.getY(); 
+        m_floats[2] += q.getZ(); 
+        m_floats[3] += q.m_floats[3];
+		return *this;
+	}
+  /**@brief Subtract out a quaternion
+   * @param q The quaternion to subtract from this one */
+	b3Quaternion& operator-=(const b3Quaternion& q) 
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, q.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, q.mVec128);
+		m_floats[0] -= q.getX(); 
+        m_floats[1] -= q.getY(); 
+        m_floats[2] -= q.getZ(); 
+        m_floats[3] -= q.m_floats[3];
+        return *this;
+	}
+  /**@brief Scale this quaternion
+   * @param s The scalar to scale by */
+	b3Quaternion& operator*=(const b3Scalar& s)
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0);	//	(S S S S)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+		m_floats[0] *= s; 
+        m_floats[1] *= s; 
+        m_floats[2] *= s; 
+        m_floats[3] *= s;
+		return *this;
+	}
+  /**@brief Multiply this quaternion by q on the right
+   * @param q The other quaternion 
+   * Equivilant to this = this * q */
+	b3Quaternion& operator*=(const b3Quaternion& q)
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128 vQ2 = q.get128();
+		__m128 A1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(0,1,2,0));
+		__m128 B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0));
+		A1 = A1 * B1;
+		__m128 A2 = b3_pshufd_ps(mVec128, B3_SHUFFLE(1,2,0,1));
+		__m128 B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1));
+		A2 = A2 * B2;
+		B1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(2,0,1,2));
+		B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2));
+		B1 = B1 * B2;	//	A3 *= B3
+		mVec128 = b3_splat_ps(mVec128, 3);	//	A0
+		mVec128 = mVec128 * vQ2;	//	A0 * B0
+		A1 = A1 + A2;	//	AB12
+		mVec128 = mVec128 - B1;	//	AB03 = AB0 - AB3 
+		A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
+		mVec128 = mVec128+ A1;	//	AB03 + AB12
+#elif defined(B3_USE_NEON)     
+        float32x4_t vQ1 = mVec128;
+        float32x4_t vQ2 = q.get128();
+        float32x4_t A0, A1, B1, A2, B2, A3, B3;
+        float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+        {
+        float32x2x2_t tmp;
+        tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+        vQ1zx = tmp.val[0];
+        tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+        vQ2zx = tmp.val[0];
+        }
+        vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+        vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+        vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+        vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+        A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+        B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+        A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+        B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+        A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+        B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+        A1 = vmulq_f32(A1, B1);
+        A2 = vmulq_f32(A2, B2);
+        A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+        A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
+        A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+        A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
+        //	change the sign of the last element
+        A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
+        A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
+        mVec128 = A0;
+		setValue(
+            m_floats[3] * q.getX() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.getZ() - m_floats[2] * q.getY(),
+			m_floats[3] * q.getY() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.getX() - m_floats[0] * q.getZ(),
+			m_floats[3] * q.getZ() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.getY() - m_floats[1] * q.getX(),
+			m_floats[3] * q.m_floats[3] - m_floats[0] * q.getX() - m_floats[1] * q.getY() - m_floats[2] * q.getZ());
+		return *this;
+	}
+  /**@brief Return the dot product between this quaternion and another
+   * @param q The other quaternion */
+	b3Scalar dot(const b3Quaternion& q) const
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vd;
+		vd = _mm_mul_ps(mVec128, q.mVec128);
+        __m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+        return _mm_cvtss_f32(vd);
+#elif defined(B3_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));  
+		x = vpadd_f32(x, x);
+		return vget_lane_f32(x, 0);
+		return  m_floats[0] * q.getX() + 
+                m_floats[1] * q.getY() + 
+                m_floats[2] * q.getZ() + 
+                m_floats[3] * q.m_floats[3];
+	}
+  /**@brief Return the length squared of the quaternion */
+	b3Scalar length2() const
+	{
+		return dot(*this);
+	}
+  /**@brief Return the length of the quaternion */
+	b3Scalar length() const
+	{
+		return b3Sqrt(length2());
+	}
+  /**@brief Normalize the quaternion 
+   * Such that x^2 + y^2 + z^2 +w^2 = 1 */
+	b3Quaternion& normalize() 
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vd;
+		vd = _mm_mul_ps(mVec128, mVec128);
+        __m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+		vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(b3vOnes, vd);
+        vd = b3_pshufd_ps(vd, 0); // splat
+		mVec128 = _mm_mul_ps(mVec128, vd);
+		return *this;
+		return *this /= length();
+	}
+  /**@brief Return a scaled version of this quaternion
+   * @param s The scale factor */
+	B3_FORCE_INLINE b3Quaternion
+	operator*(const b3Scalar& s) const
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
+		return b3Quaternion(_mm_mul_ps(mVec128, vs));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion(vmulq_n_f32(mVec128, s));
+		return b3Quaternion(getX() * s, getY() * s, getZ() * s, m_floats[3] * s);
+	}
+  /**@brief Return an inversely scaled versionof this quaternion
+   * @param s The inverse scale factor */
+	b3Quaternion operator/(const b3Scalar& s) const
+	{
+		b3Assert(s != b3Scalar(0.0));
+		return *this * (b3Scalar(1.0) / s);
+	}
+  /**@brief Inversely scale this quaternion
+   * @param s The scale factor */
+	b3Quaternion& operator/=(const b3Scalar& s) 
+	{
+		b3Assert(s != b3Scalar(0.0));
+		return *this *= b3Scalar(1.0) / s;
+	}
+  /**@brief Return a normalized version of this quaternion */
+	b3Quaternion normalized() const 
+	{
+		return *this / length();
+	} 
+  /**@brief Return the angle between this quaternion and the other 
+   * @param q The other quaternion */
+	b3Scalar angle(const b3Quaternion& q) const 
+	{
+		b3Scalar s = b3Sqrt(length2() * q.length2());
+		b3Assert(s != b3Scalar(0.0));
+		return b3Acos(dot(q) / s);
+	}
+  /**@brief Return the angle of rotation represented by this quaternion */
+	b3Scalar getAngle() const 
+	{
+		b3Scalar s = b3Scalar(2.) * b3Acos(m_floats[3]);
+		return s;
+	}
+	/**@brief Return the axis of the rotation represented by this quaternion */
+	b3Vector3 getAxis() const
+	{
+		b3Scalar s_squared = 1.f-m_floats[3]*m_floats[3];
+		if (s_squared < b3Scalar(10.) * B3_EPSILON) //Check for divide by zero
+			return b3MakeVector3(1.0, 0.0, 0.0);  // Arbitrary
+		b3Scalar s = 1.f/b3Sqrt(s_squared);
+		return b3MakeVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s);
+	}
+	/**@brief Return the inverse of this quaternion */
+	b3Quaternion inverse() const
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		return b3Quaternion(_mm_xor_ps(mVec128, b3vQInv));
+#elif defined(B3_USE_NEON)
+        return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vQInv));
+		return b3Quaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]);
+	}
+  /**@brief Return the sum of this quaternion and the other 
+   * @param q2 The other quaternion */
+	B3_FORCE_INLINE b3Quaternion
+	operator+(const b3Quaternion& q2) const
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		return b3Quaternion(_mm_add_ps(mVec128, q2.mVec128));
+#elif defined(B3_USE_NEON)
+        return b3Quaternion(vaddq_f32(mVec128, q2.mVec128));
+		const b3Quaternion& q1 = *this;
+		return b3Quaternion(q1.getX() + q2.getX(), q1.getY() + q2.getY(), q1.getZ() + q2.getZ(), q1.m_floats[3] + q2.m_floats[3]);
+	}
+  /**@brief Return the difference between this quaternion and the other 
+   * @param q2 The other quaternion */
+	B3_FORCE_INLINE b3Quaternion
+	operator-(const b3Quaternion& q2) const
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		return b3Quaternion(_mm_sub_ps(mVec128, q2.mVec128));
+#elif defined(B3_USE_NEON)
+        return b3Quaternion(vsubq_f32(mVec128, q2.mVec128));
+		const b3Quaternion& q1 = *this;
+		return b3Quaternion(q1.getX() - q2.getX(), q1.getY() - q2.getY(), q1.getZ() - q2.getZ(), q1.m_floats[3] - q2.m_floats[3]);
+	}
+  /**@brief Return the negative of this quaternion 
+   * This simply negates each element */
+	B3_FORCE_INLINE b3Quaternion operator-() const
+	{
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		return b3Quaternion(_mm_xor_ps(mVec128, b3vMzeroMask));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vMzeroMask) );
+		const b3Quaternion& q2 = *this;
+		return b3Quaternion( - q2.getX(), - q2.getY(),  - q2.getZ(),  - q2.m_floats[3]);
+	}
+  /**@todo document this and it's use */
+	B3_FORCE_INLINE b3Quaternion farthest( const b3Quaternion& qd) const 
+	{
+		b3Quaternion diff,sum;
+		diff = *this - qd;
+		sum = *this + qd;
+		if( diff.dot(diff) > sum.dot(sum) )
+			return qd;
+		return (-qd);
+	}
+	/**@todo document this and it's use */
+	B3_FORCE_INLINE b3Quaternion nearest( const b3Quaternion& qd) const 
+	{
+		b3Quaternion diff,sum;
+		diff = *this - qd;
+		sum = *this + qd;
+		if( diff.dot(diff) < sum.dot(sum) )
+			return qd;
+		return (-qd);
+	}
+  /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion
+   * @param q The other quaternion to interpolate with 
+   * @param t The ratio between this and q to interpolate.  If t = 0 the result is this, if t=1 the result is q.
+   * Slerp interpolates assuming constant velocity.  */
+	b3Quaternion slerp(const b3Quaternion& q, const b3Scalar& t) const
+	{
+	  b3Scalar magnitude = b3Sqrt(length2() * q.length2()); 
+	  b3Assert(magnitude > b3Scalar(0));
+    b3Scalar product = dot(q) / magnitude;
+    if (b3Fabs(product) < b3Scalar(1))
+		{
+      // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
+      const b3Scalar sign = (product < 0) ? b3Scalar(-1) : b3Scalar(1);
+      const b3Scalar theta = b3Acos(sign * product);
+      const b3Scalar s1 = b3Sin(sign * t * theta);   
+      const b3Scalar d = b3Scalar(1.0) / b3Sin(theta);
+      const b3Scalar s0 = b3Sin((b3Scalar(1.0) - t) * theta);
+      return b3Quaternion(
+          (m_floats[0] * s0 + q.getX() * s1) * d,
+          (m_floats[1] * s0 + q.getY() * s1) * d,
+          (m_floats[2] * s0 + q.getZ() * s1) * d,
+          (m_floats[3] * s0 + q.m_floats[3] * s1) * d);
+		}
+		else
+		{
+			return *this;
+		}
+	}
+	static const b3Quaternion&	getIdentity()
+	{
+		static const b3Quaternion identityQuat(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.),b3Scalar(1.));
+		return identityQuat;
+	}
+	B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
+/**@brief Return the product of two quaternions */
+B3_FORCE_INLINE b3Quaternion
+operator*(const b3Quaternion& q1, const b3Quaternion& q2) 
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	__m128 vQ1 = q1.get128();
+	__m128 vQ2 = q2.get128();
+	__m128 A0, A1, B1, A2, B2;
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0,1,2,0)); // X Y  z x     //      vtrn
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0)); // W W  W X     // vdup vext
+	A1 = A1 * B1;
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1)); // Y Z  X Y     // vext 
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1)); // z x  Y Y     // vtrn vdup
+	A2 = A2 * B2;
+	B1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2)); // z x Y Z      // vtrn vext
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2)); // Y Z x z      // vext vtrn
+	B1 = B1 * B2;	//	A3 *= B3
+	A0 = b3_splat_ps(vQ1, 3);	//	A0
+	A0 = A0 * vQ2;	//	A0 * B0
+	A1 = A1 + A2;	//	AB12
+	A0 =  A0 - B1;	//	AB03 = AB0 - AB3 
+    A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
+	A0 = A0 + A1;	//	AB03 + AB12
+	return b3Quaternion(A0);
+#elif defined(B3_USE_NEON)     
+	float32x4_t vQ1 = q1.get128();
+	float32x4_t vQ2 = q2.get128();
+	float32x4_t A0, A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    }
+    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+	A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
+    //	change the sign of the last element
+    A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
+	A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
+	return b3Quaternion(A0);
+	return b3Quaternion(
+        q1.getW() * q2.getX() + q1.getX() * q2.getW() + q1.getY() * q2.getZ() - q1.getZ() * q2.getY(),
+		q1.getW() * q2.getY() + q1.getY() * q2.getW() + q1.getZ() * q2.getX() - q1.getX() * q2.getZ(),
+		q1.getW() * q2.getZ() + q1.getZ() * q2.getW() + q1.getX() * q2.getY() - q1.getY() * q2.getX(),
+		q1.getW() * q2.getW() - q1.getX() * q2.getX() - q1.getY() * q2.getY() - q1.getZ() * q2.getZ()); 
+B3_FORCE_INLINE b3Quaternion
+operator*(const b3Quaternion& q, const b3Vector3& w)
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	__m128 vQ1 = q.get128();
+	__m128 vQ2 = w.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(3,3,3,0));
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(0,1,2,0));
+	A1 = A1 * B1;
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1));
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1));
+	A2 = A2 * B2;
+	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2));
+	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2));
+	A3 = A3 * B3;	//	A3 *= B3
+	A1 = A1 + A2;	//	AB12
+	A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
+    A1 = A1 - A3;	//	AB123 = AB12 - AB3 
+	return b3Quaternion(A1);
+#elif defined(B3_USE_NEON)     
+	float32x4_t vQ1 = q.get128();
+	float32x4_t vQ2 = w.get128();
+	float32x4_t A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
+    vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); 
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    }
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+    A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W  W X 
+    B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                    // X Y  z x 
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+    //	change the sign of the last element
+    A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
+    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
+	return b3Quaternion(A1);
+	return b3Quaternion( 
+         q.getW() * w.getX() + q.getY() * w.getZ() - q.getZ() * w.getY(),
+		 q.getW() * w.getY() + q.getZ() * w.getX() - q.getX() * w.getZ(),
+		 q.getW() * w.getZ() + q.getX() * w.getY() - q.getY() * w.getX(),
+		-q.getX() * w.getX() - q.getY() * w.getY() - q.getZ() * w.getZ()); 
+B3_FORCE_INLINE b3Quaternion
+operator*(const b3Vector3& w, const b3Quaternion& q)
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	__m128 vQ1 = w.get128();
+	__m128 vQ2 = q.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0,1,2,0));  // X Y  z x
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0));  // W W  W X 
+	A1 = A1 * B1;
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1));
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1));
+	A2 = A2 *B2;
+	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2));
+	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2));
+	A3 = A3 * B3;	//	A3 *= B3
+	A1 = A1 + A2;	//	AB12
+	A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
+	A1 = A1 - A3;	//	AB123 = AB12 - AB3 
+	return b3Quaternion(A1);
+#elif defined(B3_USE_NEON)     
+	float32x4_t vQ1 = w.get128();
+	float32x4_t vQ2 = q.get128();
+	float32x4_t  A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    }
+    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+    //	change the sign of the last element
+    A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
+    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
+	return b3Quaternion(A1);
+	return b3Quaternion( 
+        +w.getX() * q.getW() + w.getY() * q.getZ() - w.getZ() * q.getY(),
+		+w.getY() * q.getW() + w.getZ() * q.getX() - w.getX() * q.getZ(),
+		+w.getZ() * q.getW() + w.getX() * q.getY() - w.getY() * q.getX(),
+		-w.getX() * q.getX() - w.getY() * q.getY() - w.getZ() * q.getZ()); 
+/**@brief Calculate the dot product between two quaternions */
+B3_FORCE_INLINE b3Scalar 
+b3Dot(const b3Quaternion& q1, const b3Quaternion& q2) 
+	return q1.dot(q2); 
+/**@brief Return the length of a quaternion */
+b3Length(const b3Quaternion& q) 
+	return q.length(); 
+/**@brief Return the angle between two quaternions*/
+b3Angle(const b3Quaternion& q1, const b3Quaternion& q2) 
+	return q1.angle(q2); 
+/**@brief Return the inverse of a quaternion*/
+B3_FORCE_INLINE b3Quaternion
+b3Inverse(const b3Quaternion& q) 
+	return q.inverse();
+/**@brief Return the result of spherical linear interpolation betwen two quaternions 
+ * @param q1 The first quaternion
+ * @param q2 The second quaternion 
+ * @param t The ration between q1 and q2.  t = 0 return q1, t=1 returns q2 
+ * Slerp assumes constant velocity between positions. */
+B3_FORCE_INLINE b3Quaternion
+b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t) 
+	return q1.slerp(q2, t);
+B3_FORCE_INLINE b3Quaternion
+b3QuatMul(const b3Quaternion& rot0, const b3Quaternion& rot1)
+	return rot0*rot1;
+B3_FORCE_INLINE b3Quaternion
+b3QuatNormalized(const b3Quaternion& orn)
+	return orn.normalized();
+B3_FORCE_INLINE b3Vector3 
+b3QuatRotate(const b3Quaternion& rotation, const b3Vector3& v) 
+	b3Quaternion q = rotation * v;
+	q *= rotation.inverse();
+#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	return b3MakeVector3(_mm_and_ps(q.get128(), b3vFFF0fMask));
+#elif defined(B3_USE_NEON)
+    return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), b3vFFF0Mask));
+	return b3MakeVector3(q.getX(),q.getY(),q.getZ());
+B3_FORCE_INLINE b3Quaternion 
+b3ShortestArcQuat(const b3Vector3& v0, const b3Vector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized
+	b3Vector3 c = v0.cross(v1);
+	b3Scalar  d = v0.dot(v1);
+	if (d < -1.0 + B3_EPSILON)
+	{
+		b3Vector3 n,unused;
+		b3PlaneSpace1(v0,n,unused);
+		return b3Quaternion(n.getX(),n.getY(),n.getZ(),0.0f); // just pick any vector that is orthogonal to v0
+	}
+	b3Scalar  s = b3Sqrt((1.0f + d) * 2.0f);
+	b3Scalar rs = 1.0f / s;
+	return b3Quaternion(c.getX()*rs,c.getY()*rs,c.getZ()*rs,s * 0.5f);
+B3_FORCE_INLINE b3Quaternion 
+b3ShortestArcQuatNormalize2(b3Vector3& v0,b3Vector3& v1)
+	v0.normalize();
+	v1.normalize();
+	return b3ShortestArcQuat(v0,v1);
+#endif //B3_SIMD__QUATERNION_H_
diff --git a/src/bullet/Bullet3Common/b3Random.h b/src/bullet/Bullet3Common/b3Random.h
new file mode 100644
index 00000000..dc040f15
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Random.h
@@ -0,0 +1,50 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_GEN_RANDOM_H
+#define B3_GEN_RANDOM_H
+#include "b3Scalar.h"
+#ifdef MT19937
+#include <limits.h>
+#include <mt19937.h>
+B3_FORCE_INLINE void         b3Srand(unsigned int seed) { init_genrand(seed); }
+B3_FORCE_INLINE unsigned int b3rand()                   { return genrand_int32(); }
+#include <stdlib.h>
+B3_FORCE_INLINE void         b3Srand(unsigned int seed) { srand(seed); } 
+B3_FORCE_INLINE unsigned int b3rand()                   { return rand(); }
+inline b3Scalar b3RandRange(b3Scalar minRange, b3Scalar maxRange)
+	return (b3rand() / (b3Scalar(B3_RAND_MAX) + b3Scalar(1.0))) * (maxRange - minRange) + minRange;
+#endif //B3_GEN_RANDOM_H
diff --git a/src/bullet/Bullet3Common/b3Scalar.h b/src/bullet/Bullet3Common/b3Scalar.h
new file mode 100644
index 00000000..60b7f1cf
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Scalar.h
@@ -0,0 +1,661 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_SCALAR_H
+#define B3_SCALAR_H
+//Aligned data types not supported in managed code
+#pragma unmanaged
+#include <math.h>
+#include <stdlib.h>//size_t for MSVC 6.0
+#include <float.h>
+//Original repository is at http://github.com/erwincoumans/bullet3
+#define B3_BULLET_VERSION 300
+inline int	b3GetVersion()
+#if defined(DEBUG) || defined (_DEBUG)
+#define B3_DEBUG
+#include "b3Logging.h"//for b3Error
+#ifdef _WIN32
+		#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
+			#define B3_FORCE_INLINE inline
+			#define B3_ATTRIBUTE_ALIGNED16(a) a
+			#define B3_ATTRIBUTE_ALIGNED64(a) a
+			#define B3_ATTRIBUTE_ALIGNED128(a) a
+		#else
+			#pragma warning(disable : 4324) // disable padding warning
+//			#pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
+//			#pragma warning(disable:4996) //Turn off warnings about deprecated C routines
+//			#pragma warning(disable:4786) // Disable the "debug name too long" warning
+			#define B3_FORCE_INLINE __forceinline
+			#define B3_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
+			#define B3_ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
+			#define B3_ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
+		#ifdef _XBOX
+			#define B3_USE_VMX128
+			#include <ppcintrinsics.h>
+ 			#define B3_HAVE_NATIVE_FSEL
+ 			#define b3Fsel(a,b,c) __fsel((a),(b),(c))
+		#else
+#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (B3_USE_DOUBLE_PRECISION))
+			#define B3_USE_SSE
+			#ifdef B3_USE_SSE
+			//B3_USE_SSE_IN_API is disabled under Windows by default, because 
+			//it makes it harder to integrate Bullet into your application under Windows 
+			//(structured embedding Bullet structs/classes need to be 16-byte aligned)
+			//with relatively little performance gain
+			//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
+			//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
+			//#define B3_USE_SSE_IN_API
+			#endif //B3_USE_SSE
+			#include <emmintrin.h>
+		#endif//_XBOX
+		#endif //__MINGW32__
+#ifdef B3_DEBUG
+	#ifdef _MSC_VER
+		#include <stdio.h>
+		#define b3Assert(x) { if(!(x)){b3Error("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak();	}}
+	#else//_MSC_VER
+		#include <assert.h>
+		#define b3Assert assert
+	#endif//_MSC_VER
+		#define b3Assert(x)
+		//b3FullAssert is optional, slows down a lot
+		#define b3FullAssert(x)
+		#define b3Likely(_c)  _c
+		#define b3Unlikely(_c) _c
+#if defined	(__CELLOS_LV2__)
+		#define B3_FORCE_INLINE inline __attribute__((always_inline))
+		#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+		#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+		#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+		#ifndef assert
+		#include <assert.h>
+		#endif
+#ifdef B3_DEBUG
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+	#define b3Assert(x) {if(!(x)){b3Error("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
+	#define b3Assert assert
+		#define b3Assert(x)
+		//b3FullAssert is optional, slows down a lot
+		#define b3FullAssert(x)
+		#define b3Likely(_c)  _c
+		#define b3Unlikely(_c) _c
+#ifdef USE_LIBSPE2
+		#define B3_FORCE_INLINE __inline
+		#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+		#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+		#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+		#ifndef assert
+		#include <assert.h>
+		#endif
+#ifdef B3_DEBUG
+		#define b3Assert assert
+		#define b3Assert(x)
+		//b3FullAssert is optional, slows down a lot
+		#define b3FullAssert(x)
+		#define b3Likely(_c)   __builtin_expect((_c), 1)
+		#define b3Unlikely(_c) __builtin_expect((_c), 0)
+	//non-windows systems
+#if (defined (__APPLE__) && (!defined (B3_USE_DOUBLE_PRECISION)))
+    #if defined (__i386__) || defined (__x86_64__)
+        #define B3_USE_SSE
+		//B3_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
+		//if apps run into issues, we will disable the next line
+		#define B3_USE_SSE_IN_API
+        #ifdef B3_USE_SSE
+            // include appropriate SSE level
+            #if defined (__SSE4_1__)
+                #include <smmintrin.h>
+            #elif defined (__SSSE3__)
+                #include <tmmintrin.h>
+            #elif defined (__SSE3__)
+                #include <pmmintrin.h>
+            #else
+                #include <emmintrin.h>
+            #endif
+        #endif //B3_USE_SSE
+    #elif defined( __armv7__ )
+        #ifdef __clang__
+            #define B3_USE_NEON 1
+            #if defined B3_USE_NEON && defined (__clang__)
+                #include <arm_neon.h>
+            #endif//B3_USE_NEON
+       #endif //__clang__
+    #endif//__arm__
+	#define B3_FORCE_INLINE inline __attribute__ ((always_inline))
+///@todo: check out alignment methods for other platforms/compilers
+	#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+	#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+	#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+	#ifndef assert
+	#include <assert.h>
+	#endif
+	#if defined(DEBUG) || defined (_DEBUG)
+	 #if defined (__i386__) || defined (__x86_64__)
+	#include <stdio.h>
+	 #define b3Assert(x)\
+	{\
+	if(!(x))\
+	{\
+		b3Error("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
+		asm volatile ("int3");\
+	}\
+	}
+	#else//defined (__i386__) || defined (__x86_64__)
+		#define b3Assert assert
+	#endif//defined (__i386__) || defined (__x86_64__)
+	#else//defined(DEBUG) || defined (_DEBUG)
+		#define b3Assert(x)
+	#endif//defined(DEBUG) || defined (_DEBUG)
+	//b3FullAssert is optional, slows down a lot
+	#define b3FullAssert(x)
+	#define b3Likely(_c)  _c
+	#define b3Unlikely(_c) _c
+		#define B3_FORCE_INLINE inline
+		///@todo: check out alignment methods for other platforms/compilers
+		#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+		#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+		#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+		///#define B3_ATTRIBUTE_ALIGNED16(a) a
+		///#define B3_ATTRIBUTE_ALIGNED64(a) a
+		///#define B3_ATTRIBUTE_ALIGNED128(a) a
+		#ifndef assert
+		#include <assert.h>
+		#endif
+#if defined(DEBUG) || defined (_DEBUG)
+		#define b3Assert assert
+		#define b3Assert(x)
+		//b3FullAssert is optional, slows down a lot
+		#define b3FullAssert(x)
+		#define b3Likely(_c)  _c
+		#define b3Unlikely(_c) _c
+#endif //__APPLE__ 
+#endif // LIBSPE2
+#endif	//__CELLOS_LV2__
+///The b3Scalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
+typedef double b3Scalar;
+//this number could be bigger in double precision
+#define B3_LARGE_FLOAT 1e30
+typedef float b3Scalar;
+#define B3_LARGE_FLOAT 1e18f
+#ifdef B3_USE_SSE
+typedef __m128 b3SimdFloat4;
+#if defined B3_USE_SSE_IN_API && defined (B3_USE_SSE)
+#ifdef _WIN32
+#ifndef B3_NAN
+static int b3NanMask = 0x7F800001;
+#define B3_NAN (*(float*)&b3NanMask)
+static  int b3InfinityMask = 0x7F800000;
+#define B3_INFINITY_MASK (*(float*)&b3InfinityMask)
+inline __m128 operator + (const __m128 A, const __m128 B)
+    return _mm_add_ps(A, B);
+inline __m128 operator - (const __m128 A, const __m128 B)
+    return _mm_sub_ps(A, B);
+inline __m128 operator * (const __m128 A, const __m128 B)
+    return _mm_mul_ps(A, B);
+#define b3CastfTo128i(a) (_mm_castps_si128(a))
+#define b3CastfTo128d(a) (_mm_castps_pd(a))
+#define b3CastiTo128f(a) (_mm_castsi128_ps(a))
+#define b3CastdTo128f(a) (_mm_castpd_ps(a))
+#define b3CastdTo128i(a) (_mm_castpd_si128(a))
+#define b3Assign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
+#define b3CastfTo128i(a) ((__m128i)(a))
+#define b3CastfTo128d(a) ((__m128d)(a))
+#define b3CastiTo128f(a)  ((__m128) (a))
+#define b3CastdTo128f(a) ((__m128) (a))
+#define b3CastdTo128i(a) ((__m128i)(a))
+#define b3Assign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
+#endif //B3_USE_SSE_IN_API
+#ifdef B3_USE_NEON
+#include <arm_neon.h>
+typedef float32x4_t b3SimdFloat4;
+#define B3_NAN NAN
+#define b3Assign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
+   B3_FORCE_INLINE void* operator new(size_t sizeInBytes)   { return b3AlignedAlloc(sizeInBytes,16); }   \
+   B3_FORCE_INLINE void  operator delete(void* ptr)         { b3AlignedFree(ptr); }   \
+   B3_FORCE_INLINE void* operator new(size_t, void* ptr)   { return ptr; }   \
+   B3_FORCE_INLINE void  operator delete(void*, void*)      { }   \
+   B3_FORCE_INLINE void* operator new[](size_t sizeInBytes)   { return b3AlignedAlloc(sizeInBytes,16); }   \
+   B3_FORCE_INLINE void  operator delete[](void* ptr)         { b3AlignedFree(ptr); }   \
+   B3_FORCE_INLINE void* operator new[](size_t, void* ptr)   { return ptr; }   \
+   B3_FORCE_INLINE void  operator delete[](void*, void*)      { }   \
+B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar x) { return sqrt(x); }
+B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabs(x); }
+B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cos(x); }
+B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sin(x); }
+B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tan(x); }
+B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) { if (x<b3Scalar(-1))	x=b3Scalar(-1); if (x>b3Scalar(1))	x=b3Scalar(1); return acos(x); }
+B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) { if (x<b3Scalar(-1))	x=b3Scalar(-1); if (x>b3Scalar(1))	x=b3Scalar(1); return asin(x); }
+B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atan(x); }
+B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2(x, y); }
+B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return exp(x); }
+B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return log(x); }
+B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x,b3Scalar y) { return pow(x,y); }
+B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x,b3Scalar y) { return fmod(x,y); }
+B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar y) 
+    double x, z, tempf;
+    unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
+	tempf = y;
+	*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
+	x =  tempf;
+	z =  y*b3Scalar(0.5);
+	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);         /* iteration formula     */
+	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
+	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
+	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
+	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
+	return x*y;
+	return sqrtf(y); 
+B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabsf(x); }
+B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cosf(x); }
+B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sinf(x); }
+B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tanf(x); }
+B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) { 
+	if (x<b3Scalar(-1))	
+		x=b3Scalar(-1); 
+	if (x>b3Scalar(1))	
+		x=b3Scalar(1);
+	return acosf(x); 
+B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) { 
+	if (x<b3Scalar(-1))	
+		x=b3Scalar(-1); 
+	if (x>b3Scalar(1))	
+		x=b3Scalar(1);
+	return asinf(x); 
+B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atanf(x); }
+B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2f(x, y); }
+B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return expf(x); }
+B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return logf(x); }
+B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x,b3Scalar y) { return powf(x,y); }
+B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x,b3Scalar y) { return fmodf(x,y); }
+#define B3_2_PI         b3Scalar(6.283185307179586232)
+#define B3_PI           (B3_2_PI * b3Scalar(0.5))
+#define B3_HALF_PI      (B3_2_PI * b3Scalar(0.25))
+#define B3_RADS_PER_DEG (B3_2_PI / b3Scalar(360.0))
+#define B3_DEGS_PER_RAD  (b3Scalar(360.0) / B3_2_PI)
+#define B3_SQRT12 b3Scalar(0.7071067811865475244008443621048490)
+#define b3RecipSqrt(x) ((b3Scalar)(b3Scalar(1.0)/b3Sqrt(b3Scalar(x))))		/* reciprocal square root */
+#define B3_EPSILON      DBL_EPSILON
+#define B3_INFINITY     DBL_MAX
+#define B3_EPSILON      FLT_EPSILON
+#define B3_INFINITY     FLT_MAX
+B3_FORCE_INLINE b3Scalar b3Atan2Fast(b3Scalar y, b3Scalar x) 
+	b3Scalar coeff_1 = B3_PI / 4.0f;
+	b3Scalar coeff_2 = 3.0f * coeff_1;
+	b3Scalar abs_y = b3Fabs(y);
+	b3Scalar angle;
+	if (x >= 0.0f) {
+		b3Scalar r = (x - abs_y) / (x + abs_y);
+		angle = coeff_1 - coeff_1 * r;
+	} else {
+		b3Scalar r = (x + abs_y) / (abs_y - x);
+		angle = coeff_2 - coeff_1 * r;
+	}
+	return (y < 0.0f) ? -angle : angle;
+B3_FORCE_INLINE bool      b3FuzzyZero(b3Scalar x) { return b3Fabs(x) < B3_EPSILON; }
+B3_FORCE_INLINE bool	b3Equal(b3Scalar a, b3Scalar eps) {
+	return (((a) <= eps) && !((a) < -eps));
+B3_FORCE_INLINE bool	b3GreaterEqual (b3Scalar a, b3Scalar eps) {
+	return (!((a) <= eps));
+B3_FORCE_INLINE int       b3IsNegative(b3Scalar x) {
+    return x < b3Scalar(0.0) ? 1 : 0;
+B3_FORCE_INLINE b3Scalar b3Radians(b3Scalar x) { return x * B3_RADS_PER_DEG; }
+B3_FORCE_INLINE b3Scalar b3Degrees(b3Scalar x) { return x * B3_DEGS_PER_RAD; }
+#define B3_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
+#ifndef b3Fsel
+B3_FORCE_INLINE b3Scalar b3Fsel(b3Scalar a, b3Scalar b, b3Scalar c)
+	return a >= 0 ? b : c;
+#define b3Fsels(a,b,c) (b3Scalar)b3Fsel(a,b,c)
+B3_FORCE_INLINE bool b3MachineIsLittleEndian()
+   long int i = 1;
+   const char *p = (const char *) &i;
+   if (p[0] == 1)  // Lowest address contains the least significant byte
+	   return true;
+   else
+	   return false;
+///b3Select avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
+///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
+B3_FORCE_INLINE unsigned b3Select(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) 
+    // Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
+    // Rely on positive value or'ed with its negative having sign bit on
+    // and zero value or'ed with its negative (which is still zero) having sign bit off 
+    // Use arithmetic shift right, shifting the sign bit through all 32 bits
+    unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+    unsigned testEqz = ~testNz;
+    return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); 
+B3_FORCE_INLINE int b3Select(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
+    unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+    unsigned testEqz = ~testNz; 
+    return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
+B3_FORCE_INLINE float b3Select(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
+    return (float)b3Fsel((b3Scalar)condition - b3Scalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
+    return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero; 
+template<typename T> B3_FORCE_INLINE void b3Swap(T& a, T& b)
+	T tmp = a;
+	a = b;
+	b = tmp;
+//PCK: endian swapping functions
+B3_FORCE_INLINE unsigned b3SwapEndian(unsigned val)
+	return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8)  | ((val & 0x000000ff) << 24));
+B3_FORCE_INLINE unsigned short b3SwapEndian(unsigned short val)
+	return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
+B3_FORCE_INLINE unsigned b3SwapEndian(int val)
+	return b3SwapEndian((unsigned)val);
+B3_FORCE_INLINE unsigned short b3SwapEndian(short val)
+	return b3SwapEndian((unsigned short) val);
+///b3SwapFloat uses using char pointers to swap the endianness
+////b3SwapFloat/b3SwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
+///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. 
+///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. 
+///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. 
+///so instead of returning a float/double, we return integer/long long integer
+B3_FORCE_INLINE unsigned int  b3SwapEndianFloat(float d)
+    unsigned int a = 0;
+    unsigned char *dst = (unsigned char *)&a;
+    unsigned char *src = (unsigned char *)&d;
+    dst[0] = src[3];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    dst[3] = src[0];
+    return a;
+// unswap using char pointers
+B3_FORCE_INLINE float b3UnswapEndianFloat(unsigned int a) 
+    float d = 0.0f;
+    unsigned char *src = (unsigned char *)&a;
+    unsigned char *dst = (unsigned char *)&d;
+    dst[0] = src[3];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    dst[3] = src[0];
+    return d;
+// swap using char pointers
+B3_FORCE_INLINE void  b3SwapEndianDouble(double d, unsigned char* dst)
+    unsigned char *src = (unsigned char *)&d;
+    dst[0] = src[7];
+    dst[1] = src[6];
+    dst[2] = src[5];
+    dst[3] = src[4];
+    dst[4] = src[3];
+    dst[5] = src[2];
+    dst[6] = src[1];
+    dst[7] = src[0];
+// unswap using char pointers
+B3_FORCE_INLINE double b3UnswapEndianDouble(const unsigned char *src) 
+    double d = 0.0;
+    unsigned char *dst = (unsigned char *)&d;
+    dst[0] = src[7];
+    dst[1] = src[6];
+    dst[2] = src[5];
+    dst[3] = src[4];
+    dst[4] = src[3];
+    dst[5] = src[2];
+    dst[6] = src[1];
+    dst[7] = src[0];
+	return d;
+// returns normalized value in range [-B3_PI, B3_PI]
+B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians) 
+	angleInRadians = b3Fmod(angleInRadians, B3_2_PI);
+	if(angleInRadians < -B3_PI)
+	{
+		return angleInRadians + B3_2_PI;
+	}
+	else if(angleInRadians > B3_PI)
+	{
+		return angleInRadians - B3_2_PI;
+	}
+	else
+	{
+		return angleInRadians;
+	}
+///rudimentary class to provide type info
+struct b3TypedObject
+	b3TypedObject(int objectType)
+		:m_objectType(objectType)
+	{
+	}
+	int	m_objectType;
+	inline int getObjectType() const
+	{
+		return m_objectType;
+	}
+///align a pointer to the provided alignment, upwards
+template <typename T>T* b3AlignPointer(T* unalignedPtr, size_t alignment)
+	struct b3ConvertPointerSizeT
+	{
+		union 
+		{
+				T* ptr;
+				size_t integer;
+		};
+	};
+    b3ConvertPointerSizeT converter;
+	const size_t bit_mask = ~(alignment - 1);
+    converter.ptr = unalignedPtr;
+	converter.integer += alignment-1;
+	converter.integer &= bit_mask;
+	return converter.ptr;
+#endif //B3_SCALAR_H
diff --git a/src/bullet/Bullet3Common/b3StackAlloc.h b/src/bullet/Bullet3Common/b3StackAlloc.h
new file mode 100644
index 00000000..de7de056
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3StackAlloc.h
@@ -0,0 +1,116 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+StackAlloc extracted from GJK-EPA collision solver by Nathanael Presson
+#ifndef B3_STACK_ALLOC
+#define B3_STACK_ALLOC
+#include "b3Scalar.h" //for b3Assert
+#include "b3AlignedAllocator.h"
+///The b3Block class is an internal structure for the b3StackAlloc memory allocator.
+struct b3Block
+	b3Block*			previous;
+	unsigned char*		address;
+///The StackAlloc class provides some fast stack-based memory allocator (LIFO last-in first-out)
+class b3StackAlloc
+	b3StackAlloc(unsigned int size)	{ ctor();create(size); }
+	~b3StackAlloc()		{ destroy(); }
+	inline void		create(unsigned int size)
+	{
+		destroy();
+		data		=  (unsigned char*) b3AlignedAlloc(size,16);
+		totalsize	=	size;
+	}
+	inline void		destroy()
+	{
+		b3Assert(usedsize==0);
+		//Raise(L"StackAlloc is still in use");
+		if(usedsize==0)
+		{
+			if(!ischild && data)		
+				b3AlignedFree(data);
+			data				=	0;
+			usedsize			=	0;
+		}
+	}
+	int	getAvailableMemory() const
+	{
+		return static_cast<int>(totalsize - usedsize);
+	}
+	unsigned char*			allocate(unsigned int size)
+	{
+		const unsigned int	nus(usedsize+size);
+		if(nus<totalsize)
+		{
+			usedsize=nus;
+			return(data+(usedsize-size));
+		}
+		b3Assert(0);
+		//&& (L"Not enough memory"));
+		return(0);
+	}
+	B3_FORCE_INLINE b3Block*		beginBlock()
+	{
+		b3Block*	pb = (b3Block*)allocate(sizeof(b3Block));
+		pb->previous	=	current;
+		pb->address		=	data+usedsize;
+		current			=	pb;
+		return(pb);
+	}
+	B3_FORCE_INLINE void		endBlock(b3Block* block)
+	{
+		b3Assert(block==current);
+		//Raise(L"Unmatched blocks");
+		if(block==current)
+		{
+			current		=	block->previous;
+			usedsize	=	(unsigned int)((block->address-data)-sizeof(b3Block));
+		}
+	}
+	void		ctor()
+	{
+		data		=	0;
+		totalsize	=	0;
+		usedsize	=	0;
+		current		=	0;
+		ischild		=	false;
+	}
+	unsigned char*		data;
+	unsigned int		totalsize;
+	unsigned int		usedsize;
+	b3Block*	current;
+	bool		ischild;
+#endif //B3_STACK_ALLOC
diff --git a/src/bullet/Bullet3Common/b3Transform.h b/src/bullet/Bullet3Common/b3Transform.h
new file mode 100644
index 00000000..fa480759
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Transform.h
@@ -0,0 +1,304 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_TRANSFORM_H
+#define B3_TRANSFORM_H
+#include "b3Matrix3x3.h"
+#define b3TransformData b3TransformDoubleData
+#define b3TransformData b3TransformFloatData
+/**@brief The b3Transform class supports rigid transforms with only translation and rotation and no scaling/shear.
+ *It can be used in combination with b3Vector3, b3Quaternion and b3Matrix3x3 linear algebra classes. */
+B3_ATTRIBUTE_ALIGNED16(class) b3Transform {
+  ///Storage for the rotation
+	b3Matrix3x3 m_basis;
+  ///Storage for the translation
+	b3Vector3   m_origin;
+  /**@brief No initialization constructor */
+	b3Transform() {}
+  /**@brief Constructor from b3Quaternion (optional b3Vector3 )
+   * @param q Rotation from quaternion 
+   * @param c Translation from Vector (default 0,0,0) */
+	explicit B3_FORCE_INLINE b3Transform(const b3Quaternion& q, 
+		const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0))) 
+		: m_basis(q),
+		m_origin(c)
+	{}
+  /**@brief Constructor from b3Matrix3x3 (optional b3Vector3)
+   * @param b Rotation from Matrix 
+   * @param c Translation from Vector default (0,0,0)*/
+	explicit B3_FORCE_INLINE b3Transform(const b3Matrix3x3& b, 
+		const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0)))
+		: m_basis(b),
+		m_origin(c)
+	{}
+  /**@brief Copy constructor */
+	B3_FORCE_INLINE b3Transform (const b3Transform& other)
+		: m_basis(other.m_basis),
+		m_origin(other.m_origin)
+	{
+	}
+  /**@brief Assignment Operator */
+	B3_FORCE_INLINE b3Transform& operator=(const b3Transform& other)
+	{
+		m_basis = other.m_basis;
+		m_origin = other.m_origin;
+		return *this;
+	}
+  /**@brief Set the current transform as the value of the product of two transforms
+   * @param t1 Transform 1
+   * @param t2 Transform 2
+   * This = Transform1 * Transform2 */
+		B3_FORCE_INLINE void mult(const b3Transform& t1, const b3Transform& t2) {
+			m_basis = t1.m_basis * t2.m_basis;
+			m_origin = t1(t2.m_origin);
+		}
+/*		void multInverseLeft(const b3Transform& t1, const b3Transform& t2) {
+			b3Vector3 v = t2.m_origin - t1.m_origin;
+			m_basis = b3MultTransposeLeft(t1.m_basis, t2.m_basis);
+			m_origin = v * t1.m_basis;
+		}
+		*/
+/**@brief Return the transform of the vector */
+	B3_FORCE_INLINE b3Vector3 operator()(const b3Vector3& x) const
+	{
+        return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin;
+	}
+  /**@brief Return the transform of the vector */
+	B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& x) const
+	{
+		return (*this)(x);
+	}
+  /**@brief Return the transform of the b3Quaternion */
+	B3_FORCE_INLINE b3Quaternion operator*(const b3Quaternion& q) const
+	{
+		return getRotation() * q;
+	}
+  /**@brief Return the basis matrix for the rotation */
+	B3_FORCE_INLINE b3Matrix3x3&       getBasis()          { return m_basis; }
+  /**@brief Return the basis matrix for the rotation */
+	B3_FORCE_INLINE const b3Matrix3x3& getBasis()    const { return m_basis; }
+  /**@brief Return the origin vector translation */
+	B3_FORCE_INLINE b3Vector3&         getOrigin()         { return m_origin; }
+  /**@brief Return the origin vector translation */
+	B3_FORCE_INLINE const b3Vector3&   getOrigin()   const { return m_origin; }
+  /**@brief Return a quaternion representing the rotation */
+	b3Quaternion getRotation() const { 
+		b3Quaternion q;
+		m_basis.getRotation(q);
+		return q;
+	}
+  /**@brief Set from an array 
+   * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
+	void setFromOpenGLMatrix(const b3Scalar *m)
+	{
+		m_basis.setFromOpenGLSubMatrix(m);
+		m_origin.setValue(m[12],m[13],m[14]);
+	}
+  /**@brief Fill an array representation
+   * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
+	void getOpenGLMatrix(b3Scalar *m) const 
+	{
+		m_basis.getOpenGLSubMatrix(m);
+		m[12] = m_origin.getX();
+		m[13] = m_origin.getY();
+		m[14] = m_origin.getZ();
+		m[15] = b3Scalar(1.0);
+	}
+  /**@brief Set the translational element
+   * @param origin The vector to set the translation to */
+	B3_FORCE_INLINE void setOrigin(const b3Vector3& origin) 
+	{ 
+		m_origin = origin;
+	}
+	B3_FORCE_INLINE b3Vector3 invXform(const b3Vector3& inVec) const;
+  /**@brief Set the rotational element by b3Matrix3x3 */
+	B3_FORCE_INLINE void setBasis(const b3Matrix3x3& basis)
+	{ 
+		m_basis = basis;
+	}
+  /**@brief Set the rotational element by b3Quaternion */
+	B3_FORCE_INLINE void setRotation(const b3Quaternion& q)
+	{
+		m_basis.setRotation(q);
+	}
+  /**@brief Set this transformation to the identity */
+	void setIdentity()
+	{
+		m_basis.setIdentity();
+		m_origin.setValue(b3Scalar(0.0), b3Scalar(0.0), b3Scalar(0.0));
+	}
+  /**@brief Multiply this Transform by another(this = this * another) 
+   * @param t The other transform */
+	b3Transform& operator*=(const b3Transform& t) 
+	{
+		m_origin += m_basis * t.m_origin;
+		m_basis *= t.m_basis;
+		return *this;
+	}
+  /**@brief Return the inverse of this transform */
+	b3Transform inverse() const
+	{ 
+		b3Matrix3x3 inv = m_basis.transpose();
+		return b3Transform(inv, inv * -m_origin);
+	}
+  /**@brief Return the inverse of this transform times the other transform
+   * @param t The other transform 
+   * return this.inverse() * the other */
+	b3Transform inverseTimes(const b3Transform& t) const;  
+  /**@brief Return the product of this transform and the other */
+	b3Transform operator*(const b3Transform& t) const;
+  /**@brief Return an identity transform */
+	static const b3Transform&	getIdentity()
+	{
+		static const b3Transform identityTransform(b3Matrix3x3::getIdentity());
+		return identityTransform;
+	}
+	void	serialize(struct	b3TransformData& dataOut) const;
+	void	serializeFloat(struct	b3TransformFloatData& dataOut) const;
+	void	deSerialize(const struct	b3TransformData& dataIn);
+	void	deSerializeDouble(const struct	b3TransformDoubleData& dataIn);
+	void	deSerializeFloat(const struct	b3TransformFloatData& dataIn);
+B3_FORCE_INLINE b3Vector3
+b3Transform::invXform(const b3Vector3& inVec) const
+	b3Vector3 v = inVec - m_origin;
+	return (m_basis.transpose() * v);
+B3_FORCE_INLINE b3Transform 
+b3Transform::inverseTimes(const b3Transform& t) const  
+	b3Vector3 v = t.getOrigin() - m_origin;
+		return b3Transform(m_basis.transposeTimes(t.m_basis),
+			v * m_basis);
+B3_FORCE_INLINE b3Transform 
+b3Transform::operator*(const b3Transform& t) const
+	return b3Transform(m_basis * t.m_basis, 
+		(*this)(t.m_origin));
+/**@brief Test if two transforms have all elements equal */
+B3_FORCE_INLINE bool operator==(const b3Transform& t1, const b3Transform& t2)
+   return ( t1.getBasis()  == t2.getBasis() &&
+            t1.getOrigin() == t2.getOrigin() );
+///for serialization
+struct	b3TransformFloatData
+	b3Matrix3x3FloatData	m_basis;
+	b3Vector3FloatData	m_origin;
+struct	b3TransformDoubleData
+	b3Matrix3x3DoubleData	m_basis;
+	b3Vector3DoubleData	m_origin;
+B3_FORCE_INLINE	void	b3Transform::serialize(b3TransformData& dataOut) const
+	m_basis.serialize(dataOut.m_basis);
+	m_origin.serialize(dataOut.m_origin);
+B3_FORCE_INLINE	void	b3Transform::serializeFloat(b3TransformFloatData& dataOut) const
+	m_basis.serializeFloat(dataOut.m_basis);
+	m_origin.serializeFloat(dataOut.m_origin);
+B3_FORCE_INLINE	void	b3Transform::deSerialize(const b3TransformData& dataIn)
+	m_basis.deSerialize(dataIn.m_basis);
+	m_origin.deSerialize(dataIn.m_origin);
+B3_FORCE_INLINE	void	b3Transform::deSerializeFloat(const b3TransformFloatData& dataIn)
+	m_basis.deSerializeFloat(dataIn.m_basis);
+	m_origin.deSerializeFloat(dataIn.m_origin);
+B3_FORCE_INLINE	void	b3Transform::deSerializeDouble(const b3TransformDoubleData& dataIn)
+	m_basis.deSerializeDouble(dataIn.m_basis);
+	m_origin.deSerializeDouble(dataIn.m_origin);
+#endif //B3_TRANSFORM_H
diff --git a/src/bullet/Bullet3Common/b3TransformUtil.h b/src/bullet/Bullet3Common/b3TransformUtil.h
new file mode 100644
index 00000000..6ce580c1
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3TransformUtil.h
@@ -0,0 +1,228 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3Transform.h"
+B3_FORCE_INLINE b3Vector3 b3AabbSupport(const b3Vector3& halfExtents,const b3Vector3& supportDir)
+	return b3MakeVector3(supportDir.getX() < b3Scalar(0.0) ? -halfExtents.getX() : halfExtents.getX(),
+      supportDir.getY() < b3Scalar(0.0) ? -halfExtents.getY() : halfExtents.getY(),
+      supportDir.getZ() < b3Scalar(0.0) ? -halfExtents.getZ() : halfExtents.getZ()); 
+/// Utils related to temporal transforms
+class b3TransformUtil
+	static void integrateTransform(const b3Transform& curTrans,const b3Vector3& linvel,const b3Vector3& angvel,b3Scalar timeStep,b3Transform& predictedTransform)
+	{
+		predictedTransform.setOrigin(curTrans.getOrigin() + linvel * timeStep);
+		b3Quaternion predictedOrn = curTrans.getRotation();
+		predictedOrn += (angvel * predictedOrn) * (timeStep * b3Scalar(0.5));
+		predictedOrn.normalize();
+	#else
+		//Exponential map
+		//google for "Practical Parameterization of Rotations Using the Exponential Map", F. Sebastian Grassia
+		b3Vector3 axis;
+		b3Scalar	fAngle = angvel.length(); 
+		//limit the angular motion
+		if (fAngle*timeStep > B3_ANGULAR_MOTION_THRESHOLD)
+		{
+			fAngle = B3_ANGULAR_MOTION_THRESHOLD / timeStep;
+		}
+		if ( fAngle < b3Scalar(0.001) )
+		{
+			// use Taylor's expansions of sync function
+			axis   = angvel*( b3Scalar(0.5)*timeStep-(timeStep*timeStep*timeStep)*(b3Scalar(0.020833333333))*fAngle*fAngle );
+		}
+		else
+		{
+			// sync(fAngle) = sin(c*fAngle)/t
+			axis   = angvel*( b3Sin(b3Scalar(0.5)*fAngle*timeStep)/fAngle );
+		}
+		b3Quaternion dorn (axis.getX(),axis.getY(),axis.getZ(),b3Cos( fAngle*timeStep*b3Scalar(0.5) ));
+		b3Quaternion orn0 = curTrans.getRotation();
+		b3Quaternion predictedOrn = dorn * orn0;
+		predictedOrn.normalize();
+	#endif
+		predictedTransform.setRotation(predictedOrn);
+	}
+	static void	calculateVelocityQuaternion(const b3Vector3& pos0,const b3Vector3& pos1,const b3Quaternion& orn0,const b3Quaternion& orn1,b3Scalar timeStep,b3Vector3& linVel,b3Vector3& angVel)
+	{
+		linVel = (pos1 - pos0) / timeStep;
+		b3Vector3 axis;
+		b3Scalar  angle;
+		if (orn0 != orn1)
+		{
+			calculateDiffAxisAngleQuaternion(orn0,orn1,axis,angle);
+			angVel = axis * angle / timeStep;
+		} else
+		{
+			angVel.setValue(0,0,0);
+		}
+	}
+	static void calculateDiffAxisAngleQuaternion(const b3Quaternion& orn0,const b3Quaternion& orn1a,b3Vector3& axis,b3Scalar& angle)
+	{
+		b3Quaternion orn1 = orn0.nearest(orn1a);
+		b3Quaternion dorn = orn1 * orn0.inverse();
+		angle = dorn.getAngle();
+		axis = b3MakeVector3(dorn.getX(),dorn.getY(),dorn.getZ());
+		axis[3] = b3Scalar(0.);
+		//check for axis length
+		b3Scalar len = axis.length2();
+		if (len < B3_EPSILON*B3_EPSILON)
+			axis = b3MakeVector3(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.));
+		else
+			axis /= b3Sqrt(len);
+	}
+	static void	calculateVelocity(const b3Transform& transform0,const b3Transform& transform1,b3Scalar timeStep,b3Vector3& linVel,b3Vector3& angVel)
+	{
+		linVel = (transform1.getOrigin() - transform0.getOrigin()) / timeStep;
+		b3Vector3 axis;
+		b3Scalar  angle;
+		calculateDiffAxisAngle(transform0,transform1,axis,angle);
+		angVel = axis * angle / timeStep;
+	}
+	static void calculateDiffAxisAngle(const b3Transform& transform0,const b3Transform& transform1,b3Vector3& axis,b3Scalar& angle)
+	{
+		b3Matrix3x3 dmat = transform1.getBasis() * transform0.getBasis().inverse();
+		b3Quaternion dorn;
+		dmat.getRotation(dorn);
+		///floating point inaccuracy can lead to w component > 1..., which breaks 
+		dorn.normalize();
+		angle = dorn.getAngle();
+		axis = b3MakeVector3(dorn.getX(),dorn.getY(),dorn.getZ());
+		axis[3] = b3Scalar(0.);
+		//check for axis length
+		b3Scalar len = axis.length2();
+		if (len < B3_EPSILON*B3_EPSILON)
+			axis = b3MakeVector3(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.));
+		else
+			axis /= b3Sqrt(len);
+	}
+///The b3ConvexSeparatingDistanceUtil can help speed up convex collision detection 
+///by conservatively updating a cached separating distance/vector instead of re-calculating the closest distance
+class	b3ConvexSeparatingDistanceUtil
+	b3Quaternion	m_ornA;
+	b3Quaternion	m_ornB;
+	b3Vector3	m_posA;
+	b3Vector3	m_posB;
+	b3Vector3	m_separatingNormal;
+	b3Scalar	m_boundingRadiusA;
+	b3Scalar	m_boundingRadiusB;
+	b3Scalar	m_separatingDistance;
+	b3ConvexSeparatingDistanceUtil(b3Scalar	boundingRadiusA,b3Scalar	boundingRadiusB)
+		:m_boundingRadiusA(boundingRadiusA),
+		m_boundingRadiusB(boundingRadiusB),
+		m_separatingDistance(0.f)
+	{
+	}
+	b3Scalar	getConservativeSeparatingDistance()
+	{
+		return m_separatingDistance;
+	}
+	void	updateSeparatingDistance(const b3Transform& transA,const b3Transform& transB)
+	{
+		const b3Vector3& toPosA = transA.getOrigin();
+		const b3Vector3& toPosB = transB.getOrigin();
+		b3Quaternion toOrnA = transA.getRotation();
+		b3Quaternion toOrnB = transB.getRotation();
+		if (m_separatingDistance>0.f)
+		{
+			b3Vector3 linVelA,angVelA,linVelB,angVelB;
+			b3TransformUtil::calculateVelocityQuaternion(m_posA,toPosA,m_ornA,toOrnA,b3Scalar(1.),linVelA,angVelA);
+			b3TransformUtil::calculateVelocityQuaternion(m_posB,toPosB,m_ornB,toOrnB,b3Scalar(1.),linVelB,angVelB);
+			b3Scalar maxAngularProjectedVelocity = angVelA.length() * m_boundingRadiusA + angVelB.length() * m_boundingRadiusB;
+			b3Vector3 relLinVel = (linVelB-linVelA);
+			b3Scalar relLinVelocLength = relLinVel.dot(m_separatingNormal);
+			if (relLinVelocLength<0.f)
+			{
+				relLinVelocLength = 0.f;
+			}
+			b3Scalar	projectedMotion = maxAngularProjectedVelocity +relLinVelocLength;
+			m_separatingDistance -= projectedMotion;
+		}
+		m_posA = toPosA;
+		m_posB = toPosB;
+		m_ornA = toOrnA;
+		m_ornB = toOrnB;
+	}
+	void	initSeparatingDistance(const b3Vector3& separatingVector,b3Scalar separatingDistance,const b3Transform& transA,const b3Transform& transB)
+	{
+		m_separatingDistance = separatingDistance;
+		if (m_separatingDistance>0.f)
+		{
+			m_separatingNormal = separatingVector;
+			const b3Vector3& toPosA = transA.getOrigin();
+			const b3Vector3& toPosB = transB.getOrigin();
+			b3Quaternion toOrnA = transA.getRotation();
+			b3Quaternion toOrnB = transB.getRotation();
+			m_posA = toPosA;
+			m_posB = toPosB;
+			m_ornA = toOrnA;
+			m_ornB = toOrnB;
+		}
+	}
diff --git a/src/bullet/Bullet3Common/b3Vector3.cpp b/src/bullet/Bullet3Common/b3Vector3.cpp
new file mode 100644
index 00000000..5f5ac4ac
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Vector3.cpp
@@ -0,0 +1,1631 @@
+ Copyright (c) 2011-213 Apple Inc. http://bulletphysics.org
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ This source version has been altered.
+ */
+#if defined (_WIN32) || defined (__i386__)
+#define B3_USE_SSE_IN_API
+#include "b3Vector3.h"
+#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+#ifdef __APPLE__
+#include <stdint.h>
+typedef  float float4 __attribute__ ((vector_size(16)));
+#define float4 __m128
+//typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
+#if defined B3_USE_SSE || defined _WIN32
+#define LOG2_ARRAY_SIZE     6
+#include <emmintrin.h>
+long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    const float4 *vertices = (const float4*) vv;
+    static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+    float4 dotMax = b3Assign128( -B3_INFINITY,  -B3_INFINITY,  -B3_INFINITY,  -B3_INFINITY );
+    float4 vvec = _mm_loadu_ps( vec );
+    float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa ));          /// zzzz
+    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+    long maxIndex = -1L;
+    size_t segment = 0;
+    float4 stack_array[ STACK_ARRAY_COUNT ];
+#if DEBUG
+    // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    size_t index;
+    float4 max;
+    // Faster loop without cleanup code for full tiles
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
+    {
+        max = dotMax;
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+        }
+        // If we found a new max
+        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
+        {
+            // copy the new max across all lanes of our max accumulator
+            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
+            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
+            dotMax = max;
+            // find first occurrence of that max
+            size_t test;
+            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
+            {}
+            // record where it is.
+            maxIndex = 4*index + segment + indexTable[test];
+        }
+    }
+    // account for work we've already done
+    count -= segment;
+    // Deal with the last < STACK_ARRAY_COUNT vectors
+    max = dotMax;
+    index = 0;
+    if( b3Unlikely( count > 16) )
+    {
+        for( ; index + 4 <= count / 4; index+=4 )
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+        }
+    }
+    size_t localCount = (count & -4L) - 4*index;
+    if( localCount )
+    {
+#ifdef __APPLE__
+        float4 t0, t1, t2, t3, t4;
+        float4 * sap = &stack_array[index + localCount / 4];
+          vertices += localCount;      // counter the offset
+         size_t byteIndex = -(localCount) * sizeof(float);
+        //AT&T Code style assembly
+        asm volatile
+        (   ".align 4                                                                   \n\
+             0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
+          movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+          movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+          movaps  %[t0], %[max]                               // vertices[0]      \n\
+          movlhps %[t1], %[max]                               // x0y0x1y1         \n\
+         movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+         movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+          mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
+         movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+         movaps  %[t3], %[t0]                                // vertices[2]      \n\
+         movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+         mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+          movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+          shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+          mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+         movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
+         shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
+         shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+         addps   %[t3], %[max]                               // x + y            \n\
+         addps   %[t1], %[max]                               // x + y + z        \n\
+         movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+         maxps   %[t2], %[max]                               // record max, restore max   \n\
+         add     $16, %[byteIndex]                           // advance loop counter\n\
+         jnz     0b                                          \n\
+     "
+         : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
+         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
+         : "memory", "cc"
+         );
+        index += localCount/4;
+        {
+            for( unsigned int i=0; i<localCount/4; i++,index++)
+            { // do four dot products at a time. Carefully avoid touching the w element.
+                float4 v0 = vertices[0];
+                float4 v1 = vertices[1];
+                float4 v2 = vertices[2];
+                float4 v3 = vertices[3];
+                vertices += 4;
+                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+                lo0 = lo0*vLo;
+                lo1 = lo1*vLo;
+                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+                z = z*vHi;
+                x = x+y;
+                x = x+z;
+                stack_array[index] = x;
+                max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            }
+        }
+#endif //__APPLE__
+    }
+    // process the last few points
+    if( count & 3 )
+    {
+        float4 v0, v1, v2, x, y, z;
+        switch( count & 3 )
+        {
+            case 3:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                v2 = vertices[2];
+                // Calculate 3 dot products, transpose, duplicate v2
+                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
+                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
+                lo0 = lo0*vLo;
+                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
+                z = z*vHi;
+                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
+                lo1 = lo1*vLo;
+                x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            }
+                break;
+            case 2:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                float4 xy = _mm_movelh_ps(v0, v1);
+                z = _mm_movehl_ps(v1, v0);
+                xy = xy*vLo;
+                z = _mm_shuffle_ps( z, z,  0xa8);
+                x = _mm_shuffle_ps( xy, xy, 0xa8);
+                y = _mm_shuffle_ps( xy, xy, 0xfd);
+                z = z*vHi;
+            }
+                break;
+            case 1:
+            {
+                float4 xy = vertices[0];
+                z =  _mm_shuffle_ps( xy, xy, 0xaa);
+                xy = xy*vLo;
+                z = z*vHi;
+                x = _mm_shuffle_ps(xy, xy, 0);
+                y = _mm_shuffle_ps(xy, xy, 0x55);
+            }
+                break;
+        }
+        x = x+y;
+        x = x+z;
+        stack_array[index] = x;
+        max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+        index++;
+    }
+    // if we found a new max.
+    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
+    { // we found a new max. Search for it
+      // find max across the max vector, place in all elements of max -- big latency hit here
+        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
+        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
+        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+        // this where it actually makes a difference is handled in the early out at the top of the function,
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+        // complexity, and removed it.
+        dotMax = max;
+        // scan for the first occurence of max in the array
+        size_t test;
+        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
+        {}
+        maxIndex = 4*index + segment + indexTable[test];
+    }
+    _mm_store_ss( dotResult, dotMax);
+    return maxIndex;
+long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    const float4 *vertices = (const float4*) vv;
+    static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+    float4 dotmin = b3Assign128( B3_INFINITY,  B3_INFINITY,  B3_INFINITY,  B3_INFINITY );
+    float4 vvec = _mm_loadu_ps( vec );
+    float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa ));          /// zzzz
+    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+    long minIndex = -1L;
+    size_t segment = 0;
+    float4 stack_array[ STACK_ARRAY_COUNT ];
+#if DEBUG
+    // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    size_t index;
+    float4 min;
+    // Faster loop without cleanup code for full tiles
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
+    {
+        min = dotmin;
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+        }
+        // If we found a new min
+        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
+        {
+            // copy the new min across all lanes of our min accumulator
+            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
+            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
+            dotmin = min;
+            // find first occurrence of that min
+            size_t test;
+            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
+            {}
+            // record where it is.
+            minIndex = 4*index + segment + indexTable[test];
+        }
+    }
+    // account for work we've already done
+    count -= segment;
+    // Deal with the last < STACK_ARRAY_COUNT vectors
+    min = dotmin;
+    index = 0;
+    if(b3Unlikely( count > 16) )
+    {
+        for( ; index + 4 <= count / 4; index+=4 )
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+        }
+    }
+    size_t localCount = (count & -4L) - 4*index;
+    if( localCount )
+    {
+#ifdef __APPLE__
+        vertices += localCount;      // counter the offset
+        float4 t0, t1, t2, t3, t4;
+        size_t byteIndex = -(localCount) * sizeof(float);
+        float4 * sap = &stack_array[index + localCount / 4];
+        asm volatile
+        (   ".align 4                                                                   \n\
+             0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
+             movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+             movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+             movaps  %[t0], %[min]                               // vertices[0]      \n\
+             movlhps %[t1], %[min]                               // x0y0x1y1         \n\
+             movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+             movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+             mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
+             movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+             movaps  %[t3], %[t0]                                // vertices[2]      \n\
+             movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+             movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+             mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+             shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+             mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+             movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
+             shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
+             shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+             addps   %[t3], %[min]                               // x + y            \n\
+             addps   %[t1], %[min]                               // x + y + z        \n\
+             movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+             minps   %[t2], %[min]                               // record min, restore min   \n\
+             add     $16, %[byteIndex]                           // advance loop counter\n\
+             jnz     0b                                          \n\
+             "
+         : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
+         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
+         : "memory", "cc"
+         );
+        index += localCount/4;
+        {
+            for( unsigned int i=0; i<localCount/4; i++,index++)
+            { // do four dot products at a time. Carefully avoid touching the w element.
+                float4 v0 = vertices[0];
+                float4 v1 = vertices[1];
+                float4 v2 = vertices[2];
+                float4 v3 = vertices[3];
+                vertices += 4;
+                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+                lo0 = lo0*vLo;
+                lo1 = lo1*vLo;
+                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+                z = z*vHi;
+                x = x+y;
+                x = x+z;
+                stack_array[index] = x;
+                min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
+            }
+        }
+    }
+    // process the last few points
+    if( count & 3 )
+    {
+        float4 v0, v1, v2, x, y, z;
+        switch( count & 3 )
+        {
+            case 3:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                v2 = vertices[2];
+                // Calculate 3 dot products, transpose, duplicate v2
+                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
+                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
+                lo0 = lo0*vLo;
+                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
+                z = z*vHi;
+                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
+                lo1 = lo1*vLo;
+                x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            }
+                break;
+            case 2:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                float4 xy = _mm_movelh_ps(v0, v1);
+                z = _mm_movehl_ps(v1, v0);
+                xy = xy*vLo;
+                z = _mm_shuffle_ps( z, z,  0xa8);
+                x = _mm_shuffle_ps( xy, xy, 0xa8);
+                y = _mm_shuffle_ps( xy, xy, 0xfd);
+                z = z*vHi;
+            }
+                break;
+            case 1:
+            {
+                float4 xy = vertices[0];
+                z =  _mm_shuffle_ps( xy, xy, 0xaa);
+                xy = xy*vLo;
+                z = z*vHi;
+                x = _mm_shuffle_ps(xy, xy, 0);
+                y = _mm_shuffle_ps(xy, xy, 0x55);
+            }
+                break;
+        }
+        x = x+y;
+        x = x+z;
+        stack_array[index] = x;
+        min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+        index++;
+    }
+    // if we found a new min.
+    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
+    { // we found a new min. Search for it
+      // find min across the min vector, place in all elements of min -- big latency hit here
+        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
+        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
+        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+        // this where it actually makes a difference is handled in the early out at the top of the function,
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+        // complexity, and removed it.
+        dotmin = min;
+        // scan for the first occurence of min in the array
+        size_t test;
+        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
+        {}
+        minIndex = 4*index + segment + indexTable[test];
+    }
+    _mm_store_ss( dotResult, dotmin);
+    return minIndex;
+#elif defined B3_USE_NEON
+#include <arm_neon.h>
+static long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long (*b3_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_maxdot_large_sel;
+long (*b3_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_mindot_large_sel;
+extern "C" {int  _get_cpu_capabilities( void );}
+static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    if( _get_cpu_capabilities() & 0x2000 )
+        b3_maxdot_large = _maxdot_large_v1;
+    else
+        b3_maxdot_large = _maxdot_large_v0;
+    return b3_maxdot_large(vv, vec, count, dotResult);
+static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    if( _get_cpu_capabilities() & 0x2000 )
+        b3_mindot_large = _mindot_large_v1;
+    else
+        b3_mindot_large = _mindot_large_v0;
+    return b3_mindot_large(vv, vec, count, dotResult);
+#define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
+long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    unsigned long i = 0;
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x2_t vLo = vget_low_f32(vvec);
+    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+    float32x2_t dotMaxLo = (float32x2_t) { -B3_INFINITY, -B3_INFINITY };
+    float32x2_t dotMaxHi = (float32x2_t) { -B3_INFINITY, -B3_INFINITY };
+    uint32x2_t indexLo = (uint32x2_t) {0, 1};
+    uint32x2_t indexHi = (uint32x2_t) {2, 3};
+    uint32x2_t iLo = (uint32x2_t) {-1, -1};
+    uint32x2_t iHi = (uint32x2_t) {-1, -1};
+    const uint32x2_t four = (uint32x2_t) {4,4};
+    for( ; i+8 <= count; i+= 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        zLo = vmul_f32( z0.val[0], vHi);
+        zHi = vmul_f32( z1.val[0], vHi);
+        rLo = vpadd_f32( xy0, xy1);
+        rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        maskLo = vcgt_f32( rLo, dotMaxLo );
+        maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    for( ; i+4 <= count; i+= 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    switch( count & 3 )
+    {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            float32x2_t rHi = vpadd_f32( xy2, xy2);
+            rLo = vadd_f32(rLo, zLo);
+            rHi = vadd_f32(rHi, zHi);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+            iHi = vbsl_u32(maskHi, indexHi, iHi);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+            float32x2_t zLo = vmul_f32( z0, vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy0);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
+    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    // select best answer between even and odd results
+    dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
+    iHi = vdup_lane_u32(iLo, 1);
+    mask = vcgt_f32( dotMaxHi, dotMaxLo );
+    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    *dotResult = vget_lane_f32( dotMaxLo, 0);
+    return vget_lane_u32(iLo, 0);
+long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
+    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
+    uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
+    float32x4_t maxDot = (float32x4_t) { -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY };
+    unsigned long i = 0;
+    for( ; i + 8 <= count; i += 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        zb = vuzpq_f32( z0, z1);
+        z = vmulq_f32( zb.val[0], vHi);
+        xy = vuzpq_f32( xy0, xy1);
+        x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    for( ; i + 4 <= count; i += 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    switch (count & 3) {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
+            xy0 = vmulq_f32(xy0, vLo);
+            xy1 = vmulq_f32(xy1, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z1);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            xy0 = vmulq_f32(xy0, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z0);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+            xy0 = vmulq_f32(xy0, vLo);
+            z = vmulq_f32( z, vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
+    float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
+    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+    // select best answer between even and odd results
+    float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
+    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+    mask = vcgt_f32( maxDotO, maxDot2 );
+    maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
+    index2 = vbsl_u32(mask, indexHi, index2);
+    *dotResult = vget_lane_f32( maxDot2, 0);
+    return vget_lane_u32(index2, 0);
+long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    unsigned long i = 0;
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x2_t vLo = vget_low_f32(vvec);
+    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+    float32x2_t dotMinLo = (float32x2_t) { B3_INFINITY, B3_INFINITY };
+    float32x2_t dotMinHi = (float32x2_t) { B3_INFINITY, B3_INFINITY };
+    uint32x2_t indexLo = (uint32x2_t) {0, 1};
+    uint32x2_t indexHi = (uint32x2_t) {2, 3};
+    uint32x2_t iLo = (uint32x2_t) {-1, -1};
+    uint32x2_t iHi = (uint32x2_t) {-1, -1};
+    const uint32x2_t four = (uint32x2_t) {4,4};
+    for( ; i+8 <= count; i+= 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        zLo = vmul_f32( z0.val[0], vHi);
+        zHi = vmul_f32( z1.val[0], vHi);
+        rLo = vpadd_f32( xy0, xy1);
+        rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        maskLo = vclt_f32( rLo, dotMinLo );
+        maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    for( ; i+4 <= count; i+= 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    switch( count & 3 )
+    {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            float32x2_t rHi = vpadd_f32( xy2, xy2);
+            rLo = vadd_f32(rLo, zLo);
+            rHi = vadd_f32(rHi, zHi);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+            iHi = vbsl_u32(maskHi, indexHi, iHi);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+            float32x2_t zLo = vmul_f32( z0, vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy0);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
+    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    // select best answer between even and odd results
+    dotMinHi = vdup_lane_f32(dotMinLo, 1);
+    iHi = vdup_lane_u32(iLo, 1);
+    mask = vclt_f32( dotMinHi, dotMinLo );
+    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    *dotResult = vget_lane_f32( dotMinLo, 0);
+    return vget_lane_u32(iLo, 0);
+long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
+    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
+    uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
+    float32x4_t minDot = (float32x4_t) { B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY };
+    unsigned long i = 0;
+    for( ; i + 8 <= count; i += 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        zb = vuzpq_f32( z0, z1);
+        z = vmulq_f32( zb.val[0], vHi);
+        xy = vuzpq_f32( xy0, xy1);
+        x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    for( ; i + 4 <= count; i += 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    switch (count & 3) {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
+            xy0 = vmulq_f32(xy0, vLo);
+            xy1 = vmulq_f32(xy1, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z1);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            xy0 = vmulq_f32(xy0, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z0);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+            xy0 = vmulq_f32(xy0, vLo);
+            z = vmulq_f32( z, vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
+    float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
+    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+    // select best answer between even and odd results
+    float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
+    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+    mask = vclt_f32( minDotO, minDot2 );
+    minDot2 = vbsl_f32(mask, minDotO, minDot2);
+    index2 = vbsl_u32(mask, indexHi, index2);
+    *dotResult = vget_lane_f32( minDot2, 0);
+    return vget_lane_u32(index2, 0);
+    #error Unhandled __APPLE__ arch
+#endif  /* __APPLE__ */
diff --git a/src/bullet/Bullet3Common/b3Vector3.h b/src/bullet/Bullet3Common/b3Vector3.h
new file mode 100644
index 00000000..f6919934
--- /dev/null
+++ b/src/bullet/Bullet3Common/b3Vector3.h
@@ -0,0 +1,1343 @@
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_VECTOR3_H
+#define B3_VECTOR3_H
+//#include <stdint.h>
+#include "b3Scalar.h"
+#include "b3MinMax.h"
+#include "b3AlignedAllocator.h"
+#define b3Vector3Data b3Vector3DoubleData
+#define b3Vector3DataName "b3Vector3DoubleData"
+#define b3Vector3Data b3Vector3FloatData
+#define b3Vector3DataName "b3Vector3FloatData"
+#if defined B3_USE_SSE
+//typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
+#ifdef _MSC_VER
+#pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
+#define B3_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
+//#define b3_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
+#define b3_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
+#define b3_splat3_ps( _a, _i ) b3_pshufd_ps((_a), B3_SHUFFLE(_i,_i,_i, 3) )
+#define b3_splat_ps( _a, _i )  b3_pshufd_ps((_a), B3_SHUFFLE(_i,_i,_i,_i) )
+#define b3v3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define b3vAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define b3vFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
+#define b3v3AbsfMask b3CastiTo128f(b3v3AbsiMask)
+#define b3vFFF0fMask b3CastiTo128f(b3vFFF0Mask)
+#define b3vxyzMaskf b3vFFF0fMask
+#define b3vAbsfMask b3CastiTo128f(b3vAbsMask)
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5)  = {1.5f, 1.5f, 1.5f, 1.5f};
+#ifdef B3_USE_NEON
+const float32x4_t B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
+const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
+class b3Vector3;
+class b3Vector4;
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+//#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+inline b3Vector3 b3MakeVector3( b3SimdFloat4 v);
+inline b3Vector4 b3MakeVector4( b3SimdFloat4 vec);
+inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z);
+inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z, b3Scalar w);
+inline b3Vector4 b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w);
+/**@brief b3Vector3 can be used to represent 3D points and vectors.
+ * It has an un-used w component to suit 16-byte alignment when b3Vector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
+ * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
+ */
+B3_ATTRIBUTE_ALIGNED16(class) b3Vector3
+#if defined (B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM
+        union {
+            b3SimdFloat4      mVec128;
+            float	m_floats[4];
+			struct {float x,y,z,w;};
+        };
+	union
+	{
+        	float	m_floats[4];
+			struct {float	x,y,z,w;};
+	};
+#if defined (B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM
+	/*B3_FORCE_INLINE		b3Vector3()
+	{
+	}
+	*/
+    B3_FORCE_INLINE	b3SimdFloat4	get128() const
+    {
+        return mVec128;
+    }
+    B3_FORCE_INLINE	void	set128(b3SimdFloat4 v128)
+    {
+        mVec128 = v128;
+    }
+	public:
+/**@brief Add a vector to this one
+ * @param The vector to add to this one */
+	B3_FORCE_INLINE b3Vector3& operator+=(const b3Vector3& v)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, v.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, v.mVec128);
+		m_floats[0] += v.m_floats[0];
+		m_floats[1] += v.m_floats[1];
+		m_floats[2] += v.m_floats[2];
+		return *this;
+	}
+  /**@brief Subtract a vector from this one
+   * @param The vector to subtract */
+	B3_FORCE_INLINE b3Vector3& operator-=(const b3Vector3& v)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, v.mVec128);
+		m_floats[0] -= v.m_floats[0];
+		m_floats[1] -= v.m_floats[1];
+		m_floats[2] -= v.m_floats[2];
+		return *this;
+	}
+  /**@brief Scale the vector
+   * @param s Scale factor */
+	B3_FORCE_INLINE b3Vector3& operator*=(const b3Scalar& s)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+		m_floats[0] *= s;
+		m_floats[1] *= s;
+		m_floats[2] *= s;
+		return *this;
+	}
+  /**@brief Inversely scale the vector
+   * @param s Scale factor to divide by */
+	B3_FORCE_INLINE b3Vector3& operator/=(const b3Scalar& s)
+	{
+		b3FullAssert(s != b3Scalar(0.0));
+#if 0 //defined(B3_USE_SSE_IN_API)
+// this code is not faster !
+		__m128 vs = _mm_load_ss(&s);
+		vs = _mm_div_ss(b3v1110, vs);
+		vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+		return *this;
+		return *this *= b3Scalar(1.0) / s;
+	}
+  /**@brief Return the dot product
+   * @param v The other vector in the dot product */
+	B3_FORCE_INLINE b3Scalar dot(const b3Vector3& v) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+		return _mm_cvtss_f32(vd);
+#elif defined(B3_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
+		x = vadd_f32(x, vget_high_f32(vd));
+		return vget_lane_f32(x, 0);
+		return	m_floats[0] * v.m_floats[0] +
+				m_floats[1] * v.m_floats[1] +
+				m_floats[2] * v.m_floats[2];
+	}
+  /**@brief Return the length of the vector squared */
+	B3_FORCE_INLINE b3Scalar length2() const
+	{
+		return dot(*this);
+	}
+  /**@brief Return the length of the vector */
+	B3_FORCE_INLINE b3Scalar length() const
+	{
+		return b3Sqrt(length2());
+	}
+  /**@brief Return the distance squared between the ends of this and another vector
+   * This is symantically treating the vector like a point */
+	B3_FORCE_INLINE b3Scalar distance2(const b3Vector3& v) const;
+  /**@brief Return the distance between the ends of this and another vector
+   * This is symantically treating the vector like a point */
+	B3_FORCE_INLINE b3Scalar distance(const b3Vector3& v) const;
+	B3_FORCE_INLINE b3Vector3& safeNormalize()
+	{
+		b3Vector3 absVec = this->absolute();
+		int maxIndex = absVec.maxAxis();
+		if (absVec[maxIndex]>0)
+		{
+			*this /= absVec[maxIndex];
+			return *this /= length();
+		}
+		setValue(1,0,0);
+		return *this;
+	}
+  /**@brief Normalize this vector
+   * x^2 + y^2 + z^2 = 1 */
+	B3_FORCE_INLINE b3Vector3& normalize()
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+        // dot product first
+		__m128 vd = _mm_mul_ps(mVec128, mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+        #if 0
+        vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(b3v1110, vd);
+		vd = b3_splat_ps(vd, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, vd);
+        #else
+        // NR step 1/sqrt(x) - vd is x, y is output
+        y = _mm_rsqrt_ss(vd); // estimate
+        //  one step NR
+        z = b3v1_5;
+        vd = _mm_mul_ss(vd, b3vHalf); // vd * 0.5
+        //x2 = vd;
+        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0
+        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0
+        z = _mm_sub_ss(z, vd);  // 1.5 - vd * 0.5 * y0 * y0
+        y = _mm_mul_ss(y, z);   // y0 * (1.5 - vd * 0.5 * y0 * y0)
+		y = b3_splat_ps(y, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, y);
+        #endif
+		return *this;
+		return *this /= length();
+	}
+  /**@brief Return a normalized version of this vector */
+	B3_FORCE_INLINE b3Vector3 normalized() const;
+  /**@brief Return a rotated version of this vector
+   * @param wAxis The axis to rotate about
+   * @param angle The angle to rotate by */
+	B3_FORCE_INLINE b3Vector3 rotate( const b3Vector3& wAxis, const b3Scalar angle ) const;
+  /**@brief Return the angle between this and another vector
+   * @param v The other vector */
+	B3_FORCE_INLINE b3Scalar angle(const b3Vector3& v) const
+	{
+		b3Scalar s = b3Sqrt(length2() * v.length2());
+		b3FullAssert(s != b3Scalar(0.0));
+		return b3Acos(dot(v) / s);
+	}
+  /**@brief Return a vector will the absolute values of each element */
+	B3_FORCE_INLINE b3Vector3 absolute() const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		return b3MakeVector3(_mm_and_ps(mVec128, b3v3AbsfMask));
+#elif defined(B3_USE_NEON)
+		return b3Vector3(vabsq_f32(mVec128));
+		return b3MakeVector3(
+			b3Fabs(m_floats[0]),
+			b3Fabs(m_floats[1]),
+			b3Fabs(m_floats[2]));
+	}
+  /**@brief Return the cross product between this and another vector
+   * @param v The other vector */
+	B3_FORCE_INLINE b3Vector3 cross(const b3Vector3& v) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	T, V;
+		T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = _mm_mul_ps(V, mVec128);
+		T = _mm_mul_ps(T, v.mVec128);
+		V = _mm_sub_ps(V, T);
+		V = b3_pshufd_ps(V, B3_SHUFFLE(1, 2, 0, 3));
+		return b3MakeVector3(V);
+#elif defined(B3_USE_NEON)
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(mVec128);
+		float32x2_t Vlow = vget_low_f32(v.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
+		V = vmulq_f32(V, mVec128);
+		T = vmulq_f32(T, v.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+		V = (float32x4_t)vandq_s32((int32x4_t)V, b3vFFF0Mask);
+		return b3Vector3(V);
+		return b3MakeVector3(
+			m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
+			m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
+			m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
+	}
+	B3_FORCE_INLINE b3Scalar triple(const b3Vector3& v1, const b3Vector3& v2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		// cross:
+		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = _mm_mul_ps(V, v1.mVec128);
+		T = _mm_mul_ps(T, v2.mVec128);
+		V = _mm_sub_ps(V, T);
+		V = _mm_shuffle_ps(V, V, B3_SHUFFLE(1, 2, 0, 3));
+		// dot:
+		V = _mm_mul_ps(V, mVec128);
+		__m128 z = _mm_movehl_ps(V, V);
+		__m128 y = _mm_shuffle_ps(V, V, 0x55);
+		V = _mm_add_ss(V, y);
+		V = _mm_add_ss(V, z);
+		return _mm_cvtss_f32(V);
+#elif defined(B3_USE_NEON)
+		// cross:
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(v1.mVec128);
+		float32x2_t Vlow = vget_low_f32(v2.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
+		V = vmulq_f32(V, v1.mVec128);
+		T = vmulq_f32(T, v2.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+		// dot:
+		V = vmulq_f32(mVec128, V);
+		float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
+		x = vadd_f32(x, vget_high_f32(V));
+		return vget_lane_f32(x, 0);
+		return
+			m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
+			m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
+			m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
+	}
+  /**@brief Return the axis with the smallest value
+   * Note return values are 0,1,2 for x, y, or z */
+	B3_FORCE_INLINE int minAxis() const
+	{
+		return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2);
+	}
+  /**@brief Return the axis with the largest value
+   * Note return values are 0,1,2 for x, y, or z */
+	B3_FORCE_INLINE int maxAxis() const
+	{
+		return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0);
+	}
+	B3_FORCE_INLINE int furthestAxis() const
+	{
+		return absolute().minAxis();
+	}
+	B3_FORCE_INLINE int closestAxis() const
+	{
+		return absolute().maxAxis();
+	}
+	B3_FORCE_INLINE void setInterpolate3(const b3Vector3& v0, const b3Vector3& v1, b3Scalar rt)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vrt = _mm_load_ss(&rt);	//	(rt 0 0 0)
+		b3Scalar s = b3Scalar(1.0) - rt;
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+		__m128 r0 = _mm_mul_ps(v0.mVec128, vs);
+		vrt = b3_pshufd_ps(vrt, 0x80);	//	(rt rt rt 0.0)
+		__m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
+		__m128 tmp3 = _mm_add_ps(r0,r1);
+		mVec128 = tmp3;
+#elif defined(B3_USE_NEON)
+		float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
+		vl = vmulq_n_f32(vl, rt);
+		mVec128 = vaddq_f32(vl, v0.mVec128);
+		b3Scalar s = b3Scalar(1.0) - rt;
+		m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
+		m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
+		m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
+		//don't do the unused w component
+		//		m_co[3] = s * v0[3] + rt * v1[3];
+	}
+  /**@brief Return the linear interpolation between this and another vector
+   * @param v The other vector
+   * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
+	B3_FORCE_INLINE b3Vector3 lerp(const b3Vector3& v, const b3Scalar& t) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128	vt = _mm_load_ss(&t);	//	(t 0 0 0)
+		vt = b3_pshufd_ps(vt, 0x80);	//	(rt rt rt 0.0)
+		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
+		vl = _mm_mul_ps(vl, vt);
+		vl = _mm_add_ps(vl, mVec128);
+		return b3MakeVector3(vl);
+#elif defined(B3_USE_NEON)
+		float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
+		vl = vmulq_n_f32(vl, t);
+		vl = vaddq_f32(vl, mVec128);
+		return b3Vector3(vl);
+		return
+			b3MakeVector3(	m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
+						m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
+						m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
+	}
+  /**@brief Elementwise multiply this vector by the other
+   * @param v The other vector */
+	B3_FORCE_INLINE b3Vector3& operator*=(const b3Vector3& v)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmulq_f32(mVec128, v.mVec128);
+		m_floats[0] *= v.m_floats[0];
+		m_floats[1] *= v.m_floats[1];
+		m_floats[2] *= v.m_floats[2];
+		return *this;
+	}
+	 /**@brief Return the x value */
+		B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
+  /**@brief Return the y value */
+		B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
+  /**@brief Return the z value */
+		B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
+/**@brief Return the w value */
+		B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
+  /**@brief Set the x value */
+		B3_FORCE_INLINE void	setX(b3Scalar _x) { m_floats[0] = _x;};
+  /**@brief Set the y value */
+		B3_FORCE_INLINE void	setY(b3Scalar _y) { m_floats[1] = _y;};
+  /**@brief Set the z value */
+		B3_FORCE_INLINE void	setZ(b3Scalar _z) { m_floats[2] = _z;};
+  /**@brief Set the w value */
+		B3_FORCE_INLINE void	setW(b3Scalar _w) { m_floats[3] = _w;};
+	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}
+	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
+	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
+	B3_FORCE_INLINE	operator       b3Scalar *()       { return &m_floats[0]; }
+	B3_FORCE_INLINE	operator const b3Scalar *() const { return &m_floats[0]; }
+	B3_FORCE_INLINE	bool	operator==(const b3Vector3& other) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+		return ((m_floats[3]==other.m_floats[3]) &&
+                (m_floats[2]==other.m_floats[2]) &&
+                (m_floats[1]==other.m_floats[1]) &&
+                (m_floats[0]==other.m_floats[0]));
+	}
+	B3_FORCE_INLINE	bool	operator!=(const b3Vector3& other) const
+	{
+		return !(*this == other);
+	}
+  /**@brief Set each element to the max of the current values and the values of another b3Vector3
+   * @param other The other b3Vector3 to compare with
+   */
+	B3_FORCE_INLINE void	setMax(const b3Vector3& other)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_max_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmaxq_f32(mVec128, other.mVec128);
+		b3SetMax(m_floats[0], other.m_floats[0]);
+		b3SetMax(m_floats[1], other.m_floats[1]);
+		b3SetMax(m_floats[2], other.m_floats[2]);
+		b3SetMax(m_floats[3], other.m_floats[3]);
+	}
+  /**@brief Set each element to the min of the current values and the values of another b3Vector3
+   * @param other The other b3Vector3 to compare with
+   */
+	B3_FORCE_INLINE void	setMin(const b3Vector3& other)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = _mm_min_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vminq_f32(mVec128, other.mVec128);
+		b3SetMin(m_floats[0], other.m_floats[0]);
+		b3SetMin(m_floats[1], other.m_floats[1]);
+		b3SetMin(m_floats[2], other.m_floats[2]);
+		b3SetMin(m_floats[3], other.m_floats[3]);
+	}
+	B3_FORCE_INLINE void 	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	{
+		m_floats[0]=_x;
+		m_floats[1]=_y;
+		m_floats[2]=_z;
+		m_floats[3] = b3Scalar(0.f);
+	}
+	void	getSkewSymmetricMatrix(b3Vector3* v0,b3Vector3* v1,b3Vector3* v2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		__m128 V  = _mm_and_ps(mVec128, b3vFFF0fMask);
+		__m128 V0 = _mm_xor_ps(b3vMzeroMask, V);
+		__m128 V2 = _mm_movelh_ps(V0, V);
+		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
+        V0 = _mm_shuffle_ps(V0, V, 0xDB);
+		V2 = _mm_shuffle_ps(V2, V, 0xF9);
+		v0->mVec128 = V0;
+		v1->mVec128 = V1;
+		v2->mVec128 = V2;
+		v0->setValue(0.		,-getZ()		,getY());
+		v1->setValue(getZ()	,0.			,-getX());
+		v2->setValue(-getY()	,getX()	,0.);
+	}
+	void setZero()
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
+#elif defined(B3_USE_NEON)
+		int32x4_t vi = vdupq_n_s32(0);
+		mVec128 = vreinterpretq_f32_s32(vi);
+		setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	}
+	B3_FORCE_INLINE bool isZero() const
+	{
+		return m_floats[0] == b3Scalar(0) && m_floats[1] == b3Scalar(0) && m_floats[2] == b3Scalar(0);
+	}
+	B3_FORCE_INLINE bool fuzzyZero() const
+	{
+		return length2() < B3_EPSILON;
+	}
+	B3_FORCE_INLINE	void	serialize(struct	b3Vector3Data& dataOut) const;
+	B3_FORCE_INLINE	void	deSerialize(const struct	b3Vector3Data& dataIn);
+	B3_FORCE_INLINE	void	serializeFloat(struct	b3Vector3FloatData& dataOut) const;
+	B3_FORCE_INLINE	void	deSerializeFloat(const struct	b3Vector3FloatData& dataIn);
+	B3_FORCE_INLINE	void	serializeDouble(struct	b3Vector3DoubleData& dataOut) const;
+	B3_FORCE_INLINE	void	deSerializeDouble(const struct	b3Vector3DoubleData& dataIn);
+        /**@brief returns index of maximum dot product between this and vectors in array[]
+         * @param array The other vectors
+         * @param array_count The number of other vectors
+         * @param dotOut The maximum dot product */
+        B3_FORCE_INLINE   long    maxDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const;
+        /**@brief returns index of minimum dot product between this and vectors in array[]
+         * @param array The other vectors
+         * @param array_count The number of other vectors
+         * @param dotOut The minimum dot product */
+        B3_FORCE_INLINE   long    minDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const;
+    /* create a vector as  b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 ))  */
+    B3_FORCE_INLINE b3Vector3  dot3( const b3Vector3 &v0, const b3Vector3 &v1, const b3Vector3 &v2 ) const
+    {
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+        __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
+        __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
+        __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
+        __m128 b0 = _mm_unpacklo_ps( a0, a1 );
+        __m128 b1 = _mm_unpackhi_ps( a0, a1 );
+        __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
+        __m128 r = _mm_movelh_ps( b0, b2 );
+        r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
+        a2 = _mm_and_ps( a2, b3vxyzMaskf);
+        r = _mm_add_ps( r, b3CastdTo128f (_mm_move_sd( b3CastfTo128d(a2), b3CastfTo128d(b1) )));
+        return b3MakeVector3(r);
+#elif defined(B3_USE_NEON)
+        static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
+        float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
+        float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
+        float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
+        float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
+        a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
+        float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
+        float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
+        return b3Vector3( vcombine_f32(b0, b1) );
+		return b3MakeVector3( dot(v0), dot(v1), dot(v2));
+    }
+/**@brief Return the sum of two vectors (Point symantics)*/
+B3_FORCE_INLINE b3Vector3
+operator+(const b3Vector3& v1, const b3Vector3& v2)
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	return b3MakeVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3(vaddq_f32(v1.mVec128, v2.mVec128));
+	return b3MakeVector3(
+			v1.m_floats[0] + v2.m_floats[0],
+			v1.m_floats[1] + v2.m_floats[1],
+			v1.m_floats[2] + v2.m_floats[2]);
+/**@brief Return the elementwise product of two vectors */
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Vector3& v1, const b3Vector3& v2)
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	return b3MakeVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3(vmulq_f32(v1.mVec128, v2.mVec128));
+	return b3MakeVector3(
+			v1.m_floats[0] * v2.m_floats[0],
+			v1.m_floats[1] * v2.m_floats[1],
+			v1.m_floats[2] * v2.m_floats[2]);
+/**@brief Return the difference between two vectors */
+B3_FORCE_INLINE b3Vector3
+operator-(const b3Vector3& v1, const b3Vector3& v2)
+#if (defined(B3_USE_SSE_IN_API)  && defined(B3_USE_SSE))
+	//	without _mm_and_ps this code causes slowdown in Concave moving
+	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
+	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
+#elif defined(B3_USE_NEON)
+	float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
+	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
+	return b3MakeVector3(
+			v1.m_floats[0] - v2.m_floats[0],
+			v1.m_floats[1] - v2.m_floats[1],
+			v1.m_floats[2] - v2.m_floats[2]);
+/**@brief Return the negative of the vector */
+B3_FORCE_INLINE b3Vector3
+operator-(const b3Vector3& v)
+#if (defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+	__m128 r = _mm_xor_ps(v.mVec128, b3vMzeroMask);
+	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3((b3SimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)b3vMzeroMask));
+	return b3MakeVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
+/**@brief Return the vector scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Vector3& v, const b3Scalar& s)
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+	vs = b3_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+	return b3MakeVector3(_mm_mul_ps(v.mVec128, vs));
+#elif defined(B3_USE_NEON)
+	float32x4_t r = vmulq_n_f32(v.mVec128, s);
+	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
+	return b3MakeVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
+/**@brief Return the vector scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Scalar& s, const b3Vector3& v)
+	return v * s;
+/**@brief Return the vector inversely scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator/(const b3Vector3& v, const b3Scalar& s)
+	b3FullAssert(s != b3Scalar(0.0));
+#if 0 //defined(B3_USE_SSE_IN_API)
+// this code is not faster !
+	__m128 vs = _mm_load_ss(&s);
+    vs = _mm_div_ss(b3v1110, vs);
+	vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
+	return b3Vector3(_mm_mul_ps(v.mVec128, vs));
+	return v * (b3Scalar(1.0) / s);
+/**@brief Return the vector inversely scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator/(const b3Vector3& v1, const b3Vector3& v2)
+#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE))
+	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
+	vec = _mm_and_ps(vec, b3vFFF0fMask);
+	return b3MakeVector3(vec);
+#elif defined(B3_USE_NEON)
+	float32x4_t x, y, v, m;
+	x = v1.mVec128;
+	y = v2.mVec128;
+	v = vrecpeq_f32(y);			// v ~ 1/y
+	m = vrecpsq_f32(y, v);		// m = (2-v*y)
+	v = vmulq_f32(v, m);		// vv = v*m ~~ 1/y
+	m = vrecpsq_f32(y, v);		// mm = (2-vv*y)
+	v = vmulq_f32(v, x);		// x*vv
+	v = vmulq_f32(v, m);		// (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
+	return b3Vector3(v);
+	return b3MakeVector3(
+			v1.m_floats[0] / v2.m_floats[0],
+			v1.m_floats[1] / v2.m_floats[1],
+			v1.m_floats[2] / v2.m_floats[2]);
+/**@brief Return the dot product between two vectors */
+b3Dot(const b3Vector3& v1, const b3Vector3& v2)
+	return v1.dot(v2);
+/**@brief Return the distance squared between two vectors */
+b3Distance2(const b3Vector3& v1, const b3Vector3& v2)
+	return v1.distance2(v2);
+/**@brief Return the distance between two vectors */
+b3Distance(const b3Vector3& v1, const b3Vector3& v2)
+	return v1.distance(v2);
+/**@brief Return the angle between two vectors */
+b3Angle(const b3Vector3& v1, const b3Vector3& v2)
+	return v1.angle(v2);
+/**@brief Return the cross product of two vectors */
+B3_FORCE_INLINE b3Vector3
+b3Cross(const b3Vector3& v1, const b3Vector3& v2)
+	return v1.cross(v2);
+b3Triple(const b3Vector3& v1, const b3Vector3& v2, const b3Vector3& v3)
+	return v1.triple(v2, v3);
+/**@brief Return the linear interpolation between two vectors
+ * @param v1 One vector
+ * @param v2 The other vector
+ * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */
+B3_FORCE_INLINE b3Vector3
+b3Lerp(const b3Vector3& v1, const b3Vector3& v2, const b3Scalar& t)
+	return v1.lerp(v2, t);
+B3_FORCE_INLINE b3Scalar b3Vector3::distance2(const b3Vector3& v) const
+	return (v - *this).length2();
+B3_FORCE_INLINE b3Scalar b3Vector3::distance(const b3Vector3& v) const
+	return (v - *this).length();
+B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+	b3Vector3 norm = *this;
+	return norm.normalize();
+	return *this / length();
+B3_FORCE_INLINE b3Vector3 b3Vector3::rotate( const b3Vector3& wAxis, const b3Scalar _angle ) const
+	// wAxis must be a unit lenght vector
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+    __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
+	b3Scalar ssin = b3Sin( _angle );
+    __m128 C = wAxis.cross( b3MakeVector3(mVec128) ).mVec128;
+	O = _mm_and_ps(O, b3vFFF0fMask);
+    b3Scalar scos = b3Cos( _angle );
+	__m128 vsin = _mm_load_ss(&ssin);	//	(S 0 0 0)
+    __m128 vcos = _mm_load_ss(&scos);	//	(S 0 0 0)
+	__m128 Y = b3_pshufd_ps(O, 0xC9);	//	(Y Z X 0)
+	__m128 Z = b3_pshufd_ps(O, 0xD2);	//	(Z X Y 0)
+	O = _mm_add_ps(O, Y);
+	vsin = b3_pshufd_ps(vsin, 0x80);	//	(S S S 0)
+	O = _mm_add_ps(O, Z);
+    vcos = b3_pshufd_ps(vcos, 0x80);	//	(S S S 0)
+    vsin = vsin * C;
+	O = O * wAxis.mVec128;
+	__m128 X = mVec128 - O;
+    O = O + vsin;
+	vcos = vcos * X;
+	O = O + vcos;
+	return b3MakeVector3(O);
+	b3Vector3 o = wAxis * wAxis.dot( *this );
+	b3Vector3 _x = *this - o;
+	b3Vector3 _y;
+	_y = wAxis.cross( *this );
+	return ( o + _x * b3Cos( _angle ) + _y * b3Sin( _angle ) );
+B3_FORCE_INLINE   long    b3Vector3::maxDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const
+#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+    #if defined _WIN32 || defined (B3_USE_SSE)
+        const long scalar_cutoff = 10;
+        long b3_maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #elif defined B3_USE_NEON
+        const long scalar_cutoff = 4;
+        extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #endif
+    if( array_count < scalar_cutoff )
+#endif//B3_USE_SSE || B3_USE_NEON
+    {
+        b3Scalar maxDot = -B3_INFINITY;
+        int i = 0;
+        int ptIndex = -1;
+        for( i = 0; i < array_count; i++ )
+        {
+            b3Scalar dot = array[i].dot(*this);
+            if( dot > maxDot )
+            {
+                maxDot = dot;
+                ptIndex = i;
+            }
+        }
+		b3Assert(ptIndex>=0);
+        if (ptIndex<0)
+		{
+			ptIndex = 0;
+		}
+        dotOut = maxDot;
+        return ptIndex;
+    }
+#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+    return b3_maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+B3_FORCE_INLINE   long    b3Vector3::minDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const
+#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+    #if defined B3_USE_SSE
+        const long scalar_cutoff = 10;
+        long b3_mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #elif defined B3_USE_NEON
+        const long scalar_cutoff = 4;
+        extern long (*b3_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #else
+        #error unhandled arch!
+    #endif
+    if( array_count < scalar_cutoff )
+#endif//B3_USE_SSE || B3_USE_NEON
+    {
+        b3Scalar  minDot = B3_INFINITY;
+        int i = 0;
+        int ptIndex = -1;
+        for( i = 0; i < array_count; i++ )
+        {
+            b3Scalar dot = array[i].dot(*this);
+            if( dot < minDot )
+            {
+                minDot = dot;
+                ptIndex = i;
+            }
+        }
+        dotOut = minDot;
+        return ptIndex;
+    }
+#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+    return b3_mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+class b3Vector4 : public b3Vector3
+	B3_FORCE_INLINE b3Vector4 absolute4() const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+		return b3MakeVector4(_mm_and_ps(mVec128, b3vAbsfMask));
+#elif defined(B3_USE_NEON)
+		return b3Vector4(vabsq_f32(mVec128));
+		return b3MakeVector4(
+			b3Fabs(m_floats[0]),
+			b3Fabs(m_floats[1]),
+			b3Fabs(m_floats[2]),
+			b3Fabs(m_floats[3]));
+	}
+	b3Scalar	getW() const { return m_floats[3];}
+		B3_FORCE_INLINE int maxAxis4() const
+	{
+		int maxIndex = -1;
+		b3Scalar maxVal = b3Scalar(-B3_LARGE_FLOAT);
+		if (m_floats[0] > maxVal)
+		{
+			maxIndex = 0;
+			maxVal = m_floats[0];
+		}
+		if (m_floats[1] > maxVal)
+		{
+			maxIndex = 1;
+			maxVal = m_floats[1];
+		}
+		if (m_floats[2] > maxVal)
+		{
+			maxIndex = 2;
+			maxVal =m_floats[2];
+		}
+		if (m_floats[3] > maxVal)
+		{
+			maxIndex = 3;
+			maxVal = m_floats[3];
+		}
+		return maxIndex;
+	}
+	B3_FORCE_INLINE int minAxis4() const
+	{
+		int minIndex = -1;
+		b3Scalar minVal = b3Scalar(B3_LARGE_FLOAT);
+		if (m_floats[0] < minVal)
+		{
+			minIndex = 0;
+			minVal = m_floats[0];
+		}
+		if (m_floats[1] < minVal)
+		{
+			minIndex = 1;
+			minVal = m_floats[1];
+		}
+		if (m_floats[2] < minVal)
+		{
+			minIndex = 2;
+			minVal =m_floats[2];
+		}
+		if (m_floats[3] < minVal)
+		{
+			minIndex = 3;
+			minVal = m_floats[3];
+		}
+		return minIndex;
+	}
+	B3_FORCE_INLINE int closestAxis4() const
+	{
+		return absolute4().maxAxis4();
+	}
+  /**@brief Set x,y,z and zero w
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   */
+/*		void getValue(b3Scalar *m) const
+		{
+			m[0] = m_floats[0];
+			m[1] = m_floats[1];
+			m[2] =m_floats[2];
+		}
+/**@brief Set the values
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   * @param w Value of w
+   */
+		B3_FORCE_INLINE void	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w)
+		{
+			m_floats[0]=_x;
+			m_floats[1]=_y;
+			m_floats[2]=_z;
+			m_floats[3]=_w;
+		}
+///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+B3_FORCE_INLINE void	b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal)
+	unsigned char* dest = (unsigned char*) &destVal;
+	unsigned char* src  = (unsigned char*) &sourceVal;
+	dest[0] = src[7];
+    dest[1] = src[6];
+    dest[2] = src[5];
+    dest[3] = src[4];
+    dest[4] = src[3];
+    dest[5] = src[2];
+    dest[6] = src[1];
+    dest[7] = src[0];
+	unsigned char* dest = (unsigned char*) &destVal;
+	unsigned char* src  = (unsigned char*) &sourceVal;
+	dest[0] = src[3];
+    dest[1] = src[2];
+    dest[2] = src[1];
+    dest[3] = src[0];
+///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+B3_FORCE_INLINE void	b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec)
+	for (int i=0;i<4;i++)
+	{
+		b3SwapScalarEndian(sourceVec[i],destVec[i]);
+	}
+///b3UnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+B3_FORCE_INLINE void	b3UnSwapVector3Endian(b3Vector3& vector)
+	b3Vector3	swappedVec;
+	for (int i=0;i<4;i++)
+	{
+		b3SwapScalarEndian(vector[i],swappedVec[i]);
+	}
+	vector = swappedVec;
+template <class T>
+B3_FORCE_INLINE void b3PlaneSpace1 (const T& n, T& p, T& q)
+  if (b3Fabs(n[2]) > B3_SQRT12) {
+    // choose p in y-z plane
+    b3Scalar a = n[1]*n[1] + n[2]*n[2];
+    b3Scalar k = b3RecipSqrt (a);
+    p[0] = 0;
+	p[1] = -n[2]*k;
+	p[2] = n[1]*k;
+    // set q = n x p
+    q[0] = a*k;
+	q[1] = -n[0]*p[2];
+	q[2] = n[0]*p[1];
+  }
+  else {
+    // choose p in x-y plane
+    b3Scalar a = n[0]*n[0] + n[1]*n[1];
+    b3Scalar k = b3RecipSqrt (a);
+    p[0] = -n[1]*k;
+	p[1] = n[0]*k;
+	p[2] = 0;
+    // set q = n x p
+    q[0] = -n[2]*p[1];
+	q[1] = n[2]*p[0];
+	q[2] = a*k;
+  }
+struct	b3Vector3FloatData
+	float	m_floats[4];
+struct	b3Vector3DoubleData
+	double	m_floats[4];
+B3_FORCE_INLINE	void	b3Vector3::serializeFloat(struct	b3Vector3FloatData& dataOut) const
+	///could also do a memcpy, check if it is worth it
+	for (int i=0;i<4;i++)
+		dataOut.m_floats[i] = float(m_floats[i]);
+B3_FORCE_INLINE void	b3Vector3::deSerializeFloat(const struct	b3Vector3FloatData& dataIn)
+	for (int i=0;i<4;i++)
+		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
+B3_FORCE_INLINE	void	b3Vector3::serializeDouble(struct	b3Vector3DoubleData& dataOut) const
+	///could also do a memcpy, check if it is worth it
+	for (int i=0;i<4;i++)
+		dataOut.m_floats[i] = double(m_floats[i]);
+B3_FORCE_INLINE void	b3Vector3::deSerializeDouble(const struct	b3Vector3DoubleData& dataIn)
+	for (int i=0;i<4;i++)
+		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
+B3_FORCE_INLINE	void	b3Vector3::serialize(struct	b3Vector3Data& dataOut) const
+	///could also do a memcpy, check if it is worth it
+	for (int i=0;i<4;i++)
+		dataOut.m_floats[i] = m_floats[i];
+B3_FORCE_INLINE void	b3Vector3::deSerialize(const struct	b3Vector3Data& dataIn)
+	for (int i=0;i<4;i++)
+		m_floats[i] = dataIn.m_floats[i];
+inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z)
+	b3Vector3	tmp;
+	tmp.setValue(x,y,z);
+	return tmp;
+inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z, b3Scalar w)
+	b3Vector3	tmp;
+	tmp.setValue(x,y,z);
+	tmp.w = w;
+	return tmp;
+inline b3Vector4 b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w)
+	b3Vector4	tmp;
+	tmp.setValue(x,y,z,w);
+	return tmp;
+#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+inline b3Vector3 b3MakeVector3( b3SimdFloat4 v)
+        b3Vector3 tmp;
+        tmp.set128(v);
+        return tmp;
+inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec)
+	b3Vector4	tmp;
+	tmp.set128(vec);
+	return tmp;
+#endif //B3_VECTOR3_H
diff --git a/src/bullet/Bullet3Common/shared/b3Float4.h b/src/bullet/Bullet3Common/shared/b3Float4.h
new file mode 100644
index 00000000..5e4b95bc
--- /dev/null
+++ b/src/bullet/Bullet3Common/shared/b3Float4.h
@@ -0,0 +1,97 @@
+#ifndef B3_FLOAT4_H
+#define B3_FLOAT4_H
+#include "Bullet3Common/shared/b3PlatformDefinitions.h"
+#ifdef __cplusplus
+	#include "Bullet3Common/b3Vector3.h"
+	#define b3Float4 b3Vector3
+	#define b3Float4ConstArg const b3Vector3&
+	#define b3Dot3F4 b3Dot
+	#define b3Cross3 b3Cross
+	#define	b3MakeFloat4  b3MakeVector3
+	inline b3Vector3 b3Normalized(const b3Vector3& vec)
+	{
+		return vec.normalized();
+	}
+	inline b3Float4 b3FastNormalized3(b3Float4ConstArg v)
+	{
+		return v.normalized();
+	}
+	inline b3Float4 b3MaxFloat4 (const b3Float4& a, const b3Float4& b)
+	{
+		b3Float4 tmp = a;
+		tmp.setMax(b);
+		return tmp;
+	}
+	inline b3Float4 b3MinFloat4 (const b3Float4& a, const b3Float4& b)
+	{
+		b3Float4 tmp = a;
+		tmp.setMin(b);
+		return tmp;
+	}
+	typedef float4	b3Float4;
+	#define b3Float4ConstArg const b3Float4
+	#define b3MakeFloat4 (float4)
+	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)
+	{
+		float4 a1 = b3MakeFloat4(v0.xyz,0.f);
+		float4 b1 = b3MakeFloat4(v1.xyz,0.f);
+		return dot(a1, b1);
+	}
+	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)
+	{
+		float4 a1 = b3MakeFloat4(v0.xyz,0.f);
+		float4 b1 = b3MakeFloat4(v1.xyz,0.f);
+		return cross(a1, b1);
+	}
+	#define b3MinFloat4 min
+	#define b3MaxFloat4 max
+	#define b3Normalized(a) normalize(a)
+inline bool b3IsAlmostZero(b3Float4ConstArg v)
+	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	
+		return false;
+	return true;
+inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )
+    float maxDot = -B3_INFINITY;
+    int i = 0;
+    int ptIndex = -1;
+    for( i = 0; i < vecLen; i++ )
+    {
+        float dot = b3Dot3F4(vecArray[i],vec);
+        if( dot > maxDot )
+        {
+            maxDot = dot;
+            ptIndex = i;
+        }
+    }
+	b3Assert(ptIndex>=0);
+    if (ptIndex<0)
+	{
+		ptIndex = 0;
+	}
+    *dotOut = maxDot;
+    return ptIndex;
+#endif //B3_FLOAT4_H
diff --git a/src/bullet/Bullet3Common/shared/b3Int2.h b/src/bullet/Bullet3Common/shared/b3Int2.h
new file mode 100644
index 00000000..f1d01f81
--- /dev/null
+++ b/src/bullet/Bullet3Common/shared/b3Int2.h
@@ -0,0 +1,64 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_INT2_H
+#define B3_INT2_H
+#ifdef __cplusplus
+struct b3UnsignedInt2
+	union
+	{
+		struct
+		{
+			unsigned int x,y;
+		};
+		struct
+		{
+			unsigned int s[2];
+		};
+	};
+struct b3Int2
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+inline b3Int2 b3MakeInt2(int x, int y)
+	b3Int2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+#define b3UnsignedInt2 uint2
+#define b3Int2 int2
+#define b3MakeInt2 (int2)
+#endif //__cplusplus
\ No newline at end of file
diff --git a/src/bullet/Bullet3Common/shared/b3Int4.h b/src/bullet/Bullet3Common/shared/b3Int4.h
new file mode 100644
index 00000000..aa02d6be
--- /dev/null
+++ b/src/bullet/Bullet3Common/shared/b3Int4.h
@@ -0,0 +1,68 @@
+#ifndef B3_INT4_H
+#define B3_INT4_H
+#ifdef __cplusplus
+#include "Bullet3Common/b3Scalar.h"
+B3_ATTRIBUTE_ALIGNED16(struct) b3UnsignedInt4
+	union
+	{
+		struct
+		{
+			unsigned int x,y,z,w;
+		};
+		struct
+		{
+			unsigned int s[4];
+		};
+	};
+B3_ATTRIBUTE_ALIGNED16(struct) b3Int4
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+B3_FORCE_INLINE b3Int4 b3MakeInt4(int x, int y, int z, int w = 0)
+	b3Int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+B3_FORCE_INLINE b3UnsignedInt4 b3MakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
+	b3UnsignedInt4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+#define b3UnsignedInt4 uint4
+#define b3Int4 int4
+#define b3MakeInt4 (int4)
+#define b3MakeUnsignedInt4 (uint4)
+#endif //__cplusplus
+#endif //B3_INT4_H
diff --git a/src/bullet/Bullet3Common/shared/b3Mat3x3.h b/src/bullet/Bullet3Common/shared/b3Mat3x3.h
new file mode 100644
index 00000000..7b1fef32
--- /dev/null
+++ b/src/bullet/Bullet3Common/shared/b3Mat3x3.h
@@ -0,0 +1,179 @@
+#ifndef B3_MAT3x3_H
+#define B3_MAT3x3_H
+#include "Bullet3Common/shared/b3Quat.h"
+#ifdef __cplusplus
+#include "Bullet3Common/b3Matrix3x3.h"
+#define b3Mat3x3 b3Matrix3x3
+#define b3Mat3x3ConstArg const b3Matrix3x3&
+inline b3Mat3x3 b3QuatGetRotationMatrix(b3QuatConstArg quat)
+	return b3Mat3x3(quat);
+inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg mat)
+	return mat.absolute();
+#define b3GetRow(m,row) m.getRow(row)
+b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b)
+	return b*a;
+typedef struct
+	b3Float4 m_row[3];
+#define b3Mat3x3ConstArg const b3Mat3x3
+#define b3GetRow(m,row) (m.m_row[row])
+inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)
+	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
+	b3Mat3x3 out;
+	out.m_row[0].x=1-2*quat2.y-2*quat2.z;
+	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
+	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
+	out.m_row[0].w = 0.f;
+	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
+	out.m_row[1].y=1-2*quat2.x-2*quat2.z;
+	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
+	out.m_row[1].w = 0.f;
+	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
+	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
+	out.m_row[2].z=1-2*quat2.x-2*quat2.y;
+	out.m_row[2].w = 0.f;
+	return out;
+inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)
+	b3Mat3x3 out;
+	out.m_row[0] = fabs(matIn.m_row[0]);
+	out.m_row[1] = fabs(matIn.m_row[1]);
+	out.m_row[2] = fabs(matIn.m_row[2]);
+	return out;
+b3Mat3x3 mtZero();
+b3Mat3x3 mtIdentity();
+b3Mat3x3 mtTranspose(b3Mat3x3 m);
+b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);
+b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);
+b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);
+b3Mat3x3 mtZero()
+	b3Mat3x3 m;
+	m.m_row[0] = (b3Float4)(0.f);
+	m.m_row[1] = (b3Float4)(0.f);
+	m.m_row[2] = (b3Float4)(0.f);
+	return m;
+b3Mat3x3 mtIdentity()
+	b3Mat3x3 m;
+	m.m_row[0] = (b3Float4)(1,0,0,0);
+	m.m_row[1] = (b3Float4)(0,1,0,0);
+	m.m_row[2] = (b3Float4)(0,0,1,0);
+	return m;
+b3Mat3x3 mtTranspose(b3Mat3x3 m)
+	b3Mat3x3 out;
+	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)
+	b3Mat3x3 transB;
+	transB = mtTranspose( b );
+	b3Mat3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)
+	b3Float4 ans;
+	ans.x = b3Dot3F4( a.m_row[0], b );
+	ans.y = b3Dot3F4( a.m_row[1], b );
+	ans.z = b3Dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)
+	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+	b3Float4 ans;
+	ans.x = b3Dot3F4( a, colx );
+	ans.y = b3Dot3F4( a, coly );
+	ans.z = b3Dot3F4( a, colz );
+	return ans;
+#endif //B3_MAT3x3_H
diff --git a/src/bullet/Bullet3Common/shared/b3PlatformDefinitions.h b/src/bullet/Bullet3Common/shared/b3PlatformDefinitions.h
new file mode 100644
index 00000000..1c133fb0
--- /dev/null
+++ b/src/bullet/Bullet3Common/shared/b3PlatformDefinitions.h
@@ -0,0 +1,41 @@
+struct MyTest
+	int bla;
+#ifdef __cplusplus
+//#define b3ConstArray(a) const b3AlignedObjectArray<a>&
+#define b3ConstArray(a) const a*
+#define b3AtomicInc(a) ((*a)++)
+inline int b3AtomicAdd (volatile int *p, int val)
+	int oldValue = *p;
+	int newValue = oldValue+val;
+	*p = newValue;
+	return oldValue;
+#define __global 
+#define B3_STATIC static
+#define B3_LARGE_FLOAT 1e18f
+#define B3_INFINITY 1e18f
+#define b3Assert(a)
+#define b3ConstArray(a) __global const a*
+#define b3AtomicInc atomic_inc
+#define b3AtomicAdd atomic_add
+#define b3Fabs fabs
+#define b3Sqrt native_sqrt
+#define b3Sin native_sin
+#define b3Cos native_cos
+#define B3_STATIC
diff --git a/src/bullet/Bullet3Common/shared/b3Quat.h b/src/bullet/Bullet3Common/shared/b3Quat.h
new file mode 100644
index 00000000..f262d5e0
--- /dev/null
+++ b/src/bullet/Bullet3Common/shared/b3Quat.h
@@ -0,0 +1,103 @@
+#ifndef B3_QUAT_H
+#define B3_QUAT_H
+#include "Bullet3Common/shared/b3PlatformDefinitions.h"
+#include "Bullet3Common/shared/b3Float4.h"
+#ifdef __cplusplus
+	#include "Bullet3Common/b3Quaternion.h"
+	#include "Bullet3Common/b3Transform.h"
+	#define b3Quat b3Quaternion
+	#define b3QuatConstArg const b3Quaternion&
+	inline b3Quat b3QuatInverse(b3QuatConstArg orn)
+	{
+		return orn.inverse();
+	}
+	inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)
+	{
+		b3Transform tr;
+		tr.setOrigin(translation);
+		tr.setRotation(orientation);
+		return tr(point);
+	}
+	typedef float4	b3Quat;
+	#define b3QuatConstArg const b3Quat
+inline float4 b3FastNormalize4(float4 v)
+	v = (float4)(v.xyz,0.f);
+	return fast_normalize(v);
+inline b3Quat b3QuatMul(b3Quat a, b3Quat b);
+inline b3Quat b3QuatNormalized(b3QuatConstArg in);
+inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);
+inline b3Quat b3QuatInvert(b3QuatConstArg q);
+inline b3Quat b3QuatInverse(b3QuatConstArg q);
+inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)
+	b3Quat ans;
+	ans = b3Cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - b3Dot3F4(a, b);
+	return ans;
+inline b3Quat b3QuatNormalized(b3QuatConstArg in)
+	b3Quat q;
+	q=in;
+	//return b3FastNormalize4(in);
+	float len = native_sqrt(dot(q, q));
+	if(len > 0.f)
+	{
+		q *= 1.f / len;
+	}
+	else
+	{
+		q.x = q.y = q.z = 0.f;
+		q.w = 1.f;
+	}
+	return q;
+inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)
+	b3Quat qInv = b3QuatInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);
+	return out;
+inline b3Quat b3QuatInverse(b3QuatConstArg q)
+	return (b3Quat)(-q.xyz, q.w);
+inline b3Quat b3QuatInvert(b3QuatConstArg q)
+	return (b3Quat)(-q.xyz, q.w);
+inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)
+	return b3QuatRotate( b3QuatInvert( q ), vec );
+inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)
+	return b3QuatRotate( orientation, point ) + (translation);
+#endif //B3_QUAT_H
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h
new file mode 100644
index 00000000..7a12257b
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h
@@ -0,0 +1,159 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Scalar.h"
+enum	b3SolverMode
+	B3_SOLVER_SIMD = 256,
+struct b3ContactSolverInfoData
+	b3Scalar	m_tau;
+	b3Scalar	m_damping;//global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	b3Scalar	m_friction;
+	b3Scalar	m_timeStep;
+	b3Scalar	m_restitution;
+	int		m_numIterations;
+	b3Scalar	m_maxErrorReduction;
+	b3Scalar	m_sor;
+	b3Scalar	m_erp;//used as Baumgarte factor
+	b3Scalar	m_erp2;//used in Split Impulse
+	b3Scalar	m_globalCfm;//constraint force mixing
+	int			m_splitImpulse;
+	b3Scalar	m_splitImpulsePenetrationThreshold;
+	b3Scalar	m_splitImpulseTurnErp;
+	b3Scalar	m_linearSlop;
+	b3Scalar	m_warmstartingFactor;
+	int			m_solverMode;
+	int	m_restingContactRestitutionThreshold;
+	int			m_minimumSolverBatchSize;
+	b3Scalar	m_maxGyroscopicForce;
+	b3Scalar	m_singleAxisRollingFrictionThreshold;
+struct b3ContactSolverInfo : public b3ContactSolverInfoData
+	inline b3ContactSolverInfo()
+	{
+		m_tau = b3Scalar(0.6);
+		m_damping = b3Scalar(1.0);
+		m_friction = b3Scalar(0.3);
+		m_timeStep = b3Scalar(1.f/60.f);
+		m_restitution = b3Scalar(0.);
+		m_maxErrorReduction = b3Scalar(20.);
+		m_numIterations = 10;
+		m_erp = b3Scalar(0.2);
+		m_erp2 = b3Scalar(0.8);
+		m_globalCfm = b3Scalar(0.);
+		m_sor = b3Scalar(1.);
+		m_splitImpulse = true;
+		m_splitImpulsePenetrationThreshold = -.04f;
+		m_splitImpulseTurnErp = 0.1f;
+		m_linearSlop = b3Scalar(0.0);
+		m_warmstartingFactor=b3Scalar(0.85);
+		m_restingContactRestitutionThreshold = 2;//unused as of 2.81
+		m_minimumSolverBatchSize = 128; //try to combine islands until the amount of constraints reaches this limit
+		m_maxGyroscopicForce = 100.f; ///only used to clamp forces for bodies that have their B3_ENABLE_GYROPSCOPIC_FORCE flag set (using b3RigidBody::setFlag)
+		m_singleAxisRollingFrictionThreshold = 1e30f;///if the velocity is above this threshold, it will use a single constraint row (axis), otherwise 3 rows.
+	}
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct b3ContactSolverInfoDoubleData
+	double		m_tau;
+	double		m_damping;//global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	double		m_friction;
+	double		m_timeStep;
+	double		m_restitution;
+	double		m_maxErrorReduction;
+	double		m_sor;
+	double		m_erp;//used as Baumgarte factor
+	double		m_erp2;//used in Split Impulse
+	double		m_globalCfm;//constraint force mixing
+	double		m_splitImpulsePenetrationThreshold;
+	double		m_splitImpulseTurnErp;
+	double		m_linearSlop;
+	double		m_warmstartingFactor;
+	double		m_maxGyroscopicForce;
+	double		m_singleAxisRollingFrictionThreshold;
+	int			m_numIterations;
+	int			m_solverMode;
+	int			m_restingContactRestitutionThreshold;
+	int			m_minimumSolverBatchSize;
+	int			m_splitImpulse;
+	char		m_padding[4];
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct b3ContactSolverInfoFloatData
+	float		m_tau;
+	float		m_damping;//global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	float		m_friction;
+	float		m_timeStep;
+	float		m_restitution;
+	float		m_maxErrorReduction;
+	float		m_sor;
+	float		m_erp;//used as Baumgarte factor
+	float		m_erp2;//used in Split Impulse
+	float		m_globalCfm;//constraint force mixing
+	float		m_splitImpulsePenetrationThreshold;
+	float		m_splitImpulseTurnErp;
+	float		m_linearSlop;
+	float		m_warmstartingFactor;
+	float		m_maxGyroscopicForce;
+	float		m_singleAxisRollingFrictionThreshold;
+	int			m_numIterations;
+	int			m_solverMode;
+	int			m_restingContactRestitutionThreshold;
+	int			m_minimumSolverBatchSize;
+	int			m_splitImpulse;
+	char		m_padding[4];
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.cpp b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.cpp
new file mode 100644
index 00000000..5e11e749
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.cpp
@@ -0,0 +1,108 @@
+#include "b3FixedConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Common/b3TransformUtil.h"
+#include <new>
+b3FixedConstraint::b3FixedConstraint(int rbA,int rbB, const b3Transform& frameInA,const b3Transform& frameInB)
+	m_pivotInA = frameInA.getOrigin();
+	m_pivotInB = frameInB.getOrigin();
+	m_relTargetAB = frameInA.getRotation()*frameInB.getRotation().inverse();
+b3FixedConstraint::~b3FixedConstraint ()
+void b3FixedConstraint::getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies)
+	info->m_numConstraintRows = 6;
+	info->nub = 6;
+void b3FixedConstraint::getInfo2 (b3ConstraintInfo2* info, const b3RigidBodyData* bodies)
+	//fix the 3 linear degrees of freedom
+	const b3Vector3& worldPosA = bodies[m_rbA].m_pos;
+	const b3Quaternion& worldOrnA = bodies[m_rbA].m_quat;
+	const b3Vector3& worldPosB= bodies[m_rbB].m_pos;
+	const b3Quaternion& worldOrnB = bodies[m_rbB].m_quat;
+	info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip+1] = 1;
+	info->m_J1linearAxis[2*info->rowskip+2] = 1;
+	b3Vector3 a1 = b3QuatRotate(worldOrnA,m_pivotInA);
+	{
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip);
+		b3Vector3 a1neg = -a1;
+		a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2);
+	}
+	if (info->m_J2linearAxis)
+	{
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[info->rowskip+1] = -1;
+		info->m_J2linearAxis[2*info->rowskip+2] = -1;
+	}
+	b3Vector3 a2 = b3QuatRotate(worldOrnB,m_pivotInB);
+	{
+	//	b3Vector3 a2n = -a2;
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip);
+		a2.getSkewSymmetricMatrix(angular0,angular1,angular2);
+	}
+    // set right hand side for the linear dofs
+	b3Scalar k = info->fps * info->erp;
+	b3Vector3 linearError = k*(a2+worldPosB-a1-worldPosA);
+    int j;
+	for (j=0; j<3; j++)
+    {
+        info->m_constraintError[j*info->rowskip] = linearError[j];
+		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
+    }
+		//fix the 3 angular degrees of freedom
+	int start_row = 3;
+	int s = info->rowskip;
+    int start_index = start_row * s;
+    // 3 rows to make body rotations equal
+	info->m_J1angularAxis[start_index] = 1;
+    info->m_J1angularAxis[start_index + s + 1] = 1;
+    info->m_J1angularAxis[start_index + s*2+2] = 1;
+    if ( info->m_J2angularAxis)
+    {
+        info->m_J2angularAxis[start_index] = -1;
+        info->m_J2angularAxis[start_index + s+1] = -1;
+        info->m_J2angularAxis[start_index + s*2+2] = -1;
+    }
+    // set right hand side for the angular dofs
+	b3Vector3 diff;
+	b3Scalar angle;
+	b3Quaternion qrelCur = worldOrnA *worldOrnB.inverse();
+	b3TransformUtil::calculateDiffAxisAngleQuaternion(m_relTargetAB,qrelCur,diff,angle);
+	diff*=-angle;
+	for (j=0; j<3; j++)
+    {
+        info->m_constraintError[(3+j)*info->rowskip] = k * diff[j];
+    }
\ No newline at end of file
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.h
new file mode 100644
index 00000000..e884a829
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.h
@@ -0,0 +1,35 @@
+#include "b3TypedConstraint.h"
+B3_ATTRIBUTE_ALIGNED16(class) b3FixedConstraint : public b3TypedConstraint
+	b3Vector3 m_pivotInA;
+	b3Vector3 m_pivotInB;
+	b3Quaternion m_relTargetAB;
+	b3FixedConstraint(int  rbA,int rbB, const b3Transform& frameInA,const b3Transform& frameInB);
+	virtual ~b3FixedConstraint();
+	virtual void getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies);
+	virtual void getInfo2 (b3ConstraintInfo2* info, const b3RigidBodyData* bodies);
+	virtual	void	setParam(int num, b3Scalar value, int axis = -1)
+	{
+		b3Assert(0);
+	}
+	virtual	b3Scalar getParam(int num, int axis = -1) const
+	{
+		b3Assert(0);
+		return 0.f;
+	}
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.cpp b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.cpp
new file mode 100644
index 00000000..b2398f45
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.cpp
@@ -0,0 +1,807 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+Refactored by Francisco Le?n
+email: projectileman@yahoo.com
+#include "b3Generic6DofConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Common/b3TransformUtil.h"
+#include "Bullet3Common/b3TransformUtil.h"
+#include <new>
+#define D6_USE_OBSOLETE_METHOD false
+#define D6_USE_FRAME_OFFSET true
+b3Generic6DofConstraint::b3Generic6DofConstraint(int rbA,int  rbB, const b3Transform& frameInA, const b3Transform& frameInB, bool useLinearReferenceFrameA, const b3RigidBodyData* bodies)
+: b3TypedConstraint(B3_D6_CONSTRAINT_TYPE, rbA, rbB)
+, m_frameInA(frameInA)
+, m_frameInB(frameInB),
+	calculateTransforms(bodies);
+b3Scalar btGetMatrixElem(const b3Matrix3x3& mat, int index);
+b3Scalar btGetMatrixElem(const b3Matrix3x3& mat, int index)
+	int i = index%3;
+	int j = index/3;
+	return mat[i][j];
+///MatrixToEulerXYZ from http://www.geometrictools.com/LibFoundation/Mathematics/Wm4Matrix3.inl.html
+bool	matrixToEulerXYZ(const b3Matrix3x3& mat,b3Vector3& xyz);
+bool	matrixToEulerXYZ(const b3Matrix3x3& mat,b3Vector3& xyz)
+	//	// rot =  cy*cz          -cy*sz           sy
+	//	//        cz*sx*sy+cx*sz  cx*cz-sx*sy*sz -cy*sx
+	//	//       -cx*cz*sy+sx*sz  cz*sx+cx*sy*sz  cx*cy
+	//
+	b3Scalar fi = btGetMatrixElem(mat,2);
+	if (fi < b3Scalar(1.0f))
+	{
+		if (fi > b3Scalar(-1.0f))
+		{
+			xyz[0] = b3Atan2(-btGetMatrixElem(mat,5),btGetMatrixElem(mat,8));
+			xyz[1] = b3Asin(btGetMatrixElem(mat,2));
+			xyz[2] = b3Atan2(-btGetMatrixElem(mat,1),btGetMatrixElem(mat,0));
+			return true;
+		}
+		else
+		{
+			// WARNING.  Not unique.  XA - ZA = -atan2(r10,r11)
+			xyz[0] = -b3Atan2(btGetMatrixElem(mat,3),btGetMatrixElem(mat,4));
+			xyz[1] = -B3_HALF_PI;
+			xyz[2] = b3Scalar(0.0);
+			return false;
+		}
+	}
+	else
+	{
+		// WARNING.  Not unique.  XAngle + ZAngle = atan2(r10,r11)
+		xyz[0] = b3Atan2(btGetMatrixElem(mat,3),btGetMatrixElem(mat,4));
+		xyz[1] = B3_HALF_PI;
+		xyz[2] = 0.0;
+	}
+	return false;
+//////////////////////////// b3RotationalLimitMotor ////////////////////////////////////
+int b3RotationalLimitMotor::testLimitValue(b3Scalar test_value)
+	if(m_loLimit>m_hiLimit)
+	{
+		m_currentLimit = 0;//Free from violation
+		return 0;
+	}
+	if (test_value < m_loLimit)
+	{
+		m_currentLimit = 1;//low limit violation
+		m_currentLimitError =  test_value - m_loLimit;
+		if(m_currentLimitError>B3_PI) 
+			m_currentLimitError-=B3_2_PI;
+		else if(m_currentLimitError<-B3_PI) 
+			m_currentLimitError+=B3_2_PI;
+		return 1;
+	}
+	else if (test_value> m_hiLimit)
+	{
+		m_currentLimit = 2;//High limit violation
+		m_currentLimitError = test_value - m_hiLimit;
+		if(m_currentLimitError>B3_PI) 
+			m_currentLimitError-=B3_2_PI;
+		else if(m_currentLimitError<-B3_PI) 
+			m_currentLimitError+=B3_2_PI;
+		return 2;
+	};
+	m_currentLimit = 0;//Free from violation
+	return 0;
+//////////////////////////// End b3RotationalLimitMotor ////////////////////////////////////
+//////////////////////////// b3TranslationalLimitMotor ////////////////////////////////////
+int b3TranslationalLimitMotor::testLimitValue(int limitIndex, b3Scalar test_value)
+	b3Scalar loLimit = m_lowerLimit[limitIndex];
+	b3Scalar hiLimit = m_upperLimit[limitIndex];
+	if(loLimit > hiLimit)
+	{
+		m_currentLimit[limitIndex] = 0;//Free from violation
+		m_currentLimitError[limitIndex] = b3Scalar(0.f);
+		return 0;
+	}
+	if (test_value < loLimit)
+	{
+		m_currentLimit[limitIndex] = 2;//low limit violation
+		m_currentLimitError[limitIndex] =  test_value - loLimit;
+		return 2;
+	}
+	else if (test_value> hiLimit)
+	{
+		m_currentLimit[limitIndex] = 1;//High limit violation
+		m_currentLimitError[limitIndex] = test_value - hiLimit;
+		return 1;
+	};
+	m_currentLimit[limitIndex] = 0;//Free from violation
+	m_currentLimitError[limitIndex] = b3Scalar(0.f);
+	return 0;
+//////////////////////////// b3TranslationalLimitMotor ////////////////////////////////////
+void b3Generic6DofConstraint::calculateAngleInfo()
+	b3Matrix3x3 relative_frame = m_calculatedTransformA.getBasis().inverse()*m_calculatedTransformB.getBasis();
+	matrixToEulerXYZ(relative_frame,m_calculatedAxisAngleDiff);
+	// in euler angle mode we do not actually constrain the angular velocity
+	// along the axes axis[0] and axis[2] (although we do use axis[1]) :
+	//
+	//    to get			constrain w2-w1 along		...not
+	//    ------			---------------------		------
+	//    d(angle[0])/dt = 0	ax[1] x ax[2]			ax[0]
+	//    d(angle[1])/dt = 0	ax[1]
+	//    d(angle[2])/dt = 0	ax[0] x ax[1]			ax[2]
+	//
+	// constraining w2-w1 along an axis 'a' means that a'*(w2-w1)=0.
+	// to prove the result for angle[0], write the expression for angle[0] from
+	// GetInfo1 then take the derivative. to prove this for angle[2] it is
+	// easier to take the euler rate expression for d(angle[2])/dt with respect
+	// to the components of w and set that to 0.
+	b3Vector3 axis0 = m_calculatedTransformB.getBasis().getColumn(0);
+	b3Vector3 axis2 = m_calculatedTransformA.getBasis().getColumn(2);
+	m_calculatedAxis[1] = axis2.cross(axis0);
+	m_calculatedAxis[0] = m_calculatedAxis[1].cross(axis2);
+	m_calculatedAxis[2] = axis0.cross(m_calculatedAxis[1]);
+	m_calculatedAxis[0].normalize();
+	m_calculatedAxis[1].normalize();
+	m_calculatedAxis[2].normalize();
+static b3Transform getCenterOfMassTransform(const b3RigidBodyData& body)
+	b3Transform tr(body.m_quat,body.m_pos);
+	return tr;
+void b3Generic6DofConstraint::calculateTransforms(const b3RigidBodyData* bodies)
+	b3Transform transA;
+	b3Transform transB;
+	transA = getCenterOfMassTransform(bodies[m_rbA]);
+	transB = getCenterOfMassTransform(bodies[m_rbB]);
+	calculateTransforms(transA,transB,bodies);
+void b3Generic6DofConstraint::calculateTransforms(const b3Transform& transA,const b3Transform& transB,const b3RigidBodyData* bodies)
+	m_calculatedTransformA = transA * m_frameInA;
+	m_calculatedTransformB = transB * m_frameInB;
+	calculateLinearInfo();
+	calculateAngleInfo();
+	if(m_useOffsetForConstraintFrame)
+	{	//  get weight factors depending on masses
+		b3Scalar miA = bodies[m_rbA].m_invMass;
+		b3Scalar miB = bodies[m_rbB].m_invMass;
+		m_hasStaticBody = (miA < B3_EPSILON) || (miB < B3_EPSILON);
+		b3Scalar miS = miA + miB;
+		if(miS > b3Scalar(0.f))
+		{
+			m_factA = miB / miS;
+		}
+		else 
+		{
+			m_factA = b3Scalar(0.5f);
+		}
+		m_factB = b3Scalar(1.0f) - m_factA;
+	}
+bool b3Generic6DofConstraint::testAngularLimitMotor(int axis_index)
+	b3Scalar angle = m_calculatedAxisAngleDiff[axis_index];
+	angle = b3AdjustAngleToLimits(angle, m_angularLimits[axis_index].m_loLimit, m_angularLimits[axis_index].m_hiLimit);
+	m_angularLimits[axis_index].m_currentPosition = angle;
+	//test limits
+	m_angularLimits[axis_index].testLimitValue(angle);
+	return m_angularLimits[axis_index].needApplyTorques();
+void b3Generic6DofConstraint::getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies)
+	//prepare constraint
+	calculateTransforms(getCenterOfMassTransform(bodies[m_rbA]),getCenterOfMassTransform(bodies[m_rbB]),bodies);
+	info->m_numConstraintRows = 0;
+	info->nub = 6;
+	int i;
+	//test linear limits
+	for(i = 0; i < 3; i++)
+	{
+		if(m_linearLimits.needApplyForce(i))
+		{
+			info->m_numConstraintRows++;
+			info->nub--;
+		}
+	}
+	//test angular limits
+	for (i=0;i<3 ;i++ )
+	{
+		if(testAngularLimitMotor(i))
+		{
+			info->m_numConstraintRows++;
+			info->nub--;
+		}
+	}
+//	printf("info->m_numConstraintRows=%d\n",info->m_numConstraintRows);
+void b3Generic6DofConstraint::getInfo1NonVirtual (b3ConstraintInfo1* info,const b3RigidBodyData* bodies)
+	//pre-allocate all 6
+	info->m_numConstraintRows = 6;
+	info->nub = 0;
+void b3Generic6DofConstraint::getInfo2 (b3ConstraintInfo2* info,const b3RigidBodyData* bodies)
+	b3Transform transA = getCenterOfMassTransform(bodies[m_rbA]);
+	b3Transform transB = getCenterOfMassTransform(bodies[m_rbB]);
+	const b3Vector3& linVelA = bodies[m_rbA].m_linVel;
+	const b3Vector3& linVelB = bodies[m_rbB].m_linVel;
+	const b3Vector3& angVelA = bodies[m_rbA].m_angVel;
+	const b3Vector3& angVelB = bodies[m_rbB].m_angVel;
+	if(m_useOffsetForConstraintFrame)
+	{ // for stability better to solve angular limits first
+		int row = setAngularLimits(info, 0,transA,transB,linVelA,linVelB,angVelA,angVelB);
+		setLinearLimits(info, row, transA,transB,linVelA,linVelB,angVelA,angVelB);
+	}
+	else
+	{ // leave old version for compatibility
+		int row = setLinearLimits(info, 0, transA,transB,linVelA,linVelB,angVelA,angVelB);
+		setAngularLimits(info, row,transA,transB,linVelA,linVelB,angVelA,angVelB);
+	}
+void b3Generic6DofConstraint::getInfo2NonVirtual (b3ConstraintInfo2* info, const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB,const b3RigidBodyData* bodies)
+	//prepare constraint
+	calculateTransforms(transA,transB,bodies);
+	int i;
+	for (i=0;i<3 ;i++ )
+	{
+		testAngularLimitMotor(i);
+	}
+	if(m_useOffsetForConstraintFrame)
+	{ // for stability better to solve angular limits first
+		int row = setAngularLimits(info, 0,transA,transB,linVelA,linVelB,angVelA,angVelB);
+		setLinearLimits(info, row, transA,transB,linVelA,linVelB,angVelA,angVelB);
+	}
+	else
+	{ // leave old version for compatibility
+		int row = setLinearLimits(info, 0, transA,transB,linVelA,linVelB,angVelA,angVelB);
+		setAngularLimits(info, row,transA,transB,linVelA,linVelB,angVelA,angVelB);
+	}
+int b3Generic6DofConstraint::setLinearLimits(b3ConstraintInfo2* info, int row, const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB)
+//	int row = 0;
+	//solve linear limits
+	b3RotationalLimitMotor limot;
+	for (int i=0;i<3 ;i++ )
+	{
+		if(m_linearLimits.needApplyForce(i))
+		{ // re-use rotational motor code
+			limot.m_bounce = b3Scalar(0.f);
+			limot.m_currentLimit = m_linearLimits.m_currentLimit[i];
+			limot.m_currentPosition = m_linearLimits.m_currentLinearDiff[i];
+			limot.m_currentLimitError  = m_linearLimits.m_currentLimitError[i];
+			limot.m_damping  = m_linearLimits.m_damping;
+			limot.m_enableMotor  = m_linearLimits.m_enableMotor[i];
+			limot.m_hiLimit  = m_linearLimits.m_upperLimit[i];
+			limot.m_limitSoftness  = m_linearLimits.m_limitSoftness;
+			limot.m_loLimit  = m_linearLimits.m_lowerLimit[i];
+			limot.m_maxLimitForce  = b3Scalar(0.f);
+			limot.m_maxMotorForce  = m_linearLimits.m_maxMotorForce[i];
+			limot.m_targetVelocity  = m_linearLimits.m_targetVelocity[i];
+			b3Vector3 axis = m_calculatedTransformA.getBasis().getColumn(i);
+			int flags = m_flags >> (i * B3_6DOF_FLAGS_AXIS_SHIFT);
+			limot.m_normalCFM	= (flags & B3_6DOF_FLAGS_CFM_NORM) ? m_linearLimits.m_normalCFM[i] : info->cfm[0];
+			limot.m_stopCFM		= (flags & B3_6DOF_FLAGS_CFM_STOP) ? m_linearLimits.m_stopCFM[i] : info->cfm[0];
+			limot.m_stopERP		= (flags & B3_6DOF_FLAGS_ERP_STOP) ? m_linearLimits.m_stopERP[i] : info->erp;
+			if(m_useOffsetForConstraintFrame)
+			{
+				int indx1 = (i + 1) % 3;
+				int indx2 = (i + 2) % 3;
+				int rotAllowed = 1; // rotations around orthos to current axis
+				if(m_angularLimits[indx1].m_currentLimit && m_angularLimits[indx2].m_currentLimit)
+				{
+					rotAllowed = 0;
+				}
+				row += get_limit_motor_info2(&limot, transA,transB,linVelA,linVelB,angVelA,angVelB, info, row, axis, 0, rotAllowed);
+			}
+			else
+			{
+				row += get_limit_motor_info2(&limot, transA,transB,linVelA,linVelB,angVelA,angVelB, info, row, axis, 0);
+			}
+		}
+	}
+	return row;
+int b3Generic6DofConstraint::setAngularLimits(b3ConstraintInfo2 *info, int row_offset, const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB)
+	b3Generic6DofConstraint * d6constraint = this;
+	int row = row_offset;
+	//solve angular limits
+	for (int i=0;i<3 ;i++ )
+	{
+		if(d6constraint->getRotationalLimitMotor(i)->needApplyTorques())
+		{
+			b3Vector3 axis = d6constraint->getAxis(i);
+			int flags = m_flags >> ((i + 3) * B3_6DOF_FLAGS_AXIS_SHIFT);
+			if(!(flags & B3_6DOF_FLAGS_CFM_NORM))
+			{
+				m_angularLimits[i].m_normalCFM = info->cfm[0];
+			}
+			if(!(flags & B3_6DOF_FLAGS_CFM_STOP))
+			{
+				m_angularLimits[i].m_stopCFM = info->cfm[0];
+			}
+			if(!(flags & B3_6DOF_FLAGS_ERP_STOP))
+			{
+				m_angularLimits[i].m_stopERP = info->erp;
+			}
+			row += get_limit_motor_info2(d6constraint->getRotationalLimitMotor(i),
+												transA,transB,linVelA,linVelB,angVelA,angVelB, info,row,axis,1);
+		}
+	}
+	return row;
+void	b3Generic6DofConstraint::updateRHS(b3Scalar	timeStep)
+	(void)timeStep;
+void b3Generic6DofConstraint::setFrames(const b3Transform& frameA, const b3Transform& frameB,const b3RigidBodyData* bodies)
+	m_frameInA = frameA;
+	m_frameInB = frameB;
+	calculateTransforms(bodies);
+b3Vector3 b3Generic6DofConstraint::getAxis(int axis_index) const
+	return m_calculatedAxis[axis_index];
+b3Scalar	b3Generic6DofConstraint::getRelativePivotPosition(int axisIndex) const
+	return m_calculatedLinearDiff[axisIndex];
+b3Scalar b3Generic6DofConstraint::getAngle(int axisIndex) const
+	return m_calculatedAxisAngleDiff[axisIndex];
+void b3Generic6DofConstraint::calcAnchorPos(const b3RigidBodyData* bodies)
+	b3Scalar imA = bodies[m_rbA].m_invMass;
+	b3Scalar imB = bodies[m_rbB].m_invMass;
+	b3Scalar weight;
+	if(imB == b3Scalar(0.0))
+	{
+		weight = b3Scalar(1.0);
+	}
+	else
+	{
+		weight = imA / (imA + imB);
+	}
+	const b3Vector3& pA = m_calculatedTransformA.getOrigin();
+	const b3Vector3& pB = m_calculatedTransformB.getOrigin();
+	m_AnchorPos = pA * weight + pB * (b3Scalar(1.0) - weight);
+	return;
+void b3Generic6DofConstraint::calculateLinearInfo()
+	m_calculatedLinearDiff = m_calculatedTransformB.getOrigin() - m_calculatedTransformA.getOrigin();
+	m_calculatedLinearDiff = m_calculatedTransformA.getBasis().inverse() * m_calculatedLinearDiff;
+	for(int i = 0; i < 3; i++)
+	{
+		m_linearLimits.m_currentLinearDiff[i] = m_calculatedLinearDiff[i];
+		m_linearLimits.testLimitValue(i, m_calculatedLinearDiff[i]);
+	}
+int b3Generic6DofConstraint::get_limit_motor_info2(
+	b3RotationalLimitMotor * limot,
+	const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB,
+	b3ConstraintInfo2 *info, int row, b3Vector3& ax1, int rotational,int rotAllowed)
+    int srow = row * info->rowskip;
+    int powered = limot->m_enableMotor;
+    int limit = limot->m_currentLimit;
+    if (powered || limit)
+    {   // if the joint is powered, or has joint limits, add in the extra row
+        b3Scalar *J1 = rotational ? info->m_J1angularAxis : info->m_J1linearAxis;
+        b3Scalar *J2 = rotational ? info->m_J2angularAxis : info->m_J2linearAxis;
+		if (J1)
+		{
+			J1[srow+0] = ax1[0];
+			J1[srow+1] = ax1[1];
+			J1[srow+2] = ax1[2];
+		}
+		if (J2)
+		{
+			J2[srow+0] = -ax1[0];
+			J2[srow+1] = -ax1[1];
+			J2[srow+2] = -ax1[2];
+		}
+		if((!rotational))
+        {
+			if (m_useOffsetForConstraintFrame)
+			{
+				b3Vector3 tmpA, tmpB, relA, relB;
+				// get vector from bodyB to frameB in WCS
+				relB = m_calculatedTransformB.getOrigin() - transB.getOrigin();
+				// get its projection to constraint axis
+				b3Vector3 projB = ax1 * relB.dot(ax1);
+				// get vector directed from bodyB to constraint axis (and orthogonal to it)
+				b3Vector3 orthoB = relB - projB;
+				// same for bodyA
+				relA = m_calculatedTransformA.getOrigin() - transA.getOrigin();
+				b3Vector3 projA = ax1 * relA.dot(ax1);
+				b3Vector3 orthoA = relA - projA;
+				// get desired offset between frames A and B along constraint axis
+				b3Scalar desiredOffs = limot->m_currentPosition - limot->m_currentLimitError;
+				// desired vector from projection of center of bodyA to projection of center of bodyB to constraint axis
+				b3Vector3 totalDist = projA + ax1 * desiredOffs - projB;
+				// get offset vectors relA and relB
+				relA = orthoA + totalDist * m_factA;
+				relB = orthoB - totalDist * m_factB;
+				tmpA = relA.cross(ax1);
+				tmpB = relB.cross(ax1);
+				if(m_hasStaticBody && (!rotAllowed))
+				{
+					tmpA *= m_factA;
+					tmpB *= m_factB;
+				}
+				int i;
+				for (i=0; i<3; i++) info->m_J1angularAxis[srow+i] = tmpA[i];
+				for (i=0; i<3; i++) info->m_J2angularAxis[srow+i] = -tmpB[i];
+			} else
+			{
+				b3Vector3 ltd;	// Linear Torque Decoupling vector
+				b3Vector3 c = m_calculatedTransformB.getOrigin() - transA.getOrigin();
+				ltd = c.cross(ax1);
+				info->m_J1angularAxis[srow+0] = ltd[0];
+				info->m_J1angularAxis[srow+1] = ltd[1];
+				info->m_J1angularAxis[srow+2] = ltd[2];
+				c = m_calculatedTransformB.getOrigin() - transB.getOrigin();
+				ltd = -c.cross(ax1);
+				info->m_J2angularAxis[srow+0] = ltd[0];
+				info->m_J2angularAxis[srow+1] = ltd[1];
+				info->m_J2angularAxis[srow+2] = ltd[2];
+			}
+        }
+        // if we're limited low and high simultaneously, the joint motor is
+        // ineffective
+        if (limit && (limot->m_loLimit == limot->m_hiLimit)) powered = 0;
+        info->m_constraintError[srow] = b3Scalar(0.f);
+        if (powered)
+        {
+			info->cfm[srow] = limot->m_normalCFM;
+            if(!limit)
+            {
+				b3Scalar tag_vel = rotational ? limot->m_targetVelocity : -limot->m_targetVelocity;
+				b3Scalar mot_fact = getMotorFactor(	limot->m_currentPosition, 
+													limot->m_loLimit,
+													limot->m_hiLimit, 
+													tag_vel, 
+													info->fps * limot->m_stopERP);
+				info->m_constraintError[srow] += mot_fact * limot->m_targetVelocity;
+                info->m_lowerLimit[srow] = -limot->m_maxMotorForce;
+                info->m_upperLimit[srow] = limot->m_maxMotorForce;
+            }
+        }
+        if(limit)
+        {
+            b3Scalar k = info->fps * limot->m_stopERP;
+			if(!rotational)
+			{
+				info->m_constraintError[srow] += k * limot->m_currentLimitError;
+			}
+			else
+			{
+				info->m_constraintError[srow] += -k * limot->m_currentLimitError;
+			}
+			info->cfm[srow] = limot->m_stopCFM;
+            if (limot->m_loLimit == limot->m_hiLimit)
+            {   // limited low and high simultaneously
+                info->m_lowerLimit[srow] = -B3_INFINITY;
+                info->m_upperLimit[srow] = B3_INFINITY;
+            }
+            else
+            {
+                if (limit == 1)
+                {
+                    info->m_lowerLimit[srow] = 0;
+                    info->m_upperLimit[srow] = B3_INFINITY;
+                }
+                else
+                {
+                    info->m_lowerLimit[srow] = -B3_INFINITY;
+                    info->m_upperLimit[srow] = 0;
+                }
+                // deal with bounce
+                if (limot->m_bounce > 0)
+                {
+                    // calculate joint velocity
+                    b3Scalar vel;
+                    if (rotational)
+                    {
+                        vel = angVelA.dot(ax1);
+//make sure that if no body -> angVelB == zero vec
+//                        if (body1)
+                            vel -= angVelB.dot(ax1);
+                    }
+                    else
+                    {
+                        vel = linVelA.dot(ax1);
+//make sure that if no body -> angVelB == zero vec
+//                        if (body1)
+                            vel -= linVelB.dot(ax1);
+                    }
+                    // only apply bounce if the velocity is incoming, and if the
+                    // resulting c[] exceeds what we already have.
+                    if (limit == 1)
+                    {
+                        if (vel < 0)
+                        {
+                            b3Scalar newc = -limot->m_bounce* vel;
+                            if (newc > info->m_constraintError[srow]) 
+								info->m_constraintError[srow] = newc;
+                        }
+                    }
+                    else
+                    {
+                        if (vel > 0)
+                        {
+                            b3Scalar newc = -limot->m_bounce * vel;
+                            if (newc < info->m_constraintError[srow]) 
+								info->m_constraintError[srow] = newc;
+                        }
+                    }
+                }
+            }
+        }
+        return 1;
+    }
+    else return 0;
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+	///If no axis is provided, it uses the default axis for this constraint.
+void b3Generic6DofConstraint::setParam(int num, b3Scalar value, int axis)
+	if((axis >= 0) && (axis < 3))
+	{
+		switch(num)
+		{
+				m_linearLimits.m_stopERP[axis] = value;
+				m_flags |= B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+				m_linearLimits.m_stopCFM[axis] = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			case B3_CONSTRAINT_CFM : 
+				m_linearLimits.m_normalCFM[axis] = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			default : 
+				b3AssertConstrParams(0);
+		}
+	}
+	else if((axis >=3) && (axis < 6))
+	{
+		switch(num)
+		{
+				m_angularLimits[axis - 3].m_stopERP = value;
+				m_flags |= B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+				m_angularLimits[axis - 3].m_stopCFM = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			case B3_CONSTRAINT_CFM : 
+				m_angularLimits[axis - 3].m_normalCFM = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			default : 
+				b3AssertConstrParams(0);
+		}
+	}
+	else
+	{
+		b3AssertConstrParams(0);
+	}
+	///return the local value of parameter
+b3Scalar b3Generic6DofConstraint::getParam(int num, int axis) const 
+	b3Scalar retVal = 0;
+	if((axis >= 0) && (axis < 3))
+	{
+		switch(num)
+		{
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_linearLimits.m_stopERP[axis];
+				break;
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_linearLimits.m_stopCFM[axis];
+				break;
+			case B3_CONSTRAINT_CFM : 
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_linearLimits.m_normalCFM[axis];
+				break;
+			default : 
+				b3AssertConstrParams(0);
+		}
+	}
+	else if((axis >=3) && (axis < 6))
+	{
+		switch(num)
+		{
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_angularLimits[axis - 3].m_stopERP;
+				break;
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_angularLimits[axis - 3].m_stopCFM;
+				break;
+			case B3_CONSTRAINT_CFM : 
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_angularLimits[axis - 3].m_normalCFM;
+				break;
+			default : 
+				b3AssertConstrParams(0);
+		}
+	}
+	else
+	{
+		b3AssertConstrParams(0);
+	}
+	return retVal;
+void b3Generic6DofConstraint::setAxis(const b3Vector3& axis1,const b3Vector3& axis2, const b3RigidBodyData* bodies)
+	b3Vector3 zAxis = axis1.normalized();
+	b3Vector3 yAxis = axis2.normalized();
+	b3Vector3 xAxis = yAxis.cross(zAxis); // we want right coordinate system
+	b3Transform frameInW;
+	frameInW.setIdentity();
+	frameInW.getBasis().setValue(	xAxis[0], yAxis[0], zAxis[0],	
+	                                xAxis[1], yAxis[1], zAxis[1],
+	                               xAxis[2], yAxis[2], zAxis[2]);
+	// now get constraint frame in local coordinate systems
+	m_frameInA = getCenterOfMassTransform(bodies[m_rbA]).inverse() * frameInW;
+	m_frameInB = getCenterOfMassTransform(bodies[m_rbB]).inverse() * frameInW;
+	calculateTransforms(bodies);
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.h
new file mode 100644
index 00000000..084d3605
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.h
@@ -0,0 +1,550 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+/// 2009 March: b3Generic6DofConstraint refactored by Roman Ponomarev
+/// Added support for generic constraint solver through getInfo1/getInfo2 methods
+b3Generic6DofConstraint Refactored by Francisco Le?n
+email: projectileman@yahoo.com
+#include "Bullet3Common/b3Vector3.h"
+#include "b3JacobianEntry.h"
+#include "b3TypedConstraint.h"
+struct b3RigidBodyData;
+//! Rotation Limit structure for generic joints
+class b3RotationalLimitMotor
+    //! limit_parameters
+    //!@{
+    b3Scalar m_loLimit;//!< joint limit
+    b3Scalar m_hiLimit;//!< joint limit
+    b3Scalar m_targetVelocity;//!< target motor velocity
+    b3Scalar m_maxMotorForce;//!< max force on motor
+    b3Scalar m_maxLimitForce;//!< max force on limit
+    b3Scalar m_damping;//!< Damping.
+    b3Scalar m_limitSoftness;//! Relaxation factor
+    b3Scalar m_normalCFM;//!< Constraint force mixing factor
+    b3Scalar m_stopERP;//!< Error tolerance factor when joint is at limit
+    b3Scalar m_stopCFM;//!< Constraint force mixing factor when joint is at limit
+    b3Scalar m_bounce;//!< restitution factor
+    bool m_enableMotor;
+    //!@}
+    //! temp_variables
+    //!@{
+    b3Scalar m_currentLimitError;//!  How much is violated this limit
+    b3Scalar m_currentPosition;     //!  current value of angle 
+    int m_currentLimit;//!< 0=free, 1=at lo limit, 2=at hi limit
+    b3Scalar m_accumulatedImpulse;
+    //!@}
+    b3RotationalLimitMotor()
+    {
+    	m_accumulatedImpulse = 0.f;
+        m_targetVelocity = 0;
+        m_maxMotorForce = 0.1f;
+        m_maxLimitForce = 300.0f;
+        m_loLimit = 1.0f;
+        m_hiLimit = -1.0f;
+		m_normalCFM = 0.f;
+		m_stopERP = 0.2f;
+		m_stopCFM = 0.f;
+        m_bounce = 0.0f;
+        m_damping = 1.0f;
+        m_limitSoftness = 0.5f;
+        m_currentLimit = 0;
+        m_currentLimitError = 0;
+        m_enableMotor = false;
+    }
+    b3RotationalLimitMotor(const b3RotationalLimitMotor & limot)
+    {
+        m_targetVelocity = limot.m_targetVelocity;
+        m_maxMotorForce = limot.m_maxMotorForce;
+        m_limitSoftness = limot.m_limitSoftness;
+        m_loLimit = limot.m_loLimit;
+        m_hiLimit = limot.m_hiLimit;
+		m_normalCFM = limot.m_normalCFM;
+		m_stopERP = limot.m_stopERP;
+		m_stopCFM =	limot.m_stopCFM;
+        m_bounce = limot.m_bounce;
+        m_currentLimit = limot.m_currentLimit;
+        m_currentLimitError = limot.m_currentLimitError;
+        m_enableMotor = limot.m_enableMotor;
+    }
+	//! Is limited
+    bool isLimited()
+    {
+    	if(m_loLimit > m_hiLimit) return false;
+    	return true;
+    }
+	//! Need apply correction
+    bool needApplyTorques()
+    {
+    	if(m_currentLimit == 0 && m_enableMotor == false) return false;
+    	return true;
+    }
+	//! calculates  error
+	/*!
+	calculates m_currentLimit and m_currentLimitError.
+	*/
+	int testLimitValue(b3Scalar test_value);
+	//! apply the correction impulses for two bodies
+    b3Scalar solveAngularLimits(b3Scalar timeStep,b3Vector3& axis, b3Scalar jacDiagABInv,b3RigidBodyData * body0, b3RigidBodyData * body1);
+class b3TranslationalLimitMotor
+	b3Vector3 m_lowerLimit;//!< the constraint lower limits
+    b3Vector3 m_upperLimit;//!< the constraint upper limits
+    b3Vector3 m_accumulatedImpulse;
+    //! Linear_Limit_parameters
+    //!@{
+	b3Vector3	m_normalCFM;//!< Constraint force mixing factor
+    b3Vector3	m_stopERP;//!< Error tolerance factor when joint is at limit
+	b3Vector3	m_stopCFM;//!< Constraint force mixing factor when joint is at limit
+    b3Vector3	m_targetVelocity;//!< target motor velocity
+    b3Vector3	m_maxMotorForce;//!< max force on motor
+    b3Vector3	m_currentLimitError;//!  How much is violated this limit
+    b3Vector3	m_currentLinearDiff;//!  Current relative offset of constraint frames
+	b3Scalar	m_limitSoftness;//!< Softness for linear limit
+    b3Scalar	m_damping;//!< Damping for linear limit
+    b3Scalar	m_restitution;//! Bounce parameter for linear limit
+	//!@}
+	bool		m_enableMotor[3];
+	int			m_currentLimit[3];//!< 0=free, 1=at lower limit, 2=at upper limit
+    b3TranslationalLimitMotor()
+    {
+    	m_lowerLimit.setValue(0.f,0.f,0.f);
+    	m_upperLimit.setValue(0.f,0.f,0.f);
+    	m_accumulatedImpulse.setValue(0.f,0.f,0.f);
+		m_normalCFM.setValue(0.f, 0.f, 0.f);
+		m_stopERP.setValue(0.2f, 0.2f, 0.2f);
+		m_stopCFM.setValue(0.f, 0.f, 0.f);
+    	m_limitSoftness = 0.7f;
+    	m_damping = b3Scalar(1.0f);
+    	m_restitution = b3Scalar(0.5f);
+		for(int i=0; i < 3; i++) 
+		{
+			m_enableMotor[i] = false;
+			m_targetVelocity[i] = b3Scalar(0.f);
+			m_maxMotorForce[i] = b3Scalar(0.f);
+		}
+    }
+    b3TranslationalLimitMotor(const b3TranslationalLimitMotor & other )
+    {
+    	m_lowerLimit = other.m_lowerLimit;
+    	m_upperLimit = other.m_upperLimit;
+    	m_accumulatedImpulse = other.m_accumulatedImpulse;
+    	m_limitSoftness = other.m_limitSoftness ;
+    	m_damping = other.m_damping;
+    	m_restitution = other.m_restitution;
+		m_normalCFM = other.m_normalCFM;
+		m_stopERP = other.m_stopERP;
+		m_stopCFM = other.m_stopCFM;
+		for(int i=0; i < 3; i++) 
+		{
+			m_enableMotor[i] = other.m_enableMotor[i];
+			m_targetVelocity[i] = other.m_targetVelocity[i];
+			m_maxMotorForce[i] = other.m_maxMotorForce[i];
+		}
+    }
+    //! Test limit
+	/*!
+    - free means upper < lower,
+    - locked means upper == lower
+    - limited means upper > lower
+    - limitIndex: first 3 are linear, next 3 are angular
+    */
+    inline bool	isLimited(int limitIndex)
+    {
+       return (m_upperLimit[limitIndex] >= m_lowerLimit[limitIndex]);
+    }
+    inline bool needApplyForce(int limitIndex)
+    {
+    	if(m_currentLimit[limitIndex] == 0 && m_enableMotor[limitIndex] == false) return false;
+    	return true;
+    }
+	int testLimitValue(int limitIndex, b3Scalar test_value);
+    b3Scalar solveLinearAxis(
+    	b3Scalar timeStep,
+        b3Scalar jacDiagABInv,
+        b3RigidBodyData& body1,const b3Vector3 &pointInA,
+        b3RigidBodyData& body2,const b3Vector3 &pointInB,
+        int limit_index,
+        const b3Vector3 & axis_normal_on_a,
+		const b3Vector3 & anchorPos);
+enum b36DofFlags
+#define B3_6DOF_FLAGS_AXIS_SHIFT 3 // bits per axis
+/// b3Generic6DofConstraint between two rigidbodies each with a pivotpoint that descibes the axis location in local space
+b3Generic6DofConstraint can leave any of the 6 degree of freedom 'free' or 'locked'.
+currently this limit supports rotational motors<br>
+<li> For Linear limits, use b3Generic6DofConstraint.setLinearUpperLimit, b3Generic6DofConstraint.setLinearLowerLimit. You can set the parameters with the b3TranslationalLimitMotor structure accsesible through the b3Generic6DofConstraint.getTranslationalLimitMotor method.
+At this moment translational motors are not supported. May be in the future. </li>
+<li> For Angular limits, use the b3RotationalLimitMotor structure for configuring the limit.
+This is accessible through b3Generic6DofConstraint.getLimitMotor method,
+This brings support for limit parameters and motors. </li>
+<li> Angulars limits have these possible ranges:
+<table border=1 >
+	<td><b>AXIS</b></td>
+	<td><b>MIN ANGLE</b></td>
+	<td><b>MAX ANGLE</b></td>
+	<td>X</td>
+	<td>-PI</td>
+	<td>PI</td>
+	<td>Y</td>
+	<td>-PI/2</td>
+	<td>PI/2</td>
+	<td>Z</td>
+	<td>-PI</td>
+	<td>PI</td>
+B3_ATTRIBUTE_ALIGNED16(class) b3Generic6DofConstraint : public b3TypedConstraint
+	//! relative_frames
+    //!@{
+	b3Transform	m_frameInA;//!< the constraint space w.r.t body A
+    b3Transform	m_frameInB;//!< the constraint space w.r.t body B
+    //!@}
+    //! Jacobians
+    //!@{
+//    b3JacobianEntry	m_jacLinear[3];//!< 3 orthogonal linear constraints
+//    b3JacobianEntry	m_jacAng[3];//!< 3 orthogonal angular constraints
+    //!@}
+	//! Linear_Limit_parameters
+    //!@{
+    b3TranslationalLimitMotor m_linearLimits;
+    //!@}
+    //! hinge_parameters
+    //!@{
+    b3RotationalLimitMotor m_angularLimits[3];
+	//!@}
+    //! temporal variables
+    //!@{
+    b3Transform m_calculatedTransformA;
+    b3Transform m_calculatedTransformB;
+    b3Vector3 m_calculatedAxisAngleDiff;
+    b3Vector3 m_calculatedAxis[3];
+    b3Vector3 m_calculatedLinearDiff;
+    b3Scalar m_timeStep;
+	b3Scalar	m_factA;
+	b3Scalar	m_factB;
+	bool		m_hasStaticBody;
+	b3Vector3 m_AnchorPos; // point betwen pivots of bodies A and B to solve linear axes
+    bool	m_useLinearReferenceFrameA;
+	bool	m_useOffsetForConstraintFrame;
+	int		m_flags;
+    //!@}
+    b3Generic6DofConstraint&	operator=(b3Generic6DofConstraint&	other)
+    {
+        b3Assert(0);
+        (void) other;
+        return *this;
+    }
+	int setAngularLimits(b3ConstraintInfo2 *info, int row_offset,const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB);
+	int setLinearLimits(b3ConstraintInfo2 *info, int row, const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB);
+	// tests linear limits
+	void calculateLinearInfo();
+	//! calcs the euler angles between the two bodies.
+    void calculateAngleInfo();
+    b3Generic6DofConstraint(int rbA, int rbB, const b3Transform& frameInA, const b3Transform& frameInB ,bool useLinearReferenceFrameA,const b3RigidBodyData* bodies);
+	//! Calcs global transform of the offsets
+	/*!
+	Calcs the global transform for the joint offset for body A an B, and also calcs the agle differences between the bodies.
+	\sa b3Generic6DofConstraint.getCalculatedTransformA , b3Generic6DofConstraint.getCalculatedTransformB, b3Generic6DofConstraint.calculateAngleInfo
+	*/
+    void calculateTransforms(const b3Transform& transA,const b3Transform& transB,const b3RigidBodyData* bodies);
+	void calculateTransforms(const b3RigidBodyData* bodies);
+	//! Gets the global transform of the offset for body A
+    /*!
+    \sa b3Generic6DofConstraint.getFrameOffsetA, b3Generic6DofConstraint.getFrameOffsetB, b3Generic6DofConstraint.calculateAngleInfo.
+    */
+    const b3Transform & getCalculatedTransformA() const
+    {
+    	return m_calculatedTransformA;
+    }
+    //! Gets the global transform of the offset for body B
+    /*!
+    \sa b3Generic6DofConstraint.getFrameOffsetA, b3Generic6DofConstraint.getFrameOffsetB, b3Generic6DofConstraint.calculateAngleInfo.
+    */
+    const b3Transform & getCalculatedTransformB() const
+    {
+    	return m_calculatedTransformB;
+    }
+    const b3Transform & getFrameOffsetA() const
+    {
+    	return m_frameInA;
+    }
+    const b3Transform & getFrameOffsetB() const
+    {
+    	return m_frameInB;
+    }
+    b3Transform & getFrameOffsetA()
+    {
+    	return m_frameInA;
+    }
+    b3Transform & getFrameOffsetB()
+    {
+    	return m_frameInB;
+    }
+	virtual void getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies);
+	void getInfo1NonVirtual (b3ConstraintInfo1* info,const b3RigidBodyData* bodies);
+	virtual void getInfo2 (b3ConstraintInfo2* info,const b3RigidBodyData* bodies);
+	void getInfo2NonVirtual (b3ConstraintInfo2* info,const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB,const b3RigidBodyData* bodies);
+    void	updateRHS(b3Scalar	timeStep);
+	//! Get the rotation axis in global coordinates
+    b3Vector3 getAxis(int axis_index) const;
+    //! Get the relative Euler angle
+    /*!
+	\pre b3Generic6DofConstraint::calculateTransforms() must be called previously.
+	*/
+    b3Scalar getAngle(int axis_index) const;
+	//! Get the relative position of the constraint pivot
+    /*!
+	\pre b3Generic6DofConstraint::calculateTransforms() must be called previously.
+	*/
+	b3Scalar getRelativePivotPosition(int axis_index) const;
+	void setFrames(const b3Transform & frameA, const b3Transform & frameB, const b3RigidBodyData* bodies);
+	//! Test angular limit.
+	/*!
+	Calculates angular correction and returns true if limit needs to be corrected.
+	\pre b3Generic6DofConstraint::calculateTransforms() must be called previously.
+	*/
+    bool testAngularLimitMotor(int axis_index);
+    void	setLinearLowerLimit(const b3Vector3& linearLower)
+    {
+    	m_linearLimits.m_lowerLimit = linearLower;
+    }
+	void	getLinearLowerLimit(b3Vector3& linearLower)
+	{
+		linearLower = m_linearLimits.m_lowerLimit;
+	}
+	void	setLinearUpperLimit(const b3Vector3& linearUpper)
+	{
+		m_linearLimits.m_upperLimit = linearUpper;
+	}
+	void	getLinearUpperLimit(b3Vector3& linearUpper)
+	{
+		linearUpper = m_linearLimits.m_upperLimit;
+	}
+    void	setAngularLowerLimit(const b3Vector3& angularLower)
+    {
+		for(int i = 0; i < 3; i++) 
+			m_angularLimits[i].m_loLimit = b3NormalizeAngle(angularLower[i]);
+    }
+	void	getAngularLowerLimit(b3Vector3& angularLower)
+	{
+		for(int i = 0; i < 3; i++) 
+			angularLower[i] = m_angularLimits[i].m_loLimit;
+	}
+    void	setAngularUpperLimit(const b3Vector3& angularUpper)
+    {
+		for(int i = 0; i < 3; i++)
+			m_angularLimits[i].m_hiLimit = b3NormalizeAngle(angularUpper[i]);
+    }
+	void	getAngularUpperLimit(b3Vector3& angularUpper)
+	{
+		for(int i = 0; i < 3; i++)
+			angularUpper[i] = m_angularLimits[i].m_hiLimit;
+	}
+	//! Retrieves the angular limit informacion
+    b3RotationalLimitMotor * getRotationalLimitMotor(int index)
+    {
+    	return &m_angularLimits[index];
+    }
+    //! Retrieves the  limit informacion
+    b3TranslationalLimitMotor * getTranslationalLimitMotor()
+    {
+    	return &m_linearLimits;
+    }
+    //first 3 are linear, next 3 are angular
+    void setLimit(int axis, b3Scalar lo, b3Scalar hi)
+    {
+    	if(axis<3)
+    	{
+    		m_linearLimits.m_lowerLimit[axis] = lo;
+    		m_linearLimits.m_upperLimit[axis] = hi;
+    	}
+    	else
+    	{
+			lo = b3NormalizeAngle(lo);
+			hi = b3NormalizeAngle(hi);
+    		m_angularLimits[axis-3].m_loLimit = lo;
+    		m_angularLimits[axis-3].m_hiLimit = hi;
+    	}
+    }
+	//! Test limit
+	/*!
+    - free means upper < lower,
+    - locked means upper == lower
+    - limited means upper > lower
+    - limitIndex: first 3 are linear, next 3 are angular
+    */
+    bool	isLimited(int limitIndex)
+    {
+    	if(limitIndex<3)
+    	{
+			return m_linearLimits.isLimited(limitIndex);
+    	}
+        return m_angularLimits[limitIndex-3].isLimited();
+    }
+	virtual void calcAnchorPos(const b3RigidBodyData* bodies); // overridable
+	int get_limit_motor_info2(	b3RotationalLimitMotor * limot,
+								const b3Transform& transA,const b3Transform& transB,const b3Vector3& linVelA,const b3Vector3& linVelB,const b3Vector3& angVelA,const b3Vector3& angVelB,
+								b3ConstraintInfo2 *info, int row, b3Vector3& ax1, int rotational, int rotAllowed = false);
+	// access for UseFrameOffset
+	bool getUseFrameOffset() { return m_useOffsetForConstraintFrame; }
+	void setUseFrameOffset(bool frameOffsetOnOff) { m_useOffsetForConstraintFrame = frameOffsetOnOff; }
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+	///If no axis is provided, it uses the default axis for this constraint.
+	virtual	void setParam(int num, b3Scalar value, int axis = -1);
+	///return the local value of parameter
+	virtual	b3Scalar getParam(int num, int axis = -1) const;
+	void setAxis( const b3Vector3& axis1, const b3Vector3& axis2,const b3RigidBodyData* bodies);
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3JacobianEntry.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3JacobianEntry.h
new file mode 100644
index 00000000..a55168eb
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3JacobianEntry.h
@@ -0,0 +1,155 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Matrix3x3.h"
+// Another memory optimization would be to store m_1MinvJt in the remaining 3 w components
+// which makes the b3JacobianEntry memory layout 16 bytes
+// if you only are interested in angular part, just feed massInvA and massInvB zero
+/// Jacobian entry is an abstraction that allows to describe constraints
+/// it can be used in combination with a constraint solver
+/// Can be used to relate the effect of an impulse to the constraint error
+B3_ATTRIBUTE_ALIGNED16(class) b3JacobianEntry
+	b3JacobianEntry() {};
+	//constraint between two different rigidbodies
+	b3JacobianEntry(
+		const b3Matrix3x3& world2A,
+		const b3Matrix3x3& world2B,
+		const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,
+		const b3Vector3& jointAxis,
+		const b3Vector3& inertiaInvA, 
+		const b3Scalar massInvA,
+		const b3Vector3& inertiaInvB,
+		const b3Scalar massInvB)
+		:m_linearJointAxis(jointAxis)
+	{
+		m_aJ = world2A*(rel_pos1.cross(m_linearJointAxis));
+		m_bJ = world2B*(rel_pos2.cross(-m_linearJointAxis));
+		m_0MinvJt	= inertiaInvA * m_aJ;
+		m_1MinvJt = inertiaInvB * m_bJ;
+		m_Adiag = massInvA + m_0MinvJt.dot(m_aJ) + massInvB + m_1MinvJt.dot(m_bJ);
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+	//angular constraint between two different rigidbodies
+	b3JacobianEntry(const b3Vector3& jointAxis,
+		const b3Matrix3x3& world2A,
+		const b3Matrix3x3& world2B,
+		const b3Vector3& inertiaInvA,
+		const b3Vector3& inertiaInvB)
+		:m_linearJointAxis(b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)))
+	{
+		m_aJ= world2A*jointAxis;
+		m_bJ = world2B*-jointAxis;
+		m_0MinvJt	= inertiaInvA * m_aJ;
+		m_1MinvJt = inertiaInvB * m_bJ;
+		m_Adiag =  m_0MinvJt.dot(m_aJ) + m_1MinvJt.dot(m_bJ);
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+	//angular constraint between two different rigidbodies
+	b3JacobianEntry(const b3Vector3& axisInA,
+		const b3Vector3& axisInB,
+		const b3Vector3& inertiaInvA,
+		const b3Vector3& inertiaInvB)
+		: m_linearJointAxis(b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)))
+		, m_aJ(axisInA)
+		, m_bJ(-axisInB)
+	{
+		m_0MinvJt	= inertiaInvA * m_aJ;
+		m_1MinvJt = inertiaInvB * m_bJ;
+		m_Adiag =  m_0MinvJt.dot(m_aJ) + m_1MinvJt.dot(m_bJ);
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+	//constraint on one rigidbody
+	b3JacobianEntry(
+		const b3Matrix3x3& world2A,
+		const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,
+		const b3Vector3& jointAxis,
+		const b3Vector3& inertiaInvA, 
+		const b3Scalar massInvA)
+		:m_linearJointAxis(jointAxis)
+	{
+		m_aJ= world2A*(rel_pos1.cross(jointAxis));
+		m_bJ = world2A*(rel_pos2.cross(-jointAxis));
+		m_0MinvJt	= inertiaInvA * m_aJ;
+		m_1MinvJt = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+		m_Adiag = massInvA + m_0MinvJt.dot(m_aJ);
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+	b3Scalar	getDiagonal() const { return m_Adiag; }
+	// for two constraints on the same rigidbody (for example vehicle friction)
+	b3Scalar	getNonDiagonal(const b3JacobianEntry& jacB, const b3Scalar massInvA) const
+	{
+		const b3JacobianEntry& jacA = *this;
+		b3Scalar lin = massInvA * jacA.m_linearJointAxis.dot(jacB.m_linearJointAxis);
+		b3Scalar ang = jacA.m_0MinvJt.dot(jacB.m_aJ);
+		return lin + ang;
+	}
+	// for two constraints on sharing two same rigidbodies (for example two contact points between two rigidbodies)
+	b3Scalar	getNonDiagonal(const b3JacobianEntry& jacB,const b3Scalar massInvA,const b3Scalar massInvB) const
+	{
+		const b3JacobianEntry& jacA = *this;
+		b3Vector3 lin = jacA.m_linearJointAxis * jacB.m_linearJointAxis;
+		b3Vector3 ang0 = jacA.m_0MinvJt * jacB.m_aJ;
+		b3Vector3 ang1 = jacA.m_1MinvJt * jacB.m_bJ;
+		b3Vector3 lin0 = massInvA * lin ;
+		b3Vector3 lin1 = massInvB * lin;
+		b3Vector3 sum = ang0+ang1+lin0+lin1;
+		return sum[0]+sum[1]+sum[2];
+	}
+	b3Scalar getRelativeVelocity(const b3Vector3& linvelA,const b3Vector3& angvelA,const b3Vector3& linvelB,const b3Vector3& angvelB)
+	{
+		b3Vector3 linrel = linvelA - linvelB;
+		b3Vector3 angvela  = angvelA * m_aJ;
+		b3Vector3 angvelb  = angvelB * m_bJ;
+		linrel *= m_linearJointAxis;
+		angvela += angvelb;
+		angvela += linrel;
+		b3Scalar rel_vel2 = angvela[0]+angvela[1]+angvela[2];
+		return rel_vel2 + B3_EPSILON;
+	}
+	b3Vector3	m_linearJointAxis;
+	b3Vector3	m_aJ;
+	b3Vector3	m_bJ;
+	b3Vector3	m_0MinvJt;
+	b3Vector3	m_1MinvJt;
+	//Optimization: can be stored in the w/last component of one of the vectors
+	b3Scalar	m_Adiag;
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.cpp b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.cpp
new file mode 100644
index 00000000..b5fddef6
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.cpp
@@ -0,0 +1,1814 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2012 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//enable B3_SOLVER_DEBUG if you experience solver crashes
+//#define B3_SOLVER_DEBUG
+//It is not necessary (redundant) to refresh contact manifolds, this refresh has been moved to the collision algorithms.
+#include "b3PgsJacobiSolver.h"
+#include "Bullet3Common/b3MinMax.h"
+#include "b3TypedConstraint.h"
+#include <new>
+#include "Bullet3Common/b3StackAlloc.h"
+//#include "b3SolverBody.h"
+//#include "b3SolverConstraint.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include <string.h> //for memset
+//#include "../../dynamics/basic_demo/Stubs/AdlContact4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+static b3Transform	getWorldTransform(b3RigidBodyData* rb)
+	b3Transform newTrans;
+	newTrans.setOrigin(rb->m_pos);
+	newTrans.setRotation(rb->m_quat);
+	return newTrans;
+static const b3Matrix3x3&	getInvInertiaTensorWorld(b3InertiaData* inertia)
+	return inertia->m_invInertiaWorld;
+static const b3Vector3&	getLinearVelocity(b3RigidBodyData* rb)
+	return rb->m_linVel;
+static const b3Vector3&	getAngularVelocity(b3RigidBodyData* rb)
+	return rb->m_angVel;
+static b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos)
+	//we also calculate lin/ang velocity for kinematic objects
+	return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos);
+struct	b3ContactPoint
+	b3Vector3	m_positionWorldOnA;
+	b3Vector3	m_positionWorldOnB;
+	b3Vector3	m_normalWorldOnB;
+	b3Scalar	m_appliedImpulse;
+	b3Scalar	m_distance;
+	b3Scalar	m_combinedRestitution;
+	///information related to friction
+	b3Scalar	m_combinedFriction;
+	b3Vector3	m_lateralFrictionDir1;
+	b3Vector3	m_lateralFrictionDir2;
+	b3Scalar	m_appliedImpulseLateral1;
+	b3Scalar	m_appliedImpulseLateral2;	
+	b3Scalar	m_combinedRollingFriction;
+	b3Scalar	m_contactMotion1;
+	b3Scalar	m_contactMotion2;
+	b3Scalar	m_contactCFM1;
+	b3Scalar	m_contactCFM2;
+	bool		m_lateralFrictionInitialized;
+	b3Vector3	getPositionWorldOnA()
+	{
+		return m_positionWorldOnA;
+	}
+	b3Vector3	getPositionWorldOnB()
+	{
+		return m_positionWorldOnB;
+	}
+	b3Scalar	getDistance()
+	{
+		return m_distance;
+	}
+void	getContactPoint(b3Contact4* contact, int contactIndex, b3ContactPoint& pointOut)
+	pointOut.m_appliedImpulse = 0.f;
+	pointOut.m_appliedImpulseLateral1 = 0.f;
+	pointOut.m_appliedImpulseLateral2 = 0.f;
+	pointOut.m_combinedFriction = contact->getFrictionCoeff();
+	pointOut.m_combinedRestitution = contact->getRestituitionCoeff();
+	pointOut.m_combinedRollingFriction = 0.f;
+	pointOut.m_contactCFM1 = 0.f;
+	pointOut.m_contactCFM2 = 0.f;
+	pointOut.m_contactMotion1 = 0.f;
+	pointOut.m_contactMotion2 = 0.f;
+	pointOut.m_distance = contact->getPenetration(contactIndex);//??0.01f
+	b3Vector3 normalOnB = contact->m_worldNormalOnB;
+	normalOnB.normalize();//is this needed?
+	b3Vector3 l1,l2;
+	b3PlaneSpace1(normalOnB,l1,l2);
+	pointOut.m_normalWorldOnB = normalOnB;
+	//printf("normalOnB = %f,%f,%f\n",normalOnB.getX(),normalOnB.getY(),normalOnB.getZ());
+	pointOut.m_lateralFrictionDir1 = l1;
+	pointOut.m_lateralFrictionDir2 = l2;
+	pointOut.m_lateralFrictionInitialized = true;
+	b3Vector3 worldPosB = contact->m_worldPosB[contactIndex];
+	pointOut.m_positionWorldOnB = worldPosB;
+	pointOut.m_positionWorldOnA = worldPosB+normalOnB*pointOut.m_distance;
+int	getNumContacts(b3Contact4* contact)
+	return contact->getNPoints();
+b3PgsJacobiSolver::b3PgsJacobiSolver(bool usePgs)
+void	b3PgsJacobiSolver::solveContacts(int numBodies, b3RigidBodyData* bodies, b3InertiaData* inertias, int numContacts, b3Contact4* contacts, int numConstraints, b3TypedConstraint** constraints)
+	b3ContactSolverInfo infoGlobal;
+	infoGlobal.m_splitImpulse = false;
+	infoGlobal.m_timeStep = 1.f/60.f;
+	infoGlobal.m_numIterations = 4;//4;
+	infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS;
+	solveGroup(bodies,inertias,numBodies,contacts,numContacts,constraints,numConstraints,infoGlobal);
+	if (!numContacts)
+		return;
+/// b3PgsJacobiSolver Sequentially applies impulses
+b3Scalar b3PgsJacobiSolver::solveGroup(b3RigidBodyData* bodies,
+										b3InertiaData* inertias, 
+										int numBodies,
+										b3Contact4* manifoldPtr, 
+										int numManifolds,
+										b3TypedConstraint** constraints,
+										int numConstraints,
+										const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("solveGroup");
+	//you need to provide at least some bodies
+	solveGroupCacheFriendlySetup( bodies, inertias,numBodies, manifoldPtr,  numManifolds,constraints, numConstraints,infoGlobal);
+	solveGroupCacheFriendlyIterations(constraints, numConstraints,infoGlobal);
+	solveGroupCacheFriendlyFinish(bodies, inertias,numBodies, infoGlobal);
+	return 0.f;
+#ifdef USE_SIMD
+#include <emmintrin.h>
+#define b3VecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
+static inline __m128 b3SimdDot3( __m128 vec0, __m128 vec1 )
+	__m128 result = _mm_mul_ps( vec0, vec1);
+	return _mm_add_ps( b3VecSplat( result, 0 ), _mm_add_ps( b3VecSplat( result, 1 ), b3VecSplat( result, 2 ) ) );
+// Project Gauss Seidel or the equivalent Sequential Impulse
+void b3PgsJacobiSolver::resolveSingleConstraintRowGenericSIMD(b3SolverBody& body1,b3SolverBody& body2,const b3SolverConstraint& c)
+#ifdef USE_SIMD
+	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
+	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
+	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
+	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn	=	_mm_add_ps(b3SimdDot3(c.m_contactNormal.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), b3SimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128));
+	__m128 deltaVel2Dotn	=	_mm_sub_ps(b3SimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128),b3SimdDot3((c.m_contactNormal).mVec128,body2.internalGetDeltaLinearVelocity().mVec128));
+	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
+	b3SimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
+	b3SimdScalar resultLowerLess,resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
+	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
+	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
+	__m128 upperMinApplied = _mm_sub_ps(upperLimit1,cpAppliedImp);
+	deltaImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied) );
+	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1) );
+	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
+	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
+	__m128 impulseMagnitude = deltaImpulse;
+	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
+	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
+	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
+	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
+	resolveSingleConstraintRowGeneric(body1,body2,c);
+// Project Gauss Seidel or the equivalent Sequential Impulse
+ void b3PgsJacobiSolver::resolveSingleConstraintRowGeneric(b3SolverBody& body1,b3SolverBody& body2,const b3SolverConstraint& c)
+	b3Scalar deltaImpulse = c.m_rhs-b3Scalar(c.m_appliedImpulse)*c.m_cfm;
+	const b3Scalar deltaVel1Dotn	=	c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
+	const b3Scalar deltaVel2Dotn	=	-c.m_contactNormal.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+//	const b3Scalar delta_rel_vel	=	deltaVel1Dotn-deltaVel2Dotn;
+	deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
+	deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
+	const b3Scalar sum = b3Scalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else if (sum > c.m_upperLimit) 
+	{
+		deltaImpulse = c.m_upperLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_upperLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+	body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+ void b3PgsJacobiSolver::resolveSingleConstraintRowLowerLimitSIMD(b3SolverBody& body1,b3SolverBody& body2,const b3SolverConstraint& c)
+#ifdef USE_SIMD
+	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
+	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
+	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
+	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn	=	_mm_add_ps(b3SimdDot3(c.m_contactNormal.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), b3SimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128));
+	__m128 deltaVel2Dotn	=	_mm_sub_ps(b3SimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128),b3SimdDot3((c.m_contactNormal).mVec128,body2.internalGetDeltaLinearVelocity().mVec128));
+	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
+	b3SimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
+	b3SimdScalar resultLowerLess,resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
+	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
+	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
+	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
+	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
+	__m128 impulseMagnitude = deltaImpulse;
+	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
+	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
+	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
+	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
+	resolveSingleConstraintRowLowerLimit(body1,body2,c);
+// Project Gauss Seidel or the equivalent Sequential Impulse
+ void b3PgsJacobiSolver::resolveSingleConstraintRowLowerLimit(b3SolverBody& body1,b3SolverBody& body2,const b3SolverConstraint& c)
+	b3Scalar deltaImpulse = c.m_rhs-b3Scalar(c.m_appliedImpulse)*c.m_cfm;
+	const b3Scalar deltaVel1Dotn	=	c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
+	const b3Scalar deltaVel2Dotn	=	-c.m_contactNormal.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+	deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
+	deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
+	const b3Scalar sum = b3Scalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+	body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+void	b3PgsJacobiSolver::resolveSplitPenetrationImpulseCacheFriendly(
+        b3SolverBody& body1,
+        b3SolverBody& body2,
+        const b3SolverConstraint& c)
+		if (c.m_rhsPenetration)
+        {
+			m_numSplitImpulseRecoveries++;
+			b3Scalar deltaImpulse = c.m_rhsPenetration-b3Scalar(c.m_appliedPushImpulse)*c.m_cfm;
+			const b3Scalar deltaVel1Dotn	=	c.m_contactNormal.dot(body1.internalGetPushVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetTurnVelocity());
+			const b3Scalar deltaVel2Dotn	=	-c.m_contactNormal.dot(body2.internalGetPushVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetTurnVelocity());
+			deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
+			deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
+			const b3Scalar sum = b3Scalar(c.m_appliedPushImpulse) + deltaImpulse;
+			if (sum < c.m_lowerLimit)
+			{
+				deltaImpulse = c.m_lowerLimit-c.m_appliedPushImpulse;
+				c.m_appliedPushImpulse = c.m_lowerLimit;
+			}
+			else
+			{
+				c.m_appliedPushImpulse = sum;
+			}
+			body1.internalApplyPushImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+			body2.internalApplyPushImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+        }
+ void b3PgsJacobiSolver::resolveSplitPenetrationSIMD(b3SolverBody& body1,b3SolverBody& body2,const b3SolverConstraint& c)
+#ifdef USE_SIMD
+	if (!c.m_rhsPenetration)
+		return;
+	m_numSplitImpulseRecoveries++;
+	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedPushImpulse);
+	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
+	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
+	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhsPenetration), _mm_mul_ps(_mm_set1_ps(c.m_appliedPushImpulse),_mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn	=	_mm_add_ps(b3SimdDot3(c.m_contactNormal.mVec128,body1.internalGetPushVelocity().mVec128), b3SimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetTurnVelocity().mVec128));
+	__m128 deltaVel2Dotn	=	_mm_sub_ps(b3SimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetTurnVelocity().mVec128),b3SimdDot3((c.m_contactNormal).mVec128,body2.internalGetPushVelocity().mVec128));
+	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
+	b3SimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
+	b3SimdScalar resultLowerLess,resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
+	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
+	c.m_appliedPushImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
+	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
+	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
+	__m128 impulseMagnitude = deltaImpulse;
+	body1.internalGetPushVelocity().mVec128 = _mm_add_ps(body1.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
+	body1.internalGetTurnVelocity().mVec128 = _mm_add_ps(body1.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
+	body2.internalGetPushVelocity().mVec128 = _mm_sub_ps(body2.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
+	body2.internalGetTurnVelocity().mVec128 = _mm_add_ps(body2.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
+	resolveSplitPenetrationImpulseCacheFriendly(body1,body2,c);
+unsigned long b3PgsJacobiSolver::b3Rand2()
+	m_btSeed2 = (1664525L*m_btSeed2 + 1013904223L) & 0xffffffff;
+	return m_btSeed2;
+//See ODE: adam's all-int straightforward(?) dRandInt (0..n-1)
+int b3PgsJacobiSolver::b3RandInt2 (int n)
+	// seems good; xor-fold and modulus
+	const unsigned long un = static_cast<unsigned long>(n);
+	unsigned long r = b3Rand2();
+	// note: probably more aggressive than it needs to be -- might be
+	//       able to get away without one or two of the innermost branches.
+	if (un <= 0x00010000UL) {
+		r ^= (r >> 16);
+		if (un <= 0x00000100UL) {
+			r ^= (r >> 8);
+			if (un <= 0x00000010UL) {
+				r ^= (r >> 4);
+				if (un <= 0x00000004UL) {
+					r ^= (r >> 2);
+					if (un <= 0x00000002UL) {
+						r ^= (r >> 1);
+					}
+				}
+			}
+		}
+	}
+	return (int) (r % un);
+void	b3PgsJacobiSolver::initSolverBody(int bodyIndex, b3SolverBody* solverBody, b3RigidBodyData* rb)
+	solverBody->m_deltaLinearVelocity.setValue(0.f,0.f,0.f);
+	solverBody->m_deltaAngularVelocity.setValue(0.f,0.f,0.f);
+	solverBody->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+	solverBody->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+	if (rb)
+	{
+		solverBody->m_worldTransform = getWorldTransform(rb);
+		solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass,rb->m_invMass,rb->m_invMass));
+		solverBody->m_originalBodyIndex = bodyIndex;
+		solverBody->m_angularFactor = b3MakeVector3(1,1,1);
+		solverBody->m_linearFactor = b3MakeVector3(1,1,1);
+		solverBody->m_linearVelocity = getLinearVelocity(rb);
+		solverBody->m_angularVelocity = getAngularVelocity(rb);
+	} else
+	{
+		solverBody->m_worldTransform.setIdentity();
+		solverBody->internalSetInvMass(b3MakeVector3(0,0,0));
+		solverBody->m_originalBodyIndex = bodyIndex;
+		solverBody->m_angularFactor.setValue(1,1,1);
+		solverBody->m_linearFactor.setValue(1,1,1);
+		solverBody->m_linearVelocity.setValue(0,0,0);
+		solverBody->m_angularVelocity.setValue(0,0,0);
+	}
+b3Scalar b3PgsJacobiSolver::restitutionCurve(b3Scalar rel_vel, b3Scalar restitution)
+	b3Scalar rest = restitution * -rel_vel;
+	return rest;
+void b3PgsJacobiSolver::setupFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias, b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis,int  solverBodyIdA,int solverBodyIdB,b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity, b3Scalar cfmSlip)
+	solverConstraint.m_contactNormal = normalAxis;
+	b3SolverBody& solverBodyA = m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody& solverBodyB = m_tmpSolverBodyPool[solverBodyIdB];
+	b3RigidBodyData* body0 = &bodies[solverBodyA.m_originalBodyIndex];
+	b3RigidBodyData* body1 = &bodies[solverBodyB.m_originalBodyIndex];
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
+	solverConstraint.m_friction = cp.m_combinedFriction;
+	solverConstraint.m_originalContactPoint = 0;
+	solverConstraint.m_appliedImpulse = 0.f;
+	solverConstraint.m_appliedPushImpulse = 0.f;
+	{
+		b3Vector3 ftorqueAxis1 = rel_pos1.cross(solverConstraint.m_contactNormal);
+		solverConstraint.m_relpos1CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentA = body0 ? getInvInertiaTensorWorld(&inertias[solverBodyA.m_originalBodyIndex])*ftorqueAxis1 : b3MakeVector3(0,0,0);
+	}
+	{
+		b3Vector3 ftorqueAxis1 = rel_pos2.cross(-solverConstraint.m_contactNormal);
+		solverConstraint.m_relpos2CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentB = body1 ? getInvInertiaTensorWorld(&inertias[solverBodyB.m_originalBodyIndex])*ftorqueAxis1 : b3MakeVector3(0,0,0);
+	}
+	b3Scalar scaledDenom;
+	{
+		b3Vector3 vec;
+		b3Scalar denom0 = 0.f;
+		b3Scalar denom1 = 0.f;
+		if (body0)
+		{
+			vec = ( solverConstraint.m_angularComponentA).cross(rel_pos1);
+			denom0 = body0->m_invMass + normalAxis.dot(vec);
+		}
+		if (body1)
+		{
+			vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
+			denom1 = body1->m_invMass + normalAxis.dot(vec);
+		}
+		b3Scalar denom;
+		if (m_usePgs)
+		{
+			scaledDenom = denom = relaxation/(denom0+denom1);
+		} else
+		{
+			denom = relaxation/(denom0+denom1);
+			b3Scalar countA = body0->m_invMass ? b3Scalar(m_bodyCount[solverBodyA.m_originalBodyIndex]): 1.f;
+			b3Scalar countB = body1->m_invMass ? b3Scalar(m_bodyCount[solverBodyB.m_originalBodyIndex]): 1.f;
+			scaledDenom = relaxation/(denom0*countA+denom1*countB);
+		}
+		solverConstraint.m_jacDiagABInv = denom;
+	}
+	{
+		b3Scalar rel_vel;
+		b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(body0?solverBodyA.m_linearVelocity:b3MakeVector3(0,0,0)) 
+			+ solverConstraint.m_relpos1CrossNormal.dot(body0?solverBodyA.m_angularVelocity:b3MakeVector3(0,0,0));
+		b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(body1?solverBodyB.m_linearVelocity:b3MakeVector3(0,0,0)) 
+			+ solverConstraint.m_relpos2CrossNormal.dot(body1?solverBodyB.m_angularVelocity:b3MakeVector3(0,0,0));
+		rel_vel = vel1Dotn+vel2Dotn;
+//		b3Scalar positionalError = 0.f;
+		b3SimdScalar velocityError =  desiredVelocity - rel_vel;
+		b3SimdScalar	velocityImpulse = velocityError * b3SimdScalar(scaledDenom);//solverConstraint.m_jacDiagABInv);
+		solverConstraint.m_rhs = velocityImpulse;
+		solverConstraint.m_cfm = cfmSlip;
+		solverConstraint.m_lowerLimit = 0;
+		solverConstraint.m_upperLimit = 1e10f;
+	}
+b3SolverConstraint&	b3PgsJacobiSolver::addFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias, const b3Vector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity, b3Scalar cfmSlip)
+	b3SolverConstraint& solverConstraint = m_tmpSolverContactFrictionConstraintPool.expandNonInitializing();
+	solverConstraint.m_frictionIndex = frictionIndex;
+	setupFrictionConstraint(bodies,inertias,solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, 
+							colObj0, colObj1, relaxation, desiredVelocity, cfmSlip);
+	return solverConstraint;
+void b3PgsJacobiSolver::setupRollingFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias,	b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis1,int solverBodyIdA,int  solverBodyIdB,
+									b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,
+									b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, 
+									b3Scalar desiredVelocity, b3Scalar cfmSlip)
+	b3Vector3 normalAxis=b3MakeVector3(0,0,0);
+	solverConstraint.m_contactNormal = normalAxis;
+	b3SolverBody& solverBodyA = m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody& solverBodyB = m_tmpSolverBodyPool[solverBodyIdB];
+	b3RigidBodyData* body0 = &bodies[m_tmpSolverBodyPool[solverBodyIdA].m_originalBodyIndex];
+	b3RigidBodyData* body1 = &bodies[m_tmpSolverBodyPool[solverBodyIdB].m_originalBodyIndex];
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
+	solverConstraint.m_friction = cp.m_combinedRollingFriction;
+	solverConstraint.m_originalContactPoint = 0;
+	solverConstraint.m_appliedImpulse = 0.f;
+	solverConstraint.m_appliedPushImpulse = 0.f;
+	{
+		b3Vector3 ftorqueAxis1 = -normalAxis1;
+		solverConstraint.m_relpos1CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentA = body0 ? getInvInertiaTensorWorld(&inertias[solverBodyA.m_originalBodyIndex])*ftorqueAxis1 : b3MakeVector3(0,0,0);
+	}
+	{
+		b3Vector3 ftorqueAxis1 = normalAxis1;
+		solverConstraint.m_relpos2CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentB = body1 ? getInvInertiaTensorWorld(&inertias[solverBodyB.m_originalBodyIndex])*ftorqueAxis1 : b3MakeVector3(0,0,0);
+	}
+	{
+		b3Vector3 iMJaA = body0?getInvInertiaTensorWorld(&inertias[solverBodyA.m_originalBodyIndex])*solverConstraint.m_relpos1CrossNormal:b3MakeVector3(0,0,0);
+		b3Vector3 iMJaB = body1?getInvInertiaTensorWorld(&inertias[solverBodyB.m_originalBodyIndex])*solverConstraint.m_relpos2CrossNormal:b3MakeVector3(0,0,0);
+		b3Scalar sum = 0;
+		sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+		sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+		solverConstraint.m_jacDiagABInv = b3Scalar(1.)/sum;
+	}
+	{
+		b3Scalar rel_vel;
+		b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(body0?solverBodyA.m_linearVelocity:b3MakeVector3(0,0,0)) 
+			+ solverConstraint.m_relpos1CrossNormal.dot(body0?solverBodyA.m_angularVelocity:b3MakeVector3(0,0,0));
+		b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(body1?solverBodyB.m_linearVelocity:b3MakeVector3(0,0,0)) 
+			+ solverConstraint.m_relpos2CrossNormal.dot(body1?solverBodyB.m_angularVelocity:b3MakeVector3(0,0,0));
+		rel_vel = vel1Dotn+vel2Dotn;
+//		b3Scalar positionalError = 0.f;
+		b3SimdScalar velocityError =  desiredVelocity - rel_vel;
+		b3SimdScalar	velocityImpulse = velocityError * b3SimdScalar(solverConstraint.m_jacDiagABInv);
+		solverConstraint.m_rhs = velocityImpulse;
+		solverConstraint.m_cfm = cfmSlip;
+		solverConstraint.m_lowerLimit = 0;
+		solverConstraint.m_upperLimit = 1e10f;
+	}
+b3SolverConstraint&	b3PgsJacobiSolver::addRollingFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias,const b3Vector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity, b3Scalar cfmSlip)
+	b3SolverConstraint& solverConstraint = m_tmpSolverContactRollingFrictionConstraintPool.expandNonInitializing();
+	solverConstraint.m_frictionIndex = frictionIndex;
+	setupRollingFrictionConstraint(bodies,inertias,solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, 
+							colObj0, colObj1, relaxation, desiredVelocity, cfmSlip);
+	return solverConstraint;
+int	b3PgsJacobiSolver::getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias)
+	//b3Assert(bodyIndex< m_tmpSolverBodyPool.size());
+	b3RigidBodyData& body = bodies[bodyIndex];
+	int curIndex = -1;
+	if (m_usePgs || body.m_invMass==0.f)
+	{
+		if (m_bodyCount[bodyIndex]<0)
+		{
+			curIndex = m_tmpSolverBodyPool.size();
+			b3SolverBody& solverBody = m_tmpSolverBodyPool.expand();
+			initSolverBody(bodyIndex,&solverBody,&body);
+			solverBody.m_originalBodyIndex = bodyIndex;
+			m_bodyCount[bodyIndex] = curIndex;
+		} else
+		{
+			curIndex = m_bodyCount[bodyIndex];
+		}
+	} else
+	{
+		b3Assert(m_bodyCount[bodyIndex]>0);
+		m_bodyCountCheck[bodyIndex]++;
+		curIndex = m_tmpSolverBodyPool.size();
+		b3SolverBody& solverBody = m_tmpSolverBodyPool.expand();
+		initSolverBody(bodyIndex,&solverBody,&body);
+		solverBody.m_originalBodyIndex = bodyIndex;
+	}
+	b3Assert(curIndex>=0);
+	return curIndex;
+#include <stdio.h>
+void b3PgsJacobiSolver::setupContactConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias,b3SolverConstraint& solverConstraint, 
+																 int solverBodyIdA, int solverBodyIdB,
+																 b3ContactPoint& cp, const b3ContactSolverInfo& infoGlobal,
+																 b3Vector3& vel, b3Scalar& rel_vel, b3Scalar& relaxation,
+																 b3Vector3& rel_pos1, b3Vector3& rel_pos2)
+			const b3Vector3& pos1 = cp.getPositionWorldOnA();
+			const b3Vector3& pos2 = cp.getPositionWorldOnB();
+			b3SolverBody* bodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+			b3SolverBody* bodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+			b3RigidBodyData* rb0 = &bodies[bodyA->m_originalBodyIndex];
+			b3RigidBodyData* rb1 = &bodies[bodyB->m_originalBodyIndex];
+//			b3Vector3 rel_pos1 = pos1 - colObj0->getWorldTransform().getOrigin(); 
+//			b3Vector3 rel_pos2 = pos2 - colObj1->getWorldTransform().getOrigin();
+			rel_pos1 = pos1 - bodyA->getWorldTransform().getOrigin(); 
+			rel_pos2 = pos2 - bodyB->getWorldTransform().getOrigin();
+			relaxation = 1.f;
+			b3Vector3 torqueAxis0 = rel_pos1.cross(cp.m_normalWorldOnB);
+			solverConstraint.m_angularComponentA = rb0 ? getInvInertiaTensorWorld(&inertias[bodyA->m_originalBodyIndex])*torqueAxis0 : b3MakeVector3(0,0,0);
+			b3Vector3 torqueAxis1 = rel_pos2.cross(cp.m_normalWorldOnB);		
+			solverConstraint.m_angularComponentB = rb1 ? getInvInertiaTensorWorld(&inertias[bodyB->m_originalBodyIndex])*-torqueAxis1 : b3MakeVector3(0,0,0);
+			b3Scalar scaledDenom;
+				{
+					b3Scalar denom0 = rb0->computeImpulseDenominator(pos1,cp.m_normalWorldOnB);
+					b3Scalar denom1 = rb1->computeImpulseDenominator(pos2,cp.m_normalWorldOnB);
+					b3Vector3 vec;
+					b3Scalar denom0 = 0.f;
+					b3Scalar denom1 = 0.f;
+					if (rb0)
+					{
+						vec = ( solverConstraint.m_angularComponentA).cross(rel_pos1);
+						denom0 = rb0->m_invMass + cp.m_normalWorldOnB.dot(vec);
+					}
+					if (rb1)
+					{
+						vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
+						denom1 = rb1->m_invMass + cp.m_normalWorldOnB.dot(vec);
+					}
+					b3Scalar denom;
+					if (m_usePgs)
+					{
+						scaledDenom = denom = relaxation/(denom0+denom1);
+					} else
+					{
+						denom = relaxation/(denom0+denom1);
+						b3Scalar countA = rb0->m_invMass? b3Scalar(m_bodyCount[bodyA->m_originalBodyIndex]) : 1.f;
+						b3Scalar countB = rb1->m_invMass? b3Scalar(m_bodyCount[bodyB->m_originalBodyIndex]) : 1.f;
+						scaledDenom = relaxation/(denom0*countA+denom1*countB);
+					}
+					solverConstraint.m_jacDiagABInv = denom;
+				}
+				solverConstraint.m_contactNormal = cp.m_normalWorldOnB;
+				solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+				solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+				b3Scalar restitution = 0.f;
+				b3Scalar penetration = cp.getDistance()+infoGlobal.m_linearSlop;
+				{
+					b3Vector3 vel1,vel2;
+					vel1 = rb0? getVelocityInLocalPoint(rb0,rel_pos1) : b3MakeVector3(0,0,0);
+					vel2 = rb1? getVelocityInLocalPoint(rb1, rel_pos2) : b3MakeVector3(0,0,0);
+	//			b3Vector3 vel2 = rb1 ? rb1->getVelocityInLocalPoint(rel_pos2) : b3Vector3(0,0,0);
+					vel  = vel1 - vel2;
+					rel_vel = cp.m_normalWorldOnB.dot(vel);
+					solverConstraint.m_friction = cp.m_combinedFriction;
+					restitution =  restitutionCurve(rel_vel, cp.m_combinedRestitution);
+					if (restitution <= b3Scalar(0.))
+					{
+						restitution = 0.f;
+					};
+				}
+				///warm starting (or zero if disabled)
+				if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+				{
+					solverConstraint.m_appliedImpulse = cp.m_appliedImpulse * infoGlobal.m_warmstartingFactor;
+					if (rb0)
+						bodyA->internalApplyImpulse(solverConstraint.m_contactNormal*bodyA->internalGetInvMass(),solverConstraint.m_angularComponentA,solverConstraint.m_appliedImpulse);
+					if (rb1)
+						bodyB->internalApplyImpulse(solverConstraint.m_contactNormal*bodyB->internalGetInvMass(),-solverConstraint.m_angularComponentB,-(b3Scalar)solverConstraint.m_appliedImpulse);
+				} else
+				{
+					solverConstraint.m_appliedImpulse = 0.f;
+				}
+				solverConstraint.m_appliedPushImpulse = 0.f;
+				{
+					b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rb0?bodyA->m_linearVelocity:b3MakeVector3(0,0,0)) 
+						+ solverConstraint.m_relpos1CrossNormal.dot(rb0?bodyA->m_angularVelocity:b3MakeVector3(0,0,0));
+					b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rb1?bodyB->m_linearVelocity:b3MakeVector3(0,0,0)) 
+						+ solverConstraint.m_relpos2CrossNormal.dot(rb1?bodyB->m_angularVelocity:b3MakeVector3(0,0,0));
+					b3Scalar rel_vel = vel1Dotn+vel2Dotn;
+					b3Scalar positionalError = 0.f;
+					b3Scalar	velocityError = restitution - rel_vel;// * damping;
+					b3Scalar erp = infoGlobal.m_erp2;
+					if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+					{
+						erp = infoGlobal.m_erp;
+					}
+					if (penetration>0)
+					{
+						positionalError = 0;
+						velocityError -= penetration / infoGlobal.m_timeStep;
+					} else
+					{
+						positionalError = -penetration * erp/infoGlobal.m_timeStep;
+					}
+					b3Scalar  penetrationImpulse = positionalError*scaledDenom;//solverConstraint.m_jacDiagABInv;
+					b3Scalar velocityImpulse = velocityError *scaledDenom;//solverConstraint.m_jacDiagABInv;
+					if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+					{
+						//combine position and velocity into rhs
+						solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+						solverConstraint.m_rhsPenetration = 0.f;
+					} else
+					{
+						//split position and velocity into rhs and m_rhsPenetration
+						solverConstraint.m_rhs = velocityImpulse;
+						solverConstraint.m_rhsPenetration = penetrationImpulse;
+					}
+					solverConstraint.m_cfm = 0.f;
+					solverConstraint.m_lowerLimit = 0;
+					solverConstraint.m_upperLimit = 1e10f;
+				}
+void b3PgsJacobiSolver::setFrictionConstraintImpulse( b3RigidBodyData* bodies, b3InertiaData* inertias,b3SolverConstraint& solverConstraint, 
+																		int solverBodyIdA, int solverBodyIdB,
+																 b3ContactPoint& cp, const b3ContactSolverInfo& infoGlobal)
+	b3SolverBody* bodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody* bodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+	{
+		b3SolverConstraint& frictionConstraint1 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex];
+		if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+		{
+			frictionConstraint1.m_appliedImpulse = cp.m_appliedImpulseLateral1 * infoGlobal.m_warmstartingFactor;
+			if (bodies[bodyA->m_originalBodyIndex].m_invMass)
+				bodyA->internalApplyImpulse(frictionConstraint1.m_contactNormal*bodies[bodyA->m_originalBodyIndex].m_invMass,frictionConstraint1.m_angularComponentA,frictionConstraint1.m_appliedImpulse);
+			if (bodies[bodyB->m_originalBodyIndex].m_invMass)
+				bodyB->internalApplyImpulse(frictionConstraint1.m_contactNormal*bodies[bodyB->m_originalBodyIndex].m_invMass,-frictionConstraint1.m_angularComponentB,-(b3Scalar)frictionConstraint1.m_appliedImpulse);
+		} else
+		{
+			frictionConstraint1.m_appliedImpulse = 0.f;
+		}
+	}
+	if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+	{
+		b3SolverConstraint& frictionConstraint2 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex+1];
+		if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+		{
+			frictionConstraint2.m_appliedImpulse = cp.m_appliedImpulseLateral2  * infoGlobal.m_warmstartingFactor;
+			if (bodies[bodyA->m_originalBodyIndex].m_invMass)
+				bodyA->internalApplyImpulse(frictionConstraint2.m_contactNormal*bodies[bodyA->m_originalBodyIndex].m_invMass,frictionConstraint2.m_angularComponentA,frictionConstraint2.m_appliedImpulse);
+			if (bodies[bodyB->m_originalBodyIndex].m_invMass)
+				bodyB->internalApplyImpulse(frictionConstraint2.m_contactNormal*bodies[bodyB->m_originalBodyIndex].m_invMass,-frictionConstraint2.m_angularComponentB,-(b3Scalar)frictionConstraint2.m_appliedImpulse);
+		} else
+		{
+			frictionConstraint2.m_appliedImpulse = 0.f;
+		}
+	}
+void	b3PgsJacobiSolver::convertContact(b3RigidBodyData* bodies, b3InertiaData* inertias,b3Contact4* manifold,const b3ContactSolverInfo& infoGlobal)
+	b3RigidBodyData* colObj0=0,*colObj1=0;
+	int solverBodyIdA = getOrInitSolverBody(manifold->getBodyA(),bodies,inertias);
+	int solverBodyIdB = getOrInitSolverBody(manifold->getBodyB(),bodies,inertias);
+//	b3RigidBody* bodyA = b3RigidBody::upcast(colObj0);
+//	b3RigidBody* bodyB = b3RigidBody::upcast(colObj1);
+	b3SolverBody* solverBodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody* solverBodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+	///avoid collision response between two static objects
+	if (solverBodyA->m_invMass.isZero() && solverBodyB->m_invMass.isZero())
+		return;
+	int rollingFriction=1;
+	int numContacts = getNumContacts(manifold);
+	for (int j=0;j<numContacts;j++)
+	{
+		b3ContactPoint cp;
+		getContactPoint(manifold,j,cp);
+		if (cp.getDistance() <= getContactProcessingThreshold(manifold))
+		{
+			b3Vector3 rel_pos1;
+			b3Vector3 rel_pos2;
+			b3Scalar relaxation;
+			b3Scalar rel_vel;
+			b3Vector3 vel;
+			int frictionIndex = m_tmpSolverContactConstraintPool.size();
+			b3SolverConstraint& solverConstraint = m_tmpSolverContactConstraintPool.expandNonInitializing();
+//			b3RigidBody* rb0 = b3RigidBody::upcast(colObj0);
+//			b3RigidBody* rb1 = b3RigidBody::upcast(colObj1);
+			solverConstraint.m_solverBodyIdA = solverBodyIdA;
+			solverConstraint.m_solverBodyIdB = solverBodyIdB;
+			solverConstraint.m_originalContactPoint = &cp;
+			setupContactConstraint(bodies,inertias,solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal, vel, rel_vel, relaxation, rel_pos1, rel_pos2);
+//			const b3Vector3& pos1 = cp.getPositionWorldOnA();
+//			const b3Vector3& pos2 = cp.getPositionWorldOnB();
+			/////setup the friction constraints
+			solverConstraint.m_frictionIndex = m_tmpSolverContactFrictionConstraintPool.size();
+			b3Vector3 angVelA,angVelB;
+			solverBodyA->getAngularVelocity(angVelA);
+			solverBodyB->getAngularVelocity(angVelB);			
+			b3Vector3 relAngVel = angVelB-angVelA;
+			if ((cp.m_combinedRollingFriction>0.f) && (rollingFriction>0))
+			{
+				//only a single rollingFriction per manifold
+				rollingFriction--;
+				if (relAngVel.length()>infoGlobal.m_singleAxisRollingFrictionThreshold)
+				{
+					relAngVel.normalize();
+					if (relAngVel.length()>0.001)
+						addRollingFrictionConstraint(bodies,inertias,relAngVel,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				} else
+				{
+					addRollingFrictionConstraint(bodies,inertias,cp.m_normalWorldOnB,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					b3Vector3 axis0,axis1;
+					b3PlaneSpace1(cp.m_normalWorldOnB,axis0,axis1);
+					if (axis0.length()>0.001)
+						addRollingFrictionConstraint(bodies,inertias,axis0,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					if (axis1.length()>0.001)
+						addRollingFrictionConstraint(bodies,inertias,axis1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				}
+			}
+			///Bullet has several options to set the friction directions
+			///By default, each contact has only a single friction direction that is recomputed automatically very frame 
+			///based on the relative linear velocity.
+			///If the relative velocity it zero, it will automatically compute a friction direction.
+			///You can also enable two friction directions, using the B3_SOLVER_USE_2_FRICTION_DIRECTIONS.
+			///In that case, the second friction direction will be orthogonal to both contact normal and first friction direction.
+			///
+			///If you choose B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION, then the friction will be independent from the relative projected velocity.
+			///
+			///The user can manually override the friction directions for certain contacts using a contact callback, 
+			///and set the cp.m_lateralFrictionInitialized to true
+			///In that case, you can set the target relative motion in each friction direction (cp.m_contactMotion1 and cp.m_contactMotion2)
+			///this will give a conveyor belt effect
+			///
+			if (!(infoGlobal.m_solverMode & B3_SOLVER_ENABLE_FRICTION_DIRECTION_CACHING) || !cp.m_lateralFrictionInitialized)
+			{
+				cp.m_lateralFrictionDir1 = vel - cp.m_normalWorldOnB * rel_vel;
+				b3Scalar lat_rel_vel = cp.m_lateralFrictionDir1.length2();
+				if (!(infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION) && lat_rel_vel > B3_EPSILON)
+				{
+					cp.m_lateralFrictionDir1 *= 1.f/b3Sqrt(lat_rel_vel);
+					if((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+					{
+						cp.m_lateralFrictionDir2 = cp.m_lateralFrictionDir1.cross(cp.m_normalWorldOnB);
+						cp.m_lateralFrictionDir2.normalize();//??
+						addFrictionConstraint(bodies,inertias,cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					}
+					addFrictionConstraint(bodies,inertias,cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				} else
+				{
+					b3PlaneSpace1(cp.m_normalWorldOnB,cp.m_lateralFrictionDir1,cp.m_lateralFrictionDir2);
+					if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+					{
+						addFrictionConstraint(bodies,inertias,cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					}
+					addFrictionConstraint(bodies,inertias,cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION))
+					{
+						cp.m_lateralFrictionInitialized = true;
+					}
+				}
+			} else
+			{
+				addFrictionConstraint(bodies,inertias,cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation,cp.m_contactMotion1, cp.m_contactCFM1);
+				if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+					addFrictionConstraint(bodies,inertias,cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation, cp.m_contactMotion2, cp.m_contactCFM2);
+				setFrictionConstraintImpulse( bodies,inertias,solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal);
+			}
+		}
+	}
+b3Scalar b3PgsJacobiSolver::solveGroupCacheFriendlySetup(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds,b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("solveGroupCacheFriendlySetup");
+	m_maxOverrideNumSolverIterations = 0;
+	m_tmpSolverBodyPool.resize(0);
+	m_bodyCount.resize(0);
+	m_bodyCount.resize(numBodies,0);
+	m_bodyCountCheck.resize(0);
+	m_bodyCountCheck.resize(numBodies,0);
+	m_deltaLinearVelocities.resize(0);
+	m_deltaLinearVelocities.resize(numBodies,b3MakeVector3(0,0,0));
+	m_deltaAngularVelocities.resize(0);
+	m_deltaAngularVelocities.resize(numBodies,b3MakeVector3(0,0,0));
+	int totalBodies = 0;
+	for (int i=0;i<numConstraints;i++)
+	{
+		int bodyIndexA = constraints[i]->getRigidBodyA();
+		int bodyIndexB = constraints[i]->getRigidBodyB();
+		if (m_usePgs)
+		{
+			m_bodyCount[bodyIndexA]=-1;
+			m_bodyCount[bodyIndexB]=-1;
+		} else
+		{
+			//didn't implement joints with Jacobi version yet
+			b3Assert(0);
+		}
+	}
+	for (int i=0;i<numManifolds;i++)
+	{
+		int bodyIndexA = manifoldPtr[i].getBodyA();
+		int bodyIndexB = manifoldPtr[i].getBodyB();
+		if (m_usePgs)
+		{
+			m_bodyCount[bodyIndexA]=-1;
+			m_bodyCount[bodyIndexB]=-1;
+		} else
+		{
+			if (bodies[bodyIndexA].m_invMass)
+			{
+				//m_bodyCount[bodyIndexA]+=manifoldPtr[i].getNPoints();
+				m_bodyCount[bodyIndexA]++;
+			}
+			else
+				m_bodyCount[bodyIndexA]=-1;
+			if (bodies[bodyIndexB].m_invMass)
+			//	m_bodyCount[bodyIndexB]+=manifoldPtr[i].getNPoints();
+				m_bodyCount[bodyIndexB]++;
+			else
+				m_bodyCount[bodyIndexB]=-1;
+		}
+	}
+	if (1)
+	{
+		int j;
+		for (j=0;j<numConstraints;j++)
+		{
+			b3TypedConstraint* constraint = constraints[j];
+			constraint->internalSetAppliedImpulse(0.0f);
+		}
+	}
+	//b3RigidBody* rb0=0,*rb1=0;
+	//if (1)
+	{
+		{
+			int totalNumRows = 0;
+			int i;
+			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
+			//calculate the total number of contraint rows
+			for (i=0;i<numConstraints;i++)
+			{
+				b3TypedConstraint::b3ConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+				b3JointFeedback* fb = constraints[i]->getJointFeedback();
+				if (fb)
+				{
+					fb->m_appliedForceBodyA.setZero();
+					fb->m_appliedTorqueBodyA.setZero();
+					fb->m_appliedForceBodyB.setZero();
+					fb->m_appliedTorqueBodyB.setZero();
+				}
+				if (constraints[i]->isEnabled())
+				{
+				}
+				if (constraints[i]->isEnabled())
+				{
+					constraints[i]->getInfo1(&info1,bodies);
+				} else
+				{
+					info1.m_numConstraintRows = 0;
+					info1.nub = 0;
+				}
+				totalNumRows += info1.m_numConstraintRows;
+			}
+			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
+			///setup the b3SolverConstraints
+			int currentRow = 0;
+			for (i=0;i<numConstraints;i++)
+			{
+				const b3TypedConstraint::b3ConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+				if (info1.m_numConstraintRows)
+				{
+					b3Assert(currentRow<totalNumRows);
+					b3SolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
+					b3TypedConstraint* constraint = constraints[i];
+					b3RigidBodyData& rbA = bodies[ constraint->getRigidBodyA()];
+					//b3RigidBody& rbA = constraint->getRigidBodyA();
+	//				b3RigidBody& rbB = constraint->getRigidBodyB();
+					b3RigidBodyData& rbB = bodies[ constraint->getRigidBodyB()];
+                    int solverBodyIdA = getOrInitSolverBody(constraint->getRigidBodyA(),bodies,inertias);
+                    int solverBodyIdB = getOrInitSolverBody(constraint->getRigidBodyB(),bodies,inertias);
+                    b3SolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
+                    b3SolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
+					int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
+					if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
+						m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
+					int j;
+					for ( j=0;j<info1.m_numConstraintRows;j++)
+					{
+						memset(&currentConstraintRow[j],0,sizeof(b3SolverConstraint));
+						currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;
+						currentConstraintRow[j].m_upperLimit = B3_INFINITY;
+						currentConstraintRow[j].m_appliedImpulse = 0.f;
+						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+						currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+						currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
+						currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
+					}
+					bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+					b3TypedConstraint::b3ConstraintInfo2 info2;
+					info2.fps = 1.f/infoGlobal.m_timeStep;
+					info2.erp = infoGlobal.m_erp;
+					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
+					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
+					info2.m_J2linearAxis = 0;
+					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
+					info2.rowskip = sizeof(b3SolverConstraint)/sizeof(b3Scalar);//check this
+					///the size of b3SolverConstraint needs be a multiple of b3Scalar
+		            b3Assert(info2.rowskip*sizeof(b3Scalar)== sizeof(b3SolverConstraint));
+					info2.m_constraintError = &currentConstraintRow->m_rhs;
+					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
+					info2.m_damping = infoGlobal.m_damping;
+					info2.cfm = &currentConstraintRow->m_cfm;
+					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+					info2.m_numIterations = infoGlobal.m_numIterations;
+					constraints[i]->getInfo2(&info2,bodies);
+					///finalize the constraint setup
+					for ( j=0;j<info1.m_numConstraintRows;j++)
+					{
+						b3SolverConstraint& solverConstraint = currentConstraintRow[j];
+						if (solverConstraint.m_upperLimit>=constraints[i]->getBreakingImpulseThreshold())
+						{
+							solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold();
+						}
+						if (solverConstraint.m_lowerLimit<=-constraints[i]->getBreakingImpulseThreshold())
+						{
+							solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold();
+						}
+						solverConstraint.m_originalContactPoint = constraint;
+						b3Matrix3x3& invInertiaWorldA= inertias[constraint->getRigidBodyA()].m_invInertiaWorld;
+						{
+							//b3Vector3 angularFactorA(1,1,1);
+							const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
+							solverConstraint.m_angularComponentA = invInertiaWorldA*ftorqueAxis1;//*angularFactorA;
+						}
+						b3Matrix3x3& invInertiaWorldB= inertias[constraint->getRigidBodyB()].m_invInertiaWorld;
+						{
+							const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
+							solverConstraint.m_angularComponentB = invInertiaWorldB*ftorqueAxis2;//*constraint->getRigidBodyB().getAngularFactor();
+						}
+						{
+							//it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal
+							//because it gets multiplied iMJlB
+							b3Vector3 iMJlA = solverConstraint.m_contactNormal*rbA.m_invMass;
+							b3Vector3 iMJaA = invInertiaWorldA*solverConstraint.m_relpos1CrossNormal;
+							b3Vector3 iMJlB = solverConstraint.m_contactNormal*rbB.m_invMass;//sign of normal?
+							b3Vector3 iMJaB = invInertiaWorldB*solverConstraint.m_relpos2CrossNormal;
+							b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
+							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+							sum += iMJlB.dot(solverConstraint.m_contactNormal);
+							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+							b3Scalar fsum = b3Fabs(sum);
+							b3Assert(fsum > B3_EPSILON);
+							solverConstraint.m_jacDiagABInv = fsum>B3_EPSILON?b3Scalar(1.)/sum : 0.f;
+						}
+						///fix rhs
+						///todo: add force/torque accelerators
+						{
+							b3Scalar rel_vel;
+							b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel);
+							b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel);
+							rel_vel = vel1Dotn+vel2Dotn;
+							b3Scalar restitution = 0.f;
+							b3Scalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
+							b3Scalar	velocityError = restitution - rel_vel * info2.m_damping;
+							b3Scalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+							b3Scalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+							solverConstraint.m_appliedImpulse = 0.f;
+						}
+					}
+				}
+				currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows;
+			}
+		}
+		{
+			int i;
+			for (i=0;i<numManifolds;i++)
+			{
+				b3Contact4& manifold = manifoldPtr[i];
+				convertContact(bodies,inertias,&manifold,infoGlobal);
+			}
+		}
+	}
+//	b3ContactSolverInfo info = infoGlobal;
+	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
+	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
+	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
+	///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints
+	m_orderNonContactConstraintPool.resizeNoInitialize(numNonContactPool);
+	if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+		m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool*2);
+	else
+		m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool);
+	m_orderFrictionConstraintPool.resizeNoInitialize(numFrictionPool);
+	{
+		int i;
+		for (i=0;i<numNonContactPool;i++)
+		{
+			m_orderNonContactConstraintPool[i] = i;
+		}
+		for (i=0;i<numConstraintPool;i++)
+		{
+			m_orderTmpConstraintPool[i] = i;
+		}
+		for (i=0;i<numFrictionPool;i++)
+		{
+			m_orderFrictionConstraintPool[i] = i;
+		}
+	}
+	return 0.f;
+b3Scalar b3PgsJacobiSolver::solveSingleIteration(int iteration,b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
+	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
+	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
+	if (infoGlobal.m_solverMode & B3_SOLVER_RANDMIZE_ORDER)
+	{
+		if (1)			// uncomment this for a bit less random ((iteration & 7) == 0)
+		{
+			for (int j=0; j<numNonContactPool; ++j) {
+				int tmp = m_orderNonContactConstraintPool[j];
+				int swapi = b3RandInt2(j+1);
+				m_orderNonContactConstraintPool[j] = m_orderNonContactConstraintPool[swapi];
+				m_orderNonContactConstraintPool[swapi] = tmp;
+			}
+			//contact/friction constraints are not solved more than 
+			if (iteration< infoGlobal.m_numIterations)
+			{
+				for (int j=0; j<numConstraintPool; ++j) {
+					int tmp = m_orderTmpConstraintPool[j];
+					int swapi = b3RandInt2(j+1);
+					m_orderTmpConstraintPool[j] = m_orderTmpConstraintPool[swapi];
+					m_orderTmpConstraintPool[swapi] = tmp;
+				}
+				for (int j=0; j<numFrictionPool; ++j) {
+					int tmp = m_orderFrictionConstraintPool[j];
+					int swapi = b3RandInt2(j+1);
+					m_orderFrictionConstraintPool[j] = m_orderFrictionConstraintPool[swapi];
+					m_orderFrictionConstraintPool[swapi] = tmp;
+				}
+			}
+		}
+	}
+	if (infoGlobal.m_solverMode & B3_SOLVER_SIMD)
+	{
+		///solve all joint constraints, using SIMD, if available
+		for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+		{
+			b3SolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+			if (iteration < constraint.m_overrideNumSolverIterations)
+				resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+		}
+		if (iteration< infoGlobal.m_numIterations)
+		{
+			///solve all contact constraints using SIMD, if available
+			{
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int multiplier = (infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS)? 2 : 1;
+				for (int c=0;c<numPoolConstraints;c++)
+				{
+					b3Scalar totalImpulse =0;
+					{
+						const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[c]];
+						resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+						totalImpulse = solveManifold.m_appliedImpulse;
+					}
+					bool applyFriction = true;
+					if (applyFriction)
+					{
+						{
+							b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c*multiplier]];
+							if (totalImpulse>b3Scalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+							}
+						}
+						if (infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS)
+						{
+							b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c*multiplier+1]];
+							if (totalImpulse>b3Scalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+							}
+						}
+					}
+				}
+			}
+			{
+				//solve the friction constraints after all contact constraints, don't interleave them
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int j;
+				for (j=0;j<numPoolConstraints;j++)
+				{
+					const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+					resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+				}
+				if (!m_usePgs)
+					averageVelocities();
+				///solve all friction constraints, using SIMD, if available
+				int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+				for (j=0;j<numFrictionPoolConstraints;j++)
+				{
+					b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+					b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse>b3Scalar(0))
+					{
+						solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+						solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+						resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					}
+				}
+				int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+				for (j=0;j<numRollingFrictionPoolConstraints;j++)
+				{
+					b3SolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+					b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse>b3Scalar(0))
+					{
+						b3Scalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+						if (rollingFrictionMagnitude>rollingFrictionConstraint.m_friction)
+							rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+						rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+						rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+						resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA],m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB],rollingFrictionConstraint);
+					}
+				}
+			}			
+		}
+	} else
+	{
+		//non-SIMD version
+		///solve all joint constraints
+		for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+		{
+			b3SolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+			if (iteration < constraint.m_overrideNumSolverIterations)
+				resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+		}
+		if (iteration< infoGlobal.m_numIterations)
+		{
+			///solve all contact constraints
+			int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+			for (int j=0;j<numPoolConstraints;j++)
+			{
+				const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+				resolveSingleConstraintRowLowerLimit(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+			}
+			///solve all friction constraints
+			int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+			for (int j=0;j<numFrictionPoolConstraints;j++)
+			{
+				b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+				b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+				if (totalImpulse>b3Scalar(0))
+				{
+					solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+					solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+					resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+				}
+			}
+			int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+			for (int j=0;j<numRollingFrictionPoolConstraints;j++)
+			{
+				b3SolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+				b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+				if (totalImpulse>b3Scalar(0))
+				{
+					b3Scalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+					if (rollingFrictionMagnitude>rollingFrictionConstraint.m_friction)
+						rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+					rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+					rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+					resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA],m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB],rollingFrictionConstraint);
+				}
+			}
+		}
+	}
+	return 0.f;
+void b3PgsJacobiSolver::solveGroupCacheFriendlySplitImpulseIterations(b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	int iteration;
+	if (infoGlobal.m_splitImpulse)
+	{
+		if (infoGlobal.m_solverMode & B3_SOLVER_SIMD)
+		{
+			for ( iteration = 0;iteration<infoGlobal.m_numIterations;iteration++)
+			{
+				{
+					int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+					int j;
+					for (j=0;j<numPoolConstraints;j++)
+					{
+						const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+						resolveSplitPenetrationSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					}
+				}
+			}
+		}
+		else
+		{
+			for ( iteration = 0;iteration<infoGlobal.m_numIterations;iteration++)
+			{
+				{
+					int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+					int j;
+					for (j=0;j<numPoolConstraints;j++)
+					{
+						const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+						resolveSplitPenetrationImpulseCacheFriendly(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					}
+				}
+			}
+		}
+	}
+b3Scalar b3PgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("solveGroupCacheFriendlyIterations");
+	{
+		///this is a special step to resolve penetrations (just for contacts)
+		solveGroupCacheFriendlySplitImpulseIterations(constraints,numConstraints,infoGlobal);
+		int maxIterations = m_maxOverrideNumSolverIterations > infoGlobal.m_numIterations? m_maxOverrideNumSolverIterations : infoGlobal.m_numIterations;
+		for ( int iteration = 0 ; iteration< maxIterations ; iteration++)
+		//for ( int iteration = maxIterations-1  ; iteration >= 0;iteration--)
+		{			
+			solveSingleIteration(iteration, constraints,numConstraints,infoGlobal);
+			if (!m_usePgs)
+			{
+				averageVelocities();
+			}
+		}
+	}
+	return 0.f;
+void	b3PgsJacobiSolver::averageVelocities()
+	B3_PROFILE("averaging");
+	//average the velocities
+	int numBodies = m_bodyCount.size();
+	m_deltaLinearVelocities.resize(0);
+	m_deltaLinearVelocities.resize(numBodies,b3MakeVector3(0,0,0));
+	m_deltaAngularVelocities.resize(0);
+	m_deltaAngularVelocities.resize(numBodies,b3MakeVector3(0,0,0));
+	for (int i=0;i<m_tmpSolverBodyPool.size();i++)
+	{
+		if (!m_tmpSolverBodyPool[i].m_invMass.isZero())
+		{
+			int orgBodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+			m_deltaLinearVelocities[orgBodyIndex]+=m_tmpSolverBodyPool[i].getDeltaLinearVelocity();
+			m_deltaAngularVelocities[orgBodyIndex]+=m_tmpSolverBodyPool[i].getDeltaAngularVelocity();
+		}
+	}
+	for (int i=0;i<m_tmpSolverBodyPool.size();i++)
+	{
+		int orgBodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+		if (!m_tmpSolverBodyPool[i].m_invMass.isZero())
+		{
+			b3Assert(m_bodyCount[orgBodyIndex] == m_bodyCountCheck[orgBodyIndex]);
+			b3Scalar factor = 1.f/b3Scalar(m_bodyCount[orgBodyIndex]);
+			m_tmpSolverBodyPool[i].m_deltaLinearVelocity = m_deltaLinearVelocities[orgBodyIndex]*factor;
+			m_tmpSolverBodyPool[i].m_deltaAngularVelocity = m_deltaAngularVelocities[orgBodyIndex]*factor;
+		}
+	}
+b3Scalar b3PgsJacobiSolver::solveGroupCacheFriendlyFinish(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("solveGroupCacheFriendlyFinish");
+	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+	int i,j;
+	if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+	{
+		for (j=0;j<numPoolConstraints;j++)
+		{
+			const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
+			b3ContactPoint* pt = (b3ContactPoint*) solveManifold.m_originalContactPoint;
+			b3Assert(pt);
+			pt->m_appliedImpulse = solveManifold.m_appliedImpulse;
+		//	float f = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+			//	printf("pt->m_appliedImpulseLateral1 = %f\n", f);
+			pt->m_appliedImpulseLateral1 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+			//printf("pt->m_appliedImpulseLateral1 = %f\n", pt->m_appliedImpulseLateral1);
+			if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+			{
+				pt->m_appliedImpulseLateral2 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex+1].m_appliedImpulse;
+			}
+			//do a callback here?
+		}
+	}
+	numPoolConstraints = m_tmpSolverNonContactConstraintPool.size();
+	for (j=0;j<numPoolConstraints;j++)
+	{
+		const b3SolverConstraint& solverConstr = m_tmpSolverNonContactConstraintPool[j];
+		b3TypedConstraint* constr = (b3TypedConstraint*)solverConstr.m_originalContactPoint;
+		b3JointFeedback* fb = constr->getJointFeedback();
+		if (fb)
+		{
+			b3SolverBody* bodyA = &m_tmpSolverBodyPool[solverConstr.m_solverBodyIdA];
+			b3SolverBody* bodyB = &m_tmpSolverBodyPool[solverConstr.m_solverBodyIdB];
+			fb->m_appliedForceBodyA += solverConstr.m_contactNormal*solverConstr.m_appliedImpulse*bodyA->m_linearFactor/infoGlobal.m_timeStep;
+			fb->m_appliedForceBodyB += -solverConstr.m_contactNormal*solverConstr.m_appliedImpulse*bodyB->m_linearFactor/infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyA += solverConstr.m_relpos1CrossNormal* bodyA->m_angularFactor*solverConstr.m_appliedImpulse/infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyB += -solverConstr.m_relpos1CrossNormal* bodyB->m_angularFactor*solverConstr.m_appliedImpulse/infoGlobal.m_timeStep;
+		}
+		constr->internalSetAppliedImpulse(solverConstr.m_appliedImpulse);
+		if (b3Fabs(solverConstr.m_appliedImpulse)>=constr->getBreakingImpulseThreshold())
+		{
+			constr->setEnabled(false);
+		}
+	}
+	{
+		B3_PROFILE("write back velocities and transforms");
+		for ( i=0;i<m_tmpSolverBodyPool.size();i++)
+		{
+			int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+			//b3Assert(i==bodyIndex);
+			b3RigidBodyData* body = &bodies[bodyIndex];
+			if (body->m_invMass)
+			{
+				if (infoGlobal.m_splitImpulse)
+					m_tmpSolverBodyPool[i].writebackVelocityAndTransform(infoGlobal.m_timeStep, infoGlobal.m_splitImpulseTurnErp);
+				else
+					m_tmpSolverBodyPool[i].writebackVelocity();
+				if (m_usePgs)
+				{
+					body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity;
+					body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity;
+				} else
+				{
+					b3Scalar factor = 1.f/b3Scalar(m_bodyCount[bodyIndex]);
+					b3Vector3 deltaLinVel = m_deltaLinearVelocities[bodyIndex]*factor;
+					b3Vector3 deltaAngVel = m_deltaAngularVelocities[bodyIndex]*factor;
+					//printf("body %d\n",bodyIndex);
+					//printf("deltaLinVel = %f,%f,%f\n",deltaLinVel.getX(),deltaLinVel.getY(),deltaLinVel.getZ());
+					//printf("deltaAngVel = %f,%f,%f\n",deltaAngVel.getX(),deltaAngVel.getY(),deltaAngVel.getZ());
+					body->m_linVel += deltaLinVel;
+					body->m_angVel += deltaAngVel;
+				}
+				if (infoGlobal.m_splitImpulse)
+				{
+					body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin();
+					b3Quaternion orn;
+					orn = m_tmpSolverBodyPool[i].m_worldTransform.getRotation();
+					body->m_quat = orn;
+				}
+			}
+		}
+	}
+	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverBodyPool.resizeNoInitialize(0);
+	return 0.f;
+void	b3PgsJacobiSolver::reset()
+	m_btSeed2 = 0;
\ No newline at end of file
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h
new file mode 100644
index 00000000..d2ca307f
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h
@@ -0,0 +1,149 @@
+struct b3Contact4;
+struct b3ContactPoint;
+class b3Dispatcher;
+#include "b3TypedConstraint.h"
+#include "b3ContactSolverInfo.h"
+#include "b3SolverBody.h"
+#include "b3SolverConstraint.h"
+struct b3RigidBodyData;
+struct b3InertiaData;
+class b3PgsJacobiSolver
+	b3AlignedObjectArray<b3SolverBody>      m_tmpSolverBodyPool;
+	b3ConstraintArray			m_tmpSolverContactConstraintPool;
+	b3ConstraintArray			m_tmpSolverNonContactConstraintPool;
+	b3ConstraintArray			m_tmpSolverContactFrictionConstraintPool;
+	b3ConstraintArray			m_tmpSolverContactRollingFrictionConstraintPool;
+	b3AlignedObjectArray<int>	m_orderTmpConstraintPool;
+	b3AlignedObjectArray<int>	m_orderNonContactConstraintPool;
+	b3AlignedObjectArray<int>	m_orderFrictionConstraintPool;
+	b3AlignedObjectArray<b3TypedConstraint::b3ConstraintInfo1> m_tmpConstraintSizesPool;
+	b3AlignedObjectArray<int>		m_bodyCount;
+	b3AlignedObjectArray<int>		m_bodyCountCheck;
+	b3AlignedObjectArray<b3Vector3>	m_deltaLinearVelocities;
+	b3AlignedObjectArray<b3Vector3>	m_deltaAngularVelocities;
+	bool						m_usePgs;
+	void						averageVelocities();
+	int							m_maxOverrideNumSolverIterations;
+	int							m_numSplitImpulseRecoveries;
+	b3Scalar	getContactProcessingThreshold(b3Contact4* contact)
+	{
+		return 0.02f;
+	}
+	void setupFrictionConstraint(	b3RigidBodyData* bodies,b3InertiaData* inertias, b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis,int solverBodyIdA,int  solverBodyIdB,
+									b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,
+									b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, 
+									b3Scalar desiredVelocity=0., b3Scalar cfmSlip=0.);
+	void setupRollingFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias,	b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis,int solverBodyIdA,int  solverBodyIdB,
+									b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,
+									b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, 
+									b3Scalar desiredVelocity=0., b3Scalar cfmSlip=0.);
+	b3SolverConstraint&	addFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias,const b3Vector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity=0., b3Scalar cfmSlip=0.);
+	b3SolverConstraint&	addRollingFrictionConstraint(b3RigidBodyData* bodies,b3InertiaData* inertias,const b3Vector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,b3ContactPoint& cp,const b3Vector3& rel_pos1,const b3Vector3& rel_pos2,b3RigidBodyData* colObj0,b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity=0, b3Scalar cfmSlip=0.f);
+	void setupContactConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias,
+								b3SolverConstraint& solverConstraint, int solverBodyIdA, int solverBodyIdB, b3ContactPoint& cp, 
+								const b3ContactSolverInfo& infoGlobal, b3Vector3& vel, b3Scalar& rel_vel, b3Scalar& relaxation, 
+								b3Vector3& rel_pos1, b3Vector3& rel_pos2);
+	void setFrictionConstraintImpulse( b3RigidBodyData* bodies, b3InertiaData* inertias,b3SolverConstraint& solverConstraint, int solverBodyIdA,int solverBodyIdB, 
+										 b3ContactPoint& cp, const b3ContactSolverInfo& infoGlobal);
+	///m_btSeed2 is used for re-arranging the constraint rows. improves convergence/quality of friction
+	unsigned long	m_btSeed2;
+	b3Scalar restitutionCurve(b3Scalar rel_vel, b3Scalar restitution);
+	void	convertContact(b3RigidBodyData* bodies, b3InertiaData* inertias,b3Contact4* manifold,const b3ContactSolverInfo& infoGlobal);
+	void	resolveSplitPenetrationSIMD(
+     b3SolverBody& bodyA,b3SolverBody& bodyB,
+        const b3SolverConstraint& contactConstraint);
+	void	resolveSplitPenetrationImpulseCacheFriendly(
+       b3SolverBody& bodyA,b3SolverBody& bodyB,
+        const b3SolverConstraint& contactConstraint);
+	//internal method
+	int		getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
+	void	initSolverBody(int bodyIndex, b3SolverBody* solverBody, b3RigidBodyData* collisionObject);
+	void	resolveSingleConstraintRowGeneric(b3SolverBody& bodyA,b3SolverBody& bodyB,const b3SolverConstraint& contactConstraint);
+	void	resolveSingleConstraintRowGenericSIMD(b3SolverBody& bodyA,b3SolverBody& bodyB,const b3SolverConstraint& contactConstraint);
+	void	resolveSingleConstraintRowLowerLimit(b3SolverBody& bodyA,b3SolverBody& bodyB,const b3SolverConstraint& contactConstraint);
+	void	resolveSingleConstraintRowLowerLimitSIMD(b3SolverBody& bodyA,b3SolverBody& bodyB,const b3SolverConstraint& contactConstraint);
+	virtual b3Scalar solveGroupCacheFriendlySetup(b3RigidBodyData* bodies, b3InertiaData* inertias,int numBodies,b3Contact4* manifoldPtr, int numManifolds,b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	virtual b3Scalar solveGroupCacheFriendlyIterations(b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	virtual void solveGroupCacheFriendlySplitImpulseIterations(b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	b3Scalar solveSingleIteration(int iteration, b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	virtual b3Scalar solveGroupCacheFriendlyFinish(b3RigidBodyData* bodies, b3InertiaData* inertias,int numBodies,const b3ContactSolverInfo& infoGlobal);
+	b3PgsJacobiSolver(bool usePgs);
+	virtual ~b3PgsJacobiSolver();
+//	void	solveContacts(int numBodies, b3RigidBodyData* bodies, b3InertiaData* inertias, int numContacts, b3Contact4* contacts);
+	void	solveContacts(int numBodies, b3RigidBodyData* bodies, b3InertiaData* inertias, int numContacts, b3Contact4* contacts, int numConstraints, b3TypedConstraint** constraints);
+	b3Scalar solveGroup(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,b3Contact4* manifoldPtr, int numManifolds,b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	///clear internal cached data and reset random seed
+	virtual	void	reset();
+	unsigned long b3Rand2();
+	int b3RandInt2 (int n);
+	void	setRandSeed(unsigned long seed)
+	{
+		m_btSeed2 = seed;
+	}
+	unsigned long	getRandSeed() const
+	{
+		return m_btSeed2;
+	}
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.cpp b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.cpp
new file mode 100644
index 00000000..637c5b5a
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.cpp
@@ -0,0 +1,209 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3Point2PointConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include <new>
+b3Point2PointConstraint::b3Point2PointConstraint(int rbA,int rbB, const b3Vector3& pivotInA,const b3Vector3& pivotInB)
+b3Point2PointConstraint::b3Point2PointConstraint(int  rbA,const b3Vector3& pivotInA)
+void b3Point2PointConstraint::getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies)
+	getInfo1NonVirtual(info,bodies);
+void b3Point2PointConstraint::getInfo1NonVirtual (b3ConstraintInfo1* info,const b3RigidBodyData* bodies)
+		info->m_numConstraintRows = 3;
+		info->nub = 3;
+void b3Point2PointConstraint::getInfo2 (b3ConstraintInfo2* info, const b3RigidBodyData* bodies)
+	b3Transform trA;
+	trA.setIdentity();
+	trA.setOrigin(bodies[m_rbA].m_pos);
+	trA.setRotation(bodies[m_rbA].m_quat);
+	b3Transform trB;
+	trB.setIdentity();
+	trB.setOrigin(bodies[m_rbB].m_pos);
+	trB.setRotation(bodies[m_rbB].m_quat);
+	getInfo2NonVirtual(info, trA,trB);
+void b3Point2PointConstraint::getInfo2NonVirtual (b3ConstraintInfo2* info, const b3Transform& body0_trans, const b3Transform& body1_trans)
+	 //retrieve matrices
+	// anchor points in global coordinates with respect to body PORs.
+    // set jacobian
+    info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip+1] = 1;
+	info->m_J1linearAxis[2*info->rowskip+2] = 1;
+	b3Vector3 a1 = body0_trans.getBasis()*getPivotInA();
+	b3Vector3 a1a = b3QuatRotate(body0_trans.getRotation(),getPivotInA());
+	{
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip);
+		b3Vector3 a1neg = -a1;
+		a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2);
+	}
+	if (info->m_J2linearAxis)
+	{
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[info->rowskip+1] = -1;
+		info->m_J2linearAxis[2*info->rowskip+2] = -1;
+	}
+	b3Vector3 a2 = body1_trans.getBasis()*getPivotInB();
+	{
+	//	b3Vector3 a2n = -a2;
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip);
+		a2.getSkewSymmetricMatrix(angular0,angular1,angular2);
+	}
+    // set right hand side
+	b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
+    b3Scalar k = info->fps * currERP;
+    int j;
+	for (j=0; j<3; j++)
+    {
+        info->m_constraintError[j*info->rowskip] = k * (a2[j] + body1_trans.getOrigin()[j] - a1[j] - body0_trans.getOrigin()[j]);
+		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
+    }
+	if(m_flags & B3_P2P_FLAGS_CFM)
+	{
+		for (j=0; j<3; j++)
+		{
+			info->cfm[j*info->rowskip] = m_cfm;
+		}
+	}
+	b3Scalar impulseClamp = m_setting.m_impulseClamp;//
+	for (j=0; j<3; j++)
+    {
+		if (m_setting.m_impulseClamp > 0)
+		{
+			info->m_lowerLimit[j*info->rowskip] = -impulseClamp;
+			info->m_upperLimit[j*info->rowskip] = impulseClamp;
+		}
+	}
+	info->m_damping = m_setting.m_damping;
+void	b3Point2PointConstraint::updateRHS(b3Scalar	timeStep)
+	(void)timeStep;
+///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+///If no axis is provided, it uses the default axis for this constraint.
+void b3Point2PointConstraint::setParam(int num, b3Scalar value, int axis)
+	if(axis != -1)
+	{
+		b3AssertConstrParams(0);
+	}
+	else
+	{
+		switch(num)
+		{
+			case B3_CONSTRAINT_ERP :
+				m_erp = value; 
+				m_flags |= B3_P2P_FLAGS_ERP;
+				break;
+			case B3_CONSTRAINT_CFM :
+				m_cfm = value; 
+				m_flags |= B3_P2P_FLAGS_CFM;
+				break;
+			default: 
+				b3AssertConstrParams(0);
+		}
+	}
+///return the local value of parameter
+b3Scalar b3Point2PointConstraint::getParam(int num, int axis) const 
+	b3Scalar retVal(B3_INFINITY);
+	if(axis != -1)
+	{
+		b3AssertConstrParams(0);
+	}
+	else
+	{
+		switch(num)
+		{
+			case B3_CONSTRAINT_ERP :
+				b3AssertConstrParams(m_flags & B3_P2P_FLAGS_ERP);
+				retVal = m_erp; 
+				break;
+			case B3_CONSTRAINT_CFM :
+				b3AssertConstrParams(m_flags & B3_P2P_FLAGS_CFM);
+				retVal = m_cfm; 
+				break;
+			default: 
+				b3AssertConstrParams(0);
+		}
+	}
+	return retVal;
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h
new file mode 100644
index 00000000..681b4873
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h
@@ -0,0 +1,159 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+//#include "b3JacobianEntry.h"
+#include "b3TypedConstraint.h"
+class b3RigidBody;
+#define b3Point2PointConstraintData	b3Point2PointConstraintDoubleData
+#define b3Point2PointConstraintDataName	"b3Point2PointConstraintDoubleData"
+#define b3Point2PointConstraintData	b3Point2PointConstraintFloatData
+#define b3Point2PointConstraintDataName	"b3Point2PointConstraintFloatData"
+struct	b3ConstraintSetting
+	b3ConstraintSetting()	:
+		m_tau(b3Scalar(0.3)),
+		m_damping(b3Scalar(1.)),
+		m_impulseClamp(b3Scalar(0.))
+	{
+	}
+	b3Scalar		m_tau;
+	b3Scalar		m_damping;
+	b3Scalar		m_impulseClamp;
+enum b3Point2PointFlags
+	B3_P2P_FLAGS_ERP = 1,
+	B3_P2P_FLAGS_CFM = 2
+/// point to point constraint between two rigidbodies each with a pivotpoint that descibes the 'ballsocket' location in local space
+B3_ATTRIBUTE_ALIGNED16(class) b3Point2PointConstraint : public b3TypedConstraint
+	b3Vector3	m_pivotInA;
+	b3Vector3	m_pivotInB;
+	int			m_flags;
+	b3Scalar	m_erp;
+	b3Scalar	m_cfm;
+	b3ConstraintSetting	m_setting;
+	b3Point2PointConstraint(int  rbA,int rbB, const b3Vector3& pivotInA,const b3Vector3& pivotInB);
+	//b3Point2PointConstraint(int  rbA,const b3Vector3& pivotInA);
+	virtual void getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies);
+	void getInfo1NonVirtual (b3ConstraintInfo1* info,const b3RigidBodyData* bodies);
+	virtual void getInfo2 (b3ConstraintInfo2* info, const b3RigidBodyData* bodies);
+	void getInfo2NonVirtual (b3ConstraintInfo2* info, const b3Transform& body0_trans, const b3Transform& body1_trans);
+	void	updateRHS(b3Scalar	timeStep);
+	void	setPivotA(const b3Vector3& pivotA)
+	{
+		m_pivotInA = pivotA;
+	}
+	void	setPivotB(const b3Vector3& pivotB)
+	{
+		m_pivotInB = pivotB;
+	}
+	const b3Vector3& getPivotInA() const
+	{
+		return m_pivotInA;
+	}
+	const b3Vector3& getPivotInB() const
+	{
+		return m_pivotInB;
+	}
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+	///If no axis is provided, it uses the default axis for this constraint.
+	virtual	void	setParam(int num, b3Scalar value, int axis = -1);
+	///return the local value of parameter
+	virtual	b3Scalar getParam(int num, int axis = -1) const;
+//	virtual	int	calculateSerializeBufferSize() const;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+//	virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3Point2PointConstraintFloatData
+	b3TypedConstraintData	m_typeConstraintData;
+	b3Vector3FloatData	m_pivotInA;
+	b3Vector3FloatData	m_pivotInB;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3Point2PointConstraintDoubleData
+	b3TypedConstraintData	m_typeConstraintData;
+	b3Vector3DoubleData	m_pivotInA;
+	b3Vector3DoubleData	m_pivotInB;
+B3_FORCE_INLINE	int	b3Point2PointConstraint::calculateSerializeBufferSize() const
+	return sizeof(b3Point2PointConstraintData);
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+B3_FORCE_INLINE	const char*	b3Point2PointConstraint::serialize(void* dataBuffer, b3Serializer* serializer) const
+	b3Point2PointConstraintData* p2pData = (b3Point2PointConstraintData*)dataBuffer;
+	b3TypedConstraint::serialize(&p2pData->m_typeConstraintData,serializer);
+	m_pivotInA.serialize(p2pData->m_pivotInA);
+	m_pivotInB.serialize(p2pData->m_pivotInB);
+	return b3Point2PointConstraintDataName;
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h
new file mode 100644
index 00000000..0049317d
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h
@@ -0,0 +1,302 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_SOLVER_BODY_H
+#define B3_SOLVER_BODY_H
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+#include "Bullet3Common/b3TransformUtil.h"
+///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
+#ifdef B3_USE_SSE
+#define USE_SIMD 1
+#endif //
+#ifdef USE_SIMD
+struct	b3SimdScalar
+	B3_FORCE_INLINE	b3SimdScalar()
+	{
+	}
+	B3_FORCE_INLINE	b3SimdScalar(float	fl)
+	:m_vec128 (_mm_set1_ps(fl))
+	{
+	}
+	B3_FORCE_INLINE	b3SimdScalar(__m128 v128)
+		:m_vec128(v128)
+	{
+	}
+	union
+	{
+		__m128		m_vec128;
+		float		m_floats[4];
+		float		x,y,z,w;
+		int			m_ints[4];
+		b3Scalar	m_unusedPadding;
+	};
+	B3_FORCE_INLINE	__m128	get128()
+	{
+		return m_vec128;
+	}
+	B3_FORCE_INLINE	const __m128	get128() const
+	{
+		return m_vec128;
+	}
+	B3_FORCE_INLINE	void	set128(__m128 v128)
+	{
+		m_vec128 = v128;
+	}
+	B3_FORCE_INLINE	operator       __m128()       
+	{ 
+		return m_vec128; 
+	}
+	B3_FORCE_INLINE	operator const __m128() const 
+	{ 
+		return m_vec128; 
+	}
+	B3_FORCE_INLINE	operator float() const 
+	{ 
+		return m_floats[0]; 
+	}
+///@brief Return the elementwise product of two b3SimdScalar
+B3_FORCE_INLINE b3SimdScalar 
+operator*(const b3SimdScalar& v1, const b3SimdScalar& v2) 
+	return b3SimdScalar(_mm_mul_ps(v1.get128(),v2.get128()));
+///@brief Return the elementwise product of two b3SimdScalar
+B3_FORCE_INLINE b3SimdScalar 
+operator+(const b3SimdScalar& v1, const b3SimdScalar& v2) 
+	return b3SimdScalar(_mm_add_ps(v1.get128(),v2.get128()));
+#define b3SimdScalar b3Scalar
+///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
+B3_ATTRIBUTE_ALIGNED16 (struct)	b3SolverBody
+	b3Transform		m_worldTransform;
+	b3Vector3		m_deltaLinearVelocity;
+	b3Vector3		m_deltaAngularVelocity;
+	b3Vector3		m_angularFactor;
+	b3Vector3		m_linearFactor;
+	b3Vector3		m_invMass;
+	b3Vector3		m_pushVelocity;
+	b3Vector3		m_turnVelocity;
+	b3Vector3		m_linearVelocity;
+	b3Vector3		m_angularVelocity;
+	union 
+	{
+		void*	m_originalBody;
+		int		m_originalBodyIndex;
+	};
+	int padding[3];
+	void	setWorldTransform(const b3Transform& worldTransform)
+	{
+		m_worldTransform = worldTransform;
+	}
+	const b3Transform& getWorldTransform() const
+	{
+		return m_worldTransform;
+	}
+	B3_FORCE_INLINE void	getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
+	{
+		if (m_originalBody)
+			velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+		else
+			velocity.setValue(0,0,0);
+	}
+	B3_FORCE_INLINE void	getAngularVelocity(b3Vector3& angVel) const
+	{
+		if (m_originalBody)
+			angVel =m_angularVelocity+m_deltaAngularVelocity;
+		else
+			angVel.setValue(0,0,0);
+	}
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
+	B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,b3Scalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
+	const b3Vector3& getDeltaLinearVelocity() const
+	{
+		return m_deltaLinearVelocity;
+	}
+	const b3Vector3& getDeltaAngularVelocity() const
+	{
+		return m_deltaAngularVelocity;
+	}
+	const b3Vector3& getPushVelocity() const 
+	{
+		return m_pushVelocity;
+	}
+	const b3Vector3& getTurnVelocity() const 
+	{
+		return m_turnVelocity;
+	}
+	////////////////////////////////////////////////
+	///some internal methods, don't use them
+	b3Vector3& internalGetDeltaLinearVelocity()
+	{
+		return m_deltaLinearVelocity;
+	}
+	b3Vector3& internalGetDeltaAngularVelocity()
+	{
+		return m_deltaAngularVelocity;
+	}
+	const b3Vector3& internalGetAngularFactor() const
+	{
+		return m_angularFactor;
+	}
+	const b3Vector3& internalGetInvMass() const
+	{
+		return m_invMass;
+	}
+	void internalSetInvMass(const b3Vector3& invMass)
+	{
+		m_invMass = invMass;
+	}
+	b3Vector3& internalGetPushVelocity()
+	{
+		return m_pushVelocity;
+	}
+	b3Vector3& internalGetTurnVelocity()
+	{
+		return m_turnVelocity;
+	}
+	B3_FORCE_INLINE void	internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
+	{
+		velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+	}
+	B3_FORCE_INLINE void	internalGetAngularVelocity(b3Vector3& angVel) const
+	{
+		angVel = m_angularVelocity+m_deltaAngularVelocity;
+	}
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
+	{
+		//if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
+	void	writebackVelocity()
+	{
+		//if (m_originalBody>=0)
+		{
+			m_linearVelocity +=m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
+			//m_originalBody->setCompanionId(-1);
+		}
+	}
+	void	writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
+	{
+        (void) timeStep;
+		if (m_originalBody)
+		{
+			m_linearVelocity += m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
+			//correct the position/orientation based on push/turn recovery
+			b3Transform newTransform;
+			if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0)
+			{
+			//	b3Quaternion orn = m_worldTransform.getRotation();
+				b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
+				m_worldTransform = newTransform;
+			}
+			//m_worldTransform.setRotation(orn);
+			//m_originalBody->setCompanionId(-1);
+		}
+	}
+#endif //B3_SOLVER_BODY_H
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3SolverConstraint.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3SolverConstraint.h
new file mode 100644
index 00000000..bce83d46
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3SolverConstraint.h
@@ -0,0 +1,80 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+//#include "b3JacobianEntry.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "b3SolverBody.h"
+///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
+B3_ATTRIBUTE_ALIGNED16 (struct)	b3SolverConstraint
+	b3Vector3		m_relpos1CrossNormal;
+	b3Vector3		m_contactNormal;
+	b3Vector3		m_relpos2CrossNormal;
+	//b3Vector3		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal
+	b3Vector3		m_angularComponentA;
+	b3Vector3		m_angularComponentB;
+	mutable b3SimdScalar	m_appliedPushImpulse;
+	mutable b3SimdScalar	m_appliedImpulse;
+	int m_padding1;
+	int m_padding2;
+	b3Scalar	m_friction;
+	b3Scalar	m_jacDiagABInv;
+	b3Scalar		m_rhs;
+	b3Scalar		m_cfm;
+    b3Scalar		m_lowerLimit;
+	b3Scalar		m_upperLimit;
+	b3Scalar		m_rhsPenetration;
+    union
+	{
+		void*		m_originalContactPoint;
+		b3Scalar	m_unusedPadding4;
+	};
+	int	m_overrideNumSolverIterations;
+    int			m_frictionIndex;
+	int m_solverBodyIdA;
+	int m_solverBodyIdB;
+	enum		b3SolverConstraintType
+	{
+	};
+typedef b3AlignedObjectArray<b3SolverConstraint>	b3ConstraintArray;
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.cpp b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.cpp
new file mode 100644
index 00000000..699c481d
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.cpp
@@ -0,0 +1,161 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3TypedConstraint.h"
+//#include "Bullet3Common/b3Serializer.h"
+#define B3_DEFAULT_DEBUGDRAW_SIZE b3Scalar(0.3f)
+b3TypedConstraint::b3TypedConstraint(b3TypedConstraintType type, int rbA,int rbB)
+b3Scalar b3TypedConstraint::getMotorFactor(b3Scalar pos, b3Scalar lowLim, b3Scalar uppLim, b3Scalar vel, b3Scalar timeFact)
+	if(lowLim > uppLim)
+	{
+		return b3Scalar(1.0f);
+	}
+	else if(lowLim == uppLim)
+	{
+		return b3Scalar(0.0f);
+	}
+	b3Scalar lim_fact = b3Scalar(1.0f);
+	b3Scalar delta_max = vel / timeFact;
+	if(delta_max < b3Scalar(0.0f))
+	{
+		if((pos >= lowLim) && (pos < (lowLim - delta_max)))
+		{
+			lim_fact = (lowLim - pos) / delta_max;
+		}
+		else if(pos  < lowLim)
+		{
+			lim_fact = b3Scalar(0.0f);
+		}
+		else
+		{
+			lim_fact = b3Scalar(1.0f);
+		}
+	}
+	else if(delta_max > b3Scalar(0.0f))
+	{
+		if((pos <= uppLim) && (pos > (uppLim - delta_max)))
+		{
+			lim_fact = (uppLim - pos) / delta_max;
+		}
+		else if(pos  > uppLim)
+		{
+			lim_fact = b3Scalar(0.0f);
+		}
+		else
+		{
+			lim_fact = b3Scalar(1.0f);
+		}
+	}
+	else
+	{
+			lim_fact = b3Scalar(0.0f);
+	}
+	return lim_fact;
+void b3AngularLimit::set(b3Scalar low, b3Scalar high, b3Scalar _softness, b3Scalar _biasFactor, b3Scalar _relaxationFactor)
+	m_halfRange = (high - low) / 2.0f;
+	m_center = b3NormalizeAngle(low + m_halfRange);
+	m_softness =  _softness;
+	m_biasFactor = _biasFactor;
+	m_relaxationFactor = _relaxationFactor;
+void b3AngularLimit::test(const b3Scalar angle)
+	m_correction = 0.0f;
+	m_sign = 0.0f;
+	m_solveLimit = false;
+	if (m_halfRange >= 0.0f)
+	{
+		b3Scalar deviation = b3NormalizeAngle(angle - m_center);
+		if (deviation < -m_halfRange)
+		{
+			m_solveLimit = true;
+			m_correction = - (deviation + m_halfRange);
+			m_sign = +1.0f;
+		}
+		else if (deviation > m_halfRange)
+		{
+			m_solveLimit = true;
+			m_correction = m_halfRange - deviation;
+			m_sign = -1.0f;
+		}
+	}
+b3Scalar b3AngularLimit::getError() const
+	return m_correction * m_sign;
+void b3AngularLimit::fit(b3Scalar& angle) const
+	if (m_halfRange > 0.0f)
+	{
+		b3Scalar relativeAngle = b3NormalizeAngle(angle - m_center);
+		if (!b3Equal(relativeAngle, m_halfRange))
+		{
+			if (relativeAngle > 0.0f)
+			{
+				angle = getHigh();
+			}
+			else
+			{
+				angle = getLow();
+			}
+		}
+	}
+b3Scalar b3AngularLimit::getLow() const
+	return b3NormalizeAngle(m_center - m_halfRange);
+b3Scalar b3AngularLimit::getHigh() const
+	return b3NormalizeAngle(m_center + m_halfRange);
diff --git a/src/bullet/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h
new file mode 100644
index 00000000..cf9cec0d
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h
@@ -0,0 +1,483 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2010 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Scalar.h"
+#include "b3SolverConstraint.h"
+class b3Serializer;
+//Don't change any of the existing enum values, so add enum types at the end for serialization compatibility
+enum b3TypedConstraintType
+enum b3ConstraintParams
+#if 1
+	#define b3AssertConstrParams(_par) b3Assert(_par) 
+	#define b3AssertConstrParams(_par)
+B3_ATTRIBUTE_ALIGNED16(struct)	b3JointFeedback
+	b3Vector3	m_appliedForceBodyA;
+	b3Vector3	m_appliedTorqueBodyA;
+	b3Vector3	m_appliedForceBodyB;
+	b3Vector3	m_appliedTorqueBodyB;
+struct b3RigidBodyData;
+///TypedConstraint is the baseclass for Bullet constraints and vehicles
+B3_ATTRIBUTE_ALIGNED16(class) b3TypedConstraint : public b3TypedObject
+	int	m_userConstraintType;
+	union
+	{
+		int	m_userConstraintId;
+		void* m_userConstraintPtr;
+	};
+	b3Scalar	m_breakingImpulseThreshold;
+	bool		m_isEnabled;
+	bool		m_needsFeedback;
+	int			m_overrideNumSolverIterations;
+	b3TypedConstraint&	operator=(b3TypedConstraint&	other)
+	{
+		b3Assert(0);
+		(void) other;
+		return *this;
+	}
+	int				m_rbA;
+	int				m_rbB;
+	b3Scalar	m_appliedImpulse;
+	b3Scalar	m_dbgDrawSize;
+	b3JointFeedback*	m_jointFeedback;
+	///internal method used by the constraint solver, don't use them directly
+	b3Scalar getMotorFactor(b3Scalar pos, b3Scalar lowLim, b3Scalar uppLim, b3Scalar vel, b3Scalar timeFact);
+	virtual ~b3TypedConstraint() {};
+	b3TypedConstraint(b3TypedConstraintType type, int bodyA,int bodyB);
+	struct b3ConstraintInfo1 {
+		int m_numConstraintRows,nub;
+	};
+	struct b3ConstraintInfo2 {
+		// integrator parameters: frames per second (1/stepsize), default error
+		// reduction parameter (0..1).
+		b3Scalar fps,erp;
+		// for the first and second body, pointers to two (linear and angular)
+		// n*3 jacobian sub matrices, stored by rows. these matrices will have
+		// been initialized to 0 on entry. if the second body is zero then the
+		// J2xx pointers may be 0.
+		b3Scalar *m_J1linearAxis,*m_J1angularAxis,*m_J2linearAxis,*m_J2angularAxis;
+		// elements to jump from one row to the next in J's
+		int rowskip;
+		// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
+		// "constraint force mixing" vector. c is set to zero on entry, cfm is
+		// set to a constant value (typically very small or zero) value on entry.
+		b3Scalar *m_constraintError,*cfm;
+		// lo and hi limits for variables (set to -/+ infinity on entry).
+		b3Scalar *m_lowerLimit,*m_upperLimit;
+		// findex vector for variables. see the LCP solver interface for a
+		// description of what this does. this is set to -1 on entry.
+		// note that the returned indexes are relative to the first index of
+		// the constraint.
+		int *findex;
+		// number of solver iterations
+		int m_numIterations;
+		//damping of the velocity
+		b3Scalar	m_damping;
+	};
+	int	getOverrideNumSolverIterations() const
+	{
+		return m_overrideNumSolverIterations;
+	}
+	///override the number of constraint solver iterations used to solve this constraint
+	///-1 will use the default number of iterations, as specified in SolverInfo.m_numIterations
+	void setOverrideNumSolverIterations(int overideNumIterations)
+	{
+		m_overrideNumSolverIterations = overideNumIterations;
+	}
+	///internal method used by the constraint solver, don't use them directly
+	virtual	void	setupSolverConstraint(b3ConstraintArray& ca, int solverBodyA,int solverBodyB, b3Scalar timeStep)
+	{
+        (void)ca;
+        (void)solverBodyA;
+        (void)solverBodyB;
+        (void)timeStep;
+	}
+	///internal method used by the constraint solver, don't use them directly
+	virtual void getInfo1 (b3ConstraintInfo1* info,const b3RigidBodyData* bodies)=0;
+	///internal method used by the constraint solver, don't use them directly
+	virtual void getInfo2 (b3ConstraintInfo2* info,  const b3RigidBodyData* bodies)=0;
+	///internal method used by the constraint solver, don't use them directly
+	void	internalSetAppliedImpulse(b3Scalar appliedImpulse)
+	{
+		m_appliedImpulse = appliedImpulse;
+	}
+	///internal method used by the constraint solver, don't use them directly
+	b3Scalar	internalGetAppliedImpulse()
+	{
+		return m_appliedImpulse;
+	}
+	b3Scalar	getBreakingImpulseThreshold() const
+	{
+		return 	m_breakingImpulseThreshold;
+	}
+	void	setBreakingImpulseThreshold(b3Scalar threshold)
+	{
+		m_breakingImpulseThreshold = threshold;
+	}
+	bool	isEnabled() const
+	{
+		return m_isEnabled;
+	}
+	void	setEnabled(bool enabled)
+	{
+		m_isEnabled=enabled;
+	}
+	///internal method used by the constraint solver, don't use them directly
+	virtual	void	solveConstraintObsolete(b3SolverBody& /*bodyA*/,b3SolverBody& /*bodyB*/,b3Scalar	/*timeStep*/) {};
+	int getRigidBodyA() const
+	{
+		return m_rbA;
+	}
+	int getRigidBodyB() const
+	{
+		return m_rbB;
+	}
+	int getRigidBodyA() 
+	{
+		return m_rbA;
+	}
+	int getRigidBodyB()
+	{
+		return m_rbB;
+	}
+	int getUserConstraintType() const
+	{
+		return m_userConstraintType ;
+	}
+	void	setUserConstraintType(int userConstraintType)
+	{
+		m_userConstraintType = userConstraintType;
+	};
+	void	setUserConstraintId(int uid)
+	{
+		m_userConstraintId = uid;
+	}
+	int getUserConstraintId() const
+	{
+		return m_userConstraintId;
+	}
+	void	setUserConstraintPtr(void* ptr)
+	{
+		m_userConstraintPtr = ptr;
+	}
+	void*	getUserConstraintPtr()
+	{
+		return m_userConstraintPtr;
+	}
+	void	setJointFeedback(b3JointFeedback* jointFeedback)
+	{
+		m_jointFeedback = jointFeedback;
+	}
+	const b3JointFeedback* getJointFeedback() const
+	{
+		return m_jointFeedback;
+	}
+	b3JointFeedback* getJointFeedback()
+	{
+		return m_jointFeedback;
+	}
+	int getUid() const
+	{
+		return m_userConstraintId;   
+	} 
+	bool	needsFeedback() const
+	{
+		return m_needsFeedback;
+	}
+	///enableFeedback will allow to read the applied linear and angular impulse
+	///use getAppliedImpulse, getAppliedLinearImpulse and getAppliedAngularImpulse to read feedback information
+	void	enableFeedback(bool needsFeedback)
+	{
+		m_needsFeedback = needsFeedback;
+	}
+	///getAppliedImpulse is an estimated total applied impulse. 
+	///This feedback could be used to determine breaking constraints or playing sounds.
+	b3Scalar	getAppliedImpulse() const
+	{
+		b3Assert(m_needsFeedback);
+		return m_appliedImpulse;
+	}
+	b3TypedConstraintType getConstraintType () const
+	{
+		return b3TypedConstraintType(m_objectType);
+	}
+	void setDbgDrawSize(b3Scalar dbgDrawSize)
+	{
+		m_dbgDrawSize = dbgDrawSize;
+	}
+	b3Scalar getDbgDrawSize()
+	{
+		return m_dbgDrawSize;
+	}
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+	///If no axis is provided, it uses the default axis for this constraint.
+	virtual	void	setParam(int num, b3Scalar value, int axis = -1) = 0;
+	///return the local value of parameter
+	virtual	b3Scalar getParam(int num, int axis = -1) const = 0;
+//	virtual	int	calculateSerializeBufferSize() const;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+// returns angle in range [-B3_2_PI, B3_2_PI], closest to one of the limits 
+// all arguments should be normalized angles (i.e. in range [-B3_PI, B3_PI])
+B3_FORCE_INLINE b3Scalar b3AdjustAngleToLimits(b3Scalar angleInRadians, b3Scalar angleLowerLimitInRadians, b3Scalar angleUpperLimitInRadians)
+	if(angleLowerLimitInRadians >= angleUpperLimitInRadians)
+	{
+		return angleInRadians;
+	}
+	else if(angleInRadians < angleLowerLimitInRadians)
+	{
+		b3Scalar diffLo = b3Fabs(b3NormalizeAngle(angleLowerLimitInRadians - angleInRadians));
+		b3Scalar diffHi = b3Fabs(b3NormalizeAngle(angleUpperLimitInRadians - angleInRadians));
+		return (diffLo < diffHi) ? angleInRadians : (angleInRadians + B3_2_PI);
+	}
+	else if(angleInRadians > angleUpperLimitInRadians)
+	{
+		b3Scalar diffHi = b3Fabs(b3NormalizeAngle(angleInRadians - angleUpperLimitInRadians));
+		b3Scalar diffLo = b3Fabs(b3NormalizeAngle(angleInRadians - angleLowerLimitInRadians));
+		return (diffLo < diffHi) ? (angleInRadians - B3_2_PI) : angleInRadians;
+	}
+	else
+	{
+		return angleInRadians;
+	}
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3TypedConstraintData
+	int		m_bodyA;
+	int		m_bodyB;
+	char	*m_name;
+	int	m_objectType;
+	int	m_userConstraintType;
+	int	m_userConstraintId;
+	int	m_needsFeedback;
+	float	m_appliedImpulse;
+	float	m_dbgDrawSize;
+	int	m_disableCollisionsBetweenLinkedBodies;
+	int	m_overrideNumSolverIterations;
+	float	m_breakingImpulseThreshold;
+	int		m_isEnabled;
+/*B3_FORCE_INLINE	int	b3TypedConstraint::calculateSerializeBufferSize() const
+	return sizeof(b3TypedConstraintData);
+class b3AngularLimit
+	b3Scalar 
+		m_center,
+		m_halfRange,
+		m_softness,
+		m_biasFactor,
+		m_relaxationFactor,
+		m_correction,
+		m_sign;
+	bool
+		m_solveLimit;
+	/// Default constructor initializes limit as inactive, allowing free constraint movement
+	b3AngularLimit()
+		:m_center(0.0f),
+		m_halfRange(-1.0f),
+		m_softness(0.9f),
+		m_biasFactor(0.3f),
+		m_relaxationFactor(1.0f),
+		m_correction(0.0f),
+		m_sign(0.0f),
+		m_solveLimit(false)
+	{}
+	/// Sets all limit's parameters.
+	/// When low > high limit becomes inactive.
+	/// When high - low > 2PI limit is ineffective too becouse no angle can exceed the limit
+	void set(b3Scalar low, b3Scalar high, b3Scalar _softness = 0.9f, b3Scalar _biasFactor = 0.3f, b3Scalar _relaxationFactor = 1.0f);
+	/// Checks conastaint angle against limit. If limit is active and the angle violates the limit
+	/// correction is calculated.
+	void test(const b3Scalar angle);
+	/// Returns limit's softness
+	inline b3Scalar getSoftness() const
+	{
+		return m_softness;
+	}
+	/// Returns limit's bias factor
+	inline b3Scalar getBiasFactor() const
+	{
+		return m_biasFactor;
+	}
+	/// Returns limit's relaxation factor
+	inline b3Scalar getRelaxationFactor() const
+	{
+		return m_relaxationFactor;
+	}
+	/// Returns correction value evaluated when test() was invoked 
+	inline b3Scalar getCorrection() const
+	{
+		return m_correction;
+	}
+	/// Returns sign value evaluated when test() was invoked 
+	inline b3Scalar getSign() const
+	{
+		return m_sign;
+	}
+	/// Gives half of the distance between min and max limit angle
+	inline b3Scalar getHalfRange() const
+	{
+		return m_halfRange;
+	}
+	/// Returns true when the last test() invocation recognized limit violation
+	inline bool isLimit() const
+	{
+		return m_solveLimit;
+	}
+	/// Checks given angle against limit. If limit is active and angle doesn't fit it, the angle
+	/// returned is modified so it equals to the limit closest to given angle.
+	void fit(b3Scalar& angle) const;
+	/// Returns correction value multiplied by sign value
+	b3Scalar getError() const;
+	b3Scalar getLow() const;
+	b3Scalar getHigh() const;
diff --git a/src/bullet/Bullet3Dynamics/b3CpuRigidBodyPipeline.cpp b/src/bullet/Bullet3Dynamics/b3CpuRigidBodyPipeline.cpp
new file mode 100644
index 00000000..53846a6a
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/b3CpuRigidBodyPipeline.cpp
@@ -0,0 +1,484 @@
+#include "b3CpuRigidBodyPipeline.h"
+#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+#include "Bullet3Dynamics/shared/b3Inertia.h"
+struct b3CpuRigidBodyPipelineInternalData
+	b3AlignedObjectArray<b3RigidBodyData> m_rigidBodies;
+	b3AlignedObjectArray<b3Inertia> m_inertias;
+	b3AlignedObjectArray<b3Aabb> m_aabbWorldSpace;
+	b3DynamicBvhBroadphase* m_bp;
+	b3CpuNarrowPhase* m_np;
+	b3Config m_config;
+b3CpuRigidBodyPipeline::b3CpuRigidBodyPipeline(class b3CpuNarrowPhase* narrowphase, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
+	m_data = new b3CpuRigidBodyPipelineInternalData;
+	m_data->m_np = narrowphase;
+	m_data->m_bp = broadphaseDbvt;
+	m_data->m_config = config;
+	delete m_data;
+void b3CpuRigidBodyPipeline::updateAabbWorldSpace()
+	for (int i=0;i<this->getNumBodies();i++)
+	{
+		b3RigidBodyData* body = &m_data->m_rigidBodies[i];
+		b3Float4 position = body->m_pos;
+		b3Quat	orientation = body->m_quat;
+		int collidableIndex = body->m_collidableIdx;
+		b3Collidable& collidable = m_data->m_np->getCollidableCpu(collidableIndex);
+		int shapeIndex = collidable.m_shapeIndex;
+		if (shapeIndex>=0)
+		{
+			b3Aabb localAabb = m_data->m_np->getLocalSpaceAabb(shapeIndex);
+			b3Aabb& worldAabb = m_data->m_aabbWorldSpace[i];
+			float margin=0.f;
+			b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&worldAabb.m_minVec,&worldAabb.m_maxVec);
+			m_data->m_bp->setAabb(i,worldAabb.m_minVec,worldAabb.m_maxVec,0);
+		}
+	}
+void	b3CpuRigidBodyPipeline::computeOverlappingPairs()
+	int numPairs = m_data->m_bp->getOverlappingPairCache()->getNumOverlappingPairs();
+	m_data->m_bp->calculateOverlappingPairs();
+	numPairs = m_data->m_bp->getOverlappingPairCache()->getNumOverlappingPairs();
+	printf("numPairs=%d\n",numPairs);
+void b3CpuRigidBodyPipeline::computeContactPoints()
+	b3AlignedObjectArray<b3Int4>& pairs = m_data->m_bp->getOverlappingPairCache()->getOverlappingPairArray();
+	m_data->m_np->computeContacts(pairs,m_data->m_aabbWorldSpace, m_data->m_rigidBodies);
+void	b3CpuRigidBodyPipeline::stepSimulation(float deltaTime)
+	//update world space aabb's
+	updateAabbWorldSpace();
+	//compute overlapping pairs
+	computeOverlappingPairs();
+	//compute contacts
+	computeContactPoints();
+	//solve contacts
+	//update transforms
+	integrate(deltaTime);
+static	inline	float b3CalcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1, 
+					 const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1)
+	return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1);
+static	inline	void b3SetLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1,
+							 b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1)
+	linear = -n;
+	angular0 = -b3Cross(r0, n);
+	angular1 = b3Cross(r1, n);
+static inline void b3SolveContact(b3ContactConstraint4& cs, 
+	const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+	const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+	float maxRambdaDt[4], float minRambdaDt[4])
+	b3Vector3 dLinVelA; dLinVelA.setZero();
+	b3Vector3 dAngVelA; dAngVelA.setZero();
+	b3Vector3 dLinVelB; dLinVelB.setZero();
+	b3Vector3 dAngVelB; dAngVelB.setZero();
+	for(int ic=0; ic<4; ic++)
+	{
+		//	dont necessary because this makes change to 0
+		if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+		{
+			b3Vector3 angular0, angular1, linear;
+			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
+			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
+			b3SetLinearAndAngular( (const b3Vector3 &)-cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 );
+			float rambdaDt = b3CalcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+			rambdaDt *= cs.m_jacCoeffInv[ic];
+			{
+				float prevSum = cs.m_appliedRambdaDt[ic];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max( updated, minRambdaDt[ic] );
+				updated = b3Min( updated, maxRambdaDt[ic] );
+				rambdaDt = updated - prevSum;
+				cs.m_appliedRambdaDt[ic] = updated;
+			}
+			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+            b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+			{
+				linVelA += linImp0;
+				angVelA += angImp0;
+				linVelB += linImp1;
+				angVelB += angImp1;
+			}
+		}
+	}
+static inline void b3SolveFriction(b3ContactConstraint4& cs, 
+		const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+		const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+	const b3Vector3& center = (const b3Vector3&)cs.m_center;
+	b3Vector3 n = -(const b3Vector3&)cs.m_linear;
+	b3Vector3 tangent[2];
+	b3PlaneSpace1 (n, tangent[0],tangent[1]);
+	b3Vector3 angular0, angular1, linear;
+	b3Vector3 r0 = center - posA;
+	b3Vector3 r1 = center - posB;
+	for(int i=0; i<2; i++)
+	{
+		b3SetLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
+		float rambdaDt = b3CalcRelVel(linear, -linear, angular0, angular1,
+			linVelA, angVelA, linVelB, angVelB );
+		rambdaDt *= cs.m_fJacCoeffInv[i];
+			{
+				float prevSum = cs.m_fAppliedRambdaDt[i];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max( updated, minRambdaDt[i] );
+				updated = b3Min( updated, maxRambdaDt[i] );
+				rambdaDt = updated - prevSum;
+				cs.m_fAppliedRambdaDt[i] = updated;
+			}
+		b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+		b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+		b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+		b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+		b3Assert(_finite(linImp0.getX()));
+		b3Assert(_finite(linImp1.getX()));
+		linVelA += linImp0;
+		angVelA += angImp0;
+		linVelB += linImp1;
+		angVelB += angImp1;
+	}
+	{	//	angular damping for point constraint
+		b3Vector3 ab = ( posB - posA ).normalized();
+		b3Vector3 ac = ( center - posA ).normalized();
+		if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+		{
+			float angNA = b3Dot( n, angVelA );
+			float angNB = b3Dot( n, angVelB );
+			angVelA -= (angNA*0.1f)*n;
+			angVelB -= (angNB*0.1f)*n;
+		}
+	}
+struct b3SolveTask// : public ThreadPool::Task
+	b3SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies,  
+				b3AlignedObjectArray<b3Inertia>& shapes, 
+				b3AlignedObjectArray<b3ContactConstraint4>& constraints,
+				int start, int nConstraints,
+				int maxNumBatches,
+				b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx
+				)
+		: m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ),
+		m_solveFriction( true ),m_maxNumBatches(maxNumBatches),
+		m_wgUsedBodies(wgUsedBodies),m_curWgidx(curWgidx)
+	{}
+	unsigned short int getType(){ return 0; }
+	void run(int tIdx)
+	{
+		b3AlignedObjectArray<int> usedBodies;
+		//printf("run..............\n");
+		for (int bb=0;bb<m_maxNumBatches;bb++)
+		{
+			usedBodies.resize(0);
+			for(int ic=m_nConstraints-1; ic>=0; ic--)
+			//for(int ic=0; ic<m_nConstraints; ic++)
+			{
+				int i = m_start + ic;
+				if (m_constraints[i].m_batchIdx != bb)
+					continue;
+				float frictionCoeff = b3GetFrictionCoeff(&m_constraints[i]);
+				int aIdx = (int)m_constraints[i].m_bodyA;
+				int bIdx = (int)m_constraints[i].m_bodyB;
+				int localBatch = m_constraints[i].m_batchIdx;
+				b3RigidBodyData& bodyA = m_bodies[aIdx];
+				b3RigidBodyData& bodyB = m_bodies[bIdx];
+				if ((bodyA.m_invMass) && (bodyB.m_invMass))
+				{
+				//	printf("aIdx=%d, bIdx=%d\n", aIdx,bIdx);
+				}
+				if (bIdx==10)
+				{
+					//printf("ic(b)=%d, localBatch=%d\n",ic,localBatch);
+				}
+				if (aIdx==10)
+				{
+					//printf("ic(a)=%d, localBatch=%d\n",ic,localBatch);
+				}
+				if (usedBodies.size()<(aIdx+1))
+				{
+					usedBodies.resize(aIdx+1,0);
+				}
+				if (usedBodies.size()<(bIdx+1))
+				{
+					usedBodies.resize(bIdx+1,0);
+				}
+				if (bodyA.m_invMass)
+				{
+					b3Assert(usedBodies[aIdx]==0);
+					usedBodies[aIdx]++;
+				}
+				if (bodyB.m_invMass)
+				{
+					b3Assert(usedBodies[bIdx]==0);
+					usedBodies[bIdx]++;
+				}
+				if( !m_solveFriction )
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					b3SolveContact( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
+							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+				else
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float sum = 0;
+					for(int j=0; j<4; j++)
+					{
+						sum +=m_constraints[i].m_appliedRambdaDt[j];
+					}
+					frictionCoeff = 0.7f;
+					for(int j=0; j<4; j++)
+					{
+						maxRambdaDt[j] = frictionCoeff*sum;
+						minRambdaDt[j] = -maxRambdaDt[j];
+					}
+				b3SolveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
+						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+			}
+			if (m_wgUsedBodies)
+			{
+				if (m_wgUsedBodies[m_curWgidx].size()<usedBodies.size())
+				{
+					m_wgUsedBodies[m_curWgidx].resize(usedBodies.size());
+				}
+				for (int i=0;i<usedBodies.size();i++)
+				{
+					if (usedBodies[i])
+					{
+						//printf("cell %d uses body %d\n", m_curWgidx,i);
+						m_wgUsedBodies[m_curWgidx][i]=1;
+					}
+				}
+			}
+		}
+	}
+	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
+	b3AlignedObjectArray<b3Inertia>& m_shapes;
+	b3AlignedObjectArray<b3ContactConstraint4>& m_constraints;
+	b3AlignedObjectArray<int>* m_wgUsedBodies;
+	int m_curWgidx;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+	int m_maxNumBatches;
+void b3CpuRigidBodyPipeline::solveContactConstraints()
+	int m_nIterations = 4;
+	b3AlignedObjectArray<b3ContactConstraint4> contactConstraints;
+	const b3AlignedObjectArray<b3Contact4Data>& contacts = m_data->m_np->getContacts();
+	int n = contactConstraints.size();
+	//convert contacts...
+	int maxNumBatches = 250;
+	for(int iter=0; iter<m_nIterations; iter++)
+	{
+		b3SolveTask task( m_data->m_rigidBodies, m_data->m_inertias, contactConstraints, 0, n ,maxNumBatches,0,0);
+		task.m_solveFriction = false;
+		task.run(0);
+	}
+	for(int iter=0; iter<m_nIterations; iter++)
+	{
+		b3SolveTask task( m_data->m_rigidBodies, m_data->m_inertias, contactConstraints, 0, n ,maxNumBatches,0,0);
+		task.m_solveFriction = true;
+		task.run(0);
+	}
+void b3CpuRigidBodyPipeline::integrate(float deltaTime)
+	float angDamping=0.f;
+	b3Vector3 gravityAcceleration=b3MakeVector3(0,-9,0);
+	//integrate transforms (external forces/gravity should be moved into constraint solver)
+	for (int i=0;i<m_data->m_rigidBodies.size();i++)
+	{
+		b3IntegrateTransform(&m_data->m_rigidBodies[i],deltaTime,angDamping,gravityAcceleration);
+	}
+int		b3CpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userData)
+	b3RigidBodyData body;
+	int bodyIndex = m_data->m_rigidBodies.size();
+	body.m_invMass = mass ? 1.f/mass : 0.f;
+	body.m_angVel.setValue(0,0,0);
+	body.m_collidableIdx = collidableIndex;
+	body.m_frictionCoeff = 0.3f;
+	body.m_linVel.setValue(0,0,0);
+	body.m_pos.setValue(position[0],position[1],position[2]);
+	body.m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]);
+	body.m_restituitionCoeff = 0.f;
+	m_data->m_rigidBodies.push_back(body);
+	if (collidableIndex>=0)
+	{
+		b3Aabb& worldAabb = m_data->m_aabbWorldSpace.expand();
+		b3Aabb localAabb = m_data->m_np->getLocalSpaceAabb(collidableIndex);
+		b3Vector3 localAabbMin=b3MakeVector3(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]);
+		b3Vector3 localAabbMax=b3MakeVector3(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]);
+		b3Scalar margin = 0.01f;
+		b3Transform t;
+		t.setIdentity();
+		t.setOrigin(b3MakeVector3(position[0],position[1],position[2]));
+		t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3]));
+		b3TransformAabb(localAabbMin,localAabbMax, margin,t,worldAabb.m_minVec,worldAabb.m_maxVec);
+		m_data->m_bp->createProxy(worldAabb.m_minVec,worldAabb.m_maxVec,bodyIndex,0,1,1);
+//		b3Vector3 aabbMin,aabbMax;
+	//	m_data->m_bp->getAabb(bodyIndex,aabbMin,aabbMax);
+	} else
+	{
+		b3Error("registerPhysicsInstance using invalid collidableIndex\n");
+	}
+	return bodyIndex;
+const struct b3RigidBodyData* b3CpuRigidBodyPipeline::getBodyBuffer() const
+	return m_data->m_rigidBodies.size() ? &m_data->m_rigidBodies[0] : 0;
+int	b3CpuRigidBodyPipeline::getNumBodies() const
+	return m_data->m_rigidBodies.size();
diff --git a/src/bullet/Bullet3Dynamics/b3CpuRigidBodyPipeline.h b/src/bullet/Bullet3Dynamics/b3CpuRigidBodyPipeline.h
new file mode 100644
index 00000000..2f3c2ae7
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/b3CpuRigidBodyPipeline.h
@@ -0,0 +1,67 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
+class b3CpuRigidBodyPipeline
+	struct b3CpuRigidBodyPipelineInternalData*	m_data;
+	int allocateCollidable();
+	b3CpuRigidBodyPipeline(class b3CpuNarrowPhase* narrowphase, struct b3DynamicBvhBroadphase* broadphaseDbvt, const struct b3Config& config);
+	virtual ~b3CpuRigidBodyPipeline();
+	virtual void	stepSimulation(float deltaTime);
+	virtual void	integrate(float timeStep);
+	virtual void	updateAabbWorldSpace();
+	virtual void	computeOverlappingPairs();
+	virtual void	computeContactPoints();
+	virtual void	solveContactConstraints();
+	int		registerConvexPolyhedron(class b3ConvexUtility* convex);
+	int		registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData);
+	void	writeAllInstancesToGpu();
+	void	copyConstraintsToHost();
+	void	setGravity(const float* grav);
+	void	reset();
+	int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold);
+	int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
+	void removeConstraintByUid(int uid);
+	void	addConstraint(class b3TypedConstraint* constraint);
+	void	removeConstraint(b3TypedConstraint* constraint);
+	void	castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults);
+	const struct b3RigidBodyData* getBodyBuffer() const;
+	int	getNumBodies() const;
\ No newline at end of file
diff --git a/src/bullet/Bullet3Dynamics/shared/b3ContactConstraint4.h b/src/bullet/Bullet3Dynamics/shared/b3ContactConstraint4.h
new file mode 100644
index 00000000..68cf65e3
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/shared/b3ContactConstraint4.h
@@ -0,0 +1,34 @@
+#include "Bullet3Common/shared/b3Float4.h"
+typedef struct b3ContactConstraint4 b3ContactConstraint4_t;
+struct b3ContactConstraint4
+	b3Float4 m_linear;//normal?
+	b3Float4 m_worldPos[4];
+	b3Float4 m_center;	//	friction
+	float m_jacCoeffInv[4];
+	float m_b[4];
+	float m_appliedRambdaDt[4];
+	float m_fJacCoeffInv[2];	//	friction
+	float m_fAppliedRambdaDt[2];	//	friction
+	unsigned int m_bodyA;
+	unsigned int m_bodyB;
+	int			m_batchIdx;
+	unsigned int m_paddings;
+//inline	void setFrictionCoeff(float value) { m_linear[3] = value; }
+inline	float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) 
+	return constraint->m_linear.w; 
diff --git a/src/bullet/Bullet3Dynamics/shared/b3ConvertConstraint4.h b/src/bullet/Bullet3Dynamics/shared/b3ConvertConstraint4.h
new file mode 100644
index 00000000..805a2bd3
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/shared/b3ConvertConstraint4.h
@@ -0,0 +1,153 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);
+ void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)
+  if (b3Fabs(n.z) > 0.70710678f) {
+    // choose p in y-z plane
+    float a = n.y*n.y + n.z*n.z;
+    float k = 1.f/sqrt(a);
+    p[0].x = 0;
+	p[0].y = -n.z*k;
+	p[0].z = n.y*k;
+    // set q = n x p
+    q[0].x = a*k;
+	q[0].y = -n.x*p[0].z;
+	q[0].z = n.x*p[0].y;
+  }
+  else {
+    // choose p in x-y plane
+    float a = n.x*n.x + n.y*n.y;
+    float k = 1.f/sqrt(a);
+    p[0].x = -n.y*k;
+	p[0].y = n.x*k;
+	p[0].z = 0;
+    // set q = n x p
+    q[0].x = -n.z*p[0].y;
+	q[0].y = n.z*p[0].x;
+	q[0].z = a*k;
+  }
+void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)
+	*linear = b3MakeFloat4(n.x,n.y,n.z,0.f);
+	*angular0 = b3Cross3(r0, n);
+	*angular1 = -b3Cross3(r1, n);
+float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,
+	b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )
+	return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);
+float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,
+					float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);
+	float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);
+	return -1.f/(jmj0+jmj1+jmj2+jmj3);
+void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,
+	b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, 
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,
+	b3ContactConstraint4_t* dstC )
+	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
+	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
+	float dtInv = 1.f/dt;
+	for(int ic=0; ic<4; ic++)
+	{
+		dstC->m_appliedRambdaDt[ic] = 0.f;
+	}
+	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
+	dstC->m_linear = src->m_worldNormalOnB;
+	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );
+	for(int ic=0; ic<4; ic++)
+	{
+		b3Float4 r0 = src->m_worldPosB[ic] - posA;
+		b3Float4 r1 = src->m_worldPosB[ic] - posB;
+		if( ic >= src->m_worldNormalOnB.w )//npoints
+		{
+			dstC->m_jacCoeffInv[ic] = 0.f;
+			continue;
+		}
+		float relVelN;
+		{
+			b3Float4 linear, angular0, angular1;
+			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);
+			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB );
+			relVelN = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB);
+			float e = 0.f;//src->getRestituitionCoeff();
+			if( relVelN*relVelN < 0.004f ) e = 0.f;
+			dstC->m_b[ic] = e*relVelN;
+			//float penetration = src->m_worldPosB[ic].w;
+			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;
+			dstC->m_appliedRambdaDt[ic] = 0.f;
+		}
+	}
+	if( src->m_worldNormalOnB.w > 0 )//npoints
+	{	//	prepare friction
+		b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);
+		for(int i=0; i<src->m_worldNormalOnB.w; i++) 
+			center += src->m_worldPosB[i];
+		center /= (float)src->m_worldNormalOnB.w;
+		b3Float4 tangent[2];
+		b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);
+		b3Float4 r[2];
+		r[0] = center - posA;
+		r[1] = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			b3Float4 linear, angular0, angular1;
+			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
+			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB );
+			dstC->m_fAppliedRambdaDt[i] = 0.f;
+		}
+		dstC->m_center = center;
+	}
+	for(int i=0; i<4; i++)
+	{
+		if( i<src->m_worldNormalOnB.w )
+		{
+			dstC->m_worldPos[i] = src->m_worldPosB[i];
+		}
+		else
+		{
+			dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);
+		}
+	}
diff --git a/src/bullet/Bullet3Dynamics/shared/b3Inertia.h b/src/bullet/Bullet3Dynamics/shared/b3Inertia.h
new file mode 100644
index 00000000..96fe9f8b
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/shared/b3Inertia.h
@@ -0,0 +1,15 @@
+#ifndef B3_INERTIA_H
+#define B3_INERTIA_H
+#include "Bullet3Common/shared/b3Mat3x3.h"
+struct b3Inertia
+	b3Mat3x3 m_invInertiaWorld;
+	b3Mat3x3 m_initInvInertia;
+#endif //B3_INERTIA_H
\ No newline at end of file
diff --git a/src/bullet/Bullet3Dynamics/shared/b3IntegrateTransforms.h b/src/bullet/Bullet3Dynamics/shared/b3IntegrateTransforms.h
new file mode 100644
index 00000000..e96f90d3
--- /dev/null
+++ b/src/bullet/Bullet3Dynamics/shared/b3IntegrateTransforms.h
@@ -0,0 +1,113 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)
+	if (bodies[nodeID].m_invMass != 0.f)
+	{
+		float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+		//angular velocity
+		{
+			b3Float4 axis;
+			//add some hardcoded angular damping
+			bodies[nodeID].m_angVel.x *= angularDamping;
+			bodies[nodeID].m_angVel.y *= angularDamping;
+			bodies[nodeID].m_angVel.z *= angularDamping;
+			b3Float4 angvel = bodies[nodeID].m_angVel;
+			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));
+			//limit the angular motion
+			{
+			}
+			if(fAngle < 0.001f)
+			{
+				// use Taylor's expansions of sync function
+				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);
+			}
+			b3Quat dorn;
+			dorn.x = axis.x;
+			dorn.y = axis.y;
+			dorn.z = axis.z;
+			dorn.w = b3Cos(fAngle * timeStep * 0.5f);
+			b3Quat orn0 = bodies[nodeID].m_quat;
+			b3Quat predictedOrn = b3QuatMul(dorn, orn0);
+			predictedOrn = b3QuatNormalized(predictedOrn);
+			bodies[nodeID].m_quat=predictedOrn;
+		}
+		//linear velocity		
+		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;
+		//apply gravity
+		bodies[nodeID].m_linVel += gravityAcceleration * timeStep;
+	}
+inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+	if( (body->m_invMass != 0.f))
+	{
+		//angular velocity
+		{
+			b3Float4 axis;
+			//add some hardcoded angular damping
+			body->m_angVel.x *= angularDamping;
+			body->m_angVel.y *= angularDamping;
+			body->m_angVel.z *= angularDamping;
+			b3Float4 angvel = body->m_angVel;
+			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));
+			//limit the angular motion
+			{
+			}
+			if(fAngle < 0.001f)
+			{
+				// use Taylor's expansions of sync function
+				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);
+			}
+			b3Quat dorn;
+			dorn.x = axis.x;
+			dorn.y = axis.y;
+			dorn.z = axis.z;
+			dorn.w = b3Cos(fAngle * timeStep * 0.5f);
+			b3Quat orn0 = body->m_quat;
+			b3Quat predictedOrn = b3QuatMul(dorn, orn0);
+			predictedOrn = b3QuatNormalized(predictedOrn);
+			body->m_quat=predictedOrn;
+		}
+		//apply gravity
+		body->m_linVel += gravityAcceleration * timeStep;
+		//linear velocity		
+		body->m_pos +=  body->m_linVel * timeStep;
+	}
diff --git a/src/bullet/Bullet3Geometry/b3AabbUtil.h b/src/bullet/Bullet3Geometry/b3AabbUtil.h
new file mode 100644
index 00000000..4c72d5bb
--- /dev/null
+++ b/src/bullet/Bullet3Geometry/b3AabbUtil.h
@@ -0,0 +1,232 @@
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_AABB_UTIL2
+#define B3_AABB_UTIL2
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3MinMax.h"
+B3_FORCE_INLINE void b3AabbExpand (b3Vector3& aabbMin,
+								   b3Vector3& aabbMax,
+								   const b3Vector3& expansionMin,
+								   const b3Vector3& expansionMax)
+	aabbMin = aabbMin + expansionMin;
+	aabbMax = aabbMax + expansionMax;
+/// conservative test for overlap between two aabbs
+B3_FORCE_INLINE bool b3TestPointAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
+								const b3Vector3 &point)
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > point.getX() || aabbMax1.getX() < point.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > point.getZ() || aabbMax1.getZ() < point.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > point.getY() || aabbMax1.getY() < point.getY()) ? false : overlap;
+	return overlap;
+/// conservative test for overlap between two aabbs
+B3_FORCE_INLINE bool b3TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
+								const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
+	return overlap;
+/// conservative test for overlap between triangle and aabb
+B3_FORCE_INLINE bool b3TestTriangleAgainstAabb2(const b3Vector3 *vertices,
+									const b3Vector3 &aabbMin, const b3Vector3 &aabbMax)
+	const b3Vector3 &p1 = vertices[0];
+	const b3Vector3 &p2 = vertices[1];
+	const b3Vector3 &p3 = vertices[2];
+	if (b3Min(b3Min(p1[0], p2[0]), p3[0]) > aabbMax[0]) return false;
+	if (b3Max(b3Max(p1[0], p2[0]), p3[0]) < aabbMin[0]) return false;
+	if (b3Min(b3Min(p1[2], p2[2]), p3[2]) > aabbMax[2]) return false;
+	if (b3Max(b3Max(p1[2], p2[2]), p3[2]) < aabbMin[2]) return false;
+	if (b3Min(b3Min(p1[1], p2[1]), p3[1]) > aabbMax[1]) return false;
+	if (b3Max(b3Max(p1[1], p2[1]), p3[1]) < aabbMin[1]) return false;
+	return true;
+B3_FORCE_INLINE int	b3Outcode(const b3Vector3& p,const b3Vector3& halfExtent) 
+	return (p.getX()  < -halfExtent.getX() ? 0x01 : 0x0) |    
+		   (p.getX() >  halfExtent.getX() ? 0x08 : 0x0) |
+		   (p.getY() < -halfExtent.getY() ? 0x02 : 0x0) |    
+		   (p.getY() >  halfExtent.getY() ? 0x10 : 0x0) |
+		   (p.getZ() < -halfExtent.getZ() ? 0x4 : 0x0) |    
+		   (p.getZ() >  halfExtent.getZ() ? 0x20 : 0x0);
+B3_FORCE_INLINE bool b3RayAabb2(const b3Vector3& rayFrom,
+								  const b3Vector3& rayInvDirection,
+								  const unsigned int raySign[3],
+								  const b3Vector3 bounds[2],
+								  b3Scalar& tmin,
+								  b3Scalar lambda_min,
+								  b3Scalar lambda_max)
+	b3Scalar tmax, tymin, tymax, tzmin, tzmax;
+	tmin = (bounds[raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX();
+	tmax = (bounds[1-raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX();
+	tymin = (bounds[raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY();
+	tymax = (bounds[1-raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY();
+	if ( (tmin > tymax) || (tymin > tmax) )
+		return false;
+	if (tymin > tmin)
+		tmin = tymin;
+	if (tymax < tmax)
+		tmax = tymax;
+	tzmin = (bounds[raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ();
+	tzmax = (bounds[1-raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ();
+	if ( (tmin > tzmax) || (tzmin > tmax) )
+		return false;
+	if (tzmin > tmin)
+		tmin = tzmin;
+	if (tzmax < tmax)
+		tmax = tzmax;
+	return ( (tmin < lambda_max) && (tmax > lambda_min) );
+B3_FORCE_INLINE bool b3RayAabb(const b3Vector3& rayFrom, 
+								 const b3Vector3& rayTo, 
+								 const b3Vector3& aabbMin, 
+								 const b3Vector3& aabbMax,
+					  b3Scalar& param, b3Vector3& normal) 
+	b3Vector3 aabbHalfExtent = (aabbMax-aabbMin)* b3Scalar(0.5);
+	b3Vector3 aabbCenter = (aabbMax+aabbMin)* b3Scalar(0.5);
+	b3Vector3	source = rayFrom - aabbCenter;
+	b3Vector3	target = rayTo - aabbCenter;
+	int	sourceOutcode = b3Outcode(source,aabbHalfExtent);
+	int targetOutcode = b3Outcode(target,aabbHalfExtent);
+	if ((sourceOutcode & targetOutcode) == 0x0)
+	{
+		b3Scalar lambda_enter = b3Scalar(0.0);
+		b3Scalar lambda_exit  = param;
+		b3Vector3 r = target - source;
+		int i;
+		b3Scalar	normSign = 1;
+		b3Vector3	hitNormal = b3MakeVector3(0,0,0);
+		int bit=1;
+		for (int j=0;j<2;j++)
+		{
+			for (i = 0; i != 3; ++i)
+			{
+				if (sourceOutcode & bit)
+				{
+					b3Scalar lambda = (-source[i] - aabbHalfExtent[i]*normSign) / r[i];
+					if (lambda_enter <= lambda)
+					{
+						lambda_enter = lambda;
+						hitNormal.setValue(0,0,0);
+						hitNormal[i] = normSign;
+					}
+				}
+				else if (targetOutcode & bit) 
+				{
+					b3Scalar lambda = (-source[i] - aabbHalfExtent[i]*normSign) / r[i];
+					b3SetMin(lambda_exit, lambda);
+				}
+				bit<<=1;
+			}
+			normSign = b3Scalar(-1.);
+		}
+		if (lambda_enter <= lambda_exit)
+		{
+			param = lambda_enter;
+			normal = hitNormal;
+			return true;
+		}
+	}
+	return false;
+B3_FORCE_INLINE	void b3TransformAabb(const b3Vector3& halfExtents, b3Scalar margin,const b3Transform& t,b3Vector3& aabbMinOut,b3Vector3& aabbMaxOut)
+	b3Vector3 halfExtentsWithMargin = halfExtents+b3MakeVector3(margin,margin,margin);
+	b3Matrix3x3 abs_b = t.getBasis().absolute();  
+	b3Vector3 center = t.getOrigin();
+    b3Vector3 extent = halfExtentsWithMargin.dot3( abs_b[0], abs_b[1], abs_b[2] );
+	aabbMinOut = center - extent;
+	aabbMaxOut = center + extent;
+B3_FORCE_INLINE	void b3TransformAabb(const b3Vector3& localAabbMin,const b3Vector3& localAabbMax, b3Scalar margin,const b3Transform& trans,b3Vector3& aabbMinOut,b3Vector3& aabbMaxOut)
+		//b3Assert(localAabbMin.getX() <= localAabbMax.getX());
+		//b3Assert(localAabbMin.getY() <= localAabbMax.getY());
+		//b3Assert(localAabbMin.getZ() <= localAabbMax.getZ());
+		b3Vector3 localHalfExtents = b3Scalar(0.5)*(localAabbMax-localAabbMin);
+		localHalfExtents+=b3MakeVector3(margin,margin,margin);
+		b3Vector3 localCenter = b3Scalar(0.5)*(localAabbMax+localAabbMin);
+		b3Matrix3x3 abs_b = trans.getBasis().absolute();  
+		b3Vector3 center = trans(localCenter);
+        b3Vector3 extent = localHalfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
+		aabbMinOut = center-extent;
+		aabbMaxOut = center+extent;
+#define B3_USE_BANCHLESS 1
+	//This block replaces the block below and uses no branches, and replaces the 8 bit return with a 32 bit return for improved performance (~3x on XBox 360)
+	B3_FORCE_INLINE unsigned b3TestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
+	{		
+		return static_cast<unsigned int>(b3Select((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
+			& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
+			& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
+			1, 0));
+	}
+	B3_FORCE_INLINE bool b3TestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
+	{
+		bool overlap = true;
+		overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap;
+		overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap;
+		overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap;
+		return overlap;
+	}
+#endif //B3_USE_BANCHLESS
+#endif //B3_AABB_UTIL2
diff --git a/src/bullet/Bullet3Geometry/b3ConvexHullComputer.cpp b/src/bullet/Bullet3Geometry/b3ConvexHullComputer.cpp
new file mode 100644
index 00000000..18835c38
--- /dev/null
+++ b/src/bullet/Bullet3Geometry/b3ConvexHullComputer.cpp
@@ -0,0 +1,2755 @@
+Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include <string.h>
+#include "b3ConvexHullComputer.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3MinMax.h"
+#include "Bullet3Common/b3Vector3.h"
+#ifdef __GNUC__
+	#include <stdint.h>
+	typedef int32_t btInt32_t;
+	typedef int64_t btInt64_t;
+	typedef uint32_t btUint32_t;
+	typedef uint64_t btUint64_t;
+#elif defined(_MSC_VER)
+	typedef __int32 btInt32_t;
+	typedef __int64 btInt64_t;
+	typedef unsigned __int32 btUint32_t;
+	typedef unsigned __int64 btUint64_t;
+	typedef int btInt32_t;
+	typedef long long int btInt64_t;
+	typedef unsigned int btUint32_t;
+	typedef unsigned long long int btUint64_t;
+//The definition of USE_X86_64_ASM is moved into the build system. You can enable it manually by commenting out the following lines
+//#if (defined(__GNUC__) && defined(__x86_64__) && !defined(__ICL))  // || (defined(__ICL) && defined(_M_X64))   bug in Intel compiler, disable inline assembly
+//	#define USE_X86_64_ASM
+#if defined(DEBUG_CONVEX_HULL) || defined(SHOW_ITERATIONS)
+	#include <stdio.h>
+// Convex hull implementation based on Preparata and Hong
+// Ole Kniemeyer, MAXON Computer GmbH
+class b3ConvexHullInternal
+	public:
+		class Point64
+		{
+			public:
+				btInt64_t x;
+				btInt64_t y;
+				btInt64_t z;
+				Point64(btInt64_t x, btInt64_t y, btInt64_t z): x(x), y(y), z(z)
+				{
+				}
+				bool isZero()
+				{
+					return (x == 0) && (y == 0) && (z == 0);
+				}
+				btInt64_t dot(const Point64& b) const
+				{
+					return x * b.x + y * b.y + z * b.z;
+				}
+		};
+		class Point32
+		{
+			public:
+				btInt32_t x;
+				btInt32_t y;
+				btInt32_t z;
+				int index;
+				Point32()
+				{
+				}
+				Point32(btInt32_t x, btInt32_t y, btInt32_t z): x(x), y(y), z(z), index(-1)
+				{
+				}
+				bool operator==(const Point32& b) const
+				{
+					return (x == b.x) && (y == b.y) && (z == b.z);
+				}
+				bool operator!=(const Point32& b) const
+				{
+					return (x != b.x) || (y != b.y) || (z != b.z);
+				}
+				bool isZero()
+				{
+					return (x == 0) && (y == 0) && (z == 0);
+				}
+				Point64 cross(const Point32& b) const
+				{
+					return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+				}
+				Point64 cross(const Point64& b) const
+				{
+					return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+				}
+				btInt64_t dot(const Point32& b) const
+				{
+					return x * b.x + y * b.y + z * b.z;
+				}
+				btInt64_t dot(const Point64& b) const
+				{
+					return x * b.x + y * b.y + z * b.z;
+				}
+				Point32 operator+(const Point32& b) const
+				{
+					return Point32(x + b.x, y + b.y, z + b.z);
+				}
+				Point32 operator-(const Point32& b) const
+				{
+					return Point32(x - b.x, y - b.y, z - b.z);
+				}
+		};
+		class Int128
+		{
+			public:
+				btUint64_t low;
+				btUint64_t high;
+				Int128()
+				{
+				}
+				Int128(btUint64_t low, btUint64_t high): low(low), high(high)
+				{
+				}
+				Int128(btUint64_t low): low(low), high(0)
+				{
+				}
+				Int128(btInt64_t value): low(value), high((value >= 0) ? 0 : (btUint64_t) -1LL)
+				{
+				}
+				static Int128 mul(btInt64_t a, btInt64_t b);
+				static Int128 mul(btUint64_t a, btUint64_t b);
+				Int128 operator-() const
+				{
+					return Int128((btUint64_t) -(btInt64_t)low, ~high + (low == 0));
+				}
+				Int128 operator+(const Int128& b) const
+				{
+#ifdef USE_X86_64_ASM
+					Int128 result;
+					__asm__ ("addq %[bl], %[rl]\n\t"
+									 "adcq %[bh], %[rh]\n\t"
+									 : [rl] "=r" (result.low), [rh] "=r" (result.high)
+									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+									 : "cc" );
+					return result;
+					btUint64_t lo = low + b.low;
+					return Int128(lo, high + b.high + (lo < low));
+				}
+				Int128 operator-(const Int128& b) const
+				{
+#ifdef USE_X86_64_ASM
+					Int128 result;
+					__asm__ ("subq %[bl], %[rl]\n\t"
+									 "sbbq %[bh], %[rh]\n\t"
+									 : [rl] "=r" (result.low), [rh] "=r" (result.high)
+									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+									 : "cc" );
+					return result;
+					return *this + -b;
+				}
+				Int128& operator+=(const Int128& b)
+				{
+#ifdef USE_X86_64_ASM
+					__asm__ ("addq %[bl], %[rl]\n\t"
+									 "adcq %[bh], %[rh]\n\t"
+									 : [rl] "=r" (low), [rh] "=r" (high)
+									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+									 : "cc" );
+					btUint64_t lo = low + b.low;
+					if (lo < low)
+					{
+						++high;
+					}
+					low = lo;
+					high += b.high;
+					return *this;
+				}
+				Int128& operator++()
+				{
+					if (++low == 0)
+					{
+						++high;
+					}
+					return *this;
+				}
+				Int128 operator*(btInt64_t b) const;
+				b3Scalar toScalar() const
+				{
+					return ((btInt64_t) high >= 0) ? b3Scalar(high) * (b3Scalar(0x100000000LL) * b3Scalar(0x100000000LL)) + b3Scalar(low)
+						: -(-*this).toScalar();
+				}
+				int getSign() const
+				{
+					return ((btInt64_t) high < 0) ? -1 : (high || low) ? 1 : 0;
+				}
+				bool operator<(const Int128& b) const
+				{
+					return (high < b.high) || ((high == b.high) && (low < b.low));
+				}
+				int ucmp(const Int128&b) const
+				{
+					if (high < b.high)
+					{
+						return -1;
+					}
+					if (high > b.high)
+					{
+						return 1;
+					}
+					if (low < b.low)
+					{
+						return -1;
+					}
+					if (low > b.low)
+					{
+						return 1;
+					}
+					return 0;
+				}
+		};
+		class Rational64
+		{
+			private:
+				btUint64_t m_numerator;
+				btUint64_t m_denominator;
+				int sign;
+			public:
+				Rational64(btInt64_t numerator, btInt64_t denominator)
+				{
+					if (numerator > 0)
+					{
+						sign = 1;
+						m_numerator = (btUint64_t) numerator;
+					}
+					else if (numerator < 0)
+					{
+						sign = -1;
+						m_numerator = (btUint64_t) -numerator;
+					}
+					else
+					{
+						sign = 0;
+						m_numerator = 0;
+					}
+					if (denominator > 0)
+					{
+						m_denominator = (btUint64_t) denominator;
+					}
+					else if (denominator < 0)
+					{
+						sign = -sign;
+						m_denominator = (btUint64_t) -denominator;
+					}
+					else
+					{
+						m_denominator = 0;
+					}
+				}
+				bool isNegativeInfinity() const
+				{
+					return (sign < 0) && (m_denominator == 0);
+				}
+				bool isNaN() const
+				{
+					return (sign == 0) && (m_denominator == 0);
+				}
+				int compare(const Rational64& b) const;
+				b3Scalar toScalar() const
+				{
+					return sign * ((m_denominator == 0) ? B3_INFINITY : (b3Scalar) m_numerator / m_denominator);
+				}
+		};
+		class Rational128
+		{
+			private:
+				Int128 numerator;
+				Int128 denominator;
+				int sign;
+				bool isInt64;
+			public:
+				Rational128(btInt64_t value)
+				{
+					if (value > 0)
+					{
+						sign = 1;
+						this->numerator = value;
+					}
+					else if (value < 0)
+					{
+						sign = -1;
+						this->numerator = -value;
+					}
+					else
+					{
+						sign = 0;
+						this->numerator = (btUint64_t) 0;
+					}
+					this->denominator = (btUint64_t) 1;
+					isInt64 = true;
+				}
+				Rational128(const Int128& numerator, const Int128& denominator)
+				{
+					sign = numerator.getSign();
+					if (sign >= 0)
+					{
+						this->numerator = numerator;
+					}
+					else
+					{
+						this->numerator = -numerator;
+					}
+					int dsign = denominator.getSign();
+					if (dsign >= 0)
+					{
+						this->denominator = denominator;
+					}
+					else
+					{
+						sign = -sign;
+						this->denominator = -denominator;
+					}
+					isInt64 = false;
+				}
+				int compare(const Rational128& b) const;
+				int compare(btInt64_t b) const;
+				b3Scalar toScalar() const
+				{
+					return sign * ((denominator.getSign() == 0) ? B3_INFINITY : numerator.toScalar() / denominator.toScalar());
+				}
+		};
+		class PointR128
+		{
+			public:
+				Int128 x;
+				Int128 y;
+				Int128 z;
+				Int128 denominator;
+				PointR128()
+				{
+				}
+				PointR128(Int128 x, Int128 y, Int128 z, Int128 denominator): x(x), y(y), z(z), denominator(denominator)
+				{
+				}
+				b3Scalar xvalue() const
+				{
+					return x.toScalar() / denominator.toScalar();
+				}
+				b3Scalar yvalue() const
+				{
+					return y.toScalar() / denominator.toScalar();
+				}
+				b3Scalar zvalue() const
+				{
+					return z.toScalar() / denominator.toScalar();
+				}
+		};
+		class Edge;
+		class Face;
+		class Vertex
+		{
+			public:
+				Vertex* next;
+				Vertex* prev;
+				Edge* edges;
+				Face* firstNearbyFace;
+				Face* lastNearbyFace;
+				PointR128 point128;
+				Point32 point;
+				int copy;
+				Vertex(): next(NULL), prev(NULL), edges(NULL), firstNearbyFace(NULL), lastNearbyFace(NULL), copy(-1)
+				{
+				}
+				void print()
+				{
+					b3Printf("V%d (%d, %d, %d)", point.index, point.x, point.y, point.z);
+				}
+				void printGraph();
+				Point32 operator-(const Vertex& b) const
+				{
+					return point - b.point;
+				}
+				Rational128 dot(const Point64& b) const
+				{
+					return (point.index >= 0) ? Rational128(point.dot(b))
+						: Rational128(point128.x * b.x + point128.y * b.y + point128.z * b.z, point128.denominator);
+				}
+				b3Scalar xvalue() const
+				{
+					return (point.index >= 0) ? b3Scalar(point.x) : point128.xvalue();
+				}
+				b3Scalar yvalue() const
+				{
+					return (point.index >= 0) ? b3Scalar(point.y) : point128.yvalue();
+				}
+				b3Scalar zvalue() const
+				{
+					return (point.index >= 0) ? b3Scalar(point.z) : point128.zvalue();
+				}
+				void receiveNearbyFaces(Vertex* src)
+				{
+					if (lastNearbyFace)
+					{
+						lastNearbyFace->nextWithSameNearbyVertex = src->firstNearbyFace;
+					}
+					else
+					{
+						firstNearbyFace = src->firstNearbyFace;
+					}
+					if (src->lastNearbyFace)
+					{
+						lastNearbyFace = src->lastNearbyFace;
+					}
+					for (Face* f = src->firstNearbyFace; f; f = f->nextWithSameNearbyVertex)
+					{
+						b3Assert(f->nearbyVertex == src);
+						f->nearbyVertex = this;
+					}
+					src->firstNearbyFace = NULL;
+					src->lastNearbyFace = NULL;
+				}
+		};
+		class Edge
+		{
+			public:
+				Edge* next;
+				Edge* prev;
+				Edge* reverse;
+				Vertex* target;
+				Face* face;
+				int copy;
+				~Edge()
+				{
+					next = NULL;
+					prev = NULL;
+					reverse = NULL;
+					target = NULL;
+					face = NULL;
+				}
+				void link(Edge* n)
+				{
+					b3Assert(reverse->target == n->reverse->target);
+					next = n;
+					n->prev = this;
+				}
+				void print()
+				{
+					b3Printf("E%p : %d -> %d,  n=%p p=%p   (0 %d\t%d\t%d) -> (%d %d %d)", this, reverse->target->point.index, target->point.index, next, prev,
+								 reverse->target->point.x, reverse->target->point.y, reverse->target->point.z, target->point.x, target->point.y, target->point.z);
+				}
+		};
+		class Face
+		{
+			public:
+				Face* next;
+				Vertex* nearbyVertex;
+				Face* nextWithSameNearbyVertex;
+				Point32 origin;
+				Point32 dir0;
+				Point32 dir1;
+				Face(): next(NULL), nearbyVertex(NULL), nextWithSameNearbyVertex(NULL)
+				{
+				}
+				void init(Vertex* a, Vertex* b, Vertex* c)
+				{
+					nearbyVertex = a;
+					origin = a->point;
+					dir0 = *b - *a;
+					dir1 = *c - *a;
+					if (a->lastNearbyFace)
+					{
+						a->lastNearbyFace->nextWithSameNearbyVertex = this;
+					}
+					else
+					{
+						a->firstNearbyFace = this;
+					}
+					a->lastNearbyFace = this;
+				}
+				Point64 getNormal()
+				{
+					return dir0.cross(dir1);
+				}
+		};
+		template<typename UWord, typename UHWord> class DMul
+		{
+			private:
+				static btUint32_t high(btUint64_t value)
+				{
+					return (btUint32_t) (value >> 32);
+				}
+				static btUint32_t low(btUint64_t value)
+				{
+					return (btUint32_t) value;
+				}
+				static btUint64_t mul(btUint32_t a, btUint32_t b)
+				{
+					return (btUint64_t) a * (btUint64_t) b;
+				}
+				static void shlHalf(btUint64_t& value)
+				{
+					value <<= 32;
+				}
+				static btUint64_t high(Int128 value)
+				{
+					return value.high;
+				}
+				static btUint64_t low(Int128 value)
+				{
+					return value.low;
+				}
+				static Int128 mul(btUint64_t a, btUint64_t b)
+				{
+					return Int128::mul(a, b);
+				}
+				static void shlHalf(Int128& value)
+				{
+					value.high = value.low;
+					value.low = 0;
+				}
+			public:
+				static void mul(UWord a, UWord b, UWord& resLow, UWord& resHigh)
+				{
+					UWord p00 = mul(low(a), low(b));
+					UWord p01 = mul(low(a), high(b));
+					UWord p10 = mul(high(a), low(b));
+					UWord p11 = mul(high(a), high(b));
+					UWord p0110 = UWord(low(p01)) + UWord(low(p10));
+					p11 += high(p01);
+					p11 += high(p10);
+					p11 += high(p0110);
+					shlHalf(p0110);
+					p00 += p0110;
+					if (p00 < p0110)
+					{
+						++p11;
+					}
+					resLow = p00;
+					resHigh = p11;
+				}
+		};
+	private:
+		class IntermediateHull
+		{
+			public:
+				Vertex* minXy;
+				Vertex* maxXy;
+				Vertex* minYx;
+				Vertex* maxYx;
+				IntermediateHull(): minXy(NULL), maxXy(NULL), minYx(NULL), maxYx(NULL)
+				{
+				}
+				void print();
+		};
+		template <typename T> class PoolArray
+		{
+			private:
+				T* array;
+				int size;
+			public:
+				PoolArray<T>* next;
+				PoolArray(int size): size(size), next(NULL)
+				{
+					array = (T*) b3AlignedAlloc(sizeof(T) * size, 16);
+				}
+				~PoolArray()
+				{
+					b3AlignedFree(array);
+				}
+				T* init()
+				{
+					T* o = array;
+					for (int i = 0; i < size; i++, o++)
+					{
+						o->next = (i+1 < size) ? o + 1 : NULL;
+					}
+					return array;
+				}
+		};
+		template <typename T> class Pool
+		{
+			private:
+				PoolArray<T>* arrays;
+				PoolArray<T>* nextArray;
+				T* freeObjects;
+				int arraySize;
+			public:
+				Pool(): arrays(NULL), nextArray(NULL), freeObjects(NULL), arraySize(256)
+				{
+				}
+				~Pool()
+				{
+					while (arrays)
+					{
+						PoolArray<T>* p = arrays;
+						arrays = p->next;
+						p->~PoolArray<T>();
+						b3AlignedFree(p);
+					}
+				}
+				void reset()
+				{
+					nextArray = arrays;
+					freeObjects = NULL;
+				}
+				void setArraySize(int arraySize)
+				{
+					this->arraySize = arraySize;
+				}
+				T* newObject()
+				{
+					T* o = freeObjects;
+					if (!o)
+					{
+						PoolArray<T>* p = nextArray;
+						if (p)
+						{
+							nextArray = p->next;
+						}
+						else
+						{
+							p = new(b3AlignedAlloc(sizeof(PoolArray<T>), 16)) PoolArray<T>(arraySize);
+							p->next = arrays;
+							arrays = p;
+						}
+						o = p->init();
+					}
+					freeObjects = o->next;
+					return new(o) T();
+				};
+				void freeObject(T* object)
+				{
+					object->~T();
+					object->next = freeObjects;
+					freeObjects = object;
+				}
+		};
+		b3Vector3 scaling;
+		b3Vector3 center;
+		Pool<Vertex> vertexPool;
+		Pool<Edge> edgePool;
+		Pool<Face> facePool;
+		b3AlignedObjectArray<Vertex*> originalVertices;
+		int mergeStamp;
+		int minAxis;
+		int medAxis;
+		int maxAxis;
+		int usedEdgePairs;
+		int maxUsedEdgePairs;
+		static Orientation getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t);
+		Edge* findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot);
+		void findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1);
+		Edge* newEdgePair(Vertex* from, Vertex* to);
+		void removeEdgePair(Edge* edge)
+		{
+			Edge* n = edge->next;
+			Edge* r = edge->reverse;
+			b3Assert(edge->target && r->target);
+			if (n != edge)
+			{
+				n->prev = edge->prev;
+				edge->prev->next = n;
+				r->target->edges = n;
+			}
+			else
+			{
+				r->target->edges = NULL;
+			}
+			n = r->next;
+			if (n != r)
+			{
+				n->prev = r->prev;
+				r->prev->next = n;
+				edge->target->edges = n;
+			}
+			else
+			{
+				edge->target->edges = NULL;
+			}
+			edgePool.freeObject(edge);
+			edgePool.freeObject(r);
+			usedEdgePairs--;
+		}
+		void computeInternal(int start, int end, IntermediateHull& result);
+		bool mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1);
+		void merge(IntermediateHull& h0, IntermediateHull& h1);
+		b3Vector3 toBtVector(const Point32& v);
+		b3Vector3 getBtNormal(Face* face);
+		bool shiftFace(Face* face, b3Scalar amount, b3AlignedObjectArray<Vertex*> stack);
+	public:
+		Vertex* vertexList;
+		void compute(const void* coords, bool doubleCoords, int stride, int count);
+		b3Vector3 getCoordinates(const Vertex* v);
+		b3Scalar shrink(b3Scalar amount, b3Scalar clampAmount);
+b3ConvexHullInternal::Int128 b3ConvexHullInternal::Int128::operator*(btInt64_t b) const
+	bool negative = (btInt64_t) high < 0;
+	Int128 a = negative ? -*this : *this;
+	if (b < 0)
+	{
+		negative = !negative;
+		b = -b;
+	}
+	Int128 result = mul(a.low, (btUint64_t) b);
+	result.high += a.high * (btUint64_t) b;
+	return negative ? -result : result;
+b3ConvexHullInternal::Int128 b3ConvexHullInternal::Int128::mul(btInt64_t a, btInt64_t b)
+	Int128 result;
+#ifdef USE_X86_64_ASM
+	__asm__ ("imulq %[b]"
+					 : "=a" (result.low), "=d" (result.high)
+					 : "0"(a), [b] "r"(b)
+					 : "cc" );
+	return result;
+	bool negative = a < 0;
+	if (negative)
+	{
+		a = -a;
+	}
+	if (b < 0)
+	{
+		negative = !negative;
+		b = -b;
+	}
+	DMul<btUint64_t, btUint32_t>::mul((btUint64_t) a, (btUint64_t) b, result.low, result.high);
+	return negative ? -result : result;
+b3ConvexHullInternal::Int128 b3ConvexHullInternal::Int128::mul(btUint64_t a, btUint64_t b)
+	Int128 result;
+#ifdef USE_X86_64_ASM
+	__asm__ ("mulq %[b]"
+					 : "=a" (result.low), "=d" (result.high)
+					 : "0"(a), [b] "r"(b)
+					 : "cc" );
+	DMul<btUint64_t, btUint32_t>::mul(a, b, result.low, result.high);
+	return result;
+int b3ConvexHullInternal::Rational64::compare(const Rational64& b) const
+	if (sign != b.sign)
+	{
+		return sign - b.sign;
+	}
+	else if (sign == 0)
+	{
+		return 0;
+	}
+	//	return (numerator * b.denominator > b.numerator * denominator) ? sign : (numerator * b.denominator < b.numerator * denominator) ? -sign : 0;
+#ifdef USE_X86_64_ASM
+	int result;
+	btInt64_t tmp;
+	btInt64_t dummy;
+	__asm__ ("mulq %[bn]\n\t"
+					 "movq %%rax, %[tmp]\n\t"
+					 "movq %%rdx, %%rbx\n\t"
+					 "movq %[tn], %%rax\n\t"
+					 "mulq %[bd]\n\t"
+					 "subq %[tmp], %%rax\n\t"
+					 "sbbq %%rbx, %%rdx\n\t" // rdx:rax contains 128-bit-difference "numerator*b.denominator - b.numerator*denominator"
+					 "setnsb %%bh\n\t" // bh=1 if difference is non-negative, bh=0 otherwise
+					 "orq %%rdx, %%rax\n\t"
+					 "setnzb %%bl\n\t" // bl=1 if difference if non-zero, bl=0 if it is zero
+					 "decb %%bh\n\t" // now bx=0x0000 if difference is zero, 0xff01 if it is negative, 0x0001 if it is positive (i.e., same sign as difference)
+					 "shll $16, %%ebx\n\t" // ebx has same sign as difference
+					 : "=&b"(result), [tmp] "=&r"(tmp), "=a"(dummy)
+					 : "a"(denominator), [bn] "g"(b.numerator), [tn] "g"(numerator), [bd] "g"(b.denominator)
+					 : "%rdx", "cc" );
+	return result ? result ^ sign // if sign is +1, only bit 0 of result is inverted, which does not change the sign of result (and cannot result in zero)
+																// if sign is -1, all bits of result are inverted, which changes the sign of result (and again cannot result in zero)
+								: 0;
+	return sign * Int128::mul(m_numerator, b.m_denominator).ucmp(Int128::mul(m_denominator, b.m_numerator));
+int b3ConvexHullInternal::Rational128::compare(const Rational128& b) const
+	if (sign != b.sign)
+	{
+		return sign - b.sign;
+	}
+	else if (sign == 0)
+	{
+		return 0;
+	}
+	if (isInt64)
+	{
+		return -b.compare(sign * (btInt64_t) numerator.low);
+	}
+	Int128 nbdLow, nbdHigh, dbnLow, dbnHigh;
+	DMul<Int128, btUint64_t>::mul(numerator, b.denominator, nbdLow, nbdHigh);
+	DMul<Int128, btUint64_t>::mul(denominator, b.numerator, dbnLow, dbnHigh);
+	int cmp = nbdHigh.ucmp(dbnHigh);
+	if (cmp)
+	{
+		return cmp * sign;
+	}
+	return nbdLow.ucmp(dbnLow) * sign;
+int b3ConvexHullInternal::Rational128::compare(btInt64_t b) const
+	if (isInt64)
+	{
+		btInt64_t a = sign * (btInt64_t) numerator.low;
+		return (a > b) ? 1 : (a < b) ? -1 : 0;
+	}
+	if (b > 0)
+	{
+		if (sign <= 0)
+		{
+			return -1;
+		}
+	}
+	else if (b < 0)
+	{
+		if (sign >= 0)
+		{
+			return 1;
+		}
+		b = -b;
+	}
+	else
+	{
+		return sign;
+	}
+	return numerator.ucmp(denominator * b) * sign;
+b3ConvexHullInternal::Edge* b3ConvexHullInternal::newEdgePair(Vertex* from, Vertex* to)
+	b3Assert(from && to);
+	Edge* e = edgePool.newObject();
+	Edge* r = edgePool.newObject();
+	e->reverse = r;
+	r->reverse = e;
+	e->copy = mergeStamp;
+	r->copy = mergeStamp;
+	e->target = to;
+	r->target = from;
+	e->face = NULL;
+	r->face = NULL;
+	usedEdgePairs++;
+	if (usedEdgePairs > maxUsedEdgePairs)
+	{
+		maxUsedEdgePairs = usedEdgePairs;
+	}
+	return e;
+bool b3ConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1)
+	Vertex* v0 = h0.maxYx;
+	Vertex* v1 = h1.minYx;
+	if ((v0->point.x == v1->point.x) && (v0->point.y == v1->point.y))
+	{
+		b3Assert(v0->point.z < v1->point.z);
+		Vertex* v1p = v1->prev;
+		if (v1p == v1)
+		{
+			c0 = v0;
+			if (v1->edges)
+			{
+				b3Assert(v1->edges->next == v1->edges);
+				v1 = v1->edges->target;
+				b3Assert(v1->edges->next == v1->edges);
+			}
+			c1 = v1;
+			return false;
+		}
+		Vertex* v1n = v1->next;
+		v1p->next = v1n;
+		v1n->prev = v1p;
+		if (v1 == h1.minXy)
+		{
+			if ((v1n->point.x < v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y < v1p->point.y)))
+			{
+				h1.minXy = v1n;
+			}
+			else
+			{
+				h1.minXy = v1p;
+			}
+		}
+		if (v1 == h1.maxXy)
+		{
+			if ((v1n->point.x > v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y > v1p->point.y)))
+			{
+				h1.maxXy = v1n;
+			}
+			else
+			{
+				h1.maxXy = v1p;
+			}
+		}
+	}
+	v0 = h0.maxXy;
+	v1 = h1.maxXy;
+	Vertex* v00 = NULL;
+	Vertex* v10 = NULL;
+	btInt32_t sign = 1;
+	for (int side = 0; side <= 1; side++)
+	{		
+		btInt32_t dx = (v1->point.x - v0->point.x) * sign;
+		if (dx > 0)
+		{
+			while (true)
+			{
+				btInt32_t dy = v1->point.y - v0->point.y;
+				Vertex* w0 = side ? v0->next : v0->prev;
+				if (w0 != v0)
+				{
+					btInt32_t dx0 = (w0->point.x - v0->point.x) * sign;
+					btInt32_t dy0 = w0->point.y - v0->point.y;
+					if ((dy0 <= 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx <= dy * dx0))))
+					{
+						v0 = w0;
+						dx = (v1->point.x - v0->point.x) * sign;
+						continue;
+					}
+				}
+				Vertex* w1 = side ? v1->next : v1->prev;
+				if (w1 != v1)
+				{
+					btInt32_t dx1 = (w1->point.x - v1->point.x) * sign;
+					btInt32_t dy1 = w1->point.y - v1->point.y;
+					btInt32_t dxn = (w1->point.x - v0->point.x) * sign;
+					if ((dxn > 0) && (dy1 < 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx < dy * dx1))))
+					{
+						v1 = w1;
+						dx = dxn;
+						continue;
+					}
+				}
+				break;
+			}
+		}
+		else if (dx < 0)
+		{
+			while (true)
+			{
+				btInt32_t dy = v1->point.y - v0->point.y;
+				Vertex* w1 = side ? v1->prev : v1->next;
+				if (w1 != v1)
+				{
+					btInt32_t dx1 = (w1->point.x - v1->point.x) * sign;
+					btInt32_t dy1 = w1->point.y - v1->point.y;
+					if ((dy1 >= 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx <= dy * dx1))))
+					{
+						v1 = w1;
+						dx = (v1->point.x - v0->point.x) * sign;
+						continue;
+					}
+				}
+				Vertex* w0 = side ? v0->prev : v0->next;
+				if (w0 != v0)
+				{
+					btInt32_t dx0 = (w0->point.x - v0->point.x) * sign;
+					btInt32_t dy0 = w0->point.y - v0->point.y;
+					btInt32_t dxn = (v1->point.x - w0->point.x) * sign;
+					if ((dxn < 0) && (dy0 > 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx < dy * dx0))))
+					{
+						v0 = w0;
+						dx = dxn;
+						continue;
+					}
+				}
+				break;
+			}
+		}
+		else
+		{
+			btInt32_t x = v0->point.x;
+			btInt32_t y0 = v0->point.y;
+			Vertex* w0 = v0;
+			Vertex* t;
+			while (((t = side ? w0->next : w0->prev) != v0) && (t->point.x == x) && (t->point.y <= y0))
+			{
+				w0 = t;
+				y0 = t->point.y;
+			}
+			v0 = w0;
+			btInt32_t y1 = v1->point.y;
+			Vertex* w1 = v1;
+			while (((t = side ? w1->prev : w1->next) != v1) && (t->point.x == x) && (t->point.y >= y1))
+			{
+				w1 = t;
+				y1 = t->point.y;
+			}
+			v1 = w1;
+		}
+		if (side == 0)
+		{
+			v00 = v0;
+			v10 = v1;
+			v0 = h0.minXy;
+			v1 = h1.minXy;
+			sign = -1;
+		}
+	}
+	v0->prev = v1;
+	v1->next = v0;
+	v00->next = v10;
+	v10->prev = v00;
+	if (h1.minXy->point.x < h0.minXy->point.x)
+	{
+		h0.minXy = h1.minXy;
+	}
+	if (h1.maxXy->point.x >= h0.maxXy->point.x)
+	{
+		h0.maxXy = h1.maxXy;
+	}
+	h0.maxYx = h1.maxYx;
+	c0 = v00;
+	c1 = v10;
+	return true;
+void b3ConvexHullInternal::computeInternal(int start, int end, IntermediateHull& result)
+	int n = end - start;
+	switch (n)
+	{
+		case 0:
+			result.minXy = NULL;
+			result.maxXy = NULL;
+			result.minYx = NULL;
+			result.maxYx = NULL;
+			return;
+		case 2:
+		{
+			Vertex* v = originalVertices[start];
+			Vertex* w = v + 1;
+			if (v->point != w->point)
+			{
+				btInt32_t dx = v->point.x - w->point.x;
+				btInt32_t dy = v->point.y - w->point.y;
+				if ((dx == 0) && (dy == 0))
+				{
+					if (v->point.z > w->point.z)
+					{
+						Vertex* t = w;
+						w = v;
+						v = t;
+					}
+					b3Assert(v->point.z < w->point.z);
+					v->next = v;
+					v->prev = v;
+					result.minXy = v;
+					result.maxXy = v;
+					result.minYx = v;
+					result.maxYx = v;
+				}
+				else
+				{
+					v->next = w;
+					v->prev = w;
+					w->next = v;
+					w->prev = v;
+					if ((dx < 0) || ((dx == 0) && (dy < 0)))
+					{
+						result.minXy = v;
+						result.maxXy = w;
+					}
+					else
+					{
+						result.minXy = w;
+						result.maxXy = v;
+					}
+					if ((dy < 0) || ((dy == 0) && (dx < 0)))
+					{
+						result.minYx = v;
+						result.maxYx = w;
+					}
+					else
+					{
+						result.minYx = w;
+						result.maxYx = v;
+					}
+				}
+				Edge* e = newEdgePair(v, w);
+				e->link(e);
+				v->edges = e;
+				e = e->reverse;
+				e->link(e);
+				w->edges = e;
+				return;
+			}
+		}
+		// lint -fallthrough
+		case 1:
+		{
+			Vertex* v = originalVertices[start];
+			v->edges = NULL;
+			v->next = v;
+			v->prev = v;
+			result.minXy = v;
+			result.maxXy = v;
+			result.minYx = v;
+			result.maxYx = v;
+			return;
+		}
+	}
+	int split0 = start + n / 2;
+	Point32 p = originalVertices[split0-1]->point;
+	int split1 = split0;
+	while ((split1 < end) && (originalVertices[split1]->point == p))
+	{
+		split1++;
+	}
+	computeInternal(start, split0, result);
+	IntermediateHull hull1;
+	computeInternal(split1, end, hull1);
+	b3Printf("\n\nMerge\n");
+	result.print();
+	hull1.print();
+	merge(result, hull1);
+	b3Printf("\n  Result\n");
+	result.print();
+void b3ConvexHullInternal::IntermediateHull::print()
+	b3Printf("    Hull\n");
+	for (Vertex* v = minXy; v; )
+	{
+		b3Printf("      ");
+		v->print();
+		if (v == maxXy)
+		{
+			b3Printf(" maxXy");
+		}
+		if (v == minYx)
+		{
+			b3Printf(" minYx");
+		}
+		if (v == maxYx)
+		{
+			b3Printf(" maxYx");
+		}
+		if (v->next->prev != v)
+		{
+			b3Printf(" Inconsistency");
+		}
+		b3Printf("\n");
+		v = v->next;
+		if (v == minXy)
+		{
+			break;
+		}
+	}
+	if (minXy)
+	{		
+		minXy->copy = (minXy->copy == -1) ? -2 : -1;
+		minXy->printGraph();
+	}
+void b3ConvexHullInternal::Vertex::printGraph()
+	print();
+	b3Printf("\nEdges\n");
+	Edge* e = edges;
+	if (e)
+	{
+		do
+		{
+			e->print();
+			b3Printf("\n");
+			e = e->next;
+		} while (e != edges);
+		do
+		{
+			Vertex* v = e->target;
+			if (v->copy != copy)
+			{
+				v->copy = copy;
+				v->printGraph();
+			}
+			e = e->next;
+		} while (e != edges);
+	}
+b3ConvexHullInternal::Orientation b3ConvexHullInternal::getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t)
+	b3Assert(prev->reverse->target == next->reverse->target);
+	if (prev->next == next)
+	{
+		if (prev->prev == next)
+		{
+			Point64 n = t.cross(s);
+			Point64 m = (*prev->target - *next->reverse->target).cross(*next->target - *next->reverse->target);
+			b3Assert(!m.isZero());
+			btInt64_t dot = n.dot(m);
+			b3Assert(dot != 0);
+			return (dot > 0) ? COUNTER_CLOCKWISE : CLOCKWISE;
+		}
+	}
+	else if (prev->prev == next)
+	{
+		return CLOCKWISE;
+	}
+	else
+	{
+		return NONE;
+	}
+b3ConvexHullInternal::Edge* b3ConvexHullInternal::findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot)
+	Edge* minEdge = NULL;
+	b3Printf("find max edge for %d\n", start->point.index);
+	Edge* e = start->edges;
+	if (e)
+	{
+		do
+		{
+			if (e->copy > mergeStamp)
+			{
+				Point32 t = *e->target - *start;
+				Rational64 cot(t.dot(sxrxs), t.dot(rxs));
+				b3Printf("      Angle is %f (%d) for ", (float) b3Atan(cot.toScalar()), (int) cot.isNaN());
+				e->print();
+				if (cot.isNaN())
+				{
+					b3Assert(ccw ? (t.dot(s) < 0) : (t.dot(s) > 0));
+				}
+				else
+				{
+					int cmp;
+					if (minEdge == NULL)
+					{
+						minCot = cot;
+						minEdge = e;
+					}
+					else if ((cmp = cot.compare(minCot)) < 0)
+					{
+						minCot = cot;
+						minEdge = e;
+					}
+					else if ((cmp == 0) && (ccw == (getOrientation(minEdge, e, s, t) == COUNTER_CLOCKWISE)))
+					{
+						minEdge = e;
+					}
+				}
+				b3Printf("\n");
+			}
+			e = e->next;
+		} while (e != start->edges);
+	}
+	return minEdge;
+void b3ConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1)
+	Edge* start0 = e0;
+	Edge* start1 = e1;
+	Point32 et0 = start0 ? start0->target->point : c0->point;
+	Point32 et1 = start1 ? start1->target->point : c1->point;
+	Point32 s = c1->point - c0->point;
+	Point64 normal = ((start0 ? start0 : start1)->target->point - c0->point).cross(s);
+	btInt64_t dist = c0->point.dot(normal);
+	b3Assert(!start1 || (start1->target->point.dot(normal) == dist));
+	Point64 perp = s.cross(normal);
+	b3Assert(!perp.isZero());
+	b3Printf("   Advancing %d %d  (%p %p, %d %d)\n", c0->point.index, c1->point.index, start0, start1, start0 ? start0->target->point.index : -1, start1 ? start1->target->point.index : -1);
+	btInt64_t maxDot0 = et0.dot(perp);
+	if (e0)
+	{
+		while (e0->target != stop0)
+		{
+			Edge* e = e0->reverse->prev;
+			if (e->target->point.dot(normal) < dist)
+			{
+				break;
+			}
+			b3Assert(e->target->point.dot(normal) == dist);
+			if (e->copy == mergeStamp)
+			{
+				break;
+			}
+			btInt64_t dot = e->target->point.dot(perp);
+			if (dot <= maxDot0)
+			{
+				break;
+			}
+			maxDot0 = dot;
+			e0 = e;
+			et0 = e->target->point;
+		}
+	}
+	btInt64_t maxDot1 = et1.dot(perp);
+	if (e1)
+	{
+		while (e1->target != stop1)
+		{
+			Edge* e = e1->reverse->next;
+			if (e->target->point.dot(normal) < dist)
+			{
+				break;
+			}
+			b3Assert(e->target->point.dot(normal) == dist);
+			if (e->copy == mergeStamp)
+			{
+				break;
+			}
+			btInt64_t dot = e->target->point.dot(perp);
+			if (dot <= maxDot1)
+			{
+				break;
+			}
+			maxDot1 = dot;
+			e1 = e;
+			et1 = e->target->point;
+		}
+	}
+	b3Printf("   Starting at %d %d\n", et0.index, et1.index);
+	btInt64_t dx = maxDot1 - maxDot0;
+	if (dx > 0)
+	{
+		while (true)
+		{
+			btInt64_t dy = (et1 - et0).dot(s);
+			if (e0 && (e0->target != stop0))
+			{
+				Edge* f0 = e0->next->reverse;
+				if (f0->copy > mergeStamp)
+				{
+					btInt64_t dx0 = (f0->target->point - et0).dot(perp);
+					btInt64_t dy0 = (f0->target->point - et0).dot(s);
+					if ((dx0 == 0) ? (dy0 < 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) >= 0)))
+					{
+						et0 = f0->target->point;
+						dx = (et1 - et0).dot(perp);
+						e0 = (e0 == start0) ? NULL : f0;
+						continue;
+					}
+				}
+			}
+			if (e1 && (e1->target != stop1))
+			{
+				Edge* f1 = e1->reverse->next;
+				if (f1->copy > mergeStamp)
+				{
+					Point32 d1 = f1->target->point - et1;
+					if (d1.dot(normal) == 0)
+					{
+						btInt64_t dx1 = d1.dot(perp);
+						btInt64_t dy1 = d1.dot(s);
+						btInt64_t dxn = (f1->target->point - et0).dot(perp);
+						if ((dxn > 0) && ((dx1 == 0) ? (dy1 < 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) > 0))))
+						{
+							e1 = f1;
+							et1 = e1->target->point;
+							dx = dxn;
+							continue;
+						}
+					}
+					else
+					{
+						b3Assert((e1 == start1) && (d1.dot(normal) < 0));
+					}
+				}
+			}
+			break;
+		}
+	}
+	else if (dx < 0)
+	{
+		while (true)
+		{
+			btInt64_t dy = (et1 - et0).dot(s);
+			if (e1 && (e1->target != stop1))
+			{
+				Edge* f1 = e1->prev->reverse;
+				if (f1->copy > mergeStamp)
+				{
+					btInt64_t dx1 = (f1->target->point - et1).dot(perp);
+					btInt64_t dy1 = (f1->target->point - et1).dot(s);
+					if ((dx1 == 0) ? (dy1 > 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) <= 0)))
+					{
+						et1 = f1->target->point;
+						dx = (et1 - et0).dot(perp);
+						e1 = (e1 == start1) ? NULL : f1;
+						continue;
+					}
+				}
+			}
+			if (e0 && (e0->target != stop0))
+			{
+				Edge* f0 = e0->reverse->prev;
+				if (f0->copy > mergeStamp)
+				{
+					Point32 d0 = f0->target->point - et0;
+					if (d0.dot(normal) == 0)
+					{
+						btInt64_t dx0 = d0.dot(perp);
+						btInt64_t dy0 = d0.dot(s);
+						btInt64_t dxn = (et1 - f0->target->point).dot(perp);
+						if ((dxn < 0) && ((dx0 == 0) ? (dy0 > 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) < 0))))
+						{
+							e0 = f0;
+							et0 = e0->target->point;
+							dx = dxn;
+							continue;
+						}
+					}
+					else
+					{
+						b3Assert((e0 == start0) && (d0.dot(normal) < 0));
+					}
+				}
+			}
+			break;
+		}
+	}
+	b3Printf("   Advanced edges to %d %d\n", et0.index, et1.index);
+void b3ConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1)
+	if (!h1.maxXy)
+	{
+		return;
+	}
+	if (!h0.maxXy)
+	{
+		h0 = h1;
+		return;
+	}
+	mergeStamp--;
+	Vertex* c0 = NULL;
+	Edge* toPrev0 = NULL;
+	Edge* firstNew0 = NULL;
+	Edge* pendingHead0 = NULL;
+	Edge* pendingTail0 = NULL;
+	Vertex* c1 = NULL;
+	Edge* toPrev1 = NULL;
+	Edge* firstNew1 = NULL;
+	Edge* pendingHead1 = NULL;
+	Edge* pendingTail1 = NULL;
+	Point32 prevPoint;
+	if (mergeProjection(h0, h1, c0, c1))
+	{
+		Point32 s = *c1 - *c0;
+		Point64 normal = Point32(0, 0, -1).cross(s);
+		Point64 t = s.cross(normal);
+		b3Assert(!t.isZero());
+		Edge* e = c0->edges;
+		Edge* start0 = NULL;
+		if (e)
+		{
+			do
+			{
+				btInt64_t dot = (*e->target - *c0).dot(normal);
+				b3Assert(dot <= 0);
+				if ((dot == 0) && ((*e->target - *c0).dot(t) > 0))
+				{
+					if (!start0 || (getOrientation(start0, e, s, Point32(0, 0, -1)) == CLOCKWISE))
+					{
+						start0 = e;
+					}
+				}
+				e = e->next;
+			} while (e != c0->edges);
+		}
+		e = c1->edges;
+		Edge* start1 = NULL;
+		if (e)
+		{
+			do
+			{
+				btInt64_t dot = (*e->target - *c1).dot(normal);
+				b3Assert(dot <= 0);
+				if ((dot == 0) && ((*e->target - *c1).dot(t) > 0))
+				{
+					if (!start1 || (getOrientation(start1, e, s, Point32(0, 0, -1)) == COUNTER_CLOCKWISE))
+					{
+						start1 = e;
+					}
+				}
+				e = e->next;
+			} while (e != c1->edges);
+		}
+		if (start0 || start1)
+		{
+			findEdgeForCoplanarFaces(c0, c1, start0, start1, NULL, NULL);
+			if (start0)
+			{
+				c0 = start0->target;
+			}
+			if (start1)
+			{
+				c1 = start1->target;
+			}
+		}
+		prevPoint = c1->point;
+		prevPoint.z++;
+	}
+	else
+	{
+		prevPoint = c1->point;
+		prevPoint.x++;
+	}
+	Vertex* first0 = c0;
+	Vertex* first1 = c1;
+	bool firstRun = true;
+	while (true)
+	{
+		Point32 s = *c1 - *c0;
+		Point32 r = prevPoint - c0->point;
+		Point64 rxs = r.cross(s);
+		Point64 sxrxs = s.cross(rxs);
+		b3Printf("\n  Checking %d %d\n", c0->point.index, c1->point.index);
+		Rational64 minCot0(0, 0);
+		Edge* min0 = findMaxAngle(false, c0, s, rxs, sxrxs, minCot0);
+		Rational64 minCot1(0, 0);
+		Edge* min1 = findMaxAngle(true, c1, s, rxs, sxrxs, minCot1);
+		if (!min0 && !min1)
+		{
+			Edge* e = newEdgePair(c0, c1);
+			e->link(e);
+			c0->edges = e;
+			e = e->reverse;
+			e->link(e);
+			c1->edges = e;
+			return;
+		}
+		else
+		{
+			int cmp = !min0 ? 1 : !min1 ? -1 : minCot0.compare(minCot1);
+			b3Printf("    -> Result %d\n", cmp);
+			if (firstRun || ((cmp >= 0) ? !minCot1.isNegativeInfinity() : !minCot0.isNegativeInfinity()))
+			{
+				Edge* e = newEdgePair(c0, c1);
+				if (pendingTail0)
+				{
+					pendingTail0->prev = e;
+				}
+				else
+				{
+					pendingHead0 = e;
+				}
+				e->next = pendingTail0;
+				pendingTail0 = e;
+				e = e->reverse;
+				if (pendingTail1)
+				{
+					pendingTail1->next = e;
+				}
+				else
+				{
+					pendingHead1 = e;
+				}
+				e->prev = pendingTail1;
+				pendingTail1 = e;
+			}
+			Edge* e0 = min0;
+			Edge* e1 = min1;
+			b3Printf("   Found min edges to %d %d\n", e0 ? e0->target->point.index : -1, e1 ? e1->target->point.index : -1);
+			if (cmp == 0)
+			{
+				findEdgeForCoplanarFaces(c0, c1, e0, e1, NULL, NULL);
+			}
+			if ((cmp >= 0) && e1)
+			{
+				if (toPrev1)
+				{
+					for (Edge* e = toPrev1->next, *n = NULL; e != min1; e = n)
+					{
+						n = e->next;
+						removeEdgePair(e);
+					}
+				}
+				if (pendingTail1)
+				{
+					if (toPrev1)
+					{
+						toPrev1->link(pendingHead1);
+					}
+					else
+					{
+						min1->prev->link(pendingHead1);
+						firstNew1 = pendingHead1;
+					}
+					pendingTail1->link(min1);
+					pendingHead1 = NULL;
+					pendingTail1 = NULL;
+				}
+				else if (!toPrev1)
+				{
+					firstNew1 = min1;
+				}
+				prevPoint = c1->point;
+				c1 = e1->target;
+				toPrev1 = e1->reverse;
+			}
+			if ((cmp <= 0) && e0)
+			{
+				if (toPrev0)
+				{
+					for (Edge* e = toPrev0->prev, *n = NULL; e != min0; e = n)
+					{
+						n = e->prev;
+						removeEdgePair(e);
+					}
+				}
+				if (pendingTail0)
+				{
+					if (toPrev0)
+					{
+						pendingHead0->link(toPrev0);
+					}
+					else
+					{
+						pendingHead0->link(min0->next);
+						firstNew0 = pendingHead0;
+					}
+					min0->link(pendingTail0);
+					pendingHead0 = NULL;
+					pendingTail0 = NULL;
+				}
+				else if (!toPrev0)
+				{
+					firstNew0 = min0;
+				}
+				prevPoint = c0->point;
+				c0 = e0->target;
+				toPrev0 = e0->reverse;
+			}
+		}
+		if ((c0 == first0) && (c1 == first1))
+		{
+			if (toPrev0 == NULL)
+			{
+				pendingHead0->link(pendingTail0);
+				c0->edges = pendingTail0;
+			}
+			else
+			{
+				for (Edge* e = toPrev0->prev, *n = NULL; e != firstNew0; e = n)
+				{
+					n = e->prev;
+					removeEdgePair(e);
+				}
+				if (pendingTail0)
+				{
+					pendingHead0->link(toPrev0);
+					firstNew0->link(pendingTail0);
+				}
+			}
+			if (toPrev1 == NULL)
+			{
+				pendingTail1->link(pendingHead1);
+				c1->edges = pendingTail1;
+			}
+			else
+			{
+				for (Edge* e = toPrev1->next, *n = NULL; e != firstNew1; e = n)
+				{
+					n = e->next;
+					removeEdgePair(e);
+				}
+				if (pendingTail1)
+				{
+					toPrev1->link(pendingHead1);
+					pendingTail1->link(firstNew1);
+				}
+			}
+			return;
+		}
+		firstRun = false;
+	}
+static bool b3PointCmp(const b3ConvexHullInternal::Point32& p, const b3ConvexHullInternal::Point32& q)
+	return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
+void b3ConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count)
+	b3Vector3 min = b3MakeVector3(b3Scalar(1e30), b3Scalar(1e30), b3Scalar(1e30)),  max = b3MakeVector3(b3Scalar(-1e30), b3Scalar(-1e30), b3Scalar(-1e30));
+	const char* ptr = (const char*) coords;
+	if (doubleCoords)
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const double* v = (const double*) ptr;
+			b3Vector3 p = b3MakeVector3((b3Scalar) v[0], (b3Scalar) v[1], (b3Scalar) v[2]);
+			ptr += stride;
+			min.setMin(p);
+			max.setMax(p);
+		}
+	}
+	else
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const float* v = (const float*) ptr;
+			b3Vector3 p = b3MakeVector3(v[0], v[1], v[2]);
+			ptr += stride;
+			min.setMin(p);
+			max.setMax(p);
+		}
+	}
+	b3Vector3 s = max - min;
+	maxAxis = s.maxAxis();
+	minAxis = s.minAxis();
+	if (minAxis == maxAxis)
+	{
+		minAxis = (maxAxis + 1) % 3;
+	}
+	medAxis = 3 - maxAxis - minAxis;
+	s /= b3Scalar(10216);
+	if (((medAxis + 1) % 3) != maxAxis)
+	{
+		s *= -1;
+	}
+	scaling = s;
+	if (s[0] != 0)
+	{
+		s[0] = b3Scalar(1) / s[0];
+	}
+	if (s[1] != 0)
+	{
+		s[1] = b3Scalar(1) / s[1];
+	}
+	if (s[2] != 0)
+	{
+		s[2] = b3Scalar(1) / s[2];
+	}
+	center = (min + max) * b3Scalar(0.5);
+	b3AlignedObjectArray<Point32> points;
+	points.resize(count);
+	ptr = (const char*) coords;
+	if (doubleCoords)
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const double* v = (const double*) ptr;
+			b3Vector3 p = b3MakeVector3((b3Scalar) v[0], (b3Scalar) v[1], (b3Scalar) v[2]);
+			ptr += stride;
+			p = (p - center) * s;
+			points[i].x = (btInt32_t) p[medAxis];
+			points[i].y = (btInt32_t) p[maxAxis];
+			points[i].z = (btInt32_t) p[minAxis];
+			points[i].index = i;
+		}
+	}
+	else
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const float* v = (const float*) ptr;
+			b3Vector3 p = b3MakeVector3(v[0], v[1], v[2]);
+			ptr += stride;
+			p = (p - center) * s;
+			points[i].x = (btInt32_t) p[medAxis];
+			points[i].y = (btInt32_t) p[maxAxis];
+			points[i].z = (btInt32_t) p[minAxis];
+			points[i].index = i;
+		}
+	}
+	points.quickSort(b3PointCmp);
+	vertexPool.reset();
+	vertexPool.setArraySize(count);
+	originalVertices.resize(count);
+	for (int i = 0; i < count; i++)
+	{
+		Vertex* v = vertexPool.newObject();
+		v->edges = NULL;
+		v->point = points[i];
+		v->copy = -1;
+		originalVertices[i] = v;
+	}
+	points.clear();
+	edgePool.reset();
+	edgePool.setArraySize(6 * count);
+	usedEdgePairs = 0;
+	maxUsedEdgePairs = 0;
+	mergeStamp = -3;
+	IntermediateHull hull;
+	computeInternal(0, count, hull);
+	vertexList = hull.minXy;
+	b3Printf("max. edges %d (3v = %d)", maxUsedEdgePairs, 3 * count);
+b3Vector3 b3ConvexHullInternal::toBtVector(const Point32& v)
+	b3Vector3 p;
+	p[medAxis] = b3Scalar(v.x);
+	p[maxAxis] = b3Scalar(v.y);
+	p[minAxis] = b3Scalar(v.z);
+	return p * scaling;
+b3Vector3 b3ConvexHullInternal::getBtNormal(Face* face)
+	return toBtVector(face->dir0).cross(toBtVector(face->dir1)).normalized();
+b3Vector3 b3ConvexHullInternal::getCoordinates(const Vertex* v)
+	b3Vector3 p;
+	p[medAxis] = v->xvalue();
+	p[maxAxis] = v->yvalue();
+	p[minAxis] = v->zvalue();
+	return p * scaling + center;
+b3Scalar b3ConvexHullInternal::shrink(b3Scalar amount, b3Scalar clampAmount)
+	if (!vertexList)
+	{
+		return 0;
+	}
+	int stamp = --mergeStamp;
+	b3AlignedObjectArray<Vertex*> stack;
+	vertexList->copy = stamp;
+	stack.push_back(vertexList);
+	b3AlignedObjectArray<Face*> faces;
+	Point32 ref = vertexList->point;
+	Int128 hullCenterX(0, 0);
+	Int128 hullCenterY(0, 0);
+	Int128 hullCenterZ(0, 0);
+	Int128 volume(0, 0);
+	while (stack.size() > 0)
+	{
+		Vertex* v = stack[stack.size() - 1];
+		stack.pop_back();
+		Edge* e = v->edges;
+		if (e)
+		{
+			do
+			{
+				if (e->target->copy != stamp)
+				{
+					e->target->copy = stamp;
+					stack.push_back(e->target);
+				}
+				if (e->copy != stamp)
+				{
+					Face* face = facePool.newObject();
+					face->init(e->target, e->reverse->prev->target, v);
+					faces.push_back(face);
+					Edge* f = e;
+					Vertex* a = NULL;
+					Vertex* b = NULL;
+					do
+					{
+						if (a && b)
+						{
+							btInt64_t vol = (v->point - ref).dot((a->point - ref).cross(b->point - ref));
+							b3Assert(vol >= 0);
+							Point32 c = v->point + a->point + b->point + ref;
+							hullCenterX += vol * c.x;
+							hullCenterY += vol * c.y;
+							hullCenterZ += vol * c.z;
+							volume += vol;
+						}
+						b3Assert(f->copy != stamp);
+						f->copy = stamp;
+						f->face = face;
+						a = b;
+						b = f->target;
+						f = f->reverse->prev;
+					} while (f != e);
+				}
+				e = e->next;
+			} while (e != v->edges);
+		}
+	}
+	if (volume.getSign() <= 0)
+	{
+		return 0;
+	}
+	b3Vector3 hullCenter;
+	hullCenter[medAxis] = hullCenterX.toScalar();
+	hullCenter[maxAxis] = hullCenterY.toScalar();
+	hullCenter[minAxis] = hullCenterZ.toScalar();
+	hullCenter /= 4 * volume.toScalar();
+	hullCenter *= scaling;
+	int faceCount = faces.size();
+	if (clampAmount > 0)
+	{
+		b3Scalar minDist = B3_INFINITY;
+		for (int i = 0; i < faceCount; i++)
+		{
+			b3Vector3 normal = getBtNormal(faces[i]);
+			b3Scalar dist = normal.dot(toBtVector(faces[i]->origin) - hullCenter);
+			if (dist < minDist)
+			{
+				minDist = dist;
+			}
+		}
+		if (minDist <= 0)
+		{
+			return 0;
+		}
+		amount = b3Min(amount, minDist * clampAmount);
+	}
+	unsigned int seed = 243703;
+	for (int i = 0; i < faceCount; i++, seed = 1664525 * seed + 1013904223)
+	{
+		b3Swap(faces[i], faces[seed % faceCount]);
+	}
+	for (int i = 0; i < faceCount; i++)
+	{
+		if (!shiftFace(faces[i], amount, stack))
+		{
+			return -amount;
+		}
+	}
+	return amount;
+bool b3ConvexHullInternal::shiftFace(Face* face, b3Scalar amount, b3AlignedObjectArray<Vertex*> stack)
+	b3Vector3 origShift = getBtNormal(face) * -amount;
+	if (scaling[0] != 0)
+	{
+		origShift[0] /= scaling[0];
+	}
+	if (scaling[1] != 0)
+	{
+		origShift[1] /= scaling[1];
+	}
+	if (scaling[2] != 0)
+	{
+		origShift[2] /= scaling[2];
+	}
+	Point32 shift((btInt32_t) origShift[medAxis], (btInt32_t) origShift[maxAxis], (btInt32_t) origShift[minAxis]);
+	if (shift.isZero())
+	{
+		return true;
+	}
+	Point64 normal = face->getNormal();
+	b3Printf("\nShrinking face (%d %d %d) (%d %d %d) (%d %d %d) by (%d %d %d)\n",
+				 face->origin.x, face->origin.y, face->origin.z, face->dir0.x, face->dir0.y, face->dir0.z, face->dir1.x, face->dir1.y, face->dir1.z, shift.x, shift.y, shift.z);
+	btInt64_t origDot = face->origin.dot(normal);
+	Point32 shiftedOrigin = face->origin + shift;
+	btInt64_t shiftedDot = shiftedOrigin.dot(normal);
+	b3Assert(shiftedDot <= origDot);
+	if (shiftedDot >= origDot)
+	{
+		return false;
+	}
+	Edge* intersection = NULL;
+	Edge* startEdge = face->nearbyVertex->edges;
+	b3Printf("Start edge is ");
+	startEdge->print();
+	b3Printf(", normal is (%lld %lld %lld), shifted dot is %lld\n", normal.x, normal.y, normal.z, shiftedDot);
+	Rational128 optDot = face->nearbyVertex->dot(normal);
+	int cmp = optDot.compare(shiftedDot);
+	int n = 0;
+	if (cmp >= 0)
+	{
+		Edge* e = startEdge;
+		do
+		{
+			n++;
+			Rational128 dot = e->target->dot(normal);
+			b3Assert(dot.compare(origDot) <= 0);
+			b3Printf("Moving downwards, edge is ");
+			e->print();
+			b3Printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot);
+			if (dot.compare(optDot) < 0)
+			{
+				int c = dot.compare(shiftedDot);
+				optDot = dot;
+				e = e->reverse;
+				startEdge = e;
+				if (c < 0)
+				{
+					intersection = e;
+					break;
+				}
+				cmp = c;
+			}
+			e = e->prev;
+		} while (e != startEdge);
+		if (!intersection)
+		{
+			return false;
+		}
+	}
+	else
+	{
+		Edge* e = startEdge;
+		do
+		{
+			n++;
+			Rational128 dot = e->target->dot(normal);
+			b3Assert(dot.compare(origDot) <= 0);
+			b3Printf("Moving upwards, edge is ");
+			e->print();
+			b3Printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot);
+			if (dot.compare(optDot) > 0)
+			{
+				cmp = dot.compare(shiftedDot);
+				if (cmp >= 0)
+				{
+					intersection = e;
+					break;
+				}
+				optDot = dot;
+				e = e->reverse;
+				startEdge = e;
+			}
+			e = e->prev;
+		} while (e != startEdge);
+		if (!intersection)
+		{
+			return true;
+		}
+	}
+	b3Printf("Needed %d iterations to find initial intersection\n", n);
+	if (cmp == 0)
+	{
+		Edge* e = intersection->reverse->next;
+		n = 0;
+		while (e->target->dot(normal).compare(shiftedDot) <= 0)
+		{
+			n++;
+			e = e->next;
+			if (e == intersection->reverse)
+			{
+				return true;
+			}
+			b3Printf("Checking for outwards edge, current edge is ");
+			e->print();
+			b3Printf("\n");
+		}
+		b3Printf("Needed %d iterations to check for complete containment\n", n);
+	}
+	Edge* firstIntersection = NULL;
+	Edge* faceEdge = NULL;
+	Edge* firstFaceEdge = NULL;
+	int m = 0;
+	while (true)
+	{
+		m++;
+		b3Printf("Intersecting edge is ");
+		intersection->print();
+		b3Printf("\n");
+		if (cmp == 0)
+		{
+			Edge* e = intersection->reverse->next;
+			startEdge = e;
+			n = 0;
+			while (true)
+			{
+				n++;
+				if (e->target->dot(normal).compare(shiftedDot) >= 0)
+				{
+					break;
+				}
+				intersection = e->reverse;
+				e = e->next;
+				if (e == startEdge)
+				{
+					return true;
+				}
+			}
+			b3Printf("Needed %d iterations to advance intersection\n", n);
+		}
+		b3Printf("Advanced intersecting edge to ");
+		intersection->print();
+		b3Printf(", cmp = %d\n", cmp);
+		if (!firstIntersection)
+		{
+			firstIntersection = intersection;
+		}
+		else if (intersection == firstIntersection)
+		{
+			break;
+		}
+		int prevCmp = cmp;
+		Edge* prevIntersection = intersection;
+		Edge* prevFaceEdge = faceEdge;
+		Edge* e = intersection->reverse;
+		n = 0;
+		while (true)
+		{
+			n++;
+			e = e->reverse->prev;
+			b3Assert(e != intersection->reverse);
+			cmp = e->target->dot(normal).compare(shiftedDot);
+			b3Printf("Testing edge ");
+			e->print();
+			b3Printf(" -> cmp = %d\n", cmp);
+			if (cmp >= 0)
+			{
+				intersection = e;
+				break;
+			}
+		}
+		b3Printf("Needed %d iterations to find other intersection of face\n", n);
+		if (cmp > 0)
+		{
+			Vertex* removed = intersection->target;
+			e = intersection->reverse;
+			if (e->prev == e)
+			{
+				removed->edges = NULL;
+			}
+			else
+			{
+				removed->edges = e->prev;
+				e->prev->link(e->next);
+				e->link(e);
+			}
+			b3Printf("1: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+			Point64 n0 = intersection->face->getNormal();
+			Point64 n1 = intersection->reverse->face->getNormal();
+			btInt64_t m00 = face->dir0.dot(n0);
+			btInt64_t m01 = face->dir1.dot(n0);
+			btInt64_t m10 = face->dir0.dot(n1);
+			btInt64_t m11 = face->dir1.dot(n1);
+			btInt64_t r0 = (intersection->face->origin - shiftedOrigin).dot(n0);
+			btInt64_t r1 = (intersection->reverse->face->origin - shiftedOrigin).dot(n1);
+			Int128 det = Int128::mul(m00, m11) - Int128::mul(m01, m10);
+			b3Assert(det.getSign() != 0);
+			Vertex* v = vertexPool.newObject();
+			v->point.index = -1;
+			v->copy = -1;
+			v->point128 = PointR128(Int128::mul(face->dir0.x * r0, m11) - Int128::mul(face->dir0.x * r1, m01)
+															+ Int128::mul(face->dir1.x * r1, m00) - Int128::mul(face->dir1.x * r0, m10) + det * shiftedOrigin.x,
+															Int128::mul(face->dir0.y * r0, m11) - Int128::mul(face->dir0.y * r1, m01)
+															+ Int128::mul(face->dir1.y * r1, m00) - Int128::mul(face->dir1.y * r0, m10) + det * shiftedOrigin.y,
+															Int128::mul(face->dir0.z * r0, m11) - Int128::mul(face->dir0.z * r1, m01)
+															+ Int128::mul(face->dir1.z * r1, m00) - Int128::mul(face->dir1.z * r0, m10) + det * shiftedOrigin.z,
+															det);
+			v->point.x = (btInt32_t) v->point128.xvalue();
+			v->point.y = (btInt32_t) v->point128.yvalue();
+			v->point.z = (btInt32_t) v->point128.zvalue();
+			intersection->target = v;
+			v->edges = e;
+			stack.push_back(v);
+			stack.push_back(removed);
+			stack.push_back(NULL);
+		}
+		if (cmp || prevCmp || (prevIntersection->reverse->next->target != intersection->target))
+		{
+			faceEdge = newEdgePair(prevIntersection->target, intersection->target);
+			if (prevCmp == 0)
+			{
+				faceEdge->link(prevIntersection->reverse->next);
+			}
+			if ((prevCmp == 0) || prevFaceEdge)
+			{
+				prevIntersection->reverse->link(faceEdge);
+			}
+			if (cmp == 0)
+			{
+				intersection->reverse->prev->link(faceEdge->reverse);
+			}
+			faceEdge->reverse->link(intersection->reverse);
+		}
+		else
+		{
+			faceEdge = prevIntersection->reverse->next;
+		}
+		if (prevFaceEdge)
+		{
+			if (prevCmp > 0)
+			{
+				faceEdge->link(prevFaceEdge->reverse);
+			}
+			else if (faceEdge != prevFaceEdge->reverse)
+			{
+				stack.push_back(prevFaceEdge->target);
+				while (faceEdge->next != prevFaceEdge->reverse)
+				{
+					Vertex* removed = faceEdge->next->target;
+					removeEdgePair(faceEdge->next);
+					stack.push_back(removed);
+					b3Printf("2: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+				}
+				stack.push_back(NULL);
+			}
+		}
+		faceEdge->face = face;
+		faceEdge->reverse->face = intersection->face;
+		if (!firstFaceEdge)
+		{
+			firstFaceEdge = faceEdge;
+		}
+	}
+	b3Printf("Needed %d iterations to process all intersections\n", m);
+	if (cmp > 0)
+	{
+		firstFaceEdge->reverse->target = faceEdge->target;
+		firstIntersection->reverse->link(firstFaceEdge);
+		firstFaceEdge->link(faceEdge->reverse);
+	}
+	else if (firstFaceEdge != faceEdge->reverse)
+	{
+		stack.push_back(faceEdge->target);
+		while (firstFaceEdge->next != faceEdge->reverse)
+		{
+			Vertex* removed = firstFaceEdge->next->target;
+			removeEdgePair(firstFaceEdge->next);
+			stack.push_back(removed);
+			b3Printf("3: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+		}
+		stack.push_back(NULL);
+	}
+	b3Assert(stack.size() > 0);
+	vertexList = stack[0];
+	b3Printf("Removing part\n");
+	n = 0;
+	int pos = 0;
+	while (pos < stack.size())
+	{
+		int end = stack.size();
+		while (pos < end)
+		{
+			Vertex* kept = stack[pos++];
+			kept->print();
+			bool deeper = false;
+			Vertex* removed;
+			while ((removed = stack[pos++]) != NULL)
+			{
+				n++;
+				kept->receiveNearbyFaces(removed);
+				while (removed->edges)
+				{
+					if (!deeper)
+					{
+						deeper = true;
+						stack.push_back(kept);
+					}
+					stack.push_back(removed->edges->target);
+					removeEdgePair(removed->edges);
+				}
+			}
+			if (deeper)
+			{
+				stack.push_back(NULL);
+			}
+		}
+	}
+	b3Printf("Needed %d iterations to remove part\n", n);
+	stack.resize(0);
+	face->origin = shiftedOrigin;
+	return true;
+static int getVertexCopy(b3ConvexHullInternal::Vertex* vertex, b3AlignedObjectArray<b3ConvexHullInternal::Vertex*>& vertices)
+	int index = vertex->copy;
+	if (index < 0)
+	{
+		index = vertices.size();
+		vertex->copy = index;
+		vertices.push_back(vertex);
+		b3Printf("Vertex %d gets index *%d\n", vertex->point.index, index);
+	}
+	return index;
+b3Scalar b3ConvexHullComputer::compute(const void* coords, bool doubleCoords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp)
+	if (count <= 0)
+	{
+		vertices.clear();
+		edges.clear();
+		faces.clear();
+		return 0;
+	}
+	b3ConvexHullInternal hull;
+	hull.compute(coords, doubleCoords, stride, count);
+	b3Scalar shift = 0;
+	if ((shrink > 0) && ((shift = hull.shrink(shrink, shrinkClamp)) < 0))
+	{
+		vertices.clear();
+		edges.clear();
+		faces.clear();
+		return shift;
+	}
+	vertices.resize(0);
+	edges.resize(0);
+	faces.resize(0);
+	b3AlignedObjectArray<b3ConvexHullInternal::Vertex*> oldVertices;
+	getVertexCopy(hull.vertexList, oldVertices);
+	int copied = 0;
+	while (copied < oldVertices.size())
+	{
+		b3ConvexHullInternal::Vertex* v = oldVertices[copied];
+		vertices.push_back(hull.getCoordinates(v));
+		b3ConvexHullInternal::Edge* firstEdge = v->edges;
+		if (firstEdge)
+		{
+			int firstCopy = -1;
+			int prevCopy = -1;
+			b3ConvexHullInternal::Edge* e = firstEdge;
+			do
+			{
+				if (e->copy < 0)
+				{
+					int s = edges.size();
+					edges.push_back(Edge());
+					edges.push_back(Edge());
+					Edge* c = &edges[s];
+					Edge* r = &edges[s + 1];
+					e->copy = s;
+					e->reverse->copy = s + 1;
+					c->reverse = 1;
+					r->reverse = -1;
+					c->targetVertex = getVertexCopy(e->target, oldVertices);
+					r->targetVertex = copied;
+					b3Printf("      CREATE: Vertex *%d has edge to *%d\n", copied, c->getTargetVertex());
+				}
+				if (prevCopy >= 0)
+				{
+					edges[e->copy].next = prevCopy - e->copy;
+				}
+				else
+				{
+					firstCopy = e->copy;
+				}
+				prevCopy = e->copy;
+				e = e->next;
+			} while (e != firstEdge);
+			edges[firstCopy].next = prevCopy - firstCopy;
+		}
+		copied++;
+	}
+	for (int i = 0; i < copied; i++)
+	{
+		b3ConvexHullInternal::Vertex* v = oldVertices[i];
+		b3ConvexHullInternal::Edge* firstEdge = v->edges;
+		if (firstEdge)
+		{
+			b3ConvexHullInternal::Edge* e = firstEdge;
+			do
+			{
+				if (e->copy >= 0)
+				{
+					b3Printf("Vertex *%d has edge to *%d\n", i, edges[e->copy].getTargetVertex());
+					faces.push_back(e->copy);
+					b3ConvexHullInternal::Edge* f = e;
+					do
+					{
+						b3Printf("   Face *%d\n", edges[f->copy].getTargetVertex());
+						f->copy = -1;
+						f = f->reverse->prev;
+					} while (f != e);
+				}
+				e = e->next;
+			} while (e != firstEdge);
+		}
+	}
+	return shift;
diff --git a/src/bullet/Bullet3Geometry/b3ConvexHullComputer.h b/src/bullet/Bullet3Geometry/b3ConvexHullComputer.h
new file mode 100644
index 00000000..6dcc931a
--- /dev/null
+++ b/src/bullet/Bullet3Geometry/b3ConvexHullComputer.h
@@ -0,0 +1,103 @@
+Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+/// Convex hull implementation based on Preparata and Hong
+/// See http://code.google.com/p/bullet/issues/detail?id=275
+/// Ole Kniemeyer, MAXON Computer GmbH
+class b3ConvexHullComputer
+	private:
+		b3Scalar compute(const void* coords, bool doubleCoords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp);
+	public:
+		class Edge
+		{
+			private:
+				int next;
+				int reverse;
+				int targetVertex;
+				friend class b3ConvexHullComputer;
+			public:
+				int getSourceVertex() const
+				{
+					return (this + reverse)->targetVertex;
+				}
+				int getTargetVertex() const
+				{
+					return targetVertex;
+				}
+				const Edge* getNextEdgeOfVertex() const // clockwise list of all edges of a vertex
+				{
+					return this + next;
+				}
+				const Edge* getNextEdgeOfFace() const // counter-clockwise list of all edges of a face
+				{
+					return (this + reverse)->getNextEdgeOfVertex();
+				}
+				const Edge* getReverseEdge() const
+				{
+					return this + reverse;
+				}
+		};
+		// Vertices of the output hull
+		b3AlignedObjectArray<b3Vector3> vertices;
+		// Edges of the output hull
+		b3AlignedObjectArray<Edge> edges;
+		// Faces of the convex hull. Each entry is an index into the "edges" array pointing to an edge of the face. Faces are planar n-gons
+		b3AlignedObjectArray<int> faces;
+		/*
+		Compute convex hull of "count" vertices stored in "coords". "stride" is the difference in bytes
+		between the addresses of consecutive vertices. If "shrink" is positive, the convex hull is shrunken
+		by that amount (each face is moved by "shrink" length units towards the center along its normal).
+		If "shrinkClamp" is positive, "shrink" is clamped to not exceed "shrinkClamp * innerRadius", where "innerRadius"
+		is the minimum distance of a face to the center of the convex hull.
+		The returned value is the amount by which the hull has been shrunken. If it is negative, the amount was so large
+		that the resulting convex hull is empty.
+		The output convex hull can be found in the member variables "vertices", "edges", "faces".
+		*/
+		b3Scalar compute(const float* coords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp)
+		{
+			return compute(coords, false, stride, count, shrink, shrinkClamp);
+		}
+		// same as above, but double precision
+		b3Scalar compute(const double* coords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp)
+		{
+			return compute(coords, true, stride, count, shrink, shrinkClamp);
+		}
diff --git a/src/bullet/Bullet3Geometry/b3GeometryUtil.cpp b/src/bullet/Bullet3Geometry/b3GeometryUtil.cpp
new file mode 100644
index 00000000..dd80fed6
--- /dev/null
+++ b/src/bullet/Bullet3Geometry/b3GeometryUtil.cpp
@@ -0,0 +1,185 @@
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3GeometryUtil.h"
+  Make sure this dummy function never changes so that it
+  can be used by probes that are checking whether the
+  library is actually installed.
+extern "C"
+	void b3BulletMathProbe ();
+	void b3BulletMathProbe () {}
+bool	b3GeometryUtil::isPointInsidePlanes(const b3AlignedObjectArray<b3Vector3>& planeEquations, const b3Vector3& point, b3Scalar	margin)
+	int numbrushes = planeEquations.size();
+	for (int i=0;i<numbrushes;i++)
+	{
+		const b3Vector3& N1 = planeEquations[i];
+		b3Scalar dist = b3Scalar(N1.dot(point))+b3Scalar(N1[3])-margin;
+		if (dist>b3Scalar(0.))
+		{
+			return false;
+		}
+	}
+	return true;
+bool	b3GeometryUtil::areVerticesBehindPlane(const b3Vector3& planeNormal, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar	margin)
+	int numvertices = vertices.size();
+	for (int i=0;i<numvertices;i++)
+	{
+		const b3Vector3& N1 = vertices[i];
+		b3Scalar dist = b3Scalar(planeNormal.dot(N1))+b3Scalar(planeNormal[3])-margin;
+		if (dist>b3Scalar(0.))
+		{
+			return false;
+		}
+	}
+	return true;
+bool notExist(const b3Vector3& planeEquation,const b3AlignedObjectArray<b3Vector3>& planeEquations);
+bool notExist(const b3Vector3& planeEquation,const b3AlignedObjectArray<b3Vector3>& planeEquations)
+	int numbrushes = planeEquations.size();
+	for (int i=0;i<numbrushes;i++)
+	{
+		const b3Vector3& N1 = planeEquations[i];
+		if (planeEquation.dot(N1) > b3Scalar(0.999))
+		{
+			return false;
+		} 
+	}
+	return true;
+void	b3GeometryUtil::getPlaneEquationsFromVertices(b3AlignedObjectArray<b3Vector3>& vertices, b3AlignedObjectArray<b3Vector3>& planeEquationsOut )
+		const int numvertices = vertices.size();
+	// brute force:
+	for (int i=0;i<numvertices;i++)
+	{
+		const b3Vector3& N1 = vertices[i];
+		for (int j=i+1;j<numvertices;j++)
+		{
+			const b3Vector3& N2 = vertices[j];
+			for (int k=j+1;k<numvertices;k++)
+			{
+				const b3Vector3& N3 = vertices[k];
+				b3Vector3 planeEquation,edge0,edge1;
+				edge0 = N2-N1;
+				edge1 = N3-N1;
+				b3Scalar normalSign = b3Scalar(1.);
+				for (int ww=0;ww<2;ww++)
+				{
+					planeEquation = normalSign * edge0.cross(edge1);
+					if (planeEquation.length2() > b3Scalar(0.0001))
+					{
+						planeEquation.normalize();
+						if (notExist(planeEquation,planeEquationsOut))
+						{
+							planeEquation[3] = -planeEquation.dot(N1);
+								//check if inside, and replace supportingVertexOut if needed
+								if (areVerticesBehindPlane(planeEquation,vertices,b3Scalar(0.01)))
+								{
+									planeEquationsOut.push_back(planeEquation);
+								}
+						}
+					}
+					normalSign = b3Scalar(-1.);
+				}
+			}
+		}
+	}
+void	b3GeometryUtil::getVerticesFromPlaneEquations(const b3AlignedObjectArray<b3Vector3>& planeEquations , b3AlignedObjectArray<b3Vector3>& verticesOut )
+	const int numbrushes = planeEquations.size();
+	// brute force:
+	for (int i=0;i<numbrushes;i++)
+	{
+		const b3Vector3& N1 = planeEquations[i];
+		for (int j=i+1;j<numbrushes;j++)
+		{
+			const b3Vector3& N2 = planeEquations[j];
+			for (int k=j+1;k<numbrushes;k++)
+			{
+				const b3Vector3& N3 = planeEquations[k];
+				b3Vector3 n2n3; n2n3 = N2.cross(N3);
+				b3Vector3 n3n1; n3n1 = N3.cross(N1);
+				b3Vector3 n1n2; n1n2 = N1.cross(N2);
+				if ( ( n2n3.length2() > b3Scalar(0.0001) ) &&
+					 ( n3n1.length2() > b3Scalar(0.0001) ) &&
+					 ( n1n2.length2() > b3Scalar(0.0001) ) )
+				{
+					//point P out of 3 plane equations:
+					//	d1 ( N2 * N3 ) + d2 ( N3 * N1 ) + d3 ( N1 * N2 )  
+					//P =  -------------------------------------------------------------------------  
+					//   N1 . ( N2 * N3 )  
+					b3Scalar quotient = (N1.dot(n2n3));
+					if (b3Fabs(quotient) > b3Scalar(0.000001))
+					{
+						quotient = b3Scalar(-1.) / quotient;
+						n2n3 *= N1[3];
+						n3n1 *= N2[3];
+						n1n2 *= N3[3];
+						b3Vector3 potentialVertex = n2n3;
+						potentialVertex += n3n1;
+						potentialVertex += n1n2;
+						potentialVertex *= quotient;
+						//check if inside, and replace supportingVertexOut if needed
+						if (isPointInsidePlanes(planeEquations,potentialVertex,b3Scalar(0.01)))
+						{
+							verticesOut.push_back(potentialVertex);
+						}
+					}
+				}
+			}
+		}
+	}
diff --git a/src/bullet/Bullet3Geometry/b3GeometryUtil.h b/src/bullet/Bullet3Geometry/b3GeometryUtil.h
new file mode 100644
index 00000000..8b5fd7ad
--- /dev/null
+++ b/src/bullet/Bullet3Geometry/b3GeometryUtil.h
@@ -0,0 +1,42 @@
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+///The b3GeometryUtil helper class provides a few methods to convert between plane equations and vertices.
+class b3GeometryUtil
+	public:
+		static void	getPlaneEquationsFromVertices(b3AlignedObjectArray<b3Vector3>& vertices, b3AlignedObjectArray<b3Vector3>& planeEquationsOut );
+		static void	getVerticesFromPlaneEquations(const b3AlignedObjectArray<b3Vector3>& planeEquations , b3AlignedObjectArray<b3Vector3>& verticesOut );
+		static bool	isInside(const b3AlignedObjectArray<b3Vector3>& vertices, const b3Vector3& planeNormal, b3Scalar	margin);
+		static bool	isPointInsidePlanes(const b3AlignedObjectArray<b3Vector3>& planeEquations, const b3Vector3& point, b3Scalar	margin);
+		static bool	areVerticesBehindPlane(const b3Vector3& planeNormal, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar	margin);
+#endif //B3_GEOMETRY_UTIL_H
diff --git a/src/bullet/Bullet3Geometry/b3GrahamScan2dConvexHull.h b/src/bullet/Bullet3Geometry/b3GrahamScan2dConvexHull.h
new file mode 100644
index 00000000..1b933c52
--- /dev/null
+++ b/src/bullet/Bullet3Geometry/b3GrahamScan2dConvexHull.h
@@ -0,0 +1,117 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+struct b3GrahamVector3 : public b3Vector3
+	b3GrahamVector3(const b3Vector3& org, int orgIndex)
+		:b3Vector3(org),
+			m_orgIndex(orgIndex)
+	{
+	}
+	b3Scalar	m_angle;
+	int m_orgIndex;
+struct b3AngleCompareFunc {
+	b3Vector3 m_anchor;
+	b3AngleCompareFunc(const b3Vector3& anchor)
+	: m_anchor(anchor) 
+	{
+	}
+	bool operator()(const b3GrahamVector3& a, const b3GrahamVector3& b) const {
+		if (a.m_angle != b.m_angle)
+			return a.m_angle < b.m_angle;
+		else
+		{
+			b3Scalar al = (a-m_anchor).length2();
+			b3Scalar bl = (b-m_anchor).length2();
+			if (al != bl)
+				return  al < bl;
+			else
+			{
+				return a.m_orgIndex < b.m_orgIndex;
+			}
+		}
+	}
+inline void b3GrahamScanConvexHull2D(b3AlignedObjectArray<b3GrahamVector3>& originalPoints, b3AlignedObjectArray<b3GrahamVector3>& hull, const b3Vector3& normalAxis)
+	b3Vector3 axis0,axis1;
+	b3PlaneSpace1(normalAxis,axis0,axis1);
+	if (originalPoints.size()<=1)
+	{
+		for (int i=0;i<originalPoints.size();i++)
+			hull.push_back(originalPoints[0]);
+		return;
+	}
+	//step1 : find anchor point with smallest projection on axis0 and move it to first location
+	for (int i=0;i<originalPoints.size();i++)
+	{
+//		const b3Vector3& left = originalPoints[i];
+//		const b3Vector3& right = originalPoints[0];
+		b3Scalar projL = originalPoints[i].dot(axis0);
+		b3Scalar projR = originalPoints[0].dot(axis0);
+		if (projL < projR)
+		{
+			originalPoints.swap(0,i);
+		}
+	}
+	//also precompute angles
+	originalPoints[0].m_angle = -1e30f;
+	for (int i=1;i<originalPoints.size();i++)
+	{
+		b3Vector3 xvec = axis0;
+		b3Vector3 ar = originalPoints[i]-originalPoints[0];
+		originalPoints[i].m_angle = b3Cross(xvec, ar).dot(normalAxis) / ar.length();
+	}
+	//step 2: sort all points, based on 'angle' with this anchor
+	b3AngleCompareFunc comp(originalPoints[0]);
+	originalPoints.quickSortInternal(comp,1,originalPoints.size()-1);
+	int i;
+	for (i = 0; i<2; i++) 
+		hull.push_back(originalPoints[i]);
+	//step 3: keep all 'convex' points and discard concave points (using back tracking)
+	for (; i != originalPoints.size(); i++) 
+	{
+		bool isConvex = false;
+		while (!isConvex&& hull.size()>1) {
+			b3Vector3& a = hull[hull.size()-2];
+			b3Vector3& b = hull[hull.size()-1];
+			isConvex = b3Cross(a-b,a-originalPoints[i]).dot(normalAxis)> 0;
+			if (!isConvex)
+				hull.pop_back();
+			else 
+				hull.push_back(originalPoints[i]);
+		}
+	}
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
new file mode 100644
index 00000000..09e271e6
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
@@ -0,0 +1,44 @@
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "b3SapAabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+class b3GpuBroadphaseInterface
+	typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue  q);
+	virtual ~b3GpuBroadphaseInterface()
+	{
+	}
+	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)=0;
+	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)=0;
+	virtual void  calculateOverlappingPairs(int maxPairs)=0;
+	virtual void  calculateOverlappingPairsHost(int maxPairs)=0;
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu()=0;
+	virtual cl_mem	getAabbBufferWS()=0;
+	virtual int	getNumOverlap()=0;
+	virtual cl_mem	getOverlappingPairBuffer()=0;
+	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU()=0;
+	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU()=0;
+	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
+	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
+	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
new file mode 100644
index 00000000..77248d11
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
@@ -0,0 +1,384 @@
+#include "b3GpuGridBroadphase.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#include "kernels/gridBroadphaseKernels.h"
+#include "kernels/sapKernels.h"
+//#include "kernels/gridBroadphase.cl"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
+#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
+cl_kernel kCalcHashAABB;
+cl_kernel kClearCellStart;
+cl_kernel kFindCellStart;
+cl_kernel kFindOverlappingPairs;
+cl_kernel m_copyAabbsKernel;
+cl_kernel m_sap2Kernel;
+//int maxPairsPerBody = 64;
+int maxBodiesPerCell = 256;//??
+b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q )
+	b3Vector3 gridSize = b3MakeVector3(3,3,3);
+	b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]);
+	m_paramsCPU.m_gridSize[0] = 128;
+	m_paramsCPU.m_gridSize[1] = 128;
+	m_paramsCPU.m_gridSize[2] = 128;
+	m_paramsCPU.m_gridSize[3] = maxBodiesPerCell;
+	m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell);
+	m_paramsCPU.m_invCellSize[0] = invGridSize[0];
+	m_paramsCPU.m_invCellSize[1] = invGridSize[1];
+	m_paramsCPU.m_invCellSize[2] = invGridSize[2];
+	m_paramsCPU.m_invCellSize[3] = 0.f;
+	m_paramsGPU.push_back(m_paramsCPU);
+	cl_int errNum=0;
+	{
+		const char* sapSrc = sapCL;
+		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
+		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
+		b3Assert(errNum==CL_SUCCESS);
+	}
+	{
+		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg);
+		b3Assert(errNum==CL_SUCCESS);
+		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg);
+		b3Assert(errNum==CL_SUCCESS);
+		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg);
+		b3Assert(errNum==CL_SUCCESS);
+		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg);
+		b3Assert(errNum==CL_SUCCESS);
+	}
+	m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
+	clReleaseKernel( kCalcHashAABB);
+	clReleaseKernel( kClearCellStart);
+	clReleaseKernel( kFindCellStart);
+	clReleaseKernel( kFindOverlappingPairs);
+	clReleaseKernel( m_sap2Kernel);
+	clReleaseKernel( m_copyAabbsKernel);
+	delete m_sorter;
+void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+	b3SapAabb aabb;
+	aabb.m_minVec = aabbMin;
+	aabb.m_maxVec = aabbMax;
+	aabb.m_minIndices[3] = userPtr;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
+	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
+	m_allAabbsCPU1.push_back(aabb);
+void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+	b3SapAabb aabb;
+	aabb.m_minVec = aabbMin;
+	aabb.m_maxVec = aabbMax;
+	aabb.m_minIndices[3] = userPtr;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
+	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
+	m_allAabbsCPU1.push_back(aabb);
+void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
+	B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
+	if (0)
+	{
+		calculateOverlappingPairsHost(maxPairs);
+	/*
+		b3AlignedObjectArray<b3Int4> cpuPairs;
+		m_gpuPairs.copyToHost(cpuPairs);
+		printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
+		for (int i=0;i<m_gpuPairs.size();i++)
+		{
+			printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y);
+		}
+		*/
+		return;
+	}
+	int numSmallAabbs = m_smallAabbsMappingGPU.size();
+	b3OpenCLArray<int> pairCount(m_context,m_queue);
+	pairCount.push_back(0);
+	m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody);
+	{
+		int numLargeAabbs = m_largeAabbsMappingGPU.size();
+		if (numLargeAabbs && numSmallAabbs)
+		{
+			B3_PROFILE("sap2Kernel");
+			b3BufferInfoCL bInfo[] = { 
+				b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ),
+				b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
+				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), 
+				b3BufferInfoCL( m_gpuPairs.getBufferCL() ), 
+				b3BufferInfoCL(pairCount.getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(   numLargeAabbs  );
+			launcher.setConst( numSmallAabbs);
+			launcher.setConst( 0  );//axis is not used
+			launcher.setConst( maxPairs  );
+	//@todo: use actual maximum work item sizes of the device instead of hardcoded values
+			launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
+			int numPairs = pairCount.at(0);
+			if (numPairs >maxPairs)
+			{
+				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+				numPairs =maxPairs;
+			}
+		}
+	}
+	if (numSmallAabbs)
+	{
+		B3_PROFILE("gridKernel");
+		m_hashGpu.resize(numSmallAabbs);
+		{
+			B3_PROFILE("kCalcHashAABB");
+			b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB");
+			launch.setConst(numSmallAabbs);
+			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
+			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
+			launch.setBuffer(m_hashGpu.getBufferCL());
+			launch.setBuffer(this->m_paramsGPU.getBufferCL());
+			launch.launch1D(numSmallAabbs);
+		}
+		m_sorter->execute(m_hashGpu);
+		int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2];
+		m_cellStartGpu.resize(numCells);
+		//b3AlignedObjectArray<int >			cellStartCpu;
+		{
+			B3_PROFILE("kClearCellStart");
+			b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart");
+			launch.setConst(numCells);
+			launch.setBuffer(m_cellStartGpu.getBufferCL());
+			launch.launch1D(numCells);
+			//m_cellStartGpu.copyToHost(cellStartCpu);
+			//printf("??\n");
+		}
+		{
+			B3_PROFILE("kFindCellStart");
+			b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart");
+			launch.setConst(numSmallAabbs);
+			launch.setBuffer(m_hashGpu.getBufferCL());
+			launch.setBuffer(m_cellStartGpu.getBufferCL());
+			launch.launch1D(numSmallAabbs);
+			//m_cellStartGpu.copyToHost(cellStartCpu);
+			//printf("??\n");
+		}
+		{
+			B3_PROFILE("kFindOverlappingPairs");
+			b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs");
+			launch.setConst(numSmallAabbs);
+			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
+			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
+			launch.setBuffer(m_hashGpu.getBufferCL());
+			launch.setBuffer(m_cellStartGpu.getBufferCL());
+			launch.setBuffer(m_paramsGPU.getBufferCL());
+			//launch.setBuffer(0);
+			launch.setBuffer(pairCount.getBufferCL());
+			launch.setBuffer(m_gpuPairs.getBufferCL());
+			launch.setConst(maxPairs);
+			launch.launch1D(numSmallAabbs);
+			int numPairs = pairCount.at(0);
+			if (numPairs >maxPairs)
+			{
+				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+				numPairs =maxPairs;
+			}
+			m_gpuPairs.resize(numPairs);
+			if (0)
+			{
+				b3AlignedObjectArray<b3Int4> pairsCpu;
+				m_gpuPairs.copyToHost(pairsCpu);
+				int sz = m_gpuPairs.size();
+				printf("m_gpuPairs.size()=%d\n",sz);
+				for (int i=0;i<m_gpuPairs.size();i++)
+				{
+					printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y);
+				}
+				printf("?!?\n");
+			}
+		}
+	}
+	//calculateOverlappingPairsHost(maxPairs);
+void  b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
+	m_hostPairs.resize(0);
+	m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
+	for (int i=0;i<m_allAabbsCPU1.size();i++)
+	{
+		for (int j=i+1;j<m_allAabbsCPU1.size();j++)
+		{
+			if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
+				m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
+			{
+				b3Int4 pair;
+				int a = m_allAabbsCPU1[j].m_minIndices[3];
+				int b = m_allAabbsCPU1[i].m_minIndices[3];
+				if (a<=b)
+				{
+					pair.x = a; 
+					pair.y = b;//store the original index in the unsorted aabb array
+				} else
+				{
+					pair.x = b;
+					pair.y = a;//store the original index in the unsorted aabb array
+				}
+				if (m_hostPairs.size()<maxPairs)
+				{
+					m_hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+	m_gpuPairs.copyFromHost(m_hostPairs);
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+void b3GpuGridBroadphase::writeAabbsToGpu()
+	m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
+	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
+	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
+cl_mem	b3GpuGridBroadphase::getAabbBufferWS()
+	return this->m_allAabbsGPU1.getBufferCL();
+int	b3GpuGridBroadphase::getNumOverlap()
+	return m_gpuPairs.size();
+cl_mem	b3GpuGridBroadphase::getOverlappingPairBuffer()
+	return m_gpuPairs.getBufferCL();
+b3OpenCLArray<b3SapAabb>&	b3GpuGridBroadphase::getAllAabbsGPU()
+	return m_allAabbsGPU1;
+b3AlignedObjectArray<b3SapAabb>&	b3GpuGridBroadphase::getAllAabbsCPU()
+	return m_allAabbsCPU1;
+b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU()
+	return m_gpuPairs;
+b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU()
+	return m_smallAabbsMappingGPU;
+b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
+	return m_largeAabbsMappingGPU;
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
new file mode 100644
index 00000000..9694a362
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
@@ -0,0 +1,88 @@
+#include "b3GpuBroadphaseInterface.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+struct b3ParamsGridBroadphaseCL
+	float m_invCellSize[4];
+	int   m_gridSize[4];
+	int	getMaxBodiesPerCell() const
+	{
+		return m_gridSize[3];
+	}
+	void setMaxBodiesPerCell(int maxOverlap) 
+	{
+		m_gridSize[3] = maxOverlap;
+	}
+class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU1;
+	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU1;
+	b3OpenCLArray<int>	m_smallAabbsMappingGPU;
+	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
+	b3OpenCLArray<int>	m_largeAabbsMappingGPU;
+	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
+	b3AlignedObjectArray<b3Int4> m_hostPairs;
+	b3OpenCLArray<b3Int4>			m_gpuPairs;
+	b3OpenCLArray<b3SortData>			m_hashGpu;
+	b3OpenCLArray<int>			m_cellStartGpu;
+	b3ParamsGridBroadphaseCL		m_paramsCPU;
+	b3OpenCLArray<b3ParamsGridBroadphaseCL>		m_paramsGPU;
+	class b3RadixSort32CL*			m_sorter;
+	b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	virtual ~b3GpuGridBroadphase();
+	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	{
+		return new b3GpuGridBroadphase(ctx,device,q);
+	}
+	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+	virtual void  calculateOverlappingPairs(int maxPairs);
+	virtual void  calculateOverlappingPairsHost(int maxPairs);
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu();
+	virtual cl_mem	getAabbBufferWS();
+	virtual int	getNumOverlap();
+	virtual cl_mem	getOverlappingPairBuffer();
+	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU();
+	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU();
+	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
+	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
+	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
new file mode 100644
index 00000000..641df9eb
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
@@ -0,0 +1,577 @@
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Initial Author Jackson Lee, 2014
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "b3GpuParallelLinearBvh.h"
+b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
+	m_queue(queue),
+	m_radixSorter(context, device, queue),
+	m_rootNodeIndex(context, queue),
+	m_maxDistanceFromRoot(context, queue),
+	m_temp(context, queue),
+	m_internalNodeAabbs(context, queue),
+	m_internalNodeLeafIndexRanges(context, queue),
+	m_internalNodeChildNodes(context, queue),
+	m_internalNodeParentNodes(context, queue),
+	m_commonPrefixes(context, queue),
+	m_commonPrefixLengths(context, queue),
+	m_distanceFromRoot(context, queue),
+	m_leafNodeParentNodes(context, queue),
+	m_mortonCodesAndAabbIndicies(context, queue),
+	m_mergedAabb(context, queue),
+	m_leafNodeAabbs(context, queue),
+	m_largeAabbs(context, queue)
+	m_rootNodeIndex.resize(1);
+	m_maxDistanceFromRoot.resize(1);
+	m_temp.resize(1);
+	//
+	const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
+	const char* kernelSource = parallelLinearBvhCL;	//parallelLinearBvhCL.h
+	cl_int error;
+	char* additionalMacros = 0;
+	m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
+	b3Assert(m_parallelLinearBvhProgram);
+	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_separateAabbsKernel);
+	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_findAllNodesMergedAabbKernel);
+	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
+	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_computeAdjacentPairCommonPrefixKernel);
+	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
+	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
+	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_findDistanceFromRootKernel);
+	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
+	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_findLeafIndexRangesKernel);
+	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_plbvhCalculateOverlappingPairsKernel);
+	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_plbvhRayTraverseKernel);
+	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_plbvhLargeAabbAabbTestKernel);
+	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_plbvhLargeAabbRayTestKernel);
+	clReleaseKernel(m_separateAabbsKernel);
+	clReleaseKernel(m_findAllNodesMergedAabbKernel);
+	clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
+	clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
+	clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
+	clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
+	clReleaseKernel(m_findDistanceFromRootKernel);
+	clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
+	clReleaseKernel(m_findLeafIndexRangesKernel);
+	clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
+	clReleaseKernel(m_plbvhRayTraverseKernel);
+	clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
+	clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
+	clReleaseProgram(m_parallelLinearBvhProgram);
+void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, 
+									const b3OpenCLArray<int>& largeAabbIndices)
+	B3_PROFILE("b3ParallelLinearBvh::build()");
+	int numLargeAabbs = largeAabbIndices.size();
+	int numSmallAabbs = smallAabbIndices.size();
+	//Since all AABBs(both large and small) are input as a contiguous array, 
+	//with 2 additional arrays used to indicate the indices of large and small AABBs,
+	//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
+	{
+		B3_PROFILE("Separate large and small AABBs");
+		m_largeAabbs.resize(numLargeAabbs);
+		m_leafNodeAabbs.resize(numSmallAabbs);
+		//Write large AABBs into m_largeAabbs
+		{
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
+				b3BufferInfoCL( m_largeAabbs.getBufferCL() )
+			};
+			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numLargeAabbs);
+			launcher.launch1D(numLargeAabbs);
+		}
+		//Write small AABBs into m_leafNodeAabbs
+		{
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
+				b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
+			};
+			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numSmallAabbs);
+			launcher.launch1D(numSmallAabbs);
+		}
+		clFinish(m_queue);
+	}
+	//
+	int numLeaves = numSmallAabbs;	//Number of leaves in the BVH == Number of rigid bodies with small AABBs
+	int numInternalNodes = numLeaves - 1;
+	if(numLeaves < 2)
+	{
+		//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
+		//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
+		int rootNodeIndex = numLeaves - 1;
+		m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
+		//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
+		//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
+		//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
+		//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
+		if(numLeaves == 1)
+		{
+			b3SortData leaf;
+			leaf.m_value = 0;		//1 leaf so index is always 0; leaf.m_key does not need to be set
+			m_mortonCodesAndAabbIndicies.resize(1);
+			m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
+		}
+		return;
+	}
+	//
+	{
+		m_internalNodeAabbs.resize(numInternalNodes);
+		m_internalNodeLeafIndexRanges.resize(numInternalNodes);
+		m_internalNodeChildNodes.resize(numInternalNodes);
+		m_internalNodeParentNodes.resize(numInternalNodes);
+		m_commonPrefixes.resize(numInternalNodes);
+		m_commonPrefixLengths.resize(numInternalNodes);
+		m_distanceFromRoot.resize(numInternalNodes);
+		m_leafNodeParentNodes.resize(numLeaves);
+		m_mortonCodesAndAabbIndicies.resize(numLeaves);
+		m_mergedAabb.resize(numLeaves);
+	}
+	//Find the merged AABB of all small AABBs; this is used to define the size of 
+	//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
+	{
+		B3_PROFILE("Find AABB of merged nodes");
+		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);	//Need to make a copy since the kernel modifies the array
+		for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; 
+				numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
+		{
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_mergedAabb.getBufferCL() )		//Resulting AABB is stored in m_mergedAabb[0]
+			};
+			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numAabbsNeedingMerge);
+			launcher.launch1D(numAabbsNeedingMerge);
+		}
+		clFinish(m_queue);
+	}
+	//Insert the center of the AABBs into a virtual grid,
+	//then convert the discrete grid coordinates into a morton code
+	//For each element in m_mortonCodesAndAabbIndicies, set
+	//	m_key == morton code (value to sort by)
+	//	m_value == small AABB index
+	{
+		B3_PROFILE("Assign morton codes");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
+			b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
+			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numLeaves);
+		launcher.launch1D(numLeaves);
+		clFinish(m_queue);
+	}
+	//
+	{
+		B3_PROFILE("Sort leaves by morton codes");
+		m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
+		clFinish(m_queue);
+	}
+	//
+	constructBinaryRadixTree();
+	//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
+	//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
+	//The child nodes of each node split their parent's index range into 2 contiguous halves.
+	//
+	//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
+	//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
+	//
+	//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
+	{
+		B3_PROFILE("m_findLeafIndexRangesKernel");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numInternalNodes);
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
+	int maxPairs = out_overlappingPairs.size();
+	b3OpenCLArray<int>& numPairsGpu = m_temp;
+	int reset = 0;
+	numPairsGpu.copyFromHostPointer(&reset, 1);
+	//
+	if( m_leafNodeAabbs.size() > 1 )
+	{
+		B3_PROFILE("PLBVH small-small AABB test");
+		int numQueryAabbs = m_leafNodeAabbs.size();
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
+			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
+			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+			b3BufferInfoCL( numPairsGpu.getBufferCL() ),
+			b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(maxPairs);
+		launcher.setConst(numQueryAabbs);
+		launcher.launch1D(numQueryAabbs);
+		clFinish(m_queue);
+	}
+	int numLargeAabbRigids = m_largeAabbs.size();
+	if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
+	{
+		B3_PROFILE("PLBVH large-small AABB test");
+		int numQueryAabbs = m_leafNodeAabbs.size();
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
+			b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
+			b3BufferInfoCL( numPairsGpu.getBufferCL() ),
+			b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(maxPairs);
+		launcher.setConst(numLargeAabbRigids);
+		launcher.setConst(numQueryAabbs);
+		launcher.launch1D(numQueryAabbs);
+		clFinish(m_queue);
+	}
+	//
+	int numPairs = -1;
+	numPairsGpu.copyToHostPointer(&numPairs, 1);
+	if(numPairs > maxPairs)
+	{
+		b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+		numPairs = maxPairs;
+		numPairsGpu.copyFromHostPointer(&maxPairs, 1);
+	}
+	out_overlappingPairs.resize(numPairs);
+void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, 
+							b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
+	B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
+	int numRays = rays.size();
+	int maxRayRigidPairs = out_rayRigidPairs.size();
+	int reset = 0;
+	out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
+	//
+	if( m_leafNodeAabbs.size() > 0 )
+	{
+		B3_PROFILE("PLBVH ray test small AABB");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
+			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
+			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+			b3BufferInfoCL( rays.getBufferCL() ),
+			b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
+			b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(maxRayRigidPairs);
+		launcher.setConst(numRays);
+		launcher.launch1D(numRays);
+		clFinish(m_queue);
+	}
+	int numLargeAabbRigids = m_largeAabbs.size();
+	if(numLargeAabbRigids > 0)
+	{
+		B3_PROFILE("PLBVH ray test large AABB");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
+			b3BufferInfoCL( rays.getBufferCL() ),
+			b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
+			b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numLargeAabbRigids);
+		launcher.setConst(maxRayRigidPairs);
+		launcher.setConst(numRays);
+		launcher.launch1D(numRays);
+		clFinish(m_queue);
+	}
+	//
+	int numRayRigidPairs = -1;
+	out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
+	if(numRayRigidPairs > maxRayRigidPairs)
+		b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
+void b3GpuParallelLinearBvh::constructBinaryRadixTree()
+	B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
+	int numLeaves = m_leafNodeAabbs.size();
+	int numInternalNodes = numLeaves - 1;
+	//Each internal node is placed in between 2 leaf nodes.
+	//By using this arrangement and computing the common prefix between
+	//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
+	{
+		B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+			b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
+			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numInternalNodes);
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+	//For each leaf node, select its parent node by 
+	//comparing the 2 nearest internal nodes and assign child node indices
+	{
+		B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
+			b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numLeaves);
+		launcher.launch1D(numLeaves);
+		clFinish(m_queue);
+	}
+	//For each internal node, perform 2 binary searches among the other internal nodes
+	//to its left and right to find its potential parent nodes and assign child node indices
+	{
+		B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
+			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
+			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numInternalNodes);
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+	//Find the number of nodes seperating each internal node and the root node
+	//so that the AABBs can be set using the next kernel.
+	//Also determine the maximum number of nodes separating an internal node and the root node.
+	{
+		B3_PROFILE("m_findDistanceFromRootKernel");
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
+			b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
+			b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
+		};
+		b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numInternalNodes);
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+	//Starting from the internal nodes nearest to the leaf nodes, recursively move up
+	//the tree towards the root to set the AABBs of each internal node; each internal node
+	//checks its children and merges their AABBs
+	{
+		B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
+		int maxDistanceFromRoot = -1;
+		{
+			B3_PROFILE("copy maxDistanceFromRoot to CPU");
+			m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
+			clFinish(m_queue);
+		}
+		for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
+		{
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
+				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+				b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
+			};
+			b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(maxDistanceFromRoot);
+			launcher.setConst(distanceFromRoot);
+			launcher.setConst(numInternalNodes);
+			//It may seem inefficent to launch a thread for each internal node when a
+			//much smaller number of nodes is actually processed, but this is actually
+			//faster than determining the exact nodes that are ready to merge their child AABBs. 
+			launcher.launch1D(numInternalNodes);
+		}
+		clFinish(m_queue);
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
new file mode 100644
index 00000000..effe617b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@@ -0,0 +1,125 @@
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Initial Author Jackson Lee, 2014
+//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
+#define b3Int64 cl_long
+///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
+///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details.
+///Related papers: \n
+///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
+///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
+///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
+/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) 
+/// - [fully parallel] Sort morton codes
+/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) 
+/// - [somewhat parallel] Set internal node AABBs 
+///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
+///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
+///Instead of searching for the child nodes of each internal node, we search for the parent node of each node.
+///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs.
+class b3GpuParallelLinearBvh
+	cl_command_queue m_queue;
+	cl_program m_parallelLinearBvhProgram;
+	cl_kernel m_separateAabbsKernel;
+	cl_kernel m_findAllNodesMergedAabbKernel;
+	cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
+	//Binary radix tree construction kernels
+	cl_kernel m_computeAdjacentPairCommonPrefixKernel;
+	cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
+	cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
+	cl_kernel m_findDistanceFromRootKernel;
+	cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
+	cl_kernel m_findLeafIndexRangesKernel;
+	//Traversal kernels
+	cl_kernel m_plbvhCalculateOverlappingPairsKernel;
+	cl_kernel m_plbvhRayTraverseKernel;
+	cl_kernel m_plbvhLargeAabbAabbTestKernel;
+	cl_kernel m_plbvhLargeAabbRayTestKernel;
+	b3RadixSort32CL m_radixSorter;
+	//1 element
+	b3OpenCLArray<int> m_rootNodeIndex;							//Most significant bit(0x80000000) is set to indicate internal node
+	b3OpenCLArray<int> m_maxDistanceFromRoot;					//Max number of internal nodes between an internal node and the root node
+	b3OpenCLArray<int> m_temp;									//Used to hold the number of pairs in calculateOverlappingPairs()
+	//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
+	b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
+	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;		//x == min leaf index, y == max leaf index
+	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;				//x == left child, y == right child; msb(0x80000000) is set to indicate internal node
+	b3OpenCLArray<int> m_internalNodeParentNodes;				//For parent node index, msb(0x80000000) is not set since it is always internal
+	//1 element per internal node; for binary radix tree construction
+	b3OpenCLArray<b3Int64> m_commonPrefixes;
+	b3OpenCLArray<int> m_commonPrefixLengths;
+	b3OpenCLArray<int> m_distanceFromRoot;						//Number of internal nodes between this node and the root
+	//1 element per leaf node (leaf nodes only include small AABBs)
+	b3OpenCLArray<int> m_leafNodeParentNodes;					//For parent node index, msb(0x80000000) is not set since it is always internal
+	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;		//m_key == morton code, m_value == aabb index in m_leafNodeAabbs
+	b3OpenCLArray<b3SapAabb> m_mergedAabb;						//m_mergedAabb[0] contains the merged AABB of all leaf nodes
+	b3OpenCLArray<b3SapAabb> m_leafNodeAabbs;					//Contains only small AABBs
+	//1 element per large AABB, which is not stored in the BVH
+	b3OpenCLArray<b3SapAabb> m_largeAabbs;
+	b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
+	virtual ~b3GpuParallelLinearBvh();
+	///Must be called before any other function
+	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, 
+				const b3OpenCLArray<int>& largeAabbIndices);
+	///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
+	///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
+	///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
+	void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
+	///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
+	///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
+	///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
+	///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
+	void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, 
+								b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
+	void constructBinaryRadixTree();
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
new file mode 100644
index 00000000..78b62563
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
@@ -0,0 +1,80 @@
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Initial Author Jackson Lee, 2014
+#include "b3GpuParallelLinearBvhBroadphase.h"
+b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : 
+	m_plbvh(context, device, queue),
+	m_overlappingPairsGpu(context, queue),
+	m_aabbsGpu(context, queue),
+	m_smallAabbsMappingGpu(context, queue),
+	m_largeAabbsMappingGpu(context, queue)
+void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
+	int newAabbIndex = m_aabbsCpu.size();
+	b3SapAabb aabb;
+	aabb.m_minVec = aabbMin;
+	aabb.m_maxVec = aabbMax;
+	aabb.m_minIndices[3] = userPtr;
+	aabb.m_signedMaxIndices[3] = newAabbIndex;
+	m_smallAabbsMappingCpu.push_back(newAabbIndex);
+	m_aabbsCpu.push_back(aabb);
+void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
+	int newAabbIndex = m_aabbsCpu.size();
+	b3SapAabb aabb;
+	aabb.m_minVec = aabbMin;
+	aabb.m_maxVec = aabbMax;
+	aabb.m_minIndices[3] = userPtr;
+	aabb.m_signedMaxIndices[3] = newAabbIndex;
+	m_largeAabbsMappingCpu.push_back(newAabbIndex);
+	m_aabbsCpu.push_back(aabb);
+void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
+	//Reconstruct BVH
+	m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
+	//
+	m_overlappingPairsGpu.resize(maxPairs);
+	m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
+void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
+	b3Assert(0);	//CPU version not implemented
+void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() 
+	m_aabbsGpu.copyFromHost(m_aabbsCpu); 
+	m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
+	m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
new file mode 100644
index 00000000..284f6c78
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@@ -0,0 +1,66 @@
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Initial Author Jackson Lee, 2014
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
+#include "b3GpuParallelLinearBvh.h"
+class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
+	b3GpuParallelLinearBvh m_plbvh;
+	b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
+	b3OpenCLArray<b3SapAabb> m_aabbsGpu;
+	b3OpenCLArray<int> m_smallAabbsMappingGpu;
+	b3OpenCLArray<int> m_largeAabbsMappingGpu;
+	b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
+	b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
+	b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
+	b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
+	virtual ~b3GpuParallelLinearBvhBroadphase() {}
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask);
+	virtual void calculateOverlappingPairs(int maxPairs);
+	virtual void calculateOverlappingPairsHost(int maxPairs);
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu();
+	virtual int	getNumOverlap() { return m_overlappingPairsGpu.size(); }
+	virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
+	virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
+	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
+	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
+	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
+	static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
+	{
+		return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
+	}
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
new file mode 100644
index 00000000..0d3fe078
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
@@ -0,0 +1,1322 @@
+bool searchIncremental3dSapOnGpu = true;
+#include <limits.h>
+#include "b3GpuSapBroadphase.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "kernels/sapKernels.h"
+#include "Bullet3Common/b3MinMax.h"
+#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
+b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType)
+	const char* sapSrc = sapCL;
+	cl_int errNum=0;
+	b3Assert(m_context);
+	b3Assert(m_device);
+	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
+	b3Assert(errNum==CL_SUCCESS);
+	b3Assert(errNum==CL_SUCCESS);
+#ifndef __APPLE__
+	m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue);
+	m_prefixScanFloat4 = 0;
+	m_sapKernel = 0;
+	switch (kernelType)
+	{
+		{
+			m_sapKernel=0;
+			break;
+		}
+		{
+			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg );
+			break;
+		}
+		{
+			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
+			break;
+		}
+		{
+			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
+			break;
+		}
+		{
+			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+			break;
+		}
+		default:
+		{
+			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+			b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory");
+		}
+	};
+	m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
+	b3Assert(errNum==CL_SUCCESS);
+	m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg );
+	b3Assert(errNum==CL_SUCCESS);
+	m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
+	m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
+	m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
+	m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
+	delete m_sorter;
+	delete m_prefixScanFloat4;
+	clReleaseKernel(m_scatterKernel);
+	clReleaseKernel(m_flipFloatKernel);
+	clReleaseKernel(m_copyAabbsKernel);
+	clReleaseKernel(m_sapKernel);
+	clReleaseKernel(m_sap2Kernel);
+	clReleaseKernel(m_prepareSumVarianceKernel);
+/// conservative test for overlap between two aabbs
+static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
+								const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
+	return overlap;
+static unsigned int FloatFlip(float fl)
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+void  b3GpuSapBroadphase::init3dSap()
+	if (m_currentBuffer<0)
+	{
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+		m_currentBuffer = 0;
+		for (int axis=0;axis<3;axis++)
+		{
+			for (int buf=0;buf<2;buf++)
+			{
+				int totalNumAabbs = m_allAabbsCPU.size();
+				int numEndPoints = 2*totalNumAabbs;
+				m_sortedAxisCPU[axis][buf].resize(numEndPoints);
+				if (buf==m_currentBuffer)
+				{
+					for (int i=0;i<totalNumAabbs;i++)
+					{
+						m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1;
+						m_sortedAxisCPU[axis][buf][i*2].m_value = i*2;
+						m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1;
+						m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1;
+					}
+				}
+			}
+		}
+		for (int axis=0;axis<3;axis++)
+		{
+			m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
+		}
+		for (int axis=0;axis<3;axis++)
+		{
+			int totalNumAabbs = m_allAabbsCPU.size();
+			int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
+			m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints);
+			for (int i=0;i<numEndPoints;i++)
+			{
+				int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
+				int newDest = destIndex/2;
+				if (destIndex&1)
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i;
+				} else
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i;
+				}
+			}
+		}
+	}
+static bool b3PairCmp(const b3Int4& p, const b3Int4& q)
+	return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y)));
+static bool operator==(const b3Int4& a,const b3Int4& b)
+	return a.x == b.x && a.y == b.y;
+static bool operator<(const b3Int4& a,const b3Int4& b)
+	return a.x < b.x || (a.x == b.x && a.y < b.y);
+static bool operator>(const b3Int4& a,const b3Int4& b)
+	return a.x > b.x || (a.x == b.x && a.y > b.y);
+b3AlignedObjectArray<b3Int4> addedHostPairs;
+b3AlignedObjectArray<b3Int4> removedHostPairs;
+b3AlignedObjectArray<b3SapAabb>	preAabbs;
+void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
+	static int framepje = 0;
+	//printf("framepje=%d\n",framepje++);
+	B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap");
+	addedHostPairs.resize(0);
+	removedHostPairs.resize(0);
+	b3Assert(m_currentBuffer>=0);
+	{
+		preAabbs.resize(m_allAabbsCPU.size());
+		for (int i=0;i<preAabbs.size();i++)
+		{
+			preAabbs[i]=m_allAabbsCPU[i];
+		}
+	}
+	if (m_currentBuffer<0)
+		return;
+	{
+		B3_PROFILE("m_allAabbsGPU.copyToHost");
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+	}
+	b3AlignedObjectArray<b3Int4> allPairs;
+	{
+		B3_PROFILE("m_overlappingPairs.copyToHost");
+		m_overlappingPairs.copyToHost(allPairs);
+	}
+	if (0)
+	{
+	{
+		printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n",
+		m_allAabbsCPU[40].m_min[0],	m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2],
+		m_allAabbsCPU[40].m_max[0],	m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]);
+	}
+	{
+		printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n",
+		m_allAabbsCPU[53].m_min[0],	m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2],
+		m_allAabbsCPU[53].m_max[0],	m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]);
+	}
+	{
+	b3Int4 newPair;
+	newPair.x = 40;
+	newPair.y = 53;
+		int index = allPairs.findBinarySearch(newPair);
+		printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size());
+		{
+			int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max);
+			printf("overlap=%d\n",overlap);
+		}
+		if (preAabbs.size())
+		{
+			int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max);
+			printf("prevoverlap=%d\n",prevOverlap);
+		} else
+		{
+			printf("unknown prevoverlap\n");
+		}
+	}
+	}
+	if (0)
+	{
+		for (int i=0;i<m_allAabbsCPU.size();i++)
+		{
+			//printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2],			m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]);
+		}
+		for (int axis=0;axis<3;axis++)
+		{
+			for (int buf=0;buf<2;buf++)
+			{
+				b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2);
+			}
+		}
+	}
+	m_currentBuffer = 1-m_currentBuffer;
+	int totalNumAabbs = m_allAabbsCPU.size();
+	{
+		B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)");
+		for (int i=0;i<totalNumAabbs;i++)
+		{
+			unsigned int keyMin[3];
+			unsigned int keyMax[3];
+			for (int axis=0;axis<3;axis++)
+			{
+				float vmin=m_allAabbsCPU[i].m_min[axis];
+				float vmax = m_allAabbsCPU[i].m_max[axis];
+				keyMin[axis] = FloatFlip(vmin);
+				keyMax[axis] = FloatFlip(vmax);
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1;
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2;
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1;
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1;
+			}
+			//printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]);
+		}
+	}
+	{
+		B3_PROFILE("sort m_sortedAxisCPU");
+		for (int axis=0;axis<3;axis++)
+			m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
+	}
+	if (0)
+	{
+		for (int axis=0;axis<3;axis++)
+		{
+			//printf("axis %d\n",axis);
+			for (int i=0;i<m_sortedAxisCPU[axis][m_currentBuffer].size();i++)
+			{
+				int key = m_sortedAxisCPU[axis][m_currentBuffer][i].m_key;
+				int value = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
+				//printf("[%d]=%d\n",i,value);
+			}
+		}
+	}
+	{
+		B3_PROFILE("assign m_objectMinMaxIndexCPU");
+		for (int axis=0;axis<3;axis++)
+		{
+			int totalNumAabbs = m_allAabbsCPU.size();
+			int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
+			m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs);
+			for (int i=0;i<numEndPoints;i++)
+			{
+				int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
+				int newDest = destIndex/2;
+				if (destIndex&1)
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i;
+				} else
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i;
+				}
+			}
+		}
+	}
+	if (0)
+	{	
+		printf("==========================\n");
+		for (int axis=0;axis<3;axis++)
+		{
+			unsigned int curMinIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].x;
+			unsigned int curMaxIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].y;
+			unsigned int prevMaxIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].y;
+			unsigned int prevMinIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].x;
+			int dmin40 = curMinIndex40 - prevMinIndex40;
+			int dmax40 = curMinIndex40 - prevMinIndex40;
+			printf("axis %d curMinIndex40=%d prevMinIndex40=%d\n",axis,curMinIndex40, prevMinIndex40);
+			printf("axis %d curMaxIndex40=%d prevMaxIndex40=%d\n",axis,curMaxIndex40, prevMaxIndex40);
+		}
+		printf(".........................\n");
+		for (int axis=0;axis<3;axis++)
+		{
+			unsigned int curMinIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].x;
+			unsigned int curMaxIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].y;
+			unsigned int prevMaxIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].y;
+			unsigned int prevMinIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].x;
+			int dmin40 = curMinIndex53 - prevMinIndex53;
+			int dmax40 = curMinIndex53 - prevMinIndex53;
+			printf("axis %d curMinIndex53=%d prevMinIndex53=%d\n",axis,curMinIndex53, prevMinIndex53);
+			printf("axis %d curMaxIndex53=%d prevMaxIndex53=%d\n",axis,curMaxIndex53, prevMaxIndex53);
+		}
+	}
+	int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
+	int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size();
+	int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
+	b3Assert(a==b);
+	b3Assert(b==c);
+	/*
+	if (searchIncremental3dSapOnGpu)
+	{
+		B3_PROFILE("computePairsIncremental3dSapKernelGPU");
+		int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
+		int maxCapacity = 1024*1024;
+		{
+			B3_PROFILE("copy from host");
+			m_objectMinMaxIndexGPUaxis0.copyFromHost(m_objectMinMaxIndexCPU[0][m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis1.copyFromHost(m_objectMinMaxIndexCPU[1][m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis2.copyFromHost(m_objectMinMaxIndexCPU[2][m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis0prev.copyFromHost(m_objectMinMaxIndexCPU[0][1-m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis1prev.copyFromHost(m_objectMinMaxIndexCPU[1][1-m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis2prev.copyFromHost(m_objectMinMaxIndexCPU[2][1-m_currentBuffer]);
+			m_sortedAxisGPU0.copyFromHost(m_sortedAxisCPU[0][m_currentBuffer]);
+			m_sortedAxisGPU1.copyFromHost(m_sortedAxisCPU[1][m_currentBuffer]);
+			m_sortedAxisGPU2.copyFromHost(m_sortedAxisCPU[2][m_currentBuffer]);
+			m_sortedAxisGPU0prev.copyFromHost(m_sortedAxisCPU[0][1-m_currentBuffer]);
+			m_sortedAxisGPU1prev.copyFromHost(m_sortedAxisCPU[1][1-m_currentBuffer]);
+			m_sortedAxisGPU2prev.copyFromHost(m_sortedAxisCPU[2][1-m_currentBuffer]);
+			m_addedHostPairsGPU.resize(maxCapacity);
+			m_removedHostPairsGPU.resize(maxCapacity);
+			m_addedCountGPU.resize(0);
+			m_addedCountGPU.push_back(0);
+			m_removedCountGPU.resize(0);
+			m_removedCountGPU.push_back(0);
+		}
+		{
+			B3_PROFILE("launch1D");
+			b3LauncherCL launcher(m_queue,  m_computePairsIncremental3dSapKernel,"m_computePairsIncremental3dSapKernel");
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis0.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis1.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis2.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis0prev.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis1prev.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis2prev.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU0.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU1.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU2.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU0prev.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU1prev.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU2prev.getBufferCL());
+			launcher.setBuffer(m_addedHostPairsGPU.getBufferCL());
+			launcher.setBuffer(m_removedHostPairsGPU.getBufferCL());
+			launcher.setBuffer(m_addedCountGPU.getBufferCL());
+			launcher.setBuffer(m_removedCountGPU.getBufferCL());
+			launcher.setConst(maxCapacity);
+			launcher.setConst( numObjects);
+			launcher.launch1D( numObjects);
+			clFinish(m_queue);
+		}
+		{
+			B3_PROFILE("copy to host");
+			int addedCountGPU = m_addedCountGPU.at(0);
+			m_addedHostPairsGPU.resize(addedCountGPU);
+			m_addedHostPairsGPU.copyToHost(addedHostPairs);
+			//printf("addedCountGPU=%d\n",addedCountGPU);
+			int removedCountGPU = m_removedCountGPU.at(0);
+			m_removedHostPairsGPU.resize(removedCountGPU);
+			m_removedHostPairsGPU.copyToHost(removedHostPairs);
+			//printf("removedCountGPU=%d\n",removedCountGPU);
+		}
+	} 
+	else
+	*/
+	{
+		int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
+		B3_PROFILE("actual search");
+		for (int i=0;i<numObjects;i++)
+		{
+			//int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size();
+			//int checkObjects[]={40,53};
+			//int numCheckObjects = sizeof(checkObjects)/sizeof(int);
+			//for (int a=0;a<numCheckObjects ;a++)
+			for (int axis=0;axis<3;axis++)
+			{
+				//int i = checkObjects[a];
+				unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x;
+				unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y;
+				unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x;
+				int dmin = curMinIndex - prevMinIndex;
+				unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y;
+				int dmax = curMaxIndex - prevMaxIndex;
+				if (dmin!=0)
+				{
+					//printf("for object %d, dmin=%d\n",i,dmin);
+				}
+				if (dmax!=0)
+				{
+					//printf("for object %d, dmax=%d\n",i,dmax);
+				}
+				for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)
+				{
+					if (dmin!=0)
+					{
+						int stepMin = dmin<0 ? -1 : 1;
+						for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)
+						{
+							int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
+							int otherIndex = otherIndex2/2;
+							if (otherIndex!=i)
+							{
+								bool otherIsMax = ((otherIndex2&1)!=0);
+								if (otherIsMax)
+								{
+									//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
+									//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
+									bool overlap = true;
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
+											overlap=false;
+									}
+								//	b3Assert(overlap2==overlap);
+									bool prevOverlap = true;
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x))
+											prevOverlap=false;
+									}
+									//b3Assert(overlap==overlap2);
+									if (dmin<0)
+									{
+										if (overlap && !prevOverlap)
+										{
+											//add a pair
+											b3Int4 newPair;
+											if (i<=otherIndex)
+											{
+												newPair.x = i;
+												newPair.y = otherIndex;
+											} else
+											{
+												newPair.x = otherIndex;
+												newPair.y = i;
+											}
+											addedHostPairs.push_back(newPair);
+										}
+									} 
+									else
+									{
+										if (!overlap && prevOverlap)
+										{
+											//remove a pair
+											b3Int4 removedPair;
+											if (i<=otherIndex)
+											{
+												removedPair.x = i;
+												removedPair.y = otherIndex;
+											} else
+											{
+												removedPair.x = otherIndex;
+												removedPair.y = i;
+											}
+											removedHostPairs.push_back(removedPair);
+										}
+									}//otherisMax
+								}//if (dmin<0)
+							}//if (otherIndex!=i)
+						}//for (int j=
+					}
+					if (dmax!=0)
+					{
+						int stepMax = dmax<0 ? -1 : 1;
+						for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)
+						{
+							int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
+							int otherIndex = otherIndex2/2;
+							if (otherIndex!=i)
+							{
+								bool otherIsMin = ((otherIndex2&1)==0);
+								//if (otherIsMin)
+								{
+									//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
+									//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
+									bool overlap = true;
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
+											overlap=false;
+									}
+									//b3Assert(overlap2==overlap);
+									bool prevOverlap = true;
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x))
+											prevOverlap=false;
+									}
+									if (dmax>0)
+									{
+										if (overlap && !prevOverlap)
+										{
+											//add a pair
+											b3Int4 newPair;
+											if (i<=otherIndex)
+											{
+												newPair.x = i;
+												newPair.y = otherIndex;
+											} else
+											{
+												newPair.x = otherIndex;
+												newPair.y = i;
+											}
+											addedHostPairs.push_back(newPair);
+										}
+									} 
+									else
+									{
+										if (!overlap && prevOverlap)
+										{
+											//if (otherIndex2&1==0) -> min?
+											//remove a pair
+											b3Int4 removedPair;
+											if (i<=otherIndex)
+											{
+												removedPair.x = i;
+												removedPair.y = otherIndex;
+											} else
+											{
+												removedPair.x = otherIndex;
+												removedPair.y = i;
+											}
+											removedHostPairs.push_back(removedPair);
+										}
+									}
+								}//if (dmin<0)
+							}//if (otherIndex!=i)
+						}//for (int j=
+					}
+				}//for (int otherbuffer
+			}//for (int axis=0;
+		}//for (int i=0;i<numObjects
+	}
+	//remove duplicates and add/remove then to existing m_overlappingPairs
+	{
+		{
+			B3_PROFILE("sort allPairs");
+			allPairs.quickSort(b3PairCmp);
+		}
+		{
+			B3_PROFILE("sort addedHostPairs");
+			addedHostPairs.quickSort(b3PairCmp);
+		}
+		{
+			B3_PROFILE("sort removedHostPairs");
+			removedHostPairs.quickSort(b3PairCmp);
+		}
+	}
+	b3Int4 prevPair;
+	prevPair.x = -1;
+	prevPair.y = -1;
+	int uniqueRemovedPairs = 0;
+	b3AlignedObjectArray<int> removedPositions;
+	{
+		B3_PROFILE("actual removing");
+		for (int i=0;i<removedHostPairs.size();i++)
+		{
+			b3Int4 removedPair = removedHostPairs[i];
+			if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y))
+			{
+			int index1 = allPairs.findBinarySearch(removedPair);
+	//#ifdef _DEBUG
+				int index2 = allPairs.findLinearSearch(removedPair);
+				b3Assert(index1==index2);
+				//b3Assert(index1!=allPairs.size());
+				if (index1<allPairs.size())
+	//#endif//_DEBUG
+				{
+					uniqueRemovedPairs++;
+					removedPositions.push_back(index1);
+					{
+						//printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y);
+					}
+				}
+			}
+			prevPair = removedPair;
+		}
+		if (uniqueRemovedPairs)
+		{
+			for (int i=0;i<removedPositions.size();i++)
+			{
+				allPairs[removedPositions[i]].x = INT_MAX ;
+				allPairs[removedPositions[i]].y = INT_MAX ;
+			}
+			allPairs.quickSort(b3PairCmp);
+			allPairs.resize(allPairs.size()-uniqueRemovedPairs);
+		}
+	}
+	//if (uniqueRemovedPairs)
+	//	printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs);
+	//printf("removedHostPairs.size = %d\n",removedHostPairs.size());
+	prevPair.x = -1;
+	prevPair.y = -1;
+	int uniqueAddedPairs=0;
+	b3AlignedObjectArray<b3Int4> actualAddedPairs;
+	{
+		B3_PROFILE("actual adding");
+		for (int i=0;i<addedHostPairs.size();i++)
+		{
+			b3Int4 newPair = addedHostPairs[i];
+			if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y))
+			{
+//#ifdef _DEBUG		
+				int index1 = allPairs.findBinarySearch(newPair);
+				int index2 = allPairs.findLinearSearch(newPair);
+				b3Assert(index1==index2);
+				b3Assert(index1==allPairs.size());
+				if (index1!=allPairs.size())
+				{
+					printf("??\n");
+				}
+				if (index1==allPairs.size())
+//#endif //_DEBUG
+				{
+					uniqueAddedPairs++;
+					actualAddedPairs.push_back(newPair);
+				}
+			}
+			prevPair = newPair;
+		}
+		for (int i=0;i<actualAddedPairs.size();i++)
+		{
+			//printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y);
+			allPairs.push_back(actualAddedPairs[i]);
+		}
+	}
+	//if (uniqueAddedPairs)
+	//	printf("uniqueAddedPairs=%d\n", uniqueAddedPairs);
+	{
+		B3_PROFILE("m_overlappingPairs.copyFromHost");
+		m_overlappingPairs.copyFromHost(allPairs);
+	}
+void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
+	//test
+//	if (m_currentBuffer>=0)
+	//	return calculateOverlappingPairsHostIncremental3Sap();
+	b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
+	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+	int axis=0;
+	{
+		B3_PROFILE("CPU compute best variance axis");
+		b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0);
+		int numRigidBodies = m_smallAabbsMappingCPU.size();
+		for(int i=0;i<numRigidBodies;i++) 
+		{
+			b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
+			b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]);
+			b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]);
+			b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f;
+			s += centerAabb;
+			s2 += centerAabb*centerAabb;
+		}
+		b3Vector3 v = s2 - (s*s) / (float)numRigidBodies;
+		if(v[1] > v[0]) 
+			axis = 1;
+		if(v[2] > v[axis]) 
+			axis = 2;
+	}
+	b3AlignedObjectArray<b3Int4> hostPairs;
+	{
+		int numSmallAabbs = m_smallAabbsMappingCPU.size();
+		for (int i=0;i<numSmallAabbs;i++)
+		{
+			b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
+			float reference = smallAabbi.m_max[axis];
+			for (int j=i+1;j<numSmallAabbs;j++)
+			{
+				b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]];
+				if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
+					(b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max))
+				{
+					b3Int4 pair;
+					int a = smallAabbi.m_minIndices[3];
+					int b = smallAabbj.m_minIndices[3];
+					if (a<=b)
+					{
+						pair.x = a;//store the original index in the unsorted aabb array
+						pair.y = b;
+					} else
+					{
+						pair.x = b;//store the original index in the unsorted aabb array
+						pair.y = a;
+					}
+					hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+	{
+		int numSmallAabbs = m_smallAabbsMappingCPU.size();
+		for (int i=0;i<numSmallAabbs;i++)
+		{
+			b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
+			float reference = smallAabbi.m_max[axis];
+			int numLargeAabbs = m_largeAabbsMappingCPU.size();
+			for (int j=0;j<numLargeAabbs;j++)
+			{
+				b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]];
+				if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
+					(b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max))
+				{
+					b3Int4 pair;
+					int a = largeAabbj.m_minIndices[3];
+					int b = smallAabbi.m_minIndices[3];
+					if (a<=b)
+					{
+						pair.x = a; 
+						pair.y = b;//store the original index in the unsorted aabb array
+					} else
+					{
+						pair.x = b;
+						pair.y = a;//store the original index in the unsorted aabb array
+					}
+					hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+	if (hostPairs.size() > maxPairs)
+	{
+		hostPairs.resize(maxPairs);
+	}
+	if (hostPairs.size())
+	{
+		m_overlappingPairs.copyFromHost(hostPairs);
+	} else
+	{
+		m_overlappingPairs.resize(0);
+	}
+	//init3dSap();
+void  b3GpuSapBroadphase::reset()
+	m_allAabbsGPU.resize(0);
+	m_allAabbsCPU.resize(0);
+	m_smallAabbsMappingGPU.resize(0);
+	m_smallAabbsMappingCPU.resize(0);
+	m_pairCount.resize(0);
+	m_largeAabbsMappingGPU.resize(0);
+	m_largeAabbsMappingCPU.resize(0);
+void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
+	if (m_sapKernel==0)
+	{
+		calculateOverlappingPairsHost(maxPairs);
+		return;
+	}
+	//if (m_currentBuffer>=0)
+	//	return calculateOverlappingPairsHostIncremental3Sap();
+	//calculateOverlappingPairsHost(maxPairs);
+	B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs");
+	int axis = 0;
+	{
+	bool syncOnHost = false;
+	int numSmallAabbs = m_smallAabbsMappingCPU.size();
+	if (m_prefixScanFloat4 && numSmallAabbs)
+	{
+		B3_PROFILE("GPU compute best variance axis");
+		if (m_dst.size()!=(numSmallAabbs+1))
+		{
+			m_dst.resize(numSmallAabbs+128);
+			m_sum.resize(numSmallAabbs+128);
+			m_sum2.resize(numSmallAabbs+128);
+			m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow?
+			m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow?
+		}
+		b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel");
+		launcher.setBuffer(m_allAabbsGPU.getBufferCL());
+		launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
+		launcher.setBuffer(m_sum.getBufferCL());
+		launcher.setBuffer(m_sum2.getBufferCL());
+		launcher.setConst( numSmallAabbs  );
+		int num = numSmallAabbs;
+		launcher.launch1D( num);
+		b3Vector3 s;
+		b3Vector3 s2;
+		m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s);
+		m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2);
+		b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs;
+		if(v[1] > v[0]) 
+			axis = 1;
+		if(v[2] > v[axis]) 
+			axis = 2;
+	}
+		m_gpuSmallSortData.resize(numSmallAabbs);
+#if 1
+		if (m_smallAabbsMappingGPU.size())
+		{
+			B3_PROFILE("flipFloatKernel");
+			b3BufferInfoCL bInfo[] = { 
+				b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
+				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true),
+				b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( numSmallAabbs  );
+			launcher.setConst( axis  );
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+		if (m_gpuSmallSortData.size())
+		{
+			B3_PROFILE("gpu radix sort");
+			m_sorter->execute(m_gpuSmallSortData);
+			clFinish(m_queue);
+		}
+		m_gpuSmallSortedAabbs.resize(numSmallAabbs);
+		if (numSmallAabbs)
+		{
+			B3_PROFILE("scatterKernel");
+			b3BufferInfoCL bInfo[] = { 
+				b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
+				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true),
+				b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),
+				b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel ");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( numSmallAabbs);
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+			m_overlappingPairs.resize(maxPairs);
+			m_pairCount.resize(0);
+			m_pairCount.push_back(0);
+            int numPairs=0;
+			{
+				int numLargeAabbs = m_largeAabbsMappingGPU.size();
+				if (numLargeAabbs && numSmallAabbs)
+				{
+					//@todo
+					B3_PROFILE("sap2Kernel");
+					b3BufferInfoCL bInfo[] = { 
+						b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ),
+						b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
+						b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), 
+						b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), 
+						b3BufferInfoCL(m_pairCount.getBufferCL())};
+					b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setConst(   numLargeAabbs  );
+					launcher.setConst( numSmallAabbs);
+					launcher.setConst( axis  );
+					launcher.setConst( maxPairs  );
+//@todo: use actual maximum work item sizes of the device instead of hardcoded values
+					launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
+					numPairs = m_pairCount.at(0);
+					if (numPairs >maxPairs)
+					{
+						b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+						numPairs =maxPairs;
+					}
+				}
+			}
+			if (m_gpuSmallSortedAabbs.size())
+			{
+				B3_PROFILE("sapKernel");
+				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())};
+				b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( numSmallAabbs  );
+				launcher.setConst( axis  );
+				launcher.setConst( maxPairs  );
+				int num = numSmallAabbs;
+#if 0                
+                int buffSize = launcher.getSerializationBufferSize();
+                unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
+                for (int i=0;i<buffSize+1;i++)
+                {
+                    unsigned char* ptr = (unsigned char*)&buf[i];
+                    *ptr = 0xff;
+                }
+                int actualWrite = launcher.serializeArguments(buf,buffSize);
+                unsigned char* cptr = (unsigned char*)&buf[buffSize];
+    //            printf("buf[buffSize] = %d\n",*cptr);
+                assert(buf[buffSize]==0xff);//check for buffer overrun
+                int* ptr = (int*)&buf[buffSize];
+                *ptr = num;
+                FILE* f = fopen("m_sapKernelArgs.bin","wb");
+                fwrite(buf,buffSize+sizeof(int),1,f);
+                fclose(f);
+                launcher.launch1D( num);
+				clFinish(m_queue);
+                numPairs = m_pairCount.at(0);
+                if (numPairs>maxPairs)
+				{
+					b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+					numPairs = maxPairs;
+					m_pairCount.resize(0);
+					m_pairCount.push_back(maxPairs);
+				}
+			}
+        int numPairs = 0;
+        b3LauncherCL launcher(m_queue, m_sapKernel);
+        const char* fileName = "m_sapKernelArgs.bin";
+        FILE* f = fopen(fileName,"rb");
+        if (f)
+        {
+            int sizeInBytes=0;
+            if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) 
+            {
+                printf("error, cannot get file size\n");
+                exit(0);
+            }
+            unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
+            fread(buf,sizeInBytes,1,f);
+            int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
+            int num = *(int*)&buf[serializedBytes];
+            launcher.launch1D( num);
+            b3OpenCLArray<int> pairCount(m_context, m_queue);
+            int numElements = launcher.m_arrays[2]->size()/sizeof(int);
+            pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
+            numPairs = pairCount.at(0);
+            //printf("overlapping pairs = %d\n",numPairs);
+            b3AlignedObjectArray<b3Int4>		hostOoverlappingPairs;
+            b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue);
+            tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
+            tmpGpuPairs.copyToHost(hostOoverlappingPairs);
+            m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
+            //printf("hello %d\n", m_overlappingPairs.size());
+            free(buf);
+            fclose(f);
+        } else {
+            printf("error: cannot find file %s\n",fileName);
+        }
+        clFinish(m_queue);
+        m_overlappingPairs.resize(numPairs);
+	//init3dSap();
+void b3GpuSapBroadphase::writeAabbsToGpu()
+	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
+	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
+	m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
+void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+	int index = userPtr;
+	b3SapAabb aabb;
+	for (int i=0;i<4;i++)
+	{
+		aabb.m_min[i] = aabbMin[i];
+		aabb.m_max[i] = aabbMax[i];
+	}
+	aabb.m_minIndices[3] = index;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
+	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size());
+	m_allAabbsCPU.push_back(aabb);
+void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+	int index = userPtr;
+	b3SapAabb aabb;
+	for (int i=0;i<4;i++)
+	{
+		aabb.m_min[i] = aabbMin[i];
+		aabb.m_max[i] = aabbMax[i];
+	}
+	aabb.m_minIndices[3] = index;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
+	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size());
+	m_allAabbsCPU.push_back(aabb);
+cl_mem	b3GpuSapBroadphase::getAabbBufferWS()
+	return m_allAabbsGPU.getBufferCL();
+int	b3GpuSapBroadphase::getNumOverlap()
+	return m_overlappingPairs.size();
+cl_mem	b3GpuSapBroadphase::getOverlappingPairBuffer()
+	return m_overlappingPairs.getBufferCL();
+b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU()
+	return m_overlappingPairs;
+b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU()
+	return m_smallAabbsMappingGPU;
+b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU()
+	return m_largeAabbsMappingGPU;
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
new file mode 100644
index 00000000..23e4d624
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@@ -0,0 +1,151 @@
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
+class b3Vector3;
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "b3SapAabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "b3GpuBroadphaseInterface.h"
+class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	cl_kernel				m_flipFloatKernel;
+	cl_kernel				m_scatterKernel ;
+	cl_kernel				m_copyAabbsKernel;
+	cl_kernel				m_sapKernel;
+	cl_kernel				m_sap2Kernel;
+	cl_kernel				m_prepareSumVarianceKernel;
+	class b3RadixSort32CL* m_sorter;
+	///test for 3d SAP
+	b3AlignedObjectArray<b3SortData>		m_sortedAxisCPU[3][2];
+	b3AlignedObjectArray<b3UnsignedInt2>	m_objectMinMaxIndexCPU[3][2];
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0prev;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1prev;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2prev;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0prev;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1prev;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2prev;
+	b3OpenCLArray<b3Int4>					m_addedHostPairsGPU;
+	b3OpenCLArray<b3Int4>					m_removedHostPairsGPU;
+	b3OpenCLArray<int>						m_addedCountGPU;
+	b3OpenCLArray<int>						m_removedCountGPU;
+	int	m_currentBuffer;
+	b3OpenCLArray<int> m_pairCount;
+	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU;
+	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU()
+	{
+		return m_allAabbsGPU;
+	}
+	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU()
+	{
+		return m_allAabbsCPU;
+	}
+	b3OpenCLArray<b3Vector3>	m_sum;
+	b3OpenCLArray<b3Vector3>	m_sum2;
+	b3OpenCLArray<b3Vector3>	m_dst;
+	b3OpenCLArray<int>	m_smallAabbsMappingGPU;
+	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
+	b3OpenCLArray<int>	m_largeAabbsMappingGPU;
+	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
+	b3OpenCLArray<b3Int4>		m_overlappingPairs;
+	//temporary gpu work memory
+	b3OpenCLArray<b3SortData>	m_gpuSmallSortData;
+	b3OpenCLArray<b3SapAabb>	m_gpuSmallSortedAabbs;
+	class b3PrefixScanFloat4CL*		m_prefixScanFloat4;
+	enum b3GpuSapKernelType
+	{
+	};
+	b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
+	virtual ~b3GpuSapBroadphase();
+	static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	{
+		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
+	}
+	static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	{
+		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
+	}
+	static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	{
+		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
+	}
+	static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	{
+		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
+	}
+	static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	{
+		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
+	}
+	virtual void  calculateOverlappingPairs(int maxPairs);
+	virtual void  calculateOverlappingPairsHost(int maxPairs);
+	void  reset();
+	void init3dSap();
+	virtual void calculateOverlappingPairsHostIncremental3Sap();
+	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu();
+	virtual cl_mem	getAabbBufferWS();
+	virtual int	getNumOverlap();
+	virtual cl_mem	getOverlappingPairBuffer();
+	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
+	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
+	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
new file mode 100644
index 00000000..ea6550fe
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
@@ -0,0 +1,14 @@
+#ifndef B3_SAP_AABB_H
+#define B3_SAP_AABB_H
+#include "Bullet3Common/b3Scalar.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+///just make sure that the b3Aabb is 16-byte aligned
+B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb
+#endif //B3_SAP_AABB_H
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl
new file mode 100644
index 00000000..ded4796d
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl
@@ -0,0 +1,216 @@
+int getPosHash(int4 gridPos, __global float4* pParams)
+	int4 gridDim = *((__global int4*)(pParams + 1));
+	gridPos.x &= gridDim.x - 1;
+	gridPos.y &= gridDim.y - 1;
+	gridPos.z &= gridDim.z - 1;
+	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
+	return hash;
+int4 getGridPos(float4 worldPos, __global float4* pParams)
+    int4 gridPos;
+	int4 gridDim = *((__global int4*)(pParams + 1));
+    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
+    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
+    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
+    return gridPos;
+// calculate grid hash value for each body using its AABB
+__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = allpAABB[smallAabbMapping[index]*2];
+	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+	pos.w = 0.f;
+    // get address in grid
+    int4 gridPos = getGridPos(pos, pParams);
+    int gridHash = getPosHash(gridPos, pParams);
+    // store grid hash and body index
+    int2 hashVal;
+    hashVal.x = gridHash;
+    hashVal.y = index;
+    pHash[index] = hashVal;
+__kernel void kClearCellStart(	int numCells, 
+								__global int* pCellStart )
+    int index = get_global_id(0);
+    if(index >= numCells)
+	{
+		return;
+	}
+	pCellStart[index] = -1;
+__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )
+	__local int sharedHash[513];
+    int index = get_global_id(0);
+	int2 sortedData;
+    if(index < numObjects)
+	{
+		sortedData = pHash[index];
+		// Load hash data into shared memory so that we can look 
+		// at neighboring body's hash value without loading
+		// two hash values per thread
+		sharedHash[get_local_id(0) + 1] = sortedData.x;
+		if((index > 0) && (get_local_id(0) == 0))
+		{
+			// first thread in block must load neighbor body hash
+			sharedHash[0] = pHash[index-1].x;
+		}
+	}
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(index < numObjects)
+	{
+		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
+		{
+			cellStart[sortedData.x] = index;
+		}
+	}
+int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
+	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && 
+			(min0.y <= max1.y)&& (min1.y <= max0.y) && 
+			(min0.z <= max1.z)&& (min1.z <= max0.z); 
+//search for AABB 'index' against other AABBs' in this cell
+void findPairsInCell(	int numObjects,
+						int4	gridPos,
+						int    index,
+						__global int2*  pHash,
+						__global int*   pCellStart,
+						__global float4* allpAABB, 
+						__global const int* smallAabbMapping,
+						__global float4* pParams,
+							volatile  __global int* pairCount,
+						__global int4*   pPairBuff2,
+						int maxPairs
+						)
+	int4 pGridDim = *((__global int4*)(pParams + 1));
+	int maxBodiesPerCell = pGridDim.w;
+    int gridHash = getPosHash(gridPos, pParams);
+    // get start of bucket for this cell
+    int bucketStart = pCellStart[gridHash];
+    if (bucketStart == -1)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; 
+	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int bucketEnd = bucketStart + maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
+	for(int index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        int2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		int unsorted_indx2 = cellData.y;
+        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+		if (unsorted_indx2 != unsorted_indx) // check not colliding with self
+        {   
+			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];
+			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];
+			if(testAABBOverlap(min0, max0, min1, max1))
+			{
+				if (pairCount)
+				{
+					int handleIndex2 = as_int(min1.w);
+					if (handleIndex<handleIndex2)
+					{
+						int curPair = atomic_add(pairCount,1);
+						if (curPair<maxPairs)
+						{
+							int4 newpair;
+							newpair.x = handleIndex;
+							newpair.y = handleIndex2;
+							newpair.z = -1;
+							newpair.w = -1;
+							pPairBuff2[curPair] = newpair;
+						}
+					}
+				}
+			}
+		}
+	}
+__kernel void kFindOverlappingPairs(	int numObjects,
+										__global float4* allpAABB, 
+										__global const int* smallAabbMapping,
+										__global int2* pHash, 
+										__global int* pCellStart, 
+										__global float4* pParams ,
+										volatile  __global int* pairCount,
+										__global int4*   pPairBuff2,
+										int maxPairs
+										)
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];
+	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+    // get address in grid
+    int4 gridPosA = getGridPos(pos, pParams);
+    int4 gridPosB; 
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) 
+    {
+		gridPosB.z = gridPosA.z + z;
+        for(int y=-1; y<=1; y++) 
+        {
+			gridPosB.y = gridPosA.y + y;
+            for(int x=-1; x<=1; x++) 
+            {
+				gridPosB.x = gridPosA.x + x;
+                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);
+            }
+        }
+    }
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
new file mode 100644
index 00000000..dad42477
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
@@ -0,0 +1,199 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* gridBroadphaseCL= \
+"int getPosHash(int4 gridPos, __global float4* pParams)\n"
+"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
+"	gridPos.x &= gridDim.x - 1;\n"
+"	gridPos.y &= gridDim.y - 1;\n"
+"	gridPos.z &= gridDim.z - 1;\n"
+"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
+"	return hash;\n"
+"} \n"
+"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
+"    int4 gridPos;\n"
+"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
+"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
+"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
+"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
+"    return gridPos;\n"
+"// calculate grid hash value for each body using its AABB\n"
+"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
+"    int index = get_global_id(0);\n"
+"    if(index >= numObjects)\n"
+"	{\n"
+"		return;\n"
+"	}\n"
+"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
+"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
+"	float4 pos;\n"
+"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+"	pos.w = 0.f;\n"
+"    // get address in grid\n"
+"    int4 gridPos = getGridPos(pos, pParams);\n"
+"    int gridHash = getPosHash(gridPos, pParams);\n"
+"    // store grid hash and body index\n"
+"    int2 hashVal;\n"
+"    hashVal.x = gridHash;\n"
+"    hashVal.y = index;\n"
+"    pHash[index] = hashVal;\n"
+"__kernel void kClearCellStart(	int numCells, \n"
+"								__global int* pCellStart )\n"
+"    int index = get_global_id(0);\n"
+"    if(index >= numCells)\n"
+"	{\n"
+"		return;\n"
+"	}\n"
+"	pCellStart[index] = -1;\n"
+"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
+"	__local int sharedHash[513];\n"
+"    int index = get_global_id(0);\n"
+"	int2 sortedData;\n"
+"    if(index < numObjects)\n"
+"	{\n"
+"		sortedData = pHash[index];\n"
+"		// Load hash data into shared memory so that we can look \n"
+"		// at neighboring body's hash value without loading\n"
+"		// two hash values per thread\n"
+"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
+"		if((index > 0) && (get_local_id(0) == 0))\n"
+"		{\n"
+"			// first thread in block must load neighbor body hash\n"
+"			sharedHash[0] = pHash[index-1].x;\n"
+"		}\n"
+"	}\n"
+"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+"    if(index < numObjects)\n"
+"	{\n"
+"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
+"		{\n"
+"			cellStart[sortedData.x] = index;\n"
+"		}\n"
+"	}\n"
+"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
+"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
+"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
+"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n"
+"//search for AABB 'index' against other AABBs' in this cell\n"
+"void findPairsInCell(	int numObjects,\n"
+"						int4	gridPos,\n"
+"						int    index,\n"
+"						__global int2*  pHash,\n"
+"						__global int*   pCellStart,\n"
+"						__global float4* allpAABB, \n"
+"						__global const int* smallAabbMapping,\n"
+"						__global float4* pParams,\n"
+"							volatile  __global int* pairCount,\n"
+"						__global int4*   pPairBuff2,\n"
+"						int maxPairs\n"
+"						)\n"
+"	int4 pGridDim = *((__global int4*)(pParams + 1));\n"
+"	int maxBodiesPerCell = pGridDim.w;\n"
+"    int gridHash = getPosHash(gridPos, pParams);\n"
+"    // get start of bucket for this cell\n"
+"    int bucketStart = pCellStart[gridHash];\n"
+"    if (bucketStart == -1)\n"
+"	{\n"
+"        return;   // cell empty\n"
+"	}\n"
+"	// iterate over bodies in this cell\n"
+"    int2 sortedData = pHash[index];\n"
+"	int unsorted_indx = sortedData.y;\n"
+"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
+"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+"	int handleIndex =  as_int(min0.w);\n"
+"	\n"
+"	int bucketEnd = bucketStart + maxBodiesPerCell;\n"
+"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
+"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
+"	{\n"
+"        int2 cellData = pHash[index2];\n"
+"        if (cellData.x != gridHash)\n"
+"        {\n"
+"			break;   // no longer in same bucket\n"
+"		}\n"
+"		int unsorted_indx2 = cellData.y;\n"
+"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
+"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
+"        {   \n"
+"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
+"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
+"			if(testAABBOverlap(min0, max0, min1, max1))\n"
+"			{\n"
+"				if (pairCount)\n"
+"				{\n"
+"					int handleIndex2 = as_int(min1.w);\n"
+"					if (handleIndex<handleIndex2)\n"
+"					{\n"
+"						int curPair = atomic_add(pairCount,1);\n"
+"						if (curPair<maxPairs)\n"
+"						{\n"
+"							int4 newpair;\n"
+"							newpair.x = handleIndex;\n"
+"							newpair.y = handleIndex2;\n"
+"							newpair.z = -1;\n"
+"							newpair.w = -1;\n"
+"							pPairBuff2[curPair] = newpair;\n"
+"						}\n"
+"					}\n"
+"				\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"__kernel void kFindOverlappingPairs(	int numObjects,\n"
+"										__global float4* allpAABB, \n"
+"										__global const int* smallAabbMapping,\n"
+"										__global int2* pHash, \n"
+"										__global int* pCellStart, \n"
+"										__global float4* pParams ,\n"
+"										volatile  __global int* pairCount,\n"
+"										__global int4*   pPairBuff2,\n"
+"										int maxPairs\n"
+"										)\n"
+"    int index = get_global_id(0);\n"
+"    if(index >= numObjects)\n"
+"	{\n"
+"		return;\n"
+"	}\n"
+"    int2 sortedData = pHash[index];\n"
+"	int unsorted_indx = sortedData.y;\n"
+"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
+"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+"	float4 pos;\n"
+"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+"    // get address in grid\n"
+"    int4 gridPosA = getGridPos(pos, pParams);\n"
+"    int4 gridPosB; \n"
+"    // examine only neighbouring cells\n"
+"    for(int z=-1; z<=1; z++) \n"
+"    {\n"
+"		gridPosB.z = gridPosA.z + z;\n"
+"        for(int y=-1; y<=1; y++) \n"
+"        {\n"
+"			gridPosB.y = gridPosA.y + y;\n"
+"            for(int x=-1; x<=1; x++) \n"
+"            {\n"
+"				gridPosB.x = gridPosA.x + x;\n"
+"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
+"            }\n"
+"        }\n"
+"    }\n"
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
new file mode 100644
index 00000000..c375b9bf
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
@@ -0,0 +1,767 @@
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Initial Author Jackson Lee, 2014
+typedef float b3Scalar;
+typedef float4 b3Vector3;
+#define b3Max max
+#define b3Min min
+#define b3Sqrt sqrt
+typedef struct
+	unsigned int m_key;
+	unsigned int m_value;
+} SortDataCL;
+typedef struct 
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} b3AabbCL;
+unsigned int interleaveBits(unsigned int x)
+	//........ ........ ......12 3456789A	//x
+	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits
+	//......12 3456789A ......12 3456789A	//x ^ (x << 16)
+	//11111111 ........ ........ 11111111	//0x FF 00 00 FF
+	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;
+	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)
+	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F
+	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;
+	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)
+	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3
+	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;
+	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)
+	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49
+	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;
+	//........ ........ ......11 11111111	//0x000003FF
+	x &= 0x000003FF;		//Clear all bits above bit 10
+	x = (x ^ (x << 16)) & 0xFF0000FF;
+	x = (x ^ (x <<  8)) & 0x0300F00F;
+	x = (x ^ (x <<  4)) & 0x030C30C3;
+	x = (x ^ (x <<  2)) & 0x09249249;
+	return x;
+unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
+	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
+__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)
+	int separatedAabbIndex = get_global_id(0);
+	if(separatedAabbIndex >= numAabbsToSeparate) return;
+	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];
+	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];
+//Should replace with an optimized parallel reduction
+__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
+	//Each time this kernel is added to the command queue, 
+	//the number of AABBs needing to be merged is halved
+	//
+	//Example with 159 AABBs:
+	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80
+	//	numMergedAabbs == 159 - 80 == 79
+	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]
+	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
+	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
+	int aabbIndex = get_global_id(0);
+	if(aabbIndex >= numMergedAabbs) return;
+	int otherAabbIndex = aabbIndex + numRemainingAabbs;
+	b3AabbCL aabb = out_mergedAabb[aabbIndex];
+	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
+	b3AabbCL mergedAabb;
+	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
+	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
+	out_mergedAabb[aabbIndex] = mergedAabb;
+__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, 
+												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)
+	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index
+	if(leafNodeIndex >= numAabbs) return;
+	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];
+	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;
+	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;
+	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
+	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;
+	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;
+	//Quantize into integer coordinates
+	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
+	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;
+	int4 discretePosition;
+	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
+	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
+	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
+	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
+	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
+	discretePosition += 512;
+	//Interleave bits(assign a morton code, also known as a z-curve)
+	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
+	//
+	SortDataCL mortonCodeIndexPair;
+	mortonCodeIndexPair.m_key = mortonCode;
+	mortonCodeIndexPair.m_value = leafNodeIndex;
+	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
+//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.
+//If it is set, then the index is for an internal node; otherwise, it is a leaf node. 
+//In both cases, the bit should be cleared to access the actual node index.
+int isLeafNode(int index) { return (index >> 31 == 0); }
+int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }
+int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }
+//From sap.cl
+#define NEW_PAIR_MARKER -1
+bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+//From sap.cl
+__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, 
+											__global int* rootNodeIndex, 
+											__global int2* internalNodeChildIndices, 
+											__global b3AabbCL* internalNodeAabbs,
+											__global int2* internalNodeLeafIndexRanges,
+											__global SortDataCL* mortonCodesAndAabbIndices,
+											__global int* out_numPairs, __global int4* out_overlappingPairs, 
+											int maxPairs, int numQueryAabbs)
+	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since
+	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)
+	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
+	if(queryBvhNodeIndex >= numQueryAabbs) return;
+	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;
+	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
+	int stackSize = 1;
+	stack[0] = *rootNodeIndex;
+	while(stackSize)
+	{
+		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
+		--stackSize;
+		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
+		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
+		//Optimization - if the BVH is structured as a binary radix tree, then
+		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).
+		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.
+		{
+			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;
+			if(highestLeafIndex <= queryBvhNodeIndex) continue;
+		}
+		//bvhRigidIndex is not used if internal node
+		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
+		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
+		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
+		{
+			if(isLeaf)
+			{
+				int4 pair;
+				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
+				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
+				pair.z = NEW_PAIR_MARKER;
+				pair.w = NEW_PAIR_MARKER;
+				int pairIndex = atomic_inc(out_numPairs);
+				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
+			}
+			if(!isLeaf)	//Internal node
+			{
+				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
+				{
+					//Error
+				}
+				else
+				{
+					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
+					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
+				}
+			}
+		}
+	}
+//From rayCastKernels.cl
+typedef struct
+	float4 m_from;
+	float4 m_to;
+} b3RayInfo;
+//From rayCastKernels.cl
+b3Vector3 b3Vector3_normalize(b3Vector3 v)
+	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};
+	return normalize(normal);	//OpenCL normalize == vector4 normalize
+b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
+b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
+int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)
+	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).
+	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.
+	//
+	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane 
+	//and min.x will be the far plane; otherwise, it is reversed.
+	//
+	//In order for there to be a collision, the t_min and t_max of each pair must overlap.
+	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.
+	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)
+	//When using vector types, the select() function checks the most signficant bit, 
+	//but isless() sets the least significant bit.
+	isNegative <<= 31;
+	//select(b, a, condition) == condition ? a : b
+	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1
+	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;
+	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;
+	b3Scalar t_min_final = 0.0f;
+	b3Scalar t_max_final = rayLength;
+	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. 
+	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])
+	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.
+	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );
+	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );
+	return (t_min_final <= t_max_final);
+__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,
+								__global int* rootNodeIndex, 
+								__global int2* internalNodeChildIndices, 
+								__global b3AabbCL* internalNodeAabbs,
+								__global int2* internalNodeLeafIndexRanges,
+								__global SortDataCL* mortonCodesAndAabbIndices,
+								__global b3RayInfo* rays,
+								__global int* out_numRayRigidPairs, 
+								__global int2* out_rayRigidPairs,
+								int maxRayRigidPairs, int numRays)
+	int rayIndex = get_global_id(0);
+	if(rayIndex >= numRays) return;
+	//
+	b3Vector3 rayFrom = rays[rayIndex].m_from;
+	b3Vector3 rayTo = rays[rayIndex].m_to;
+	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
+	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
+	//
+	int stackSize = 1;
+	stack[0] = *rootNodeIndex;
+	while(stackSize)
+	{
+		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
+		--stackSize;
+		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
+		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
+		//bvhRigidIndex is not used if internal node
+		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
+		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
+		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )
+		{
+			if(isLeaf)
+			{
+				int2 rayRigidPair;
+				rayRigidPair.x = rayIndex;
+				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
+				int pairIndex = atomic_inc(out_numRayRigidPairs);
+				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
+			}
+			if(!isLeaf)	//Internal node
+			{
+				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
+				{
+					//Error
+				}
+				else
+				{
+					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
+					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
+				}
+			}
+		}
+	}
+__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, 
+									__global int* out_numPairs, __global int4* out_overlappingPairs, 
+									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)
+	int smallAabbIndex = get_global_id(0);
+	if(smallAabbIndex >= numSmallAabbRigids) return;
+	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];
+	for(int i = 0; i < numLargeAabbRigids; ++i)
+	{
+		b3AabbCL largeAabb = largeAabbs[i];
+		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )
+		{
+			int4 pair;
+			pair.x = largeAabb.m_minIndices[3];
+			pair.y = smallAabb.m_minIndices[3];
+			pair.z = NEW_PAIR_MARKER;
+			pair.w = NEW_PAIR_MARKER;
+			int pairIndex = atomic_inc(out_numPairs);
+			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
+		}
+	}
+__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,
+									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,
+									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)
+	int rayIndex = get_global_id(0);
+	if(rayIndex >= numRays) return;
+	b3Vector3 rayFrom = rays[rayIndex].m_from;
+	b3Vector3 rayTo = rays[rayIndex].m_to;
+	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
+	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
+	for(int i = 0; i < numLargeAabbRigids; ++i)
+	{
+		b3AabbCL rigidAabb = largeRigidAabbs[i];
+		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )
+		{
+			int2 rayRigidPair;
+			rayRigidPair.x = rayIndex;
+			rayRigidPair.y = rigidAabb.m_minIndices[3];
+			int pairIndex = atomic_inc(out_numRayRigidPairs);
+			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
+		}
+	}
+//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.
+//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.
+//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.
+//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).
+#define b3Int64 long
+int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }
+b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) 
+	//This function only needs to return (i & j) in order for the algorithm to work,
+	//but it may help with debugging to mask out the lower bits.
+	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);
+	b3Int64 sharedBits = i & j;
+	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0
+	return sharedBits & bitmask;
+//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths
+int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)
+	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );
+__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,
+											__global b3Int64* out_commonPrefixes,
+											__global int* out_commonPrefixLengths,
+											int numInternalNodes)
+	int internalNodeIndex = get_global_id(0);
+	if (internalNodeIndex >= numInternalNodes) return;
+	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,
+	//and the number of internal nodes is always numLeafNodes - 1
+	int leftLeafIndex = internalNodeIndex;
+	int rightLeafIndex = internalNodeIndex + 1;
+	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;
+	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;
+	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.
+	//Append the index of each leaf node to each morton code so that there are no duplicates.
+	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement
+	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.
+	//
+	//upsample(a, b) == ( ((b3Int64)a) << 32) | b
+	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);
+	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);
+	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
+	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
+__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,
+											__global int2* out_childNodes, int numLeafNodes)
+	int leafNodeIndex = get_global_id(0);
+	if (leafNodeIndex >= numLeafNodes) return;
+	int numInternalNodes = numLeafNodes - 1;
+	int leftSplitIndex = leafNodeIndex - 1;
+	int rightSplitIndex = leafNodeIndex;
+	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
+	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
+	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix
+	//Leaf nodes are considered as having the highest common prefix
+	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);
+	//Handle cases for the edge nodes; the first and last node
+	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX
+	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;
+	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;
+	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;
+	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;
+	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa
+	//out_childNodesAsInt[0] == int2.x == left child
+	//out_childNodesAsInt[1] == int2.y == right child
+	int isLeaf = 1;
+	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
+	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);
+__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,
+												__global int2* out_childNodes,
+												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,
+												int numInternalNodes)
+	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
+	if(internalNodeIndex >= numInternalNodes) return;
+	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];
+	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];
+	int leftIndex = -1;
+	int rightIndex = -1;
+	//Find nearest element to left with a lower common prefix
+	for(int i = internalNodeIndex - 1; i >= 0; --i)
+	{
+		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
+		if(nodeLeftSharedPrefixLength < nodePrefixLength)
+		{
+			leftIndex = i;
+			break;
+		}
+	}
+	//Find nearest element to right with a lower common prefix
+	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)
+	{
+		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
+		if(nodeRightSharedPrefixLength < nodePrefixLength)
+		{
+			rightIndex = i;
+			break;
+		}
+	}
+#else //Use binary search
+	//Find nearest element to left with a lower common prefix
+	int leftIndex = -1;
+	{
+		int lower = 0;
+		int upper = internalNodeIndex - 1;
+		while(lower <= upper)
+		{
+			int mid = (lower + upper) / 2;
+			b3Int64 midPrefix = commonPrefixes[mid];
+			int midPrefixLength = commonPrefixLengths[mid];
+			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
+			if(nodeMidSharedPrefixLength < nodePrefixLength) 
+			{
+				int right = mid + 1;
+				if(right < internalNodeIndex)
+				{
+					b3Int64 rightPrefix = commonPrefixes[right];
+					int rightPrefixLength = commonPrefixLengths[right];
+					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);
+					if(nodeRightSharedPrefixLength < nodePrefixLength) 
+					{
+						lower = right;
+						leftIndex = right;
+					}
+					else 
+					{
+						leftIndex = mid;
+						break;
+					}
+				}
+				else 
+				{
+					leftIndex = mid;
+					break;
+				}
+			}
+			else upper = mid - 1;
+		}
+	}
+	//Find nearest element to right with a lower common prefix
+	int rightIndex = -1;
+	{
+		int lower = internalNodeIndex + 1;
+		int upper = numInternalNodes - 1;
+		while(lower <= upper)
+		{
+			int mid = (lower + upper) / 2;
+			b3Int64 midPrefix = commonPrefixes[mid];
+			int midPrefixLength = commonPrefixLengths[mid];
+			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
+			if(nodeMidSharedPrefixLength < nodePrefixLength) 
+			{
+				int left = mid - 1;
+				if(left > internalNodeIndex)
+				{
+					b3Int64 leftPrefix = commonPrefixes[left];
+					int leftPrefixLength = commonPrefixLengths[left];
+					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);
+					if(nodeLeftSharedPrefixLength < nodePrefixLength) 
+					{
+						upper = left;
+						rightIndex = left;
+					}
+					else 
+					{
+						rightIndex = mid;
+						break;
+					}
+				}
+				else 
+				{
+					rightIndex = mid;
+					break;
+				}
+			}
+			else lower = mid + 1;
+		}
+	}
+	//Select parent
+	{
+		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
+		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
+		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);
+		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;
+		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;
+		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;
+		int isRootNode = (leftIndex == -1 && rightIndex == -1);
+		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;
+		int isLeaf = 0;
+		if(!isRootNode)
+		{
+			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa
+			//out_childNodesAsInt[0] == int2.x == left child
+			//out_childNodesAsInt[1] == int2.y == right child
+			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
+			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
+		}
+		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
+	}
+__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,
+									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)
+	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);
+	int internalNodeIndex = get_global_id(0);
+	if(internalNodeIndex >= numInternalNodes) return;
+	//
+	int distanceFromRoot = 0;
+	{
+		int parentIndex = internalNodeParentNodes[internalNodeIndex];
+		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)
+		{
+			parentIndex = internalNodeParentNodes[parentIndex];
+			++distanceFromRoot;
+		}
+	}
+	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;
+	//
+	__local int localMaxDistanceFromRoot;
+	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;
+	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);
+	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);
+__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,
+												__global int2* childNodes,
+												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,
+												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)
+	int internalNodeIndex = get_global_id(0);
+	if(internalNodeIndex >= numInternalNodes) return;
+	int distance = distanceFromRoot[internalNodeIndex];
+	if(distance == processedDistance)
+	{
+		int leftChildIndex = childNodes[internalNodeIndex].x;
+		int rightChildIndex = childNodes[internalNodeIndex].y;
+		int isLeftChildLeaf = isLeafNode(leftChildIndex);
+		int isRightChildLeaf = isLeafNode(rightChildIndex);
+		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);
+		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);
+		//leftRigidIndex/rightRigidIndex is not used if internal node
+		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
+		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
+		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];
+		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];
+		b3AabbCL mergedAabb;
+		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
+		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
+		internalNodeAabbs[internalNodeIndex] = mergedAabb;
+	}
+__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)
+	int internalNodeIndex = get_global_id(0);
+	if(internalNodeIndex >= numInternalNodes) return;
+	int numLeafNodes = numInternalNodes + 1;
+	int2 childNodes = internalNodeChildNodes[internalNodeIndex];
+	int2 leafIndexRange;	//x == min leaf index, y == max leaf index
+	//Find lowest leaf index covered by this internal node
+	{
+		int lowestIndex = childNodes.x;		//childNodes.x == Left child
+		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;
+		leafIndexRange.x = lowestIndex;
+	}
+	//Find highest leaf index covered by this internal node
+	{
+		int highestIndex = childNodes.y;	//childNodes.y == Right child
+		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;
+		leafIndexRange.y = highestIndex;
+	}
+	//
+	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
new file mode 100644
index 00000000..5eb8f45b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@@ -0,0 +1,729 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* parallelLinearBvhCL= \
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose,\n"
+"including commercial applications, and to alter it and redistribute it freely,\n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Initial Author Jackson Lee, 2014\n"
+"typedef float b3Scalar;\n"
+"typedef float4 b3Vector3;\n"
+"#define b3Max max\n"
+"#define b3Min min\n"
+"#define b3Sqrt sqrt\n"
+"typedef struct\n"
+"	unsigned int m_key;\n"
+"	unsigned int m_value;\n"
+"} SortDataCL;\n"
+"typedef struct \n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} b3AabbCL;\n"
+"unsigned int interleaveBits(unsigned int x)\n"
+"	//........ ........ ......12 3456789A	//x\n"
+"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n"
+"	\n"
+"	//......12 3456789A ......12 3456789A	//x ^ (x << 16)\n"
+"	//11111111 ........ ........ 11111111	//0x FF 00 00 FF\n"
+"	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;\n"
+"	\n"
+"	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)\n"
+"	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F\n"
+"	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;\n"
+"	\n"
+"	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)\n"
+"	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3\n"
+"	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;\n"
+"	\n"
+"	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)\n"
+"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49\n"
+"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;\n"
+"	\n"
+"	//........ ........ ......11 11111111	//0x000003FF\n"
+"	x &= 0x000003FF;		//Clear all bits above bit 10\n"
+"	\n"
+"	x = (x ^ (x << 16)) & 0xFF0000FF;\n"
+"	x = (x ^ (x <<  8)) & 0x0300F00F;\n"
+"	x = (x ^ (x <<  4)) & 0x030C30C3;\n"
+"	x = (x ^ (x <<  2)) & 0x09249249;\n"
+"	\n"
+"	return x;\n"
+"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
+"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
+"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
+"	int separatedAabbIndex = get_global_id(0);\n"
+"	if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
+"	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
+"	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
+"//Should replace with an optimized parallel reduction\n"
+"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
+"	//Each time this kernel is added to the command queue, \n"
+"	//the number of AABBs needing to be merged is halved\n"
+"	//\n"
+"	//Example with 159 AABBs:\n"
+"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
+"	//	numMergedAabbs == 159 - 80 == 79\n"
+"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
+"	\n"
+"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
+"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
+"	\n"
+"	int aabbIndex = get_global_id(0);\n"
+"	if(aabbIndex >= numMergedAabbs) return;\n"
+"	\n"
+"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
+"	\n"
+"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
+"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
+"		\n"
+"	b3AabbCL mergedAabb;\n"
+"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
+"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
+"	out_mergedAabb[aabbIndex] = mergedAabb;\n"
+"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
+"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
+"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n"
+"	if(leafNodeIndex >= numAabbs) return;\n"
+"	\n"
+"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
+"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
+"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
+"	\n"
+"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
+"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
+"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
+"	\n"
+"	//Quantize into integer coordinates\n"
+"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
+"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
+"	\n"
+"	int4 discretePosition;\n"
+"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
+"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
+"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
+"	\n"
+"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
+"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
+"	discretePosition += 512;\n"
+"	\n"
+"	//Interleave bits(assign a morton code, also known as a z-curve)\n"
+"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
+"	\n"
+"	//\n"
+"	SortDataCL mortonCodeIndexPair;\n"
+"	mortonCodeIndexPair.m_key = mortonCode;\n"
+"	mortonCodeIndexPair.m_value = leafNodeIndex;\n"
+"	\n"
+"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
+"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
+"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
+"//In both cases, the bit should be cleared to access the actual node index.\n"
+"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
+"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
+"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
+"//From sap.cl\n"
+"#define NEW_PAIR_MARKER -1\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"//From sap.cl\n"
+"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
+"											__global int* rootNodeIndex, \n"
+"											__global int2* internalNodeChildIndices, \n"
+"											__global b3AabbCL* internalNodeAabbs,\n"
+"											__global int2* internalNodeLeafIndexRanges,\n"
+"											\n"
+"											__global SortDataCL* mortonCodesAndAabbIndices,\n"
+"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+"											int maxPairs, int numQueryAabbs)\n"
+"	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
+"	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
+"	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+"	if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
+"	\n"
+"	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
+"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
+"	\n"
+"	\n"
+"	int stackSize = 1;\n"
+"	stack[0] = *rootNodeIndex;\n"
+"	\n"
+"	while(stackSize)\n"
+"	{\n"
+"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+"		--stackSize;\n"
+"		\n"
+"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
+"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+"		\n"
+"		//Optimization - if the BVH is structured as a binary radix tree, then\n"
+"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
+"		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
+"		{\n"
+"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
+"			if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
+"		}\n"
+"		\n"
+"		//bvhRigidIndex is not used if internal node\n"
+"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+"	\n"
+"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+"		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
+"		{\n"
+"			if(isLeaf)\n"
+"			{\n"
+"				int4 pair;\n"
+"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
+"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+"				pair.z = NEW_PAIR_MARKER;\n"
+"				pair.w = NEW_PAIR_MARKER;\n"
+"				\n"
+"				int pairIndex = atomic_inc(out_numPairs);\n"
+"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+"			}\n"
+"			\n"
+"			if(!isLeaf)	//Internal node\n"
+"			{\n"
+"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+"				{\n"
+"					//Error\n"
+"				}\n"
+"				else\n"
+"				{\n"
+"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"//From rayCastKernels.cl\n"
+"typedef struct\n"
+"	float4 m_from;\n"
+"	float4 m_to;\n"
+"} b3RayInfo;\n"
+"//From rayCastKernels.cl\n"
+"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
+"	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
+"	return normalize(normal);	//OpenCL normalize == vector4 normalize\n"
+"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
+"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
+"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
+"	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
+"	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
+"	//\n"
+"	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
+"	//and min.x will be the far plane; otherwise, it is reversed.\n"
+"	//\n"
+"	//In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
+"	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
+"	\n"
+"	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)\n"
+"	\n"
+"	//When using vector types, the select() function checks the most signficant bit, \n"
+"	//but isless() sets the least significant bit.\n"
+"	isNegative <<= 31;\n"
+"	//select(b, a, condition) == condition ? a : b\n"
+"	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
+"	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
+"	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
+"	\n"
+"	b3Scalar t_min_final = 0.0f;\n"
+"	b3Scalar t_max_final = rayLength;\n"
+"	\n"
+"	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
+"	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
+"	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
+"	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
+"	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
+"	\n"
+"	return (t_min_final <= t_max_final);\n"
+"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
+"								__global int* rootNodeIndex, \n"
+"								__global int2* internalNodeChildIndices, \n"
+"								__global b3AabbCL* internalNodeAabbs,\n"
+"								__global int2* internalNodeLeafIndexRanges,\n"
+"								__global SortDataCL* mortonCodesAndAabbIndices,\n"
+"								\n"
+"								__global b3RayInfo* rays,\n"
+"								\n"
+"								__global int* out_numRayRigidPairs, \n"
+"								__global int2* out_rayRigidPairs,\n"
+"								int maxRayRigidPairs, int numRays)\n"
+"	int rayIndex = get_global_id(0);\n"
+"	if(rayIndex >= numRays) return;\n"
+"	\n"
+"	//\n"
+"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
+"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
+"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
+"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
+"	\n"
+"	//\n"
+"	\n"
+"	int stackSize = 1;\n"
+"	stack[0] = *rootNodeIndex;\n"
+"	\n"
+"	while(stackSize)\n"
+"	{\n"
+"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+"		--stackSize;\n"
+"		\n"
+"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
+"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+"		\n"
+"		//bvhRigidIndex is not used if internal node\n"
+"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+"	\n"
+"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )\n"
+"		{\n"
+"			if(isLeaf)\n"
+"			{\n"
+"				int2 rayRigidPair;\n"
+"				rayRigidPair.x = rayIndex;\n"
+"				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+"				\n"
+"				int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
+"				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
+"			}\n"
+"			\n"
+"			if(!isLeaf)	//Internal node\n"
+"			{\n"
+"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+"				{\n"
+"					//Error\n"
+"				}\n"
+"				else\n"
+"				{\n"
+"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
+"									__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+"									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
+"	int smallAabbIndex = get_global_id(0);\n"
+"	if(smallAabbIndex >= numSmallAabbRigids) return;\n"
+"	\n"
+"	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
+"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
+"	{\n"
+"		b3AabbCL largeAabb = largeAabbs[i];\n"
+"		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
+"		{\n"
+"			int4 pair;\n"
+"			pair.x = largeAabb.m_minIndices[3];\n"
+"			pair.y = smallAabb.m_minIndices[3];\n"
+"			pair.z = NEW_PAIR_MARKER;\n"
+"			pair.w = NEW_PAIR_MARKER;\n"
+"			\n"
+"			int pairIndex = atomic_inc(out_numPairs);\n"
+"			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+"		}\n"
+"	}\n"
+"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
+"									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,\n"
+"									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
+"	int rayIndex = get_global_id(0);\n"
+"	if(rayIndex >= numRays) return;\n"
+"	\n"
+"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
+"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
+"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
+"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
+"	\n"
+"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
+"	{\n"
+"		b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
+"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
+"		{\n"
+"			int2 rayRigidPair;\n"
+"			rayRigidPair.x = rayIndex;\n"
+"			rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
+"			\n"
+"			int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
+"			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
+"		}\n"
+"	}\n"
+"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
+"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
+"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
+"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
+"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
+"#define b3Int64 long\n"
+"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
+"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
+"	//This function only needs to return (i & j) in order for the algorithm to work,\n"
+"	//but it may help with debugging to mask out the lower bits.\n"
+"	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
+"	b3Int64 sharedBits = i & j;\n"
+"	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0\n"
+"	\n"
+"	return sharedBits & bitmask;\n"
+"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
+"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
+"	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
+"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
+"											__global b3Int64* out_commonPrefixes,\n"
+"											__global int* out_commonPrefixLengths,\n"
+"											int numInternalNodes)\n"
+"	int internalNodeIndex = get_global_id(0);\n"
+"	if (internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
+"	//and the number of internal nodes is always numLeafNodes - 1\n"
+"	int leftLeafIndex = internalNodeIndex;\n"
+"	int rightLeafIndex = internalNodeIndex + 1;\n"
+"	\n"
+"	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
+"	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
+"	\n"
+"	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
+"	//Append the index of each leaf node to each morton code so that there are no duplicates.\n"
+"	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
+"	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
+"	//\n"
+"	//upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
+"	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
+"	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
+"	\n"
+"	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
+"	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
+"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
+"											__global int2* out_childNodes, int numLeafNodes)\n"
+"	int leafNodeIndex = get_global_id(0);\n"
+"	if (leafNodeIndex >= numLeafNodes) return;\n"
+"	\n"
+"	int numInternalNodes = numLeafNodes - 1;\n"
+"	\n"
+"	int leftSplitIndex = leafNodeIndex - 1;\n"
+"	int rightSplitIndex = leafNodeIndex;\n"
+"	\n"
+"	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+"	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+"	\n"
+"	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
+"	//Leaf nodes are considered as having the highest common prefix\n"
+"	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
+"	\n"
+"	//Handle cases for the edge nodes; the first and last node\n"
+"	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
+"	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
+"	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
+"	\n"
+"	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
+"	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
+"	\n"
+"	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa\n"
+"	\n"
+"	//out_childNodesAsInt[0] == int2.x == left child\n"
+"	//out_childNodesAsInt[1] == int2.y == right child\n"
+"	int isLeaf = 1;\n"
+"	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
+"	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
+"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
+"												__global int2* out_childNodes,\n"
+"												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
+"												int numInternalNodes)\n"
+"	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+"	if(internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
+"	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
+"	\n"
+"//#define USE_LINEAR_SEARCH\n"
+"	int leftIndex = -1;\n"
+"	int rightIndex = -1;\n"
+"	\n"
+"	//Find nearest element to left with a lower common prefix\n"
+"	for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
+"	{\n"
+"		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
+"		if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
+"		{\n"
+"			leftIndex = i;\n"
+"			break;\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	//Find nearest element to right with a lower common prefix\n"
+"	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
+"	{\n"
+"		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
+"		if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
+"		{\n"
+"			rightIndex = i;\n"
+"			break;\n"
+"		}\n"
+"	}\n"
+"	\n"
+"#else //Use binary search\n"
+"	//Find nearest element to left with a lower common prefix\n"
+"	int leftIndex = -1;\n"
+"	{\n"
+"		int lower = 0;\n"
+"		int upper = internalNodeIndex - 1;\n"
+"		\n"
+"		while(lower <= upper)\n"
+"		{\n"
+"			int mid = (lower + upper) / 2;\n"
+"			b3Int64 midPrefix = commonPrefixes[mid];\n"
+"			int midPrefixLength = commonPrefixLengths[mid];\n"
+"			\n"
+"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
+"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
+"			{\n"
+"				int right = mid + 1;\n"
+"				if(right < internalNodeIndex)\n"
+"				{\n"
+"					b3Int64 rightPrefix = commonPrefixes[right];\n"
+"					int rightPrefixLength = commonPrefixLengths[right];\n"
+"					\n"
+"					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
+"					if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
+"					{\n"
+"						lower = right;\n"
+"						leftIndex = right;\n"
+"					}\n"
+"					else \n"
+"					{\n"
+"						leftIndex = mid;\n"
+"						break;\n"
+"					}\n"
+"				}\n"
+"				else \n"
+"				{\n"
+"					leftIndex = mid;\n"
+"					break;\n"
+"				}\n"
+"			}\n"
+"			else upper = mid - 1;\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	//Find nearest element to right with a lower common prefix\n"
+"	int rightIndex = -1;\n"
+"	{\n"
+"		int lower = internalNodeIndex + 1;\n"
+"		int upper = numInternalNodes - 1;\n"
+"		\n"
+"		while(lower <= upper)\n"
+"		{\n"
+"			int mid = (lower + upper) / 2;\n"
+"			b3Int64 midPrefix = commonPrefixes[mid];\n"
+"			int midPrefixLength = commonPrefixLengths[mid];\n"
+"			\n"
+"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
+"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
+"			{\n"
+"				int left = mid - 1;\n"
+"				if(left > internalNodeIndex)\n"
+"				{\n"
+"					b3Int64 leftPrefix = commonPrefixes[left];\n"
+"					int leftPrefixLength = commonPrefixLengths[left];\n"
+"				\n"
+"					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
+"					if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
+"					{\n"
+"						upper = left;\n"
+"						rightIndex = left;\n"
+"					}\n"
+"					else \n"
+"					{\n"
+"						rightIndex = mid;\n"
+"						break;\n"
+"					}\n"
+"				}\n"
+"				else \n"
+"				{\n"
+"					rightIndex = mid;\n"
+"					break;\n"
+"				}\n"
+"			}\n"
+"			else lower = mid + 1;\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	//Select parent\n"
+"	{\n"
+"		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+"		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+"		\n"
+"		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
+"		\n"
+"		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
+"		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
+"		\n"
+"		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
+"		\n"
+"		int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
+"		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
+"		\n"
+"		int isLeaf = 0;\n"
+"		if(!isRootNode)\n"
+"		{\n"
+"			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa\n"
+"			\n"
+"			//out_childNodesAsInt[0] == int2.x == left child\n"
+"			//out_childNodesAsInt[1] == int2.y == right child\n"
+"			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
+"			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
+"		}\n"
+"		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
+"	}\n"
+"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
+"									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
+"	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
+"	int internalNodeIndex = get_global_id(0);\n"
+"	if(internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	//\n"
+"	int distanceFromRoot = 0;\n"
+"	{\n"
+"		int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
+"		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
+"		{\n"
+"			parentIndex = internalNodeParentNodes[parentIndex];\n"
+"			++distanceFromRoot;\n"
+"		}\n"
+"	}\n"
+"	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
+"	\n"
+"	//\n"
+"	__local int localMaxDistanceFromRoot;\n"
+"	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
+"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
+"												__global int2* childNodes,\n"
+"												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
+"												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
+"	int internalNodeIndex = get_global_id(0);\n"
+"	if(internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	int distance = distanceFromRoot[internalNodeIndex];\n"
+"	\n"
+"	if(distance == processedDistance)\n"
+"	{\n"
+"		int leftChildIndex = childNodes[internalNodeIndex].x;\n"
+"		int rightChildIndex = childNodes[internalNodeIndex].y;\n"
+"		\n"
+"		int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
+"		int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
+"		\n"
+"		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
+"		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
+"		\n"
+"		//leftRigidIndex/rightRigidIndex is not used if internal node\n"
+"		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
+"		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
+"		\n"
+"		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
+"		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
+"		\n"
+"		b3AabbCL mergedAabb;\n"
+"		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
+"		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
+"		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
+"	}\n"
+"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
+"	int internalNodeIndex = get_global_id(0);\n"
+"	if(internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	int numLeafNodes = numInternalNodes + 1;\n"
+"	\n"
+"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
+"	\n"
+"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n"
+"	\n"
+"	//Find lowest leaf index covered by this internal node\n"
+"	{\n"
+"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n"
+"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
+"		leafIndexRange.x = lowestIndex;\n"
+"	}\n"
+"	\n"
+"	//Find highest leaf index covered by this internal node\n"
+"	{\n"
+"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n"
+"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
+"		leafIndexRange.y = highestIndex;\n"
+"	}\n"
+"	\n"
+"	//\n"
+"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
new file mode 100644
index 00000000..93f77a64
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
@@ -0,0 +1,389 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#define NEW_PAIR_MARKER -1
+typedef struct 
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)
+	int i = get_global_id(0);
+	if (i>=numUnsortedAabbs)
+		return;
+	int j = get_global_id(1);
+	if (j>=numUnSortedAabbs2)
+		return;
+	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];
+	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];
+	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))
+	{
+		int4 myPair;
+		int xIndex = unsortedAabbPtr[0].m_minIndices[3];
+		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];
+		if (xIndex>yIndex)
+		{
+			int tmp = xIndex;
+			xIndex=yIndex;
+			yIndex=tmp;
+		}
+		myPair.x = xIndex;
+		myPair.y = yIndex;
+		myPair.z = NEW_PAIR_MARKER;
+		myPair.w = NEW_PAIR_MARKER;
+		int curPair = atomic_inc (pairCount);
+		if (curPair<maxPairs)
+		{
+				pairsOut[curPair] = myPair; //flush to main memory
+		}
+	}
+__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	for (int j=i+1;j<numObjects;j++)
+	{
+		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+		{
+			int4 myPair;
+			myPair.x = aabbs[i].m_minIndices[3];
+			myPair.y = aabbs[j].m_minIndices[3];
+			myPair.z = NEW_PAIR_MARKER;
+			myPair.w = NEW_PAIR_MARKER;
+			int curPair = atomic_inc (pairCount);
+			if (curPair<maxPairs)
+			{
+					pairsOut[curPair] = myPair; //flush to main memory
+			}
+		}
+	}
+__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	for (int j=i+1;j<numObjects;j++)
+	{
+  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+		{
+			break;
+		}
+		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+		{
+			int4 myPair;
+			myPair.x = aabbs[i].m_minIndices[3];
+			myPair.y = aabbs[j].m_minIndices[3];
+			myPair.z = NEW_PAIR_MARKER;
+			myPair.w = NEW_PAIR_MARKER;
+			int curPair = atomic_inc (pairCount);
+			if (curPair<maxPairs)
+			{
+					pairsOut[curPair] = myPair; //flush to main memory
+			}
+		}
+	}
+__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	atomic_inc(numActiveWgItems);
+	int localBreak = 0;
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (j<numObjects)
+		{
+	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+			{
+				int4 myPair;
+				myPair.x = aabbs[i].m_minIndices[3];
+				myPair.y = aabbs[j].m_minIndices[3];
+				myPair.z = NEW_PAIR_MARKER;
+				myPair.w = NEW_PAIR_MARKER;
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		j++;
+	} while (breakRequest[0]<numActiveWgItems[0]);
+__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	btAabbCL myAabb;
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	atomic_inc(numActiveWgItems);
+	int localBreak = 0;
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int4 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				myPair.z = NEW_PAIR_MARKER;
+				myPair.w = NEW_PAIR_MARKER;
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+	} while (breakRequest[0]<numActiveWgItems[0]);
+unsigned int FloatFlip(float fl);
+unsigned int FloatFlip(float fl)
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+float IFloatFlip(unsigned int f);
+float IFloatFlip(unsigned int f)
+	unsigned int mask = ((f >> 31) - 1) | 0x80000000;
+	unsigned int fl = f ^ mask;
+	return *(float*)&fl;
+__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	int src = destAabbs[i].m_maxIndices[3];
+	destAabbs[i] = allAabbs[src];
+	destAabbs[i].m_maxIndices[3] = src;
+__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);
+	sortData[i].y = i;
+__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];
+__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)
+	int i = get_global_id(0);
+	if (i>=numAabbs)
+		return;
+	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];
+	float4 s;
+	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;
+	sum[i]=s;
+	sum2[i]=s*s;	
diff --git a/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
new file mode 100644
index 00000000..04d40fcf
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -0,0 +1,342 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Erwin Coumans\n"
+"#define NEW_PAIR_MARKER -1\n"
+"typedef struct \n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numUnsortedAabbs)\n"
+"		return;\n"
+"	int j = get_global_id(1);\n"
+"	if (j>=numUnSortedAabbs2)\n"
+"		return;\n"
+"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
+"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
+"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
+"	{\n"
+"		int4 myPair;\n"
+"		\n"
+"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
+"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
+"		if (xIndex>yIndex)\n"
+"		{\n"
+"			int tmp = xIndex;\n"
+"			xIndex=yIndex;\n"
+"			yIndex=tmp;\n"
+"		}\n"
+"		\n"
+"		myPair.x = xIndex;\n"
+"		myPair.y = yIndex;\n"
+"		myPair.z = NEW_PAIR_MARKER;\n"
+"		myPair.w = NEW_PAIR_MARKER;\n"
+"		int curPair = atomic_inc (pairCount);\n"
+"		if (curPair<maxPairs)\n"
+"		{\n"
+"				pairsOut[curPair] = myPair; //flush to main memory\n"
+"		}\n"
+"	}\n"
+"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	for (int j=i+1;j<numObjects;j++)\n"
+"	{\n"
+"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"		{\n"
+"			int4 myPair;\n"
+"			myPair.x = aabbs[i].m_minIndices[3];\n"
+"			myPair.y = aabbs[j].m_minIndices[3];\n"
+"			myPair.z = NEW_PAIR_MARKER;\n"
+"			myPair.w = NEW_PAIR_MARKER;\n"
+"			int curPair = atomic_inc (pairCount);\n"
+"			if (curPair<maxPairs)\n"
+"			{\n"
+"					pairsOut[curPair] = myPair; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	for (int j=i+1;j<numObjects;j++)\n"
+"	{\n"
+"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"		{\n"
+"			break;\n"
+"		}\n"
+"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"		{\n"
+"			int4 myPair;\n"
+"			myPair.x = aabbs[i].m_minIndices[3];\n"
+"			myPair.y = aabbs[j].m_minIndices[3];\n"
+"			myPair.z = NEW_PAIR_MARKER;\n"
+"			myPair.w = NEW_PAIR_MARKER;\n"
+"			int curPair = atomic_inc (pairCount);\n"
+"			if (curPair<maxPairs)\n"
+"			{\n"
+"					pairsOut[curPair] = myPair; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"			{\n"
+"				int4 myPair;\n"
+"				myPair.x = aabbs[i].m_minIndices[3];\n"
+"				myPair.y = aabbs[j].m_minIndices[3];\n"
+"				myPair.z = NEW_PAIR_MARKER;\n"
+"				myPair.w = NEW_PAIR_MARKER;\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		j++;\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	btAabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int4 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				myPair.z = NEW_PAIR_MARKER;\n"
+"				myPair.w = NEW_PAIR_MARKER;\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"unsigned int FloatFlip(float fl);\n"
+"unsigned int FloatFlip(float fl)\n"
+"	unsigned int f = *(unsigned int*)&fl;\n"
+"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+"	return f ^ mask;\n"
+"float IFloatFlip(unsigned int f);\n"
+"float IFloatFlip(unsigned int f)\n"
+"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+"	unsigned int fl = f ^ mask;\n"
+"	return *(float*)&fl;\n"
+"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	int src = destAabbs[i].m_maxIndices[3];\n"
+"	destAabbs[i] = allAabbs[src];\n"
+"	destAabbs[i].m_maxIndices[3] = src;\n"
+"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	\n"
+"	\n"
+"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
+"	sortData[i].y = i;\n"
+"		\n"
+"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	\n"
+"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
+"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numAabbs)\n"
+"		return;\n"
+"	\n"
+"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
+"	\n"
+"	float4 s;\n"
+"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
+"	sum[i]=s;\n"
+"	sum2[i]=s*s;	\n"
diff --git a/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h b/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
new file mode 100644
index 00000000..e79182d7
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
@@ -0,0 +1,48 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifdef B3_USE_CLEW
+	#include "clew/clew.h"
+#ifdef __APPLE__
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#include <CL/cl.h>
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif //__APPLE__
+#endif //B3_USE_CLEW
+#include <assert.h>
+#include <stdio.h>
+#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
diff --git a/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp b/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
new file mode 100644
index 00000000..369f1d75
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
@@ -0,0 +1,1009 @@
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Original author: Roman Ponomarev
+//Mostly Reimplemented by Erwin Coumans
+bool gDebugForceLoadingFromSource = false;
+bool gDebugSkipLoadingBinary = false;
+#include "Bullet3Common/b3Logging.h"
+#include <string.h>
+#ifdef _WIN32
+#pragma warning (disable:4996)
+#include "b3OpenCLUtils.h"
+//#include "b3OpenCLInclude.h"
+#include <stdio.h>
+#include <stdlib.h>
+#define B3_MAX_CL_DEVICES 16 //who needs 16 devices?
+#ifdef _WIN32
+#include <windows.h>
+#include <assert.h>
+#define b3Assert assert
+#ifndef _WIN32
+#include <sys/stat.h>
+static const char* sCachedBinaryPath="cache";
+//Set the preferred platform vendor using the OpenCL SDK
+static const char* spPlatformVendor =
+#if defined(CL_PLATFORM_MINI_CL)
+"MiniCL, SCEA";
+#elif defined(CL_PLATFORM_AMD)
+"Advanced Micro Devices, Inc.";
+#elif defined(CL_PLATFORM_NVIDIA)
+"NVIDIA Corporation";
+#elif defined(CL_PLATFORM_INTEL)
+"Intel(R) Corporation";
+#elif defined(B3_USE_CLEW)
+"clew (OpenCL Extension Wrangler library)";
+"Unknown Vendor";
+#ifdef _WIN32
+#ifndef B3_USE_CLEW
+#include "CL/cl_gl.h"
+#endif //B3_USE_CLEW
+#endif //_WIN32
+void MyFatalBreakAPPLE(   const char *  errstr ,
+                       const void *  private_info ,
+                       size_t        cb ,
+                       void *        user_data  )
+    const char* patloc = strstr(errstr, "Warning");
+    //find out if it is a warning or error, exit if error
+    if (patloc)
+    {
+		b3Warning("Warning: %s\n", errstr);
+    } else
+    {
+		b3Error("Error: %s\n", errstr);
+        b3Assert(0);
+    }
+#ifdef B3_USE_CLEW
+int b3OpenCLUtils_clewInit()
+	int result = -1;
+#ifdef _WIN32
+        const char* cl = "OpenCL.dll";
+#elif defined __APPLE__
+        const char* cl = "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
+#else//presumable Linux?
+        //linux (tested on Ubuntu 12.10 with Catalyst 13.4 beta drivers, not that there is no symbolic link from libOpenCL.so
+        const char* cl = "libOpenCL.so.1";
+        result = clewInit(cl);
+        if (result != CLEW_SUCCESS)
+        {
+                cl = "libOpenCL.so";
+        } else
+        {
+                clewExit();
+        }
+        result = clewInit(cl);
+        if (result!=CLEW_SUCCESS)
+		{
+                b3Error("clewInit failed with error code %d\n",result);
+		}
+        else
+        {
+                b3Printf("clewInit succesfull using %s\n",cl);
+        }
+	return result;
+int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum)
+#ifdef B3_USE_CLEW
+	b3OpenCLUtils_clewInit();
+	cl_platform_id pPlatforms[10] = { 0 };
+    cl_uint numPlatforms = 0;
+    cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
+	//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL)
+			*pErrNum = ciErrNum;
+	}
+	return numPlatforms;
+const char* b3OpenCLUtils_getSdkVendorName()
+	return spPlatformVendor;
+void b3OpenCLUtils_setCachePath(const char* path)
+	sCachedBinaryPath = path;
+cl_platform_id b3OpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
+#ifdef B3_USE_CLEW
+        b3OpenCLUtils_clewInit();
+	cl_platform_id platform = 0;
+	unsigned int platformIndex = (unsigned int )platformIndex0;
+	cl_uint numPlatforms;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if (platformIndex>=0 && platformIndex<numPlatforms)
+	{
+		cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL)
+				*pErrNum = ciErrNum;
+			return platform;
+		}
+		platform = platforms[platformIndex];
+		free (platforms);
+	}
+	return platform;
+void b3OpenCLUtils::getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo)
+	b3Assert(platform);
+	cl_int ciErrNum;
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VENDOR,B3_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_NAME,B3_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VERSION,B3_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL);
+void b3OpenCLUtils_printPlatformInfo( cl_platform_id platform)
+	b3OpenCLPlatformInfo platformInfo;
+	b3OpenCLUtils::getPlatformInfo (platform, &platformInfo);
+	b3Printf("Platform info:\n");
+	b3Printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+	b3Printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+	b3Printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
+	cl_context retContext = 0;
+	cl_int ciErrNum=0;
+	cl_uint num_entries;
+	cl_device_id devices[B3_MAX_CL_DEVICES];
+	cl_uint num_devices;
+	cl_context_properties* cprops;
+	/*
+	* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
+	* implementation thinks we should be using.
+	*/
+	cl_context_properties cps[7] = {0,0,0,0,0,0,0};
+	cps[1] = (cl_context_properties)platform;
+#ifdef _WIN32
+#ifndef B3_USE_CLEW
+	if (pGLContext && pGLDC)
+	{
+		cps[2] = CL_GL_CONTEXT_KHR;
+		cps[3] = (cl_context_properties)pGLContext;
+		cps[4] = CL_WGL_HDC_KHR;
+		cps[5] = (cl_context_properties)pGLDC;
+	}
+#endif //B3_USE_CLEW
+#endif //_WIN32
+	num_entries = B3_MAX_CL_DEVICES;
+	num_devices=-1;
+	ciErrNum = clGetDeviceIDs(
+		platform,
+		deviceType,
+ 		num_entries,
+ 		devices,
+ 		&num_devices);
+    if (ciErrNum<0)
+    {
+        b3Printf("clGetDeviceIDs returned %d\n",ciErrNum);
+        return 0;
+    }
+	cprops = (NULL == platform) ? NULL : cps;
+	if (!num_devices)
+		return 0;
+	if (pGLContext)
+	{
+		//search for the GPU that relates to the OpenCL context
+		unsigned int i;
+		for (i=0;i<num_devices;i++)
+		{
+			retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
+			if (ciErrNum==CL_SUCCESS)
+				break;
+		}
+	}
+	else
+	{
+		if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices)
+		{
+			//create a context of the preferred device index
+			retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
+		} else
+		{
+			//create a context of all devices
+#if defined (__APPLE__)
+			retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum);
+        b3Printf("numDevices=%d\n",num_devices);
+			retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
+		}
+	}
+	if(pErrNum != NULL)
+	{
+		*pErrNum = ciErrNum;
+	};
+	return retContext;
+cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
+#ifdef B3_USE_CLEW
+        b3OpenCLUtils_clewInit();
+	cl_uint numPlatforms;
+	cl_context retContext = 0;
+	unsigned int i;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL) *pErrNum = ciErrNum;
+		return NULL;
+	}
+	if(numPlatforms > 0)
+	{
+		cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL)
+				*pErrNum = ciErrNum;
+			free(platforms);
+			return NULL;
+		}
+		for ( i = 0; i < numPlatforms; ++i)
+		{
+			char pbuf[128];
+			ciErrNum = clGetPlatformInfo(	platforms[i],
+				sizeof(pbuf),
+				pbuf,
+				NULL);
+			if(ciErrNum != CL_SUCCESS)
+			{
+				if(pErrNum != NULL) *pErrNum = ciErrNum;
+				return NULL;
+			}
+			if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
+			{
+				cl_platform_id tmpPlatform = platforms[0];
+				platforms[0] = platforms[i];
+				platforms[i] = tmpPlatform;
+				break;
+			} else
+			{
+				if(!strcmp(pbuf, spPlatformVendor))
+				{
+					cl_platform_id tmpPlatform = platforms[0];
+					platforms[0] = platforms[i];
+					platforms[i] = tmpPlatform;
+				}
+			}
+		}
+		for (i = 0; i < numPlatforms; ++i)
+		{
+			cl_platform_id platform = platforms[i];
+			assert(platform);
+			retContext = b3OpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex);
+			if (retContext)
+			{
+//				printf("OpenCL platform details:\n");
+				b3OpenCLPlatformInfo platformInfo;
+				b3OpenCLUtils::getPlatformInfo(platform, &platformInfo);
+				if (retPlatformId)
+					*retPlatformId = platform;
+				break;
+			}
+		}
+		free (platforms);
+	}
+	return retContext;
+//! Gets the id of the nth device from the context
+//! @return the id or -1 when out of range
+//! @param cxMainContext         OpenCL context
+//! @param device_idx            index of the device of interest
+cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
+	assert(cxMainContext);
+	size_t szParmDataBytes;
+	cl_device_id* cdDevices;
+	cl_device_id device ;
+	// get the list of devices associated with context
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
+	if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) {
+		return (cl_device_id)-1;
+	}
+	cdDevices = (cl_device_id*) malloc(szParmDataBytes);
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
+	device = cdDevices[deviceIndex];
+	free(cdDevices);
+	return device;
+int b3OpenCLUtils_getNumDevices(cl_context cxMainContext)
+	size_t szParamDataBytes;
+	int device_count;
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
+	device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
+	return device_count;
+void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info)
+	clGetDeviceInfo(device, CL_DEVICE_NAME, B3_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_VENDOR, B3_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
+	clGetDeviceInfo(device, CL_DRIVER_VERSION, B3_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
+	// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
+	clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, B3_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
+void b3OpenCLUtils_printDeviceInfo(cl_device_id device)
+	b3OpenCLDeviceInfo info;
+	b3OpenCLUtils::getDeviceInfo(device,&info);
+	b3Printf("Device Info:\n");
+	b3Printf("  CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
+	b3Printf("  CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
+	b3Printf("  CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
+	if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
+		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
+		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
+		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
+	if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
+		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
+	b3Printf("  CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
+	b3Printf("  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
+	b3Printf("  CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
+	b3Printf("  CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
+	b3Printf("  CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
+	b3Printf("  CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
+	b3Printf("  CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
+	b3Printf("  CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
+	b3Printf("  CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
+	b3Printf("  CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
+	b3Printf("  CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
+	b3Printf("  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
+	if( info.m_queueProperties  & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
+	if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
+	b3Printf("  CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
+	b3Printf("  CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
+	b3Printf("  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
+	b3Printf("\n  CL_DEVICE_IMAGE <dim>");
+	b3Printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
+	b3Printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
+	b3Printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
+	b3Printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
+	b3Printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
+	if (info.m_deviceExtensions != 0)
+	{
+		b3Printf("\n  CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
+	}
+	else
+	{
+		b3Printf("  CL_DEVICE_EXTENSIONS: None\n");
+	}
+	b3Printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
+		info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble);
+static const char* strip2(const char* name, const char* pattern)
+	  size_t const patlen = strlen(pattern);
+  	size_t patcnt = 0;
+	  const char * oriptr;
+	  const char * patloc;
+		// find how many times the pattern occurs in the original string
+	  for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+	  {
+		patcnt++;
+	  }
+	  return oriptr;
+cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching, bool disableBinaryCaching)
+	const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:"";
+	if (disableBinaryCaching)
+	{
+		//kernelSourceOrg = 0;
+	}
+	cl_program m_cpProgram=0;
+	cl_int status;
+	char binaryFileName[B3_MAX_STRING_LENGTH];
+	char deviceName[256];
+	char driverVersion[256];
+	const char* strippedName;
+	int fileUpToDate = 0;
+	int binaryFileValid=0;
+	if (!disableBinaryCaching && clFileNameForCaching)
+	{
+		clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
+		clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+		strippedName = strip2(clFileNameForCaching,"\\");
+		strippedName = strip2(strippedName,"/");
+#ifdef _MSVC_VER
+		sprintf_s(binaryFileName,B3_MAX_STRING_LENGTH,"%s/%s.%s.%s.bin",sCachedBinaryPath,strippedName, deviceName,driverVersion );
+		sprintf(binaryFileName,"%s/%s.%s.%s.bin",sCachedBinaryPath,strippedName, deviceName,driverVersion );
+	}
+	if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) )
+	{
+#ifdef _WIN32
+	char* bla=0;
+		//printf("searching for %s\n", binaryFileName);
+		FILETIME modtimeBinary;
+		CreateDirectoryA(sCachedBinaryPath,0);
+		{
+			HANDLE binaryFileHandle = CreateFileA(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+			if (binaryFileHandle ==INVALID_HANDLE_VALUE)
+			{
+				DWORD errorCode;
+				errorCode = GetLastError();
+				switch (errorCode)
+				{
+					{
+						b3Warning("\nCached file not found %s\n", binaryFileName);
+						break;
+					}
+					{
+						b3Warning("\nCached file path not found %s\n", binaryFileName);
+						break;
+					}
+				default:
+					{
+						b3Warning("\nFailed reading cached file with errorCode = %d\n", errorCode);
+					}
+				}
+			} else
+			{
+				if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
+				{
+					DWORD errorCode;
+					errorCode = GetLastError();
+					b3Warning("\nGetFileTime errorCode = %d\n", errorCode);
+				} else
+				{
+					binaryFileValid = 1;
+				}
+				CloseHandle(binaryFileHandle);
+			}
+			if (binaryFileValid)
+			{
+				HANDLE srcFileHandle = CreateFileA(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+				if (srcFileHandle==INVALID_HANDLE_VALUE)
+				{
+					const char* prefix[]={"./","../","../../","../../../","../../../../"};
+					for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<5;i++)
+					{
+						char relativeFileName[1024];
+						sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
+						srcFileHandle = CreateFileA(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+					}
+				}
+				if (srcFileHandle!=INVALID_HANDLE_VALUE)
+				{
+					FILETIME modtimeSrc;
+					if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
+					{
+						DWORD errorCode;
+						errorCode = GetLastError();
+						b3Warning("\nGetFileTime errorCode = %d\n", errorCode);
+					}
+					if (  ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
+						||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
+					{
+						fileUpToDate=1;
+					} else
+					{
+						b3Warning("\nCached binary file out-of-date (%s)\n",binaryFileName);
+					}
+					CloseHandle(srcFileHandle);
+				}
+				else
+				{
+#ifdef _DEBUG
+					DWORD errorCode;
+					errorCode = GetLastError();
+					switch (errorCode)
+					{
+						{
+							b3Warning("\nSrc file not found %s\n", clFileNameForCaching);
+							break;
+						}
+						{
+							b3Warning("\nSrc path not found %s\n", clFileNameForCaching);
+							break;
+						}
+					default:
+						{
+							b3Warning("\nnSrc file reading errorCode = %d\n", errorCode);
+						}
+					}
+					//we should make sure the src file exists so we can verify the timestamp with binary
+//					assert(0);
+					b3Warning("Warning: cannot find OpenCL kernel %s to verify timestamp of binary cached kernel %s\n",clFileNameForCaching, binaryFileName);
+					fileUpToDate = true;
+					//if we cannot find the source, assume it is OK in release builds
+					fileUpToDate = true;
+				}
+			}
+		}
+	fileUpToDate = true;
+	if (mkdir(sCachedBinaryPath,0777) == -1)
+	{
+	}
+	else
+	{
+		b3Printf("Succesfully created cache directory: %s\n", sCachedBinaryPath);
+	}
+#endif //_WIN32
+	}
+	if( fileUpToDate)
+	{
+#ifdef _MSC_VER
+		FILE* file;
+		if (fopen_s(&file,binaryFileName, "rb")!=0)
+			file=0;
+		FILE* file = fopen(binaryFileName, "rb");
+		if (file)
+		{
+			size_t binarySize=0;
+			char* binary =0;
+			fseek( file, 0L, SEEK_END );
+			binarySize = ftell( file );
+			rewind( file );
+			binary = (char*)malloc(sizeof(char)*binarySize);
+			int bytesRead;
+			bytesRead = fread( binary, sizeof(char), binarySize, file );
+			fclose( file );
+			m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
+			b3Assert( status == CL_SUCCESS );
+			status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
+			b3Assert( status == CL_SUCCESS );
+			if( status != CL_SUCCESS )
+			{
+				char *build_log;
+				size_t ret_val_size;
+				clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+				build_log = (char*)malloc(sizeof(char)*(ret_val_size+1));
+				clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+				build_log[ret_val_size] = '\0';
+				b3Error("%s\n", build_log);
+				free (build_log);
+				b3Assert(0);
+				m_cpProgram = 0;
+				b3Warning("clBuildProgram reported failure on cached binary: %s\n",binaryFileName);
+			} else
+			{
+				b3Printf("clBuildProgram successfully compiled cached binary: %s\n",binaryFileName);	
+			}
+			free (binary);
+		} else
+		{
+			b3Warning("Cannot open cached binary: %s\n",binaryFileName);
+		}
+	}
+	if (!m_cpProgram)
+	{
+		cl_int localErrNum;
+		char* compileFlags;
+		int flagsize;
+		const char* kernelSource = kernelSourceOrg;
+		if (!kernelSourceOrg || gDebugForceLoadingFromSource)
+		{
+			if (clFileNameForCaching)
+			{
+				FILE* file = fopen(clFileNameForCaching, "rb");
+				//in many cases the relative path is a few levels up the directory hierarchy, so try it
+				if (!file)
+				{
+					const char* prefix[]={"../","../../","../../../","../../../../"};
+					for (int i=0;!file && i<3;i++)
+					{
+						char relativeFileName[1024];
+						sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
+						file = fopen(relativeFileName, "rb");
+					}
+				}
+				if (file)
+				{
+					char* kernelSrc=0;
+					fseek( file, 0L, SEEK_END );
+					int kernelSize = ftell( file );
+					rewind( file );
+					kernelSrc = (char*)malloc(kernelSize+1);
+					int readBytes = fread((void*)kernelSrc,1,kernelSize, file);
+					kernelSrc[kernelSize] = 0;
+					fclose(file);
+					kernelSource = kernelSrc;
+				}
+			}
+		}
+		size_t program_length = kernelSource ? strlen(kernelSource) : 0;
+#ifdef MAC //or __APPLE__?
+		char* flags = "-cl-mad-enable -DMAC ";
+		const char* flags = "";
+		m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+		// Build the program with 'mad' Optimization option
+        		flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5);
+		compileFlags = (char*) malloc(flagsize);
+#ifdef _MSC_VER
+		sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros);
+		sprintf(compileFlags, "%s %s", flags, additionalMacros);
+		localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			char *build_log;
+			size_t ret_val_size;
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+			build_log = (char*) malloc(sizeof(char)*(ret_val_size+1));
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+			// to be carefully, terminate with \0
+			// there's no information in the reference whether the string is 0 terminated or not
+			build_log[ret_val_size] = '\0';
+			b3Error("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
+			free (build_log);
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+		if( !disableBinaryCaching && clFileNameForCaching )
+		{	//	write to binary
+			cl_uint numAssociatedDevices;
+			status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
+			b3Assert( status == CL_SUCCESS );
+			if (numAssociatedDevices==1)
+			{
+				size_t binarySize;
+				char* binary ;
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+				b3Assert( status == CL_SUCCESS );
+				binary = (char*)malloc(sizeof(char)*binarySize);
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+				b3Assert( status == CL_SUCCESS );
+				{
+					FILE* file=0;
+#ifdef _MSC_VER
+					if (fopen_s(&file,binaryFileName, "wb")!=0)
+						file=0;
+					file = fopen(binaryFileName, "wb");
+					if (file)
+					{
+						fwrite( binary, sizeof(char), binarySize, file );
+						fclose( file );
+					} else
+					{
+						b3Warning("cannot write file %s\n", binaryFileName);
+					}
+				}
+				free (binary);
+			}
+		}
+		free(compileFlags);
+	}
+	return m_cpProgram;
+cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
+	cl_kernel kernel;
+	cl_int localErrNum;
+	cl_program m_cpProgram = prog;
+	b3Printf("compiling kernel %s ",kernelName);
+	if (!m_cpProgram)
+	{
+		m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0, false);
+	}
+	// Create the kernel
+	kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
+	if (localErrNum != CL_SUCCESS)
+	{
+		b3Error("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
+        assert(0);
+		if (pErrNum)
+			*pErrNum = localErrNum;
+		return 0;
+	}
+	if (!prog && m_cpProgram)
+	{
+		clReleaseProgram(m_cpProgram);
+	}
+	b3Printf("ready. \n");
+	if (pErrNum)
+			*pErrNum = CL_SUCCESS;
+	return kernel;
diff --git a/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h b/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
new file mode 100644
index 00000000..db6466e7
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
@@ -0,0 +1,194 @@
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+#ifndef B3_OPENCL_UTILS_H
+#define B3_OPENCL_UTILS_H
+#include "b3OpenCLInclude.h"
+#ifdef __cplusplus
+extern "C" {
+///C API for OpenCL utilities: convenience functions, see below for C++ API
+/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+cl_context 	b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
+int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
+cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
+void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
+cl_kernel b3OpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
+cl_program b3OpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros  , const char* srcFileNameForCaching, bool disableBinaryCaching);
+//the following optional APIs provide access using specific platform information
+int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
+///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
+void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
+const char* b3OpenCLUtils_getSdkVendorName();
+///set the path (directory/folder) where the compiled OpenCL kernel are stored
+void b3OpenCLUtils_setCachePath(const char* path);
+cl_context 	b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
+#ifdef __cplusplus
+#define B3_MAX_STRING_LENGTH 1024
+typedef struct
+	char m_deviceName[B3_MAX_STRING_LENGTH];
+	char m_deviceVendor[B3_MAX_STRING_LENGTH];
+	char m_driverVersion[B3_MAX_STRING_LENGTH];
+	char m_deviceExtensions[B3_MAX_STRING_LENGTH];
+	cl_device_type		m_deviceType;
+	cl_uint 				m_computeUnits;
+	size_t 					m_workitemDims;
+	size_t 					m_workItemSize[3];
+	size_t 					m_image2dMaxWidth;
+	size_t 					m_image2dMaxHeight;
+	size_t 					m_image3dMaxWidth;
+	size_t 					m_image3dMaxHeight;
+	size_t 					m_image3dMaxDepth;
+	size_t 					m_workgroupSize;
+	cl_uint 				m_clockFrequency;
+	cl_ulong				m_constantBufferSize;
+	cl_ulong				m_localMemSize;
+	cl_ulong				m_globalMemSize;
+    cl_bool					m_errorCorrectionSupport;
+	cl_device_local_mem_type m_localMemType;
+	cl_uint					m_maxReadImageArgs;
+	cl_uint					m_maxWriteImageArgs;
+	cl_uint 				m_addressBits;
+	cl_ulong				m_maxMemAllocSize;
+	cl_command_queue_properties m_queueProperties;
+	cl_bool					m_imageSupport;
+	cl_uint					m_vecWidthChar;
+	cl_uint					m_vecWidthShort;
+	cl_uint					m_vecWidthInt;
+	cl_uint					m_vecWidthLong;
+	cl_uint					m_vecWidthFloat;
+	cl_uint					m_vecWidthDouble;
+} b3OpenCLDeviceInfo;
+struct b3OpenCLPlatformInfo
+	char m_platformVendor[B3_MAX_STRING_LENGTH];
+	char m_platformName[B3_MAX_STRING_LENGTH];
+	char m_platformVersion[B3_MAX_STRING_LENGTH];
+	b3OpenCLPlatformInfo()
+	{
+		m_platformVendor[0]=0;
+		m_platformName[0]=0;
+		m_platformVersion[0]=0;
+	}
+///C++ API for OpenCL utilities: convenience functions
+struct b3OpenCLUtils
+	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+	static inline cl_context 	createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
+	{
+		return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
+	}
+	static inline int getNumDevices(cl_context cxMainContext)
+	{
+		return b3OpenCLUtils_getNumDevices(cxMainContext);
+	}
+	static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
+	{
+		return b3OpenCLUtils_getDevice(cxMainContext,nr);
+	}
+	static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info);
+	static inline void printDeviceInfo(cl_device_id device)
+	{
+		b3OpenCLUtils_printDeviceInfo(device);
+	}
+	static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
+	{
+		return b3OpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource,  kernelName, pErrNum, prog,additionalMacros);
+	}
+	//optional
+	static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0, bool disableBinaryCaching=false)
+	{
+		return b3OpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching, disableBinaryCaching);
+	}
+	//the following optional APIs provide access using specific platform information
+	static inline int getNumPlatforms(cl_int* pErrNum=0)
+	{
+		return b3OpenCLUtils_getNumPlatforms(pErrNum);
+	}
+	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+	static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
+	{
+		return b3OpenCLUtils_getPlatform(nr,pErrNum);
+	}
+	static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo);
+	static inline void printPlatformInfo(cl_platform_id platform)
+	{
+		b3OpenCLUtils_printPlatformInfo(platform);
+	}
+	static inline const char* getSdkVendorName()
+	{
+		return b3OpenCLUtils_getSdkVendorName();
+	}
+	static inline cl_context 	createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
+	{
+		return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
+	}
+	static void setCachePath(const char* path)
+	{
+		b3OpenCLUtils_setCachePath(path);
+	}
+#endif //__cplusplus
+#endif // B3_OPENCL_UTILS_H
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
new file mode 100644
index 00000000..872f0395
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
@@ -0,0 +1,18 @@
+#ifndef B3_BVH_INFO_H
+#define B3_BVH_INFO_H
+#include "Bullet3Common/b3Vector3.h"
+struct b3BvhInfo
+	b3Vector3	m_aabbMin;
+	b3Vector3	m_aabbMax;
+	b3Vector3	m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+#endif //B3_BVH_INFO_H
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
new file mode 100644
index 00000000..cb30ee93
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
@@ -0,0 +1,258 @@
+#if 0
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3ContactCache.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+b3Scalar					gContactBreakingThreshold = b3Scalar(0.02);
+///gContactCalcArea3Points will approximate the convex hull area using 3 points
+///when setting it to false, it will use 4 points to compute the area: it is more accurate but slower
+bool						gContactCalcArea3Points = true;
+static inline b3Scalar calcArea4Points(const b3Vector3 &p0,const b3Vector3 &p1,const b3Vector3 &p2,const b3Vector3 &p3)
+	// It calculates possible 3 area constructed from random 4 points and returns the biggest one.
+	b3Vector3 a[3],b[3];
+	a[0] = p0 - p1;
+	a[1] = p0 - p2;
+	a[2] = p0 - p3;
+	b[0] = p2 - p3;
+	b[1] = p1 - p3;
+	b[2] = p1 - p2;
+	//todo: Following 3 cross production can be easily optimized by SIMD.
+	b3Vector3 tmp0 = a[0].cross(b[0]);
+	b3Vector3 tmp1 = a[1].cross(b[1]);
+	b3Vector3 tmp2 = a[2].cross(b[2]);
+	return b3Max(b3Max(tmp0.length2(),tmp1.length2()),tmp2.length2());
+#if 0
+//using localPointA for all points
+int b3ContactCache::sortCachedPoints(const b3Vector3& pt) 
+		//calculate 4 possible cases areas, and take biggest area
+		//also need to keep 'deepest'
+		int maxPenetrationIndex = -1;
+		b3Scalar maxPenetration = pt.getDistance();
+		for (int i=0;i<4;i++)
+		{
+			if (m_pointCache[i].getDistance() < maxPenetration)
+			{
+				maxPenetrationIndex = i;
+				maxPenetration = m_pointCache[i].getDistance();
+			}
+		}
+		b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.));
+	if (gContactCalcArea3Points)
+	{
+		if (maxPenetrationIndex != 0)
+		{
+			b3Vector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
+			b3Vector3 cross = a0.cross(b0);
+			res0 = cross.length2();
+		}
+		if (maxPenetrationIndex != 1)
+		{
+			b3Vector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
+			b3Vector3 cross = a1.cross(b1);
+			res1 = cross.length2();
+		}
+		if (maxPenetrationIndex != 2)
+		{
+			b3Vector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 cross = a2.cross(b2);
+			res2 = cross.length2();
+		}
+		if (maxPenetrationIndex != 3)
+		{
+			b3Vector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 cross = a3.cross(b3);
+			res3 = cross.length2();
+		}
+	} 
+	else
+	{
+		if(maxPenetrationIndex != 0) {
+			res0 = calcArea4Points(pt.m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
+		}
+		if(maxPenetrationIndex != 1) {
+			res1 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
+		}
+		if(maxPenetrationIndex != 2) {
+			res2 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[3].m_localPointA);
+		}
+		if(maxPenetrationIndex != 3) {
+			res3 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA);
+		}
+	}
+	b3Vector4 maxvec(res0,res1,res2,res3);
+	int biggestarea = maxvec.closestAxis4();
+	return biggestarea;
+int b3ContactCache::getCacheEntry(const b3Vector3& newPoint) const
+	b3Scalar shortestDist =  getContactBreakingThreshold() * getContactBreakingThreshold();
+	int size = getNumContacts();
+	int nearestPoint = -1;
+	for( int i = 0; i < size; i++ )
+	{
+		const b3Vector3 &mp = m_pointCache[i];
+		b3Vector3 diffA =  mp.m_localPointA- newPoint.m_localPointA;
+		const b3Scalar distToManiPoint = diffA.dot(diffA);
+		if( distToManiPoint < shortestDist )
+		{
+			shortestDist = distToManiPoint;
+			nearestPoint = i;
+		}
+	}
+	return nearestPoint;
+int b3ContactCache::addManifoldPoint(const b3Vector3& newPoint)
+	b3Assert(validContactDistance(newPoint));
+	int insertIndex = getNumContacts();
+	if (insertIndex == MANIFOLD_CACHE_SIZE)
+	{
+		//sort cache so best points come first, based on area
+		insertIndex = sortCachedPoints(newPoint);
+		insertIndex = 0;
+		clearUserCache(m_pointCache[insertIndex]);
+	} else
+	{
+		m_cachedPoints++;
+	}
+	if (insertIndex<0)
+		insertIndex=0;
+	//b3Assert(m_pointCache[insertIndex].m_userPersistentData==0);
+	m_pointCache[insertIndex] = newPoint;
+	return insertIndex;
+bool b3ContactCache::validContactDistance(const b3Vector3& pt)
+	return pt.w <= gContactBreakingThreshold;
+void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i)
+	int numContacts = b3Contact4Data_getNumPoints(&newContactCache);
+	if (i!=(numContacts-1))
+	{
+		b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]);
+		b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]);
+		b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]);
+	}
+	b3Contact4Data_setNumPoints(&newContactCache,numContacts-1);
+void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& contacts)
+	int numContacts = b3Contact4Data_getNumPoints(&contacts);
+	int i;
+	/// first refresh worldspace positions and distance
+	for (i=numContacts-1;i>=0;i--)
+	{
+		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
+		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
+		contacts.m_worldPosB[i] = worldPosB;
+		float distance = (worldPosA -  worldPosB).dot(contacts.m_worldNormalOnB);
+		contacts.m_worldPosB[i].w = distance;
+	}
+	/// then 
+	b3Scalar distance2d;
+	b3Vector3 projectedDifference,projectedPoint;
+	for (i=numContacts-1;i>=0;i--)
+	{
+		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
+		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
+		b3Vector3&pt = contacts.m_worldPosB[i];
+		//contact becomes invalid when signed distance exceeds margin (projected on contactnormal direction)
+		if (!validContactDistance(pt))
+		{
+			removeContactPoint(contacts,i);
+		} else
+		{
+			//contact also becomes invalid when relative movement orthogonal to normal exceeds margin
+			projectedPoint = worldPosA - contacts.m_worldNormalOnB * contacts.m_worldPosB[i].w;
+			projectedDifference = contacts.m_worldPosB[i] - projectedPoint;
+			distance2d = projectedDifference.dot(projectedDifference);
+			if (distance2d  > gContactBreakingThreshold*gContactBreakingThreshold )
+			{
+				removeContactPoint(contacts,i);
+			} else
+			{
+				////contact point processed callback
+				//if (gContactProcessedCallback)
+				//	(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
+			}
+		}
+	}
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
new file mode 100644
index 00000000..d6c9b0a0
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
@@ -0,0 +1,80 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+///maximum contact breaking and merging threshold
+extern b3Scalar gContactBreakingThreshold;
+///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase.
+///Those contact points are created by the collision narrow phase.
+///The cache can be empty, or hold 1,2,3 or 4 points. Some collision algorithms (GJK) might only add one point at a time.
+///updates/refreshes old contact points, and throw them away if necessary (distance becomes too large)
+///reduces the cache to 4 points, when more then 4 points are added, using following rules:
+///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
+///note that some pairs of objects might have more then one contact manifold.
+B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache
+	/// sort cached points so most isolated points come first
+	int	sortCachedPoints(const b3Vector3& pt);
+	int addManifoldPoint( const b3Vector3& newPoint);
+	/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
+	{
+		b3Assert(validContactDistance(newPoint));
+		m_pointCache[insertIndex] = newPoint;
+	}
+	*/
+	static bool validContactDistance(const b3Vector3& pt);
+	/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
+	static void	refreshContactPoints(  const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache);
+	static void removeContactPoint(struct b3Contact4Data& newContactCache,int i);
+#endif //B3_CONTACT_CACHE_H
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
new file mode 100644
index 00000000..b7201d2d
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
@@ -0,0 +1,4858 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+bool findSeparatingAxisOnGpu = true;
+bool splitSearchSepAxisConcave = false;
+bool splitSearchSepAxisConvex = true;
+bool useMprGpu = true;//use mpr for edge-edge  (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling...
+bool bvhTraversalKernelGPU = true;
+bool findConcaveSeparatingAxisKernelGPU = true;
+bool clipConcaveFacesAndFindContactsCPU = false;//false;//true;
+bool clipConvexFacesAndFindContactsCPU = false;//false;//true;
+bool reduceConcaveContactsOnGPU = true;//false;
+bool reduceConvexContactsOnGPU = true;//false;
+bool findConvexClippingFacesGPU = true;
+bool useGjk = true;///option for CPU/host testing, when findSeparatingAxisOnGpu = false
+bool useGjkContacts = true;//////option for CPU/host testing when findSeparatingAxisOnGpu = false
+static int myframecount=0;///for testing
+///This file was written by Erwin Coumans
+///Separating axis rest based on work from Pierre Terdiman, see
+///And contact clipping based on work from Simon Hobbs
+//#define B3_DEBUG_SAT_FACE
+//#define CHECK_ON_HOST
+int b3g_actualSATPairTests=0;
+#include "b3ConvexHullContact.h"
+#include <string.h>//memcpy
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+typedef b3AlignedObjectArray<b3Vector3> b3VertexArray;
+#include <float.h> //for FLT_MAX
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+//#include "AdlQuaternion.h"
+#include "kernels/satKernels.h"
+#include "kernels/mprKernels.h"
+#include "kernels/satConcaveKernels.h"
+#include "kernels/satClipHullContacts.h"
+#include "kernels/bvhTraversal.h"
+#include "kernels/primitiveContacts.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl"
+#define BT_NARROWPHASE_SAT_CONCAVE_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl"
+#define BT_NARROWPHASE_MPR_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl"
+#define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl"
+#define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl"
+#define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl"
+#ifndef __global
+#define __global
+#ifndef __kernel
+#define __kernel
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h"
+#define dot3F4 b3Dot
+GpuSatCollision::GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q )
+m_totalContactsOut(m_context, m_queue),
+m_sepNormals(m_context, m_queue),
+m_hasSeparatingNormals(m_context, m_queue),
+m_concaveSepNormals(m_context, m_queue),
+m_numConcavePairsOut(m_context, m_queue),
+m_gpuCompoundPairs(m_context, m_queue),
+m_gpuCompoundSepNormals(m_context, m_queue),
+m_gpuHasCompoundSepNormals(m_context, m_queue),
+m_numCompoundPairsOut(m_context, m_queue),
+	m_totalContactsOut.push_back(0);
+	cl_int errNum=0;
+	if (1)
+	{
+		const char* mprSrc = mprKernelsCL;
+		const char* srcConcave = satConcaveKernelsCL;
+		char flags[1024]={0};
+//		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl");
+		m_mprPenetrationKernel  = 0;
+		m_findSeparatingAxisUnitSphereKernel = 0;
+		if (useMprGpu)
+		{
+			cl_program mprProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,mprSrc,&errNum,flags,BT_NARROWPHASE_MPR_PATH);
+			b3Assert(errNum==CL_SUCCESS);
+			m_mprPenetrationKernel  = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "mprPenetrationKernel",&errNum,mprProg );
+			b3Assert(m_mprPenetrationKernel);
+			b3Assert(errNum==CL_SUCCESS);
+			m_findSeparatingAxisUnitSphereKernel =  b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "findSeparatingAxisUnitSphereKernel",&errNum,mprProg );
+			b3Assert(m_findSeparatingAxisUnitSphereKernel);
+            b3Assert(errNum==CL_SUCCESS);
+			int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+			m_unitSphereDirections.resize(numDirections);
+			m_unitSphereDirections.copyFromHostPointer(unitSphere162,numDirections,0,true);
+		}
+		cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,satKernelsCL,&errNum,flags,BT_NARROWPHASE_SAT_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		cl_program satConcaveProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcConcave,&errNum,flags,BT_NARROWPHASE_SAT_CONCAVE_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisKernel",&errNum,satProg );
+		b3Assert(m_findSeparatingAxisKernel);
+		b3Assert(errNum==CL_SUCCESS);
+		m_findSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisVertexFaceKernel",&errNum,satProg );
+		b3Assert(m_findSeparatingAxisVertexFaceKernel);
+		m_findSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisEdgeEdgeKernel",&errNum,satProg );
+		b3Assert(m_findSeparatingAxisVertexFaceKernel);
+		m_findConcaveSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findConcaveSeparatingAxisKernel",&errNum,satProg );
+		b3Assert(m_findConcaveSeparatingAxisKernel);
+		b3Assert(errNum==CL_SUCCESS);
+        m_findConcaveSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisVertexFaceKernel",&errNum,satConcaveProg );
+		b3Assert(m_findConcaveSeparatingAxisVertexFaceKernel);
+		b3Assert(errNum==CL_SUCCESS);
+        m_findConcaveSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisEdgeEdgeKernel",&errNum,satConcaveProg );
+		b3Assert(m_findConcaveSeparatingAxisEdgeEdgeKernel);
+		b3Assert(errNum==CL_SUCCESS);
+		m_findCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findCompoundPairsKernel",&errNum,satProg );
+		b3Assert(m_findCompoundPairsKernel);
+		b3Assert(errNum==CL_SUCCESS);
+		m_processCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "processCompoundPairsKernel",&errNum,satProg );
+		b3Assert(m_processCompoundPairsKernel);
+		b3Assert(errNum==CL_SUCCESS);
+	}
+	if (1)
+	{
+		const char* srcClip = satClipKernelsCL;
+		char flags[1024]={0};
+//		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
+		cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+		m_clipCompoundsHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipCompoundsHullHullKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+        m_findClippingFacesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "findClippingFacesKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+        m_clipFacesAndFindContacts = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipFacesAndFindContactsKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);        
+		m_clipHullHullConcaveConvexKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullConcaveConvexKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+//		m_extractManifoldAndAddContactKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "extractManifoldAndAddContactKernel",&errNum,satClipContactsProg);
+	//	b3Assert(errNum==CL_SUCCESS);
+        m_newContactReductionKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip,
+                            "newContactReductionKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+	}
+   else
+	{
+		m_clipHullHullKernel=0;
+		m_clipCompoundsHullHullKernel = 0;
+        m_findClippingFacesKernel = 0;
+        m_newContactReductionKernel=0;
+        m_clipFacesAndFindContacts = 0;
+		m_clipHullHullConcaveConvexKernel = 0;
+//		m_extractManifoldAndAddContactKernel = 0;
+	}
+	 if (1)
+	{
+		const char* srcBvh = bvhTraversalKernelCL;
+		cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,"");
+		b3Assert(errNum==CL_SUCCESS);
+	}
+	 {
+		 const char* primitiveContactsSrc = primitiveContactsKernelsCL;
+		cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,"");
+		b3Assert(errNum==CL_SUCCESS);
+		m_findConcaveSphereContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "findConcaveSphereContactsKernel",&errNum,primitiveContactsProg );
+		b3Assert(errNum==CL_SUCCESS);
+		b3Assert(m_findConcaveSphereContactsKernel);
+		m_processCompoundPairsPrimitivesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "processCompoundPairsPrimitivesKernel",&errNum,primitiveContactsProg,"");
+		b3Assert(errNum==CL_SUCCESS);
+		b3Assert(m_processCompoundPairsPrimitivesKernel);
+	 }
+	if (m_findSeparatingAxisVertexFaceKernel)
+		clReleaseKernel(m_findSeparatingAxisVertexFaceKernel);
+	if (m_findSeparatingAxisEdgeEdgeKernel)
+		clReleaseKernel(m_findSeparatingAxisEdgeEdgeKernel);
+	if (m_findSeparatingAxisUnitSphereKernel)
+		clReleaseKernel(m_findSeparatingAxisUnitSphereKernel);
+	if (m_mprPenetrationKernel)
+		clReleaseKernel(m_mprPenetrationKernel);
+	if (m_findSeparatingAxisKernel)
+		clReleaseKernel(m_findSeparatingAxisKernel);
+    if (m_findConcaveSeparatingAxisVertexFaceKernel)
+        clReleaseKernel(m_findConcaveSeparatingAxisVertexFaceKernel);
+    if (m_findConcaveSeparatingAxisEdgeEdgeKernel)
+        clReleaseKernel(m_findConcaveSeparatingAxisEdgeEdgeKernel);
+	if (m_findConcaveSeparatingAxisKernel)
+		clReleaseKernel(m_findConcaveSeparatingAxisKernel);
+	if (m_findCompoundPairsKernel)
+		clReleaseKernel(m_findCompoundPairsKernel);
+	if (m_processCompoundPairsKernel)
+		clReleaseKernel(m_processCompoundPairsKernel);
+    if (m_findClippingFacesKernel)
+        clReleaseKernel(m_findClippingFacesKernel);
+    if (m_clipFacesAndFindContacts)
+        clReleaseKernel(m_clipFacesAndFindContacts);
+    if (m_newContactReductionKernel)
+        clReleaseKernel(m_newContactReductionKernel);
+	if (m_primitiveContactsKernel)
+		clReleaseKernel(m_primitiveContactsKernel);
+	if (m_findConcaveSphereContactsKernel)
+		clReleaseKernel(m_findConcaveSphereContactsKernel);
+	if (m_processCompoundPairsPrimitivesKernel)
+		clReleaseKernel(m_processCompoundPairsPrimitivesKernel);
+	if (m_clipHullHullKernel)
+		clReleaseKernel(m_clipHullHullKernel);
+	if (m_clipCompoundsHullHullKernel)
+		clReleaseKernel(m_clipCompoundsHullHullKernel);
+	if (m_clipHullHullConcaveConvexKernel)
+		clReleaseKernel(m_clipHullHullConcaveConvexKernel);
+//	if (m_extractManifoldAndAddContactKernel)
+	//	clReleaseKernel(m_extractManifoldAndAddContactKernel);
+	if (m_bvhTraversalKernel)
+		clReleaseKernel(m_bvhTraversalKernel);
+struct MyTriangleCallback : public b3NodeOverlapCallback
+	int m_bodyIndexA;
+	int m_bodyIndexB;
+	virtual void processNode(int subPart, int triangleIndex)
+	{
+		printf("bodyIndexA %d, bodyIndexB %d\n",m_bodyIndexA,m_bodyIndexB);
+		printf("triangleIndex %d\n", triangleIndex);
+	}
+#define float4 b3Vector3
+#define make_float4(x,y,z,w) b3MakeVector3(x,y,z,w)
+float signedDistanceFromPointToPlane(const float4& point, const float4& planeEqn, float4* closestPointOnFace)
+	float4 n = planeEqn;
+	n[3] = 0.f;
+	float dist = dot3F4(n, point) + planeEqn[3];
+	*closestPointOnFace = point - dist * n;
+	return dist;
+#define cross3(a,b) (a.cross(b))
+b3Vector3 transform(const b3Vector3* v, const b3Vector3* pos, const b3Quaternion* orn)
+	b3Transform tr;
+	tr.setIdentity();
+	tr.setOrigin(*pos);
+	tr.setRotation(*orn);
+	b3Vector3 res = tr(*v);
+	return res;
+inline bool IsPointInPolygon(const float4& p, 
+							const b3GpuFace* face,
+							 const float4* baseVertex,
+							const  int* convexIndices,
+							float4* out)
+    float4 a;
+    float4 b;
+    float4 ab;
+    float4 ap;
+    float4 v;
+	float4 plane = b3MakeVector3(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);
+	if (face->m_numIndices<2)
+		return false;
+	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];
+	b = v0;
+    for(unsigned i=0; i != face->m_numIndices; ++i)
+    {
+		a = b;
+		float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];
+		b = vi;
+        ab = b-a;
+        ap = p-a;
+        v = cross3(ab,plane);
+        if (b3Dot(ap, v) > 0.f)
+        {
+            float ab_m2 = b3Dot(ab, ab);
+            float rt = ab_m2 != 0.f ? b3Dot(ab, ap) / ab_m2 : 0.f;
+            if (rt <= 0.f)
+            {
+                *out = a;
+            }
+            else if (rt >= 1.f) 
+            {
+                *out = b;
+            }
+            else
+            {
+            	float s = 1.f - rt;
+				out[0].x = s * a.x + rt * b.x;
+				out[0].y = s * a.y + rt * b.y;
+				out[0].z = s * a.z + rt * b.z;
+            }
+            return false;
+        }
+    }
+    return true;
+#define normalize3(a) (a.normalize())
+int extractManifoldSequentialGlobal( const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx)
+	if( nPoints == 0 )
+        return 0;
+    if (nPoints <=4)
+        return nPoints;
+    if (nPoints >64)
+        nPoints = 64;
+	float4 center = b3MakeVector3(0,0,0,0);
+	{
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+	//	sample 4 directions
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    int minIndex=-1;
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+    }
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    return 4;
+#define MAX_VERTS 1024
+inline void project(const b3ConvexPolyhedronData& hull,  const float4& pos, const b3Quaternion& orn, const float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max)
+	min = FLT_MAX;
+	max = -FLT_MAX;
+	int numVerts = hull.m_numVertices;
+	const float4 localDir = b3QuatRotate(orn.inverse(),dir);
+	b3Scalar offset = dot3F4(pos,dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		//b3Vector3 pt = trans * vertices[m_vertexOffset+i];
+		//b3Scalar dp = pt.dot(dir);
+		b3Vector3 vertex = vertices[hull.m_vertexOffset+i];
+		b3Scalar dp = dot3F4((float4&)vertices[hull.m_vertexOffset+i],localDir);
+		//b3Assert(dp==dpL);
+		if(dp < min)	min = dp;
+		if(dp > max)	max = dp;
+	}
+	if(min>max)
+	{
+		b3Scalar tmp = min;
+		min = max;
+		max = tmp;
+	}
+	min += offset;
+	max += offset;
+static bool TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA,const b3Quaternion& ornA,
+	const float4& posB,const b3Quaternion& ornB,
+	const float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB,b3Scalar& depth)
+	b3Scalar Min0,Max0;
+	b3Scalar Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,verticesA, Min0, Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, Min1, Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	b3Scalar d0 = Max0 - Min1;
+	assert(d0>=0.0f);
+	b3Scalar d1 = Max1 - Min0;
+	assert(d1>=0.0f);
+	depth = d0<d1 ? d0:d1;
+	return true;
+inline bool IsAlmostZero(const b3Vector3& v)
+	if(fabsf(v.x)>1e-6 || fabsf(v.y)>1e-6 || fabsf(v.z)>1e-6)	return false;
+	return true;
+static bool findSeparatingAxis(	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA1,
+	const b3Quaternion& ornA,
+	const float4& posB1,
+	const b3Quaternion& ornB,
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, 
+	const b3AlignedObjectArray<b3GpuFace>& facesA,
+	const b3AlignedObjectArray<int>& indicesA,
+	const b3AlignedObjectArray<b3Vector3>& verticesB, 
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, 
+	const b3AlignedObjectArray<b3GpuFace>& facesB,
+	const b3AlignedObjectArray<int>& indicesB,
+	b3Vector3& sep)
+	B3_PROFILE("findSeparatingAxis");
+	b3g_actualSATPairTests++;
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	float4 c0local = (float4&)hullA.m_localCenter;
+	float4 c0 = transform(&c0local, &posA, &ornA);
+	float4 c1local = (float4&)hullB.m_localCenter;
+	float4 c1 = transform(&c1local,&posB,&ornB);
+	const float4 deltaC2 = c0 - c1;
+	b3Scalar dmin = FLT_MAX;
+	int curPlaneTests=0;
+	int numFacesA = hullA.m_numFaces;
+	// Test normals from hullA
+	for(int i=0;i<numFacesA;i++)
+	{
+		const float4& normal = (float4&)facesA[hullA.m_faceOffset+i].m_plane;
+		float4 faceANormalWS = b3QuatRotate(ornA,normal);
+		if (dot3F4(deltaC2,faceANormalWS)<0)
+			faceANormalWS*=-1.f;
+		curPlaneTests++;
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+		b3Scalar d;
+		if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,faceANormalWS, verticesA, verticesB,d))
+			return false;
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)faceANormalWS;
+		}
+	}
+	int numFacesB = hullB.m_numFaces;
+	// Test normals from hullB
+	for(int i=0;i<numFacesB;i++)
+	{
+		float4 normal = (float4&)facesB[hullB.m_faceOffset+i].m_plane;
+		float4 WorldNormal = b3QuatRotate(ornB, normal);
+		if (dot3F4(deltaC2,WorldNormal)<0)
+		{
+			WorldNormal*=-1.f;
+		}
+		curPlaneTests++;
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+		b3Scalar d;
+		if(!TestSepAxis(hullA, hullB,posA,ornA,posB,ornB,WorldNormal,verticesA,verticesB,d))
+			return false;
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)WorldNormal;
+		}
+	}
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA.m_numUniqueEdges;e0++)
+	{
+		const float4& edge0 = (float4&) uniqueEdgesA[hullA.m_uniqueEdgesOffset+e0];
+		float4 edge0World = b3QuatRotate(ornA,(float4&)edge0);
+		for(int e1=0;e1<hullB.m_numUniqueEdges;e1++)
+		{
+			const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset+e1];
+			float4 edge1World = b3QuatRotate(ornB,(float4&)edge1);
+			float4 crossje = cross3(edge0World,edge1World);
+			curEdgeEdge++;
+			if(!IsAlmostZero((b3Vector3&)crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(deltaC2,crossje)<0)
+					crossje*=-1.f;
+				gExpectedNbTests++;
+				if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin))
+					continue;
+				gActualNbTests++;
+				b3Scalar dist;
+				if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,crossje, verticesA,verticesB,dist))
+					return false;
+				if(dist<dmin)
+				{
+					dmin = dist;
+					sep = (b3Vector3&)crossje;
+				}
+			}
+		}
+	}
+	if((dot3F4(-deltaC2,(float4&)sep))>0.0f)
+		sep = -sep;
+	return true;
+bool findSeparatingAxisEdgeEdge(	__global const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, 
+	const b3Float4& posA1,
+	const b3Quat& ornA,
+	const b3Float4& posB1,
+	const b3Quat& ornB,
+	const b3Float4& DeltaC2,
+	__global const b3AlignedObjectArray<float4>& vertices, 
+	__global const b3AlignedObjectArray<float4>& uniqueEdges, 
+	__global const b3AlignedObjectArray<b3GpuFace>& faces,
+	__global const b3AlignedObjectArray<int>&  indices,
+	float4* sep,
+	float* dmin)
+//	int i = get_global_id(0);
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = b3QuatRotate(ornA,edge0);
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = b3QuatRotate(ornB,edge1);
+			float4 crossje = cross3(edge0World,edge1World);
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje*=-1.f;
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					project(*hullA,posA,ornA,crossje,vertices, Min0, Max0);
+					project(*hullB,posB,ornB,crossje,vertices, Min1, Max1);
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+				}
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+__inline float4 lerp3(const float4& a,const float4& b, float  t)
+	return b3MakeVector3(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS,float planeEqWS, float4* ppVtxOut)
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+	if (numVertsIn < 2)
+		return 0;
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedronData* hullA,  
+	const float4& posA, const b3Quaternion& ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	const b3AlignedObjectArray<float4>& verticesA,	const b3AlignedObjectArray<b3GpuFace>& facesA,	const b3AlignedObjectArray<int>& indicesA,
+	//const float4* verticesB,	const b3GpuFace* facesB,	const int* indicesB,
+	float4* contactsOut,
+	int contactCapacity)
+	int numContactsOut = 0;
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = b3MakeVector3(
+				facesA[hullA->m_faceOffset+face].m_plane.x, 
+				facesA[hullA->m_faceOffset+face].m_plane.y, 
+				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = b3QuatRotate(ornA,Normal);
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+	b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numContacts = numWorldVertsB1;
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];
+		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = b3QuatRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = b3QuatRotate(ornA,planeNormalA);
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = b3QuatRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+			if (numContactsOut<contactCapacity)
+			{
+				if (depth <=maxDist)
+				{
+					float4 pointInWorld = pVtxIn[i];
+					//resultOut.addContactPoint(separatingNormal,point,depth);
+					contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+					//printf("depth=%f\n",depth);
+				}
+			} else
+			{
+				b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut,contactCapacity);
+			}
+		}
+	}
+	return numContactsOut;
+static int	clipHullAgainstHull(const float4& separatingNormal, 
+	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA, const b3Quaternion& ornA,const float4& posB, const b3Quaternion& ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	const b3AlignedObjectArray<float4>& verticesA,	const b3AlignedObjectArray<b3GpuFace>& facesA,	const b3AlignedObjectArray<int>& indicesA,
+	const b3AlignedObjectArray<float4>& verticesB,	const b3AlignedObjectArray<b3GpuFace>& facesB,	const b3AlignedObjectArray<int>& indicesB,
+	float4*	contactsOut,
+	int contactCapacity)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	B3_PROFILE("clipHullAgainstHull");
+	float curMaxDist=maxDist;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+	{
+		//B3_PROFILE("closestFaceB");
+		if (hullB.m_numFaces!=1)
+		{
+			//printf("wtf\n");
+		}
+		static bool once = true;
+		//printf("separatingNormal=%f,%f,%f\n",separatingNormal.x,separatingNormal.y,separatingNormal.z);
+		for(int face=0;face<hullB.m_numFaces;face++)
+		{
+			if (once)
+				printf("face %d\n",face);
+			const b3GpuFace* faceB = &facesB[hullB.m_faceOffset+face];
+			if (once)
+			{
+				for (int i=0;i<faceB->m_numIndices;i++)
+				{
+					float4 vert = verticesB[hullB.m_vertexOffset+indicesB[faceB->m_indexOffset+i]];
+					printf("vert[%d] = %f,%f,%f\n",i,vert.x,vert.y,vert.z);
+				}
+			}
+			//if (facesB[hullB.m_faceOffset+face].m_numIndices>2)
+			{
+				const float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset+face].m_plane.x, 
+					facesB[hullB.m_faceOffset+face].m_plane.y, facesB[hullB.m_faceOffset+face].m_plane.z,0.f);
+				const float4 WorldNormal = b3QuatRotate(ornB, Normal);
+				if (once)
+					printf("faceNormal = %f,%f,%f\n",Normal.x,Normal.y,Normal.z);
+				float d = dot3F4(WorldNormal,separatingNormal);
+				if (d > dmax)
+				{
+					dmax = d;
+					closestFaceB = face;
+				}
+			}
+		}
+		once = false;
+	}
+	b3Assert(closestFaceB>=0);
+	{
+		//B3_PROFILE("worldVertsB1");
+		const b3GpuFace& polyB = facesB[hullB.m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4& b = verticesB[hullB.m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+	if (closestFaceB>=0)
+	{
+		//B3_PROFILE("clipFaceAgainstHull");
+		numContactsOut = clipFaceAgainstHull((float4&)separatingNormal, &hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,
+				verticesA,				facesA,				indicesA,
+				contactsOut,contactCapacity);
+	}
+	return numContactsOut;
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+int extractManifold(const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx)
+	if( nPoints == 0 )
+        return 0;
+    if (nPoints <=4)
+        return nPoints;
+    if (nPoints >64)
+        nPoints = 64;
+	float4 center = make_float4(0,0,0,0);
+	{
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+	//	sample 4 directions
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    int minIndex=-1;
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+    }
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    return 4;
+int clipHullHullSingle(
+			int bodyIndexA, int bodyIndexB,
+										 const float4& posA,
+										 const b3Quaternion& ornA,
+										 const float4& posB,
+										 const b3Quaternion& ornB,
+			int collidableIndexA, int collidableIndexB,
+			const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf, 
+			b3AlignedObjectArray<b3Contact4>* globalContactOut, 
+			int& nContacts,
+			const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA,
+			const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB,
+			const b3AlignedObjectArray<b3Vector3>& verticesA, 
+			const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, 
+			const b3AlignedObjectArray<b3GpuFace>& facesA,
+			const b3AlignedObjectArray<int>& indicesA,
+			const b3AlignedObjectArray<b3Vector3>& verticesB,
+			const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB,
+			const b3AlignedObjectArray<b3GpuFace>& facesB,
+			const b3AlignedObjectArray<int>& indicesB,
+			const b3AlignedObjectArray<b3Collidable>& hostCollidablesA,
+			const b3AlignedObjectArray<b3Collidable>& hostCollidablesB,
+			const b3Vector3& sepNormalWorldSpace,
+			int maxContactCapacity			)
+	int contactIndex = -1;
+	b3ConvexPolyhedronData hullA, hullB;
+    b3Collidable colA = hostCollidablesA[collidableIndexA];
+    hullA = hostConvexDataA[colA.m_shapeIndex];
+    //printf("numvertsA = %d\n",hullA.m_numVertices);
+    b3Collidable colB = hostCollidablesB[collidableIndexB];
+    hullB = hostConvexDataB[colB.m_shapeIndex];
+    //printf("numvertsB = %d\n",hullB.m_numVertices);
+	float4 contactsOut[MAX_VERTS];
+	int localContactCapacity = MAX_VERTS;
+#ifdef _WIN32
+	b3Assert(_finite(bodyBuf->at(bodyIndexA).m_pos.x));
+	b3Assert(_finite(bodyBuf->at(bodyIndexB).m_pos.x));
+	{
+		float4 worldVertsB1[MAX_VERTS];
+		float4 worldVertsB2[MAX_VERTS];
+		int capacityWorldVerts = MAX_VERTS;
+		float4 hostNormal = make_float4(sepNormalWorldSpace.x,sepNormalWorldSpace.y,sepNormalWorldSpace.z,0.f);
+		int shapeA = hostCollidablesA[collidableIndexA].m_shapeIndex;
+		int shapeB = hostCollidablesB[collidableIndexB].m_shapeIndex;
+		b3Scalar minDist = -1;
+		b3Scalar maxDist = 0.;
+		b3Transform trA,trB;
+		{
+		//B3_PROFILE("transform computation");
+		//trA.setIdentity();
+		trA.setOrigin(b3MakeVector3(posA.x,posA.y,posA.z));
+		trA.setRotation(b3Quaternion(ornA.x,ornA.y,ornA.z,ornA.w));
+		//trB.setIdentity();
+		trB.setOrigin(b3MakeVector3(posB.x,posB.y,posB.z));
+		trB.setRotation(b3Quaternion(ornB.x,ornB.y,ornB.z,ornB.w));
+		}
+		b3Quaternion trAorn = trA.getRotation();
+        b3Quaternion trBorn = trB.getRotation();
+		int numContactsOut = clipHullAgainstHull(hostNormal, 
+						hostConvexDataA.at(shapeA), 
+						hostConvexDataB.at(shapeB),
+								(float4&)trA.getOrigin(), (b3Quaternion&)trAorn,
+								(float4&)trB.getOrigin(), (b3Quaternion&)trBorn,
+								worldVertsB1,worldVertsB2,capacityWorldVerts,
+								minDist, maxDist,
+								verticesA,	facesA,indicesA,
+								verticesB,	facesB,indicesB,
+								contactsOut,localContactCapacity);
+		if (numContactsOut>0)
+		{
+			B3_PROFILE("overlap");
+			float4 normalOnSurfaceB = (float4&)hostNormal;
+			b3Int4 contactIdx;
+			contactIdx.x = 0;
+			contactIdx.y = 1;
+			contactIdx.z = 2;
+			contactIdx.w = 3;
+			int numPoints = 0;
+			{
+			//	B3_PROFILE("extractManifold");
+				numPoints = extractManifold(contactsOut, numContactsOut, normalOnSurfaceB,  &contactIdx);
+			}
+			b3Assert(numPoints);
+			if (nContacts<maxContactCapacity)
+			{
+				contactIndex = nContacts;
+				globalContactOut->expand();
+				b3Contact4& contact = globalContactOut->at(nContacts);
+				contact.m_batchIdx = 0;//i;
+				contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+				contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+				contact.m_frictionCoeffCmp = 45874;
+				contact.m_restituitionCoeffCmp = 0;
+				float distance = 0.f;
+				for (int p=0;p<numPoints;p++)
+				{
+					contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]];//check if it is actually on B
+					contact.m_worldNormalOnB = normalOnSurfaceB; 
+				}
+				//printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints);
+				contact.m_worldNormalOnB.w = (b3Scalar)numPoints;
+				nContacts++;
+			} else
+			{
+				b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts,maxContactCapacity);
+			}
+		}
+	}
+	return contactIndex;
+void computeContactPlaneConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;
+	const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndex];
+	b3Vector3 posB = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+	b3Vector3 posA = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z);
+	b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal);
+	float planeConstant = planeEq.w;
+	b3Transform convexWorldTransform;
+	convexWorldTransform.setIdentity();
+	convexWorldTransform.setOrigin(posB);
+	convexWorldTransform.setRotation(ornB);
+	b3Transform planeTransform;
+	planeTransform.setIdentity();
+	planeTransform.setOrigin(posA);
+	planeTransform.setRotation(ornA);
+	b3Transform planeInConvex;
+	planeInConvex= convexWorldTransform.inverse() * planeTransform;
+	b3Transform convexInPlane;
+	convexInPlane = planeTransform.inverse() * convexWorldTransform;
+	b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal;
+	float maxDot = -1e30;
+	int hitVertex=-1;
+	b3Vector3 hitVtx;
+	b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS];
+	int numPoints = 0;
+	b3Int4 contactIdx;
+	contactIdx.s[0] = 0;
+	contactIdx.s[1] = 1;
+	contactIdx.s[2] = 2;
+	contactIdx.s[3] = 3;
+	for (int i=0;i<hullB->m_numVertices;i++)
+	{
+		b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i];
+		float curDot = vtx.dot(planeNormalInConvex);
+		if (curDot>maxDot)
+		{
+			hitVertex=i;
+			maxDot=curDot;
+			hitVtx = vtx;
+			//make sure the deepest points is always included
+			if (numPoints==MAX_PLANE_CONVEX_POINTS)
+				numPoints--;
+		}
+		{
+			b3Vector3 vtxWorld = convexWorldTransform*vtx;
+			b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld;
+			float dist = planeNormal.dot(vtxInPlane)-planeConstant;
+			if (dist<0.f)
+			{
+				vtxWorld.w = dist;
+				contactPoints[numPoints] = vtxWorld;
+				numPoints++;
+			}
+		}
+	}
+	int numReducedPoints  = 0;
+	numReducedPoints = numPoints;
+	if (numPoints>4)
+	{
+		numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx);
+	}
+	int dstIdx;
+//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+	if (numReducedPoints>0)
+	{
+		if (nGlobalContactsOut < maxContactCapacity)
+		{
+			dstIdx=nGlobalContactsOut;
+			nGlobalContactsOut++;
+			b3Contact4* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = -planeNormalWorld;
+			c->setFrictionCoeff(0.7);
+			c->setRestituitionCoeff(0.f);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			for (int i=0;i<numReducedPoints;i++)
+			{
+				b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
+				c->m_worldPosB[i] = pOnB1;
+			}
+			c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints;
+		}//if (dstIdx < numPairs)
+	}	
+//	printf("computeContactPlaneConvex\n");
+B3_FORCE_INLINE b3Vector3	MyUnQuantize(const unsigned short* vecIn, const b3Vector3& quantization, const b3Vector3& bvhAabbMin)
+	{
+			b3Vector3	vecOut;
+			vecOut.setValue(
+			(b3Scalar)(vecIn[0]) / (quantization.x),
+			(b3Scalar)(vecIn[1]) / (quantization.y),
+			(b3Scalar)(vecIn[2]) / (quantization.z));
+			vecOut += bvhAabbMin;
+			return vecOut;
+	}
+void traverseTreeTree()
+#include "Bullet3Common/shared/b3Mat3x3.h"
+int numAabbChecks = 0;
+int maxNumAabbChecks = 0;
+int maxDepth = 0;
+// work-in-progress
+__kernel void   findCompoundPairsKernel( 
+	int pairIndex,
+	int bodyIndexA,
+	int bodyIndexB,
+	int collidableIndexA,
+	int collidableIndexB,
+	__global const b3RigidBodyData* rigidBodies, 
+	__global const b3Collidable* collidables,
+	__global const b3ConvexPolyhedronData* convexShapes, 
+	__global const b3AlignedObjectArray<b3Float4>& vertices,
+	__global const b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace,
+	__global const b3AlignedObjectArray<b3Aabb>& aabbsLocalSpace,
+	__global const b3GpuChildShape* gpuChildShapes,
+	__global b3Int4* gpuCompoundPairsOut,
+	__global  int* numCompoundPairsOut,
+	int maxNumCompoundPairsCapacity,
+	b3AlignedObjectArray<b3QuantizedBvhNode>&	treeNodesCPU,
+	b3AlignedObjectArray<b3BvhSubtreeInfo>&	subTreesCPU,
+	b3AlignedObjectArray<b3BvhInfo>&	bvhInfoCPU
+	)
+	numAabbChecks=0;
+	maxNumAabbChecks=0;
+	int i = pairIndex;
+	{
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;
+			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;
+			int numSubTreesA = bvhInfoCPU[bvhA].m_numSubTrees;
+			int subTreesOffsetA = bvhInfoCPU[bvhA].m_subTreeOffset;
+			int subTreesOffsetB = bvhInfoCPU[bvhB].m_subTreeOffset;
+			int numSubTreesB = bvhInfoCPU[bvhB].m_numSubTrees;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+			b3Transform transA;
+			transA.setIdentity();
+			transA.setOrigin(posA);
+			transA.setRotation(ornA);
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			b3Transform transB;
+			transB.setIdentity();
+			transB.setOrigin(posB);
+			transB.setRotation(ornB);
+			for (int p=0;p<numSubTreesA;p++)
+			{
+				b3BvhSubtreeInfo subtreeA = subTreesCPU[subTreesOffsetA+p];
+				//bvhInfoCPU[bvhA].m_quantization
+				b3Vector3 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+				b3Vector3 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+				b3Vector3 aabbAMinOut,aabbAMaxOut;
+				float margin=0.f;
+				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut);
+				for (int q=0;q<numSubTreesB;q++)
+				{
+					b3BvhSubtreeInfo subtreeB = subTreesCPU[subTreesOffsetB+q];
+					b3Vector3 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+					b3Vector3 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+					b3Vector3 aabbBMinOut,aabbBMaxOut;
+					float margin=0.f;
+					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut);
+					numAabbChecks=0;
+					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+					if (aabbOverlap)
+					{
+						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfoCPU[bvhA].m_nodeOffset;
+						int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;
+						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfoCPU[bvhB].m_nodeOffset;
+						int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;
+						b3AlignedObjectArray<b3Int2> nodeStack;
+						b3Int2 node0;
+						node0.x = startNodeIndexA;
+						node0.y = startNodeIndexB;
+						int maxStackDepth = 1024;
+						nodeStack.resize(maxStackDepth);
+						int depth=0;
+						nodeStack[depth++]=node0;
+						do
+						{
+							if (depth > maxDepth)
+							{
+								maxDepth=depth;
+								printf("maxDepth=%d\n",maxDepth);
+							}
+							b3Int2 node = nodeStack[--depth];
+							b3Vector3 aMinLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+							b3Vector3 aMaxLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+							b3Vector3 bMinLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+							b3Vector3 bMaxLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+							float margin=0.f;
+							b3Vector3 aabbAMinOut,aabbAMaxOut;
+							b3TransformAabb2(aMinLocal,aMaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut);
+							b3Vector3 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2(bMinLocal,bMaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut);
+							numAabbChecks++;
+							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (nodeOverlap)
+							{
+								bool isLeafA = treeNodesCPU[node.x].isLeafNode();
+								bool isLeafB = treeNodesCPU[node.y].isLeafNode();
+								bool isInternalA = !isLeafA;
+								bool isInternalB = !isLeafB;
+								//fail, even though it might hit two leaf nodes
+								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))
+								{
+									b3Error("Error: traversal exceeded maxStackDepth\n");
+									continue;
+								}
+								if(isInternalA)
+								{
+									int nodeAleftChild = node.x+1;
+									bool isNodeALeftChildLeaf = treeNodesCPU[node.x+1].isLeafNode();
+									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + treeNodesCPU[node.x+1].getEscapeIndex();
+									if(isInternalB)
+									{					
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode();
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex();
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);
+									}
+									else
+									{
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);
+									}
+								}
+								else
+								{
+									if(isInternalB)
+									{
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode();
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex();
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);
+									}
+									else
+									{
+										int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+										if (compoundPairIdx<maxNumCompoundPairsCapacity)
+										{
+											int childShapeIndexA = treeNodesCPU[node.x].getTriangleIndex();
+											int childShapeIndexB = treeNodesCPU[node.y].getTriangleIndex();
+											gpuCompoundPairsOut[compoundPairIdx]  = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+										}
+									}
+								}
+							}
+						} while (depth);
+						maxNumAabbChecks = b3Max(numAabbChecks,maxNumAabbChecks);
+					}
+				}
+			}
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) 
+			{
+				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;
+				for (int c=0;c<numChildrenA;c++)
+				{
+					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;
+					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+					b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+					b3Aabb aabbA = aabbsLocalSpace[childColIndexA];
+					b3Transform transA;
+					transA.setIdentity();
+					transA.setOrigin(newPosA);
+					transA.setRotation(newOrnA);
+					b3Scalar margin=0.0f;
+					b3Vector3 aabbAMinOut,aabbAMaxOut;
+					b3TransformAabb2((const b3Float4&)aabbA.m_min,(const b3Float4&)aabbA.m_max, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut);
+					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+						for (int b=0;b<numChildrenB;b++)
+						{
+							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+							b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+							b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+							float4 newPosB = transform(&childPosB,&posB,&ornB);
+							b3Quat newOrnB = b3QuatMul(ornB,childOrnB);
+							b3Aabb aabbB = aabbsLocalSpace[childColIndexB];
+							b3Transform transB;
+							transB.setIdentity();
+							transB.setOrigin(newPosB);
+							transB.setRotation(newOrnB);
+							b3Vector3 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2((const b3Float4&)aabbB.m_min,(const b3Float4&)aabbB.m_max, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut);
+							numAabbChecks++;
+							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (aabbOverlap)
+							{
+								/*
+								int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+								float dmin = FLT_MAX;
+								float4 posA = newPosA;
+								posA.w = 0.f;
+								float4 posB = newPosB;
+								posB.w = 0.f;
+								float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+								b3Quat ornA = newOrnA;
+								float4 c0 = transform(&c0local, &posA, &ornA);
+								float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+								b3Quat ornB =newOrnB;
+								float4 c1 = transform(&c1local,&posB,&ornB);
+								const float4 DeltaC2 = c0 - c1;
+								*/
+								{//
+									int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+									if (compoundPairIdx<maxNumCompoundPairsCapacity)
+									{
+										gpuCompoundPairsOut[compoundPairIdx]  = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+									}
+								}//
+							}//fi(1)
+						} //for (int b=0
+					}//if (collidables[collidableIndexB].
+					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						if (1)
+						{
+							int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+							float dmin = FLT_MAX;
+							float4 posA = newPosA;
+							posA.w = 0.f;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							posB.w = 0.f;
+							float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+							b3Quat ornA = newOrnA;
+							float4 c0 = transform(&c0local, &posA, &ornA);
+							float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+							b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 c1 = transform(&c1local,&posB,&ornB);
+							const float4 DeltaC2 = c0 - c1;
+							{
+								int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+								if (compoundPairIdx<maxNumCompoundPairsCapacity)
+								{
+									gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,-1);
+								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							}//
+						}//fi (1)
+					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+				}//for (int b=0;b<numChildrenB;b++)	
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) 
+				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+			{
+				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+				for (int b=0;b<numChildrenB;b++)
+				{
+					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+					b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+					b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+					float4 newPosB = b3QuatRotate(ornB,childPosB)+posB;
+					b3Quat newOrnB = b3QuatMul(ornB,childOrnB);
+					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+					//////////////////////////////////////
+					if (1)
+					{
+						int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+						float dmin = FLT_MAX;
+						float4 posA = rigidBodies[bodyIndexA].m_pos;
+						posA.w = 0.f;
+						float4 posB = newPosB;
+						posB.w = 0.f;
+						float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+						b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+						float4 c0 = transform(&c0local, &posA, &ornA);
+						float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+						b3Quat ornB =newOrnB;
+						float4 c1 = transform(&c1local,&posB,&ornB);
+						const float4 DeltaC2 = c0 - c1;
+						{//
+							int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+							if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							{
+								gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,-1,childShapeIndexB);
+							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)
+						}//
+					}//fi (1)	
+				}//for (int b=0;b<numChildrenB;b++)
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			return;
+		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+	}//i<numPairs
+__kernel void   processCompoundPairsKernel( __global const b3Int4* gpuCompoundPairs,
+										__global const b3RigidBodyData* rigidBodies, 
+										__global const b3Collidable* collidables,
+										__global const b3ConvexPolyhedronData* convexShapes, 
+										__global const b3AlignedObjectArray<b3Float4>& vertices,
+										__global const b3AlignedObjectArray<b3Float4>& uniqueEdges,
+										__global const b3AlignedObjectArray<b3GpuFace>& faces,
+										__global const b3AlignedObjectArray<int>& indices,
+										__global b3Aabb* aabbs,
+										__global const b3GpuChildShape* gpuChildShapes,
+										__global b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut,
+										__global b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut,
+										int numCompoundPairs,
+										int i
+										)
+//	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			b3Quat	childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+			b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = b3QuatRotate(ornB,childPosB)+posB;
+			b3Quat newOrnB = b3QuatMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+		gpuHasCompoundSepNormalsOut[i] = 0;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+		int hasSeparatingAxis = 5;
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		posA.w = 0.f;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal = make_float4(1,0,0,0);
+//		bool sepA = findSeparatingAxis(	convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+		bool sepA = findSeparatingAxis(	convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	convexShapes[shapeIndexB],convexShapes[shapeIndexA],posB,ornB,posA,ornA,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin);
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else//(!sepB)
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+				if (sepEE)
+				{
+						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);
+						gpuHasCompoundSepNormalsOut[i] = 1;
+				}//sepEE
+			}//(!sepB)
+		}//(!sepA)
+	}
+__kernel void   clipCompoundsHullHullKernel( __global const b3Int4* gpuCompoundPairs, 
+																					__global const b3RigidBodyData* rigidBodies, 
+																					__global const b3Collidable* collidables,
+																					__global const b3ConvexPolyhedronData* convexShapes, 
+																					__global const b3AlignedObjectArray<b3Float4>& vertices,
+																					__global const b3AlignedObjectArray<b3Float4>& uniqueEdges,
+																					__global const b3AlignedObjectArray<b3GpuFace>& faces,
+																					__global const b3AlignedObjectArray<int>& indices,
+																					__global const b3GpuChildShape* gpuChildShapes,
+																					__global const b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut,
+																					__global const b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut,
+																					__global struct b3Contact4Data* globalContactsOut,
+																					int* nGlobalContactsOut,
+																					int numCompoundPairs, int maxContactCapacity, int i)
+//	int i = get_global_id(0);
+	int pairIndex = i;
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	float minDist = -1e30f;
+	float maxDist = 0.0f;
+	if (i<numCompoundPairs)
+	{
+		if (gpuHasCompoundSepNormalsOut[i])
+		{
+			int bodyIndexA = gpuCompoundPairs[i].x;
+			int bodyIndexB = gpuCompoundPairs[i].y;
+			int childShapeIndexA = gpuCompoundPairs[i].z;
+			int childShapeIndexB = gpuCompoundPairs[i].w;
+			int collidableIndexA = -1;
+			int collidableIndexB = -1;
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			if (childShapeIndexA >= 0)
+			{
+				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+				b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+				float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+				b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+				posA = newPosA;
+				ornA = newOrnA;
+			} else
+			{
+				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			}
+			if (childShapeIndexB>=0)
+			{
+				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+				b3Quat  childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+				float4 newPosB = b3QuatRotate(ornB,childPosB)+posB;
+				b3Quat  newOrnB = b3QuatMul(ornB,childOrnB);
+				posB = newPosB;
+				ornB = newOrnB;
+			} else
+			{
+				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+			}
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],
+														convexShapes[shapeIndexA], convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -gpuCompoundSepNormalsOut[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				b3Int4 contactIdx;// = {-1,-1,-1,-1};
+				contactIdx.s[0] = 0;
+				contactIdx.s[1] = 1;
+				contactIdx.s[2] = 2;
+				contactIdx.s[3] = 3;
+				int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+				int dstIdx;
+				dstIdx = b3AtomicInc( nGlobalContactsOut);
+				if ((dstIdx+nReducedContacts) < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = gpuCompoundPairs[pairIndex].x;
+					int bodyB = gpuCompoundPairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_childIndexA = childShapeIndexA;
+					c->m_childIndexB = childShapeIndexB;
+					for (int i=0;i<nReducedContacts;i++)
+					{
+						c->m_worldPosB[i] = pointsIn[contactIdx.s[i]];
+					}
+					b3Contact4Data_setNumPoints(c,nReducedContacts);
+				}
+			}//		if (numContactsOut>0)
+		}//		if (gpuHasCompoundSepNormalsOut[i])
+	}//	if (i<numCompoundPairs)
+void computeContactCompoundCompound(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3GpuChildShape* cpuChildShapes,
+																const b3AlignedObjectArray<b3Aabb>& hostAabbsWorldSpace,
+																const b3AlignedObjectArray<b3Aabb>& hostAabbsLocalSpace,
+																const b3AlignedObjectArray<b3Vector3>& convexVertices,
+																const b3AlignedObjectArray<b3Vector3>& hostUniqueEdges,
+																const b3AlignedObjectArray<int>& convexIndices,
+																const b3AlignedObjectArray<b3GpuFace>& faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity,
+																b3AlignedObjectArray<b3QuantizedBvhNode>&	treeNodesCPU,
+																b3AlignedObjectArray<b3BvhSubtreeInfo>&	subTreesCPU,
+																b3AlignedObjectArray<b3BvhInfo>&	bvhInfoCPU
+																)
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut;
+	int numCompoundPairsOut=0;
+	int maxNumCompoundPairsCapacity = 8192;//1024;
+	cpuCompoundPairsOut.resize(maxNumCompoundPairsCapacity);
+	// work-in-progress
+	findCompoundPairsKernel( 
+							pairIndex,
+							bodyIndexA,bodyIndexB,
+							collidableIndexA,collidableIndexB,
+							rigidBodies, 
+							collidables,
+							convexShapes, 
+							convexVertices,
+							hostAabbsWorldSpace,
+							hostAabbsLocalSpace,
+							cpuChildShapes,
+							&cpuCompoundPairsOut[0],
+							&numCompoundPairsOut,
+							maxNumCompoundPairsCapacity	,
+							treeNodesCPU,
+							subTreesCPU,
+							bvhInfoCPU
+							);
+	printf("maxNumAabbChecks=%d\n",maxNumAabbChecks);
+	if (numCompoundPairsOut>maxNumCompoundPairsCapacity)
+	{
+		b3Error("numCompoundPairsOut exceeded maxNumCompoundPairsCapacity (%d)\n",maxNumCompoundPairsCapacity);
+		numCompoundPairsOut=maxNumCompoundPairsCapacity;
+	}
+	b3AlignedObjectArray<b3Float4> cpuCompoundSepNormalsOut;
+	b3AlignedObjectArray<int> cpuHasCompoundSepNormalsOut;
+	cpuCompoundSepNormalsOut.resize(numCompoundPairsOut);
+	cpuHasCompoundSepNormalsOut.resize(numCompoundPairsOut);
+	for (int i=0;i<numCompoundPairsOut;i++)
+	{
+		processCompoundPairsKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,0,cpuChildShapes,
+			cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,numCompoundPairsOut,i);
+	}
+	for (int i=0;i<numCompoundPairsOut;i++)
+	{
+		clipCompoundsHullHullKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,cpuChildShapes,
+			cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,globalContactsOut,&nGlobalContactsOut,numCompoundPairsOut,maxContactCapacity,i);
+	}
+		/*
+		int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+					b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;
+			bool foundSepAxis = findSeparatingAxis(hullA,hullB,
+							posA,
+							ornA,
+							posB,
+							ornB,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							sepNormalWorldSpace
+							);
+							*/
+	/*
+	if (foundSepAxis)
+	{
+		contactIndex = clipHullHullSingle(
+			bodyIndexA, bodyIndexB,
+						   posA,ornA,
+						   posB,ornB,
+			collidableIndexA, collidableIndexB,
+			&rigidBodies, 
+			&globalContactsOut,
+			nGlobalContactsOut,
+			convexShapes,
+			convexShapes,
+			convexVertices, 
+			uniqueEdges, 
+			faces,
+			convexIndices,
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+			collidables,
+			collidables,
+			sepNormalWorldSpace,
+			maxContactCapacity);
+	}
+	*/
+//	return contactIndex;
+	/*
+	int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+	for (int c=0;c<numChildrenB;c++)
+	{
+		int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c;
+		int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex;
+		float4 rootPosB = rigidBodies[bodyIndexB].m_pos;
+		b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat;
+		b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition;
+		b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation;
+		float4  posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB;
+		b3Quaternion ornB = b3QuatMul(rootOrnB,childOrnB);//b3QuatMul(ornB,childOrnB);
+		int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB];
+	}
+	*/
+void computeContactPlaneCompound(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3GpuChildShape* cpuChildShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+	for (int c=0;c<numChildrenB;c++)
+	{
+		int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c;
+		int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex;
+		float4 rootPosB = rigidBodies[bodyIndexB].m_pos;
+		b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat;
+		b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition;
+		b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation;
+		float4  posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB;
+		b3Quaternion ornB = rootOrnB*childOrnB;//b3QuatMul(ornB,childOrnB);
+		int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB];
+		b3Vector3 posA = rigidBodies[bodyIndexA].m_pos;
+		b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+		int numContactsOut = 0;
+		int numWorldVertsB1= 0;
+		b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+		b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z);
+		b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal);
+		float planeConstant = planeEq.w;
+		b3Transform convexWorldTransform;
+		convexWorldTransform.setIdentity();
+		convexWorldTransform.setOrigin(posB);
+		convexWorldTransform.setRotation(ornB);
+		b3Transform planeTransform;
+		planeTransform.setIdentity();
+		planeTransform.setOrigin(posA);
+		planeTransform.setRotation(ornA);
+		b3Transform planeInConvex;
+		planeInConvex= convexWorldTransform.inverse() * planeTransform;
+		b3Transform convexInPlane;
+		convexInPlane = planeTransform.inverse() * convexWorldTransform;
+		b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal;
+		float maxDot = -1e30;
+		int hitVertex=-1;
+		b3Vector3 hitVtx;
+		b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS];
+		int numPoints = 0;
+		b3Int4 contactIdx;
+		contactIdx.s[0] = 0;
+		contactIdx.s[1] = 1;
+		contactIdx.s[2] = 2;
+		contactIdx.s[3] = 3;
+		for (int i=0;i<hullB->m_numVertices;i++)
+		{
+			b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i];
+			float curDot = vtx.dot(planeNormalInConvex);
+			if (curDot>maxDot)
+			{
+				hitVertex=i;
+				maxDot=curDot;
+				hitVtx = vtx;
+				//make sure the deepest points is always included
+				if (numPoints==MAX_PLANE_CONVEX_POINTS)
+					numPoints--;
+			}
+			if (numPoints<MAX_PLANE_CONVEX_POINTS)
+			{
+				b3Vector3 vtxWorld = convexWorldTransform*vtx;
+				b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld;
+				float dist = planeNormal.dot(vtxInPlane)-planeConstant;
+				if (dist<0.f)
+				{
+					vtxWorld.w = dist;
+					contactPoints[numPoints] = vtxWorld;
+					numPoints++;
+				}
+			}
+		}
+		int numReducedPoints  = 0;
+		numReducedPoints = numPoints;
+		if (numPoints>4)
+		{
+			numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx);
+		}
+		int dstIdx;
+	//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+		if (numReducedPoints>0)
+		{
+			if (nGlobalContactsOut < maxContactCapacity)
+			{
+				dstIdx=nGlobalContactsOut;
+				nGlobalContactsOut++;
+				b3Contact4* c = &globalContactsOut[dstIdx];
+				c->m_worldNormalOnB = -planeNormalWorld;
+				c->setFrictionCoeff(0.7);
+				c->setRestituitionCoeff(0.f);
+				c->m_batchIdx = pairIndex;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+				for (int i=0;i<numReducedPoints;i++)
+				{
+					b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
+					c->m_worldPosB[i] = pOnB1;
+				}
+				c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints;
+			}//if (dstIdx < numPairs)
+		}	
+	}
+void	computeContactSphereConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+	float radius = collidables[collidableIndexA].m_radius;
+	float4 spherePos1 = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion sphereOrn = rigidBodies[bodyIndexA].m_quat;
+	float4 pos = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion quat = rigidBodies[bodyIndexB].m_quat;
+	b3Transform tr;
+	tr.setIdentity();
+	tr.setOrigin(pos);
+	tr.setRotation(quat);
+	b3Transform trInv = tr.inverse();
+	float4 spherePos = trInv(spherePos1);
+	int collidableIndex = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndex = collidables[collidableIndex].m_shapeIndex;
+	int numFaces = convexShapes[shapeIndex].m_numFaces;
+	float4 closestPnt = b3MakeVector3(0, 0, 0, 0);
+	float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0);
+	float minDist = -1000000.f; // TODO: What is the largest/smallest float?
+	bool bCollide = true;
+	int region = -1;
+	float4 localHitNormal;
+	for ( int f = 0; f < numFaces; f++ )
+	{
+		b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];
+		float4 planeEqn;
+		float4 localPlaneNormal = b3MakeVector3(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		float4 n1 = localPlaneNormal;//quatRotate(quat,localPlaneNormal);
+		planeEqn = n1;
+		planeEqn[3] = face.m_plane.w;
+		float4 pntReturn;
+		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);
+		if ( dist > radius)
+		{
+			bCollide = false;
+			break;
+		}
+		if ( dist > 0 )
+		{
+			//might hit an edge or vertex
+			b3Vector3 out;
+			bool isInPoly = IsPointInPolygon(spherePos,
+					&face,
+					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],
+					convexIndices,
+                    &out);
+			if (isInPoly)
+			{
+				if (dist>minDist)
+				{
+					minDist = dist;
+					closestPnt = pntReturn;
+					localHitNormal = planeEqn;
+					region=1;
+				}
+			} else
+			{
+				b3Vector3 tmp = spherePos-out;
+				b3Scalar l2 = tmp.length2();
+				if (l2<radius*radius)
+				{
+					dist  = b3Sqrt(l2);
+					if (dist>minDist)
+					{
+						minDist = dist;
+						closestPnt = out;
+						localHitNormal = tmp/dist;
+						region=2;
+					}
+				} else
+				{
+					bCollide = false;
+					break;
+				}
+			}
+		}
+		else
+		{
+			if ( dist > minDist )
+			{
+				minDist = dist;
+				closestPnt = pntReturn;
+				localHitNormal = planeEqn;
+				region=3;
+			}
+		}
+	}
+	static int numChecks = 0;
+	numChecks++;
+	if (bCollide && minDist > -10000)
+	{
+		float4 normalOnSurfaceB1 = tr.getBasis()*localHitNormal;//-hitNormalWorld;
+		float4 pOnB1 = tr(closestPnt);
+		//printf("dist ,%f,",minDist);
+		float actualDepth = minDist-radius;
+		if (actualDepth<0)
+		{
+		//printf("actualDepth = ,%f,", actualDepth);
+		//printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z);
+		//printf("region=,%d,\n", region);
+		pOnB1[3] = actualDepth;
+		int dstIdx;
+//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+		if (nGlobalContactsOut < maxContactCapacity)
+		{
+			dstIdx=nGlobalContactsOut;
+			nGlobalContactsOut++;
+			b3Contact4* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = normalOnSurfaceB1;
+			c->setFrictionCoeff(0.7);
+			c->setRestituitionCoeff(0.f);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_worldPosB[0] = pOnB1;
+			int numPoints = 1;
+			c->m_worldNormalOnB.w = (b3Scalar)numPoints;
+		}//if (dstIdx < numPairs)
+		}
+	}//if (hasCollision)
+#include "b3GjkPairDetector.h"
+#include "b3GjkEpa.h"
+#include "b3VoronoiSimplexSolver.h"
+int computeContactConvexConvex( b3AlignedObjectArray<b3Int4>& pairs,
+																int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, 
+																const b3AlignedObjectArray<b3Collidable>& collidables,
+																const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes,
+																const b3AlignedObjectArray<b3Vector3>& convexVertices,
+																const b3AlignedObjectArray<b3Vector3>& uniqueEdges,
+																const b3AlignedObjectArray<int>& convexIndices,
+																const b3AlignedObjectArray<b3GpuFace>& faces,
+																b3AlignedObjectArray<b3Contact4>& globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity,
+																b3AlignedObjectArray<int>&hostHasSepAxis,
+																b3AlignedObjectArray<b3Vector3>&hostSepAxis
+							   //,const b3AlignedObjectArray<b3Contact4>& oldContacts
+																)
+	int contactIndex = -1;
+	b3VoronoiSimplexSolver simplexSolver;
+	b3GjkEpaSolver2 epaSolver;
+	b3GjkPairDetector gjkDetector(&simplexSolver,&epaSolver);
+	b3Transform transA;
+	transA.setOrigin(rigidBodies[bodyIndexA].m_pos);
+	transA.setRotation(rigidBodies[bodyIndexA].m_quat);
+	b3Transform transB;
+	transB.setOrigin(rigidBodies[bodyIndexB].m_pos);
+	transB.setRotation(rigidBodies[bodyIndexB].m_quat);
+	float maximumDistanceSquared = 1e30f;
+	b3Vector3 resultPointOnBWorld;
+	b3Vector3 sepAxis2=b3MakeVector3(0,1,0);
+	b3Scalar distance2 = 1e30f;
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	int sz = sizeof(b3Contact4);
+	bool result2 = getClosestPoints(&gjkDetector, transA, transB,
+		convexShapes[shapeIndexA], convexShapes[shapeIndexB],
+		convexVertices,convexVertices,
+		maximumDistanceSquared,
+		sepAxis2,
+		distance2,
+		resultPointOnBWorld);
+	if (result2)
+	{
+		if (nGlobalContactsOut<maxContactCapacity)
+		{
+			contactIndex = nGlobalContactsOut;
+			globalContactsOut.expand();
+			b3Contact4& newContact = globalContactsOut.at(nGlobalContactsOut);
+			newContact.m_batchIdx = 0;//i;
+			newContact.m_bodyAPtrAndSignBit = (rigidBodies.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+			newContact.m_bodyBPtrAndSignBit = (rigidBodies.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+			newContact.m_frictionCoeffCmp = 45874;
+			newContact.m_restituitionCoeffCmp = 0;
+			int numPoints = 0;
+			if (pairs[pairIndex].z>=0)
+			{
+				//printf("add existing points?\n");
+				//refresh
+				int numOldPoints = 0;//oldContacts[pairs[pairIndex].z].getNPoints();
+				if (numOldPoints)
+				{
+					//newContact = oldContacts[pairs[pairIndex].z];
+					b3ContactCache::refreshContactPoints(transA,transB,newContact);
+				}
+				numPoints = b3Contact4Data_getNumPoints(&newContact);
+			}
+			/*
+			int insertIndex = m_manifoldPtr->getCacheEntry(newPt);
+				if (insertIndex >= 0)
+				{
+					//const btManifoldPoint& oldPoint = m_manifoldPtr->getContactPoint(insertIndex);
+					m_manifoldPtr->replaceContactPoint(newPt,insertIndex);
+				} else
+				{
+					insertIndex = m_manifoldPtr->addManifoldPoint(newPt);
+				}
+			*/
+			int p=numPoints;
+			if (numPoints<4)
+			{
+				numPoints++;
+			} else
+			{
+				p=3;
+			}
+			{
+				resultPointOnBWorld.w = distance2;
+				newContact.m_worldPosB[p] = resultPointOnBWorld;
+				b3Vector3 resultPointOnAWorld = resultPointOnBWorld+distance2*sepAxis2;
+				newContact.m_localPosA[p] = transA.inverse()*resultPointOnAWorld;
+				newContact.m_localPosB[p] = transB.inverse()*resultPointOnBWorld;
+				newContact.m_worldNormalOnB = sepAxis2;
+				hostHasSepAxis[pairIndex] = 1;
+				hostSepAxis[pairIndex] =sepAxis2;
+				//printf("sepAxis[%d]=%f,%f,%f,%f\n",pairIndex,sepAxis2.x,sepAxis2.y,sepAxis2.z,sepAxis2.w);
+			}
+			//printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints);
+			newContact.m_worldNormalOnB.w = (b3Scalar)numPoints;
+			nGlobalContactsOut++;
+		} else
+		{
+			b3Error("Error: exceeding contact capacity (%d/%d)\n", nGlobalContactsOut,maxContactCapacity);
+		}
+	}
+	return contactIndex;
+int computeContactConvexConvex2(
+																int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, 
+																const b3AlignedObjectArray<b3Collidable>& collidables,
+																const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes,
+																const b3AlignedObjectArray<b3Vector3>& convexVertices,
+																const b3AlignedObjectArray<b3Vector3>& uniqueEdges,
+																const b3AlignedObjectArray<int>& convexIndices,
+																const b3AlignedObjectArray<b3GpuFace>& faces,
+																b3AlignedObjectArray<b3Contact4>& globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity,
+																const b3AlignedObjectArray<b3Contact4>& oldContacts
+																)
+	int contactIndex = -1;
+	b3Vector3 posA = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+	b3Vector3 posB = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+	b3ConvexPolyhedronData hullA, hullB;
+	b3Vector3 sepNormalWorldSpace;
+    b3Collidable colA = collidables[collidableIndexA];
+    hullA = convexShapes[colA.m_shapeIndex];
+    //printf("numvertsA = %d\n",hullA.m_numVertices);
+    b3Collidable colB = collidables[collidableIndexB];
+    hullB = convexShapes[colB.m_shapeIndex];
+    //printf("numvertsB = %d\n",hullB.m_numVertices);
+	int contactCapacity = MAX_VERTS;
+	int numContactsOut=0;
+#ifdef _WIN32
+	b3Assert(_finite(rigidBodies[bodyIndexA].m_pos.x));
+	b3Assert(_finite(rigidBodies[bodyIndexB].m_pos.x));
+		bool foundSepAxis = findSeparatingAxis(hullA,hullB,
+							posA,
+							ornA,
+							posB,
+							ornB,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							sepNormalWorldSpace
+							);
+	if (foundSepAxis)
+	{
+		contactIndex = clipHullHullSingle(
+			bodyIndexA, bodyIndexB,
+						   posA,ornA,
+						   posB,ornB,
+			collidableIndexA, collidableIndexB,
+			&rigidBodies, 
+			&globalContactsOut,
+			nGlobalContactsOut,
+			convexShapes,
+			convexShapes,
+			convexVertices, 
+			uniqueEdges, 
+			faces,
+			convexIndices,
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+			collidables,
+			collidables,
+			sepNormalWorldSpace,
+			maxContactCapacity);
+	}
+	return contactIndex;
+void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs,
+			const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+			b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
+			const b3OpenCLArray<b3Contact4>* oldContacts,
+			int maxContactCapacity,
+			int compoundPairCapacity,
+			const b3OpenCLArray<b3ConvexPolyhedronData>& convexData,
+			const b3OpenCLArray<b3Vector3>& gpuVertices,
+			const b3OpenCLArray<b3Vector3>& gpuUniqueEdges,
+			const b3OpenCLArray<b3GpuFace>& gpuFaces,
+			const b3OpenCLArray<int>& gpuIndices,
+			const b3OpenCLArray<b3Collidable>& gpuCollidables,
+			const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
+			const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
+			const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
+            b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
+            b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
+            b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
+            b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
+            b3OpenCLArray<b3Vector3>& worldVertsB2GPU,    
+			b3AlignedObjectArray<class b3OptimizedBvh*>& bvhDataUnused,
+			b3OpenCLArray<b3QuantizedBvhNode>*	treeNodesGPU,
+			b3OpenCLArray<b3BvhSubtreeInfo>*	subTreesGPU,
+			b3OpenCLArray<b3BvhInfo>*	bvhInfo,
+			int numObjects,
+			int maxTriConvexPairCapacity,
+			b3OpenCLArray<b3Int4>& triangleConvexPairsOut,
+			int& numTriConvexPairsOut
+			)
+	myframecount++;
+	if (!nPairs)
+		return;
+	b3AlignedObjectArray<b3QuantizedBvhNode>	treeNodesCPU;
+	treeNodesGPU->copyToHost(treeNodesCPU);
+	b3AlignedObjectArray<b3BvhSubtreeInfo>	subTreesCPU;
+	subTreesGPU->copyToHost(subTreesCPU);
+	b3AlignedObjectArray<b3BvhInfo>	bvhInfoCPU;
+	bvhInfo->copyToHost(bvhInfoCPU);
+	b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+	clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+	b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace;
+	clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace);
+	b3AlignedObjectArray<b3Int4> hostPairs;
+	pairs->copyToHost(hostPairs);
+	b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+	bodyBuf->copyToHost(hostBodyBuf);
+	b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+	convexData.copyToHost(hostConvexData);
+	b3AlignedObjectArray<b3Vector3> hostVertices;
+	gpuVertices.copyToHost(hostVertices);
+	b3AlignedObjectArray<b3Vector3> hostUniqueEdges;
+	gpuUniqueEdges.copyToHost(hostUniqueEdges);
+	b3AlignedObjectArray<b3GpuFace> hostFaces;
+	gpuFaces.copyToHost(hostFaces);
+	b3AlignedObjectArray<int> hostIndices;
+	gpuIndices.copyToHost(hostIndices);
+	b3AlignedObjectArray<b3Collidable> hostCollidables;
+	gpuCollidables.copyToHost(hostCollidables);
+	b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+	gpuChildShapes.copyToHost(cpuChildShapes);
+	b3AlignedObjectArray<b3Int4> hostTriangleConvexPairs;
+	b3AlignedObjectArray<b3Contact4> hostContacts;
+	if (nContacts)
+	{
+		contactOut->copyToHost(hostContacts);
+	}
+	b3AlignedObjectArray<b3Contact4> oldHostContacts;
+	if (oldContacts->size())
+	{
+		oldContacts->copyToHost(oldHostContacts);
+	}
+	hostContacts.resize(maxContactCapacity);
+	for (int i=0;i<nPairs;i++)
+	{
+		int bodyIndexA = hostPairs[i].x;
+		int bodyIndexB = hostPairs[i].y;
+		int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			computeContactSphereConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0],
+				&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+		}
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			computeContactSphereConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+				&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//printf("convex-sphere\n");
+		}
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			computeContactPlaneConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("convex-plane\n");
+		}
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			computeContactPlaneConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("plane-convex\n");
+		}
+			if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			computeContactCompoundCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], hostAabbsWorldSpace,hostAabbsLocalSpace,hostVertices,hostUniqueEdges,hostIndices,hostFaces,&hostContacts[0],
+			nContacts,maxContactCapacity,treeNodesCPU,subTreesCPU,bvhInfoCPU);	
+//			printf("convex-plane\n");
+		}
+				if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			computeContactPlaneCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], &hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("convex-plane\n");
+		}
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			computeContactPlaneCompound(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("plane-convex\n");
+		}
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			//printf("hostPairs[i].z=%d\n",hostPairs[i].z);
+			//int contactIndex = computeContactConvexConvex2(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+			int contactIndex = computeContactConvexConvex(hostPairs,i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+			if (contactIndex>=0)
+			{
+//				printf("convex convex contactIndex = %d\n",contactIndex);
+				hostPairs[i].z = contactIndex;
+			}
+//			printf("plane-convex\n");
+		}
+	}
+	if (hostPairs.size())
+	{
+		pairs->copyFromHost(hostPairs);
+	}
+	hostContacts.resize(nContacts);
+	if (nContacts)
+		{
+			contactOut->copyFromHost(hostContacts);
+		} else
+	{
+		contactOut->resize(0);
+		}
+		m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+		//printf("(HOST) nContacts = %d\n",nContacts);
+	{
+		if (nPairs)
+		{
+			m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+			B3_PROFILE("primitiveContactsKernel");
+			b3BufferInfoCL bInfo[] = {
+				b3BufferInfoCL( pairs->getBufferCL(), true ), 
+				b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+				b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				b3BufferInfoCL( convexData.getBufferCL(),true),
+				b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+				b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+				b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+				b3BufferInfoCL( contactOut->getBufferCL()),
+				b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			b3LauncherCL launcher(m_queue, m_primitiveContactsKernel,"m_primitiveContactsKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( nPairs  );
+			launcher.setConst(maxContactCapacity);
+			int num = nPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			nContacts = m_totalContactsOut.at(0);
+			contactOut->resize(nContacts);
+		}
+	}
+	B3_PROFILE("computeConvexConvexContactsGPUSAT");
+   // printf("nContacts = %d\n",nContacts);
+	m_sepNormals.resize(nPairs);
+	m_hasSeparatingNormals.resize(nPairs);
+	int concaveCapacity=maxTriConvexPairCapacity;
+	m_concaveSepNormals.resize(concaveCapacity);
+	m_concaveHasSeparatingNormals.resize(concaveCapacity);
+	m_numConcavePairsOut.resize(0);
+	m_numConcavePairsOut.push_back(0);
+	m_gpuCompoundPairs.resize(compoundPairCapacity);
+	m_gpuCompoundSepNormals.resize(compoundPairCapacity);
+	m_gpuHasCompoundSepNormals.resize(compoundPairCapacity);
+	m_numCompoundPairsOut.resize(0);
+	m_numCompoundPairsOut.push_back(0);
+	int numCompoundPairs = 0;
+	int numConcavePairs =0;
+	{
+		clFinish(m_queue);
+		if (findSeparatingAxisOnGpu)
+		{
+			m_dmins.resize(nPairs);
+			if (splitSearchSepAxisConvex)
+			{
+				if (useMprGpu)
+				{
+					nContacts = m_totalContactsOut.at(0);
+					{
+						B3_PROFILE("mprPenetrationKernel");
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL( pairs->getBufferCL(), true ), 
+							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+							b3BufferInfoCL( convexData.getBufferCL(),true),
+							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+							b3BufferInfoCL( m_sepNormals.getBufferCL()),
+							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( contactOut->getBufferCL()),
+							b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+						};
+						b3LauncherCL launcher(m_queue, m_mprPenetrationKernel,"mprPenetrationKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst(maxContactCapacity);
+						launcher.setConst( nPairs  );
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+						/*
+						b3AlignedObjectArray<int>hostHasSepAxis;
+						m_hasSeparatingNormals.copyToHost(hostHasSepAxis);
+						b3AlignedObjectArray<b3Vector3>hostSepAxis;
+						m_sepNormals.copyToHost(hostSepAxis);
+						*/
+						nContacts = m_totalContactsOut.at(0);
+						contactOut->resize(nContacts);
+					//	printf("nContacts (after mprPenetrationKernel) = %d\n",nContacts);
+						if (nContacts>maxContactCapacity)
+						{
+							b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+							nContacts = maxContactCapacity;
+						}
+					}
+				}
+				if (1)
+				{
+					if (1)
+					{
+					{
+						B3_PROFILE("findSeparatingAxisVertexFaceKernel");
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL( pairs->getBufferCL(), true ), 
+							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+							b3BufferInfoCL( convexData.getBufferCL(),true),
+							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+							b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+							b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+							b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+							b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+							b3BufferInfoCL( m_sepNormals.getBufferCL()),
+							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( m_dmins.getBufferCL())
+						};
+						b3LauncherCL launcher(m_queue, m_findSeparatingAxisVertexFaceKernel,"findSeparatingAxisVertexFaceKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst( nPairs  );
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+					}
+					int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+					{
+						B3_PROFILE("findSeparatingAxisEdgeEdgeKernel");
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL( pairs->getBufferCL(), true ), 
+							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+							b3BufferInfoCL( convexData.getBufferCL(),true),
+							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+							b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+							b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+							b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+							b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+							b3BufferInfoCL( m_sepNormals.getBufferCL()),
+							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( m_dmins.getBufferCL()),
+							b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true)
+						};
+						b3LauncherCL launcher(m_queue, m_findSeparatingAxisEdgeEdgeKernel,"findSeparatingAxisEdgeEdgeKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst( numDirections);
+						launcher.setConst( nPairs  );
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+					}
+					}
+					if (useMprGpu)
+					{
+						B3_PROFILE("findSeparatingAxisUnitSphereKernel");
+						b3BufferInfoCL bInfo[] = { 
+								b3BufferInfoCL( pairs->getBufferCL(), true ), 
+								b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+								b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+								b3BufferInfoCL( convexData.getBufferCL(),true),
+								b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+								b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true),
+								b3BufferInfoCL( m_sepNormals.getBufferCL()),
+								b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+								b3BufferInfoCL( m_dmins.getBufferCL())
+						};
+						b3LauncherCL launcher(m_queue, m_findSeparatingAxisUnitSphereKernel,"findSeparatingAxisUnitSphereKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+						launcher.setConst( numDirections);
+						launcher.setConst( nPairs  );
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+					}
+			}
+			} else
+			{
+				B3_PROFILE("findSeparatingAxisKernel");
+				b3BufferInfoCL bInfo[] = { 
+					b3BufferInfoCL( pairs->getBufferCL(), true ), 
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+					b3BufferInfoCL( m_sepNormals.getBufferCL()),
+					b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL())
+				};
+				b3LauncherCL launcher(m_queue, m_findSeparatingAxisKernel,"m_findSeparatingAxisKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( nPairs  );
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+			}
+		}
+        else
+        {
+			B3_PROFILE("findSeparatingAxisKernel CPU");
+            b3AlignedObjectArray<b3Int4> hostPairs;
+            pairs->copyToHost(hostPairs);
+            b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+            bodyBuf->copyToHost(hostBodyBuf);
+            b3AlignedObjectArray<b3Collidable> hostCollidables;
+            gpuCollidables.copyToHost(hostCollidables);
+            b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+            gpuChildShapes.copyToHost(cpuChildShapes);
+            b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexShapeData;
+            convexData.copyToHost(hostConvexShapeData);
+            b3AlignedObjectArray<b3Vector3> hostVertices;
+            gpuVertices.copyToHost(hostVertices);
+            b3AlignedObjectArray<int> hostHasSepAxis;
+            hostHasSepAxis.resize(nPairs);
+            b3AlignedObjectArray<b3Vector3> hostSepAxis;
+            hostSepAxis.resize(nPairs);
+            b3AlignedObjectArray<b3Vector3> hostUniqueEdges;
+            gpuUniqueEdges.copyToHost(hostUniqueEdges);
+            b3AlignedObjectArray<b3GpuFace> hostFaces;
+            gpuFaces.copyToHost(hostFaces);
+            b3AlignedObjectArray<int> hostIndices;
+            gpuIndices.copyToHost(hostIndices);
+			b3AlignedObjectArray<b3Contact4> hostContacts;
+			if (nContacts)
+			{
+				contactOut->copyToHost(hostContacts);
+			}
+			hostContacts.resize(maxContactCapacity);
+			int nGlobalContactsOut = nContacts;
+            for (int i=0;i<nPairs;i++)
+            {
+                int bodyIndexA = hostPairs[i].x;
+                int bodyIndexB = hostPairs[i].y;
+                int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+                int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+                int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex;
+                int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex;
+                hostHasSepAxis[i] = 0;
+                //once the broadphase avoids static-static pairs, we can remove this test
+                if ((hostBodyBuf[bodyIndexA].m_invMass==0) &&(hostBodyBuf[bodyIndexB].m_invMass==0))
+                {
+                    continue;
+                }
+                if ((hostCollidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(hostCollidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+                {
+                    continue;
+                }
+                float dmin = FLT_MAX;
+                b3ConvexPolyhedronData* convexShapeA = &hostConvexShapeData[shapeIndexA];
+                b3ConvexPolyhedronData* convexShapeB = &hostConvexShapeData[shapeIndexB];
+                b3Vector3 posA = hostBodyBuf[bodyIndexA].m_pos;
+                b3Vector3 posB = hostBodyBuf[bodyIndexB].m_pos;
+                b3Quaternion ornA =hostBodyBuf[bodyIndexA].m_quat;
+                b3Quaternion ornB =hostBodyBuf[bodyIndexB].m_quat;
+				if (useGjk)
+				{
+					//first approximate the separating axis, to 'fail-proof' GJK+EPA or MPR
+					{
+						b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter;
+						b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA);
+						b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter;
+						b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB);
+						b3Vector3 DeltaC2 = c0 - c1;
+						b3Vector3 sepAxis;
+						bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+							&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+							&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+											 &sepAxis, &dmin);
+						if (hasSepAxisA)
+						{
+							bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2,
+																	&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																	&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																	&sepAxis, &dmin);
+							if (hasSepAxisB)
+							{
+								bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+															 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+															 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+															 &sepAxis, &dmin,false);
+								if (hasEdgeEdge)
+								{
+									hostHasSepAxis[i] = 1;
+									hostSepAxis[i] = sepAxis;
+									hostSepAxis[i].w = dmin;
+								}
+							}
+						}
+					}
+					if (hostHasSepAxis[i])
+					{
+						int pairIndex = i;
+						bool useMpr = true;
+						if (useMpr)
+						{
+							int res=0;
+							float depth = 0.f;
+							b3Vector3 sepAxis2 = b3MakeVector3(1,0,0);
+							b3Vector3 resultPointOnBWorld = b3MakeVector3(0,0,0);
+						float depthOut;
+						b3Vector3 dirOut;
+						b3Vector3 posOut;
+						//res = b3MprPenetration(bodyIndexA,bodyIndexB,hostBodyBuf,hostConvexShapeData,hostCollidables,hostVertices,&mprConfig,&depthOut,&dirOut,&posOut);
+						res = b3MprPenetration(pairIndex,bodyIndexA,bodyIndexB,&hostBodyBuf[0],&hostConvexShapeData[0],&hostCollidables[0],&hostVertices[0],&hostSepAxis[0],&hostHasSepAxis[0],&depthOut,&dirOut,&posOut);
+						depth = depthOut;
+						sepAxis2 =  b3MakeVector3(-dirOut.x,-dirOut.y,-dirOut.z);
+						resultPointOnBWorld = posOut;
+						//hostHasSepAxis[i] = 0;
+						if (res==0)
+						{
+							//add point?
+							//printf("depth = %f\n",depth);
+							//printf("normal = %f,%f,%f\n",dir.v[0],dir.v[1],dir.v[2]);
+							//qprintf("pos = %f,%f,%f\n",pos.v[0],pos.v[1],pos.v[2]);
+							float dist=0.f;
+							const b3ConvexPolyhedronData& hullA = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexA].m_collidableIdx].m_shapeIndex];
+							const b3ConvexPolyhedronData& hullB = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexB].m_collidableIdx].m_shapeIndex];
+							if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist))
+							{
+								if (depth > dist)
+								{
+									float diff = depth - dist;
+									static float maxdiff = 0.f;
+									if (maxdiff < diff)
+									{
+										maxdiff = diff;
+										printf("maxdiff = %20.10f\n",maxdiff);
+									}
+								}
+							}
+							if (depth > dmin)
+							{
+								b3Vector3 oldAxis = hostSepAxis[i];
+								depth = dmin;
+								sepAxis2 = oldAxis;
+							}
+							if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist))
+							{
+								if (depth > dist)
+								{
+									float diff = depth - dist;
+									//printf("?diff  = %f\n",diff );
+									static float maxdiff = 0.f;
+									if (maxdiff < diff)
+									{
+										maxdiff = diff;
+										printf("maxdiff = %20.10f\n",maxdiff);
+									}
+								}
+								//this is used for SAT
+								//hostHasSepAxis[i] = 1;
+								//hostSepAxis[i] = sepAxis2;
+								//add contact point
+								int contactIndex = nGlobalContactsOut;
+								b3Contact4& newContact = hostContacts.at(nGlobalContactsOut);
+								nGlobalContactsOut++;
+								newContact.m_batchIdx = 0;//i;
+								newContact.m_bodyAPtrAndSignBit = (hostBodyBuf.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+								newContact.m_bodyBPtrAndSignBit = (hostBodyBuf.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+								newContact.m_frictionCoeffCmp = 45874;
+								newContact.m_restituitionCoeffCmp = 0;
+								static float maxDepth = 0.f;
+								if (depth > maxDepth)
+								{
+									maxDepth  = depth;
+									printf("MPR maxdepth = %f\n",maxDepth );
+								}
+								resultPointOnBWorld.w = -depth;
+								newContact.m_worldPosB[0] = resultPointOnBWorld;
+								b3Vector3 resultPointOnAWorld = resultPointOnBWorld+depth*sepAxis2;
+								newContact.m_worldNormalOnB = sepAxis2;
+								newContact.m_worldNormalOnB.w = (b3Scalar)1;
+							} else
+							{
+								printf("rejected\n");
+							}
+						}
+						} else
+						{
+						int result = computeContactConvexConvex( hostPairs,
+													   pairIndex,
+													bodyIndexA, bodyIndexB,
+													   collidableIndexA, collidableIndexB,
+													   hostBodyBuf,
+													   hostCollidables,
+													   hostConvexShapeData,
+													   hostVertices,
+													   hostUniqueEdges,
+													   hostIndices,
+													   hostFaces,
+													   hostContacts,
+													   nGlobalContactsOut,
+														maxContactCapacity,
+														hostHasSepAxis,
+														hostSepAxis
+																);
+						}//mpr
+					}//hostHasSepAxis[i] = 1;
+				} else
+				{
+					b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter;
+					b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA);
+					b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter;
+					b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB);
+					b3Vector3 DeltaC2 = c0 - c1;
+					b3Vector3 sepAxis;
+					bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+						&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+						&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+										 &sepAxis, &dmin);
+					if (hasSepAxisA)
+					{
+						bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2,
+																&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																&sepAxis, &dmin);
+						if (hasSepAxisB)
+						{
+							bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+														 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+														 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+														 &sepAxis, &dmin,true);
+							if (hasEdgeEdge)
+							{
+								hostHasSepAxis[i] = 1;
+								hostSepAxis[i] = sepAxis;
+							}
+						}
+					}
+				}
+            }
+			if (useGjkContacts)//nGlobalContactsOut>0)
+			{
+				//printf("nGlobalContactsOut=%d\n",nGlobalContactsOut);
+				nContacts = nGlobalContactsOut;
+				contactOut->copyFromHost(hostContacts);
+				m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+			}
+            m_hasSeparatingNormals.copyFromHost(hostHasSepAxis);
+            m_sepNormals.copyFromHost(hostSepAxis);
+            /*
+             //double-check results from GPU (comment-out the 'else' so both paths are executed
+            b3AlignedObjectArray<int> checkHasSepAxis;
+            m_hasSeparatingNormals.copyToHost(checkHasSepAxis);
+            static int frameCount = 0;
+            frameCount++;
+            for (int i=0;i<nPairs;i++)
+            {
+                if (hostHasSepAxis[i] != checkHasSepAxis[i])
+                {
+                    printf("at frameCount %d hostHasSepAxis[%d] = %d but checkHasSepAxis[i] = %d\n",
+                           frameCount,i,hostHasSepAxis[i],checkHasSepAxis[i]);
+                }
+            }
+            //m_hasSeparatingNormals.copyFromHost(hostHasSepAxis);
+            //    m_sepNormals.copyFromHost(hostSepAxis);
+            */
+        }
+        numCompoundPairs = m_numCompoundPairsOut.at(0);
+        bool useGpuFindCompoundPairs=true;
+        if (useGpuFindCompoundPairs)
+        {
+            B3_PROFILE("findCompoundPairsKernel");
+            b3BufferInfoCL bInfo[] = 
+            { 
+                b3BufferInfoCL( pairs->getBufferCL(), true ), 
+                b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                b3BufferInfoCL( convexData.getBufferCL(),true),
+                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                b3BufferInfoCL( clAabbsLocalSpace.getBufferCL(),true),
+                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL()),
+                b3BufferInfoCL( m_numCompoundPairsOut.getBufferCL()),
+                b3BufferInfoCL(subTreesGPU->getBufferCL()),
+                b3BufferInfoCL(treeNodesGPU->getBufferCL()),
+                b3BufferInfoCL(bvhInfo->getBufferCL())
+            };
+            b3LauncherCL launcher(m_queue, m_findCompoundPairsKernel,"m_findCompoundPairsKernel");
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( nPairs  );
+            launcher.setConst( compoundPairCapacity);
+            int num = nPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+            numCompoundPairs = m_numCompoundPairsOut.at(0);
+            //printf("numCompoundPairs =%d\n",numCompoundPairs );
+            if (numCompoundPairs)
+            {
+                //printf("numCompoundPairs=%d\n",numCompoundPairs);
+            }
+        } else
+        {
+            b3AlignedObjectArray<b3QuantizedBvhNode>	treeNodesCPU;
+            treeNodesGPU->copyToHost(treeNodesCPU);
+            b3AlignedObjectArray<b3BvhSubtreeInfo>	subTreesCPU;
+            subTreesGPU->copyToHost(subTreesCPU);
+            b3AlignedObjectArray<b3BvhInfo>	bvhInfoCPU;
+            bvhInfo->copyToHost(bvhInfoCPU);
+            b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+            clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+            b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace;
+            clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace);
+            b3AlignedObjectArray<b3Int4> hostPairs;
+            pairs->copyToHost(hostPairs);
+            b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+            bodyBuf->copyToHost(hostBodyBuf);
+            b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut;
+            cpuCompoundPairsOut.resize(compoundPairCapacity);
+            b3AlignedObjectArray<b3Collidable> hostCollidables;
+            gpuCollidables.copyToHost(hostCollidables);
+            b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+            gpuChildShapes.copyToHost(cpuChildShapes);
+            b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+            convexData.copyToHost(hostConvexData);
+            b3AlignedObjectArray<b3Vector3> hostVertices;
+            gpuVertices.copyToHost(hostVertices);
+            for (int pairIndex=0;pairIndex<nPairs;pairIndex++)
+            {
+                int bodyIndexA = hostPairs[pairIndex].x;
+                int bodyIndexB = hostPairs[pairIndex].y;
+                int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+                int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+				if (cpuChildShapes.size())
+				{
+                findCompoundPairsKernel( 
+                            pairIndex,
+                            bodyIndexA,
+                            bodyIndexB,
+                            collidableIndexA,
+                            collidableIndexB,
+                            &hostBodyBuf[0],
+                            &hostCollidables[0],
+                            &hostConvexData[0],
+                            hostVertices,
+                            hostAabbsWorldSpace,
+                            hostAabbsLocalSpace,
+                            &cpuChildShapes[0],
+                            &cpuCompoundPairsOut[0],
+                            &numCompoundPairs,
+                            compoundPairCapacity,
+                            treeNodesCPU,
+                            subTreesCPU,
+                            bvhInfoCPU
+                            );
+				}
+            }
+			m_numCompoundPairsOut.copyFromHostPointer(&numCompoundPairs,1,0,true);
+			if (numCompoundPairs)
+			{
+				b3CompoundOverlappingPair* ptr = (b3CompoundOverlappingPair*)&cpuCompoundPairsOut[0];
+				m_gpuCompoundPairs.copyFromHostPointer(ptr,numCompoundPairs,0,true);
+			}
+			//cpuCompoundPairsOut
+        }
+		if (numCompoundPairs)
+		{
+			printf("numCompoundPairs=%d\n",numCompoundPairs);
+		}
+        if (numCompoundPairs > compoundPairCapacity)
+        {
+            b3Error("Exceeded compound pair capacity (%d/%d)\n", numCompoundPairs,  compoundPairCapacity);
+            numCompoundPairs = compoundPairCapacity;
+        }
+        m_gpuCompoundPairs.resize(numCompoundPairs);
+        m_gpuHasCompoundSepNormals.resize(numCompoundPairs);
+        m_gpuCompoundSepNormals.resize(numCompoundPairs);
+        if (numCompoundPairs)
+        {
+            B3_PROFILE("processCompoundPairsPrimitivesKernel");
+            b3BufferInfoCL bInfo[] = 
+            { 
+                b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), 
+                b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                b3BufferInfoCL( convexData.getBufferCL(),true),
+                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                b3BufferInfoCL( contactOut->getBufferCL()),
+                b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+            };
+            b3LauncherCL launcher(m_queue, m_processCompoundPairsPrimitivesKernel,"m_processCompoundPairsPrimitivesKernel");
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( numCompoundPairs  );
+            launcher.setConst(maxContactCapacity);
+            int num = numCompoundPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+            nContacts = m_totalContactsOut.at(0);
+            //printf("nContacts (after processCompoundPairsPrimitivesKernel) = %d\n",nContacts);
+            if (nContacts>maxContactCapacity)
+            {
+                b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+                nContacts = maxContactCapacity;
+            }
+        }
+        if (numCompoundPairs)
+        {
+            B3_PROFILE("processCompoundPairsKernel");
+            b3BufferInfoCL bInfo[] = 
+            { 
+                b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), 
+                b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                b3BufferInfoCL( convexData.getBufferCL(),true),
+                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL()),
+                b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL())
+            };
+            b3LauncherCL launcher(m_queue, m_processCompoundPairsKernel,"m_processCompoundPairsKernel");
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( numCompoundPairs  );
+            int num = numCompoundPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+        }
+        //printf("numConcave  = %d\n",numConcave);
+//		printf("hostNormals.size()=%d\n",hostNormals.size());
+		//int numPairs = pairCount.at(0);
+	}
+	int vertexFaceCapacity = 64;
+	{
+		//now perform the tree query on GPU
+		if (treeNodesGPU->size() && treeNodesGPU->size())
+		{
+			if (bvhTraversalKernelGPU)
+			{
+				B3_PROFILE("m_bvhTraversalKernel");
+				numConcavePairs = m_numConcavePairsOut.at(0);
+				b3LauncherCL launcher(m_queue, m_bvhTraversalKernel,"m_bvhTraversalKernel");
+				launcher.setBuffer( pairs->getBufferCL());
+				launcher.setBuffer(  bodyBuf->getBufferCL());
+				launcher.setBuffer( gpuCollidables.getBufferCL());
+				launcher.setBuffer( clAabbsWorldSpace.getBufferCL());
+				launcher.setBuffer( triangleConvexPairsOut.getBufferCL());
+				launcher.setBuffer( m_numConcavePairsOut.getBufferCL());
+				launcher.setBuffer( subTreesGPU->getBufferCL());
+				launcher.setBuffer( treeNodesGPU->getBufferCL());
+				launcher.setBuffer( bvhInfo->getBufferCL());
+				launcher.setConst( nPairs  );
+				launcher.setConst( maxTriConvexPairCapacity);
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+				numConcavePairs = m_numConcavePairsOut.at(0);
+			} else
+			{
+					b3AlignedObjectArray<b3Int4> hostPairs;
+					pairs->copyToHost(hostPairs);
+					b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+					bodyBuf->copyToHost(hostBodyBuf);
+					b3AlignedObjectArray<b3Collidable> hostCollidables;
+					gpuCollidables.copyToHost(hostCollidables);
+					b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+					clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+					//int maxTriConvexPairCapacity,
+					b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost;
+					triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity);
+					int numTriConvexPairsOutHost=0;
+					numConcavePairs = 0;
+					//m_numConcavePairsOut
+					b3AlignedObjectArray<b3QuantizedBvhNode>	treeNodesCPU;
+					treeNodesGPU->copyToHost(treeNodesCPU);
+					b3AlignedObjectArray<b3BvhSubtreeInfo>	subTreesCPU;
+					subTreesGPU->copyToHost(subTreesCPU);
+					b3AlignedObjectArray<b3BvhInfo>	bvhInfoCPU;
+					bvhInfo->copyToHost(bvhInfoCPU);
+					//compute it...
+					volatile int hostNumConcavePairsOut=0;
+					//
+					for (int i=0;i<nPairs;i++)
+					{
+						b3BvhTraversal( &hostPairs.at(0), 
+						&hostBodyBuf.at(0),
+						&hostCollidables.at(0),
+						&hostAabbsWorldSpace.at(0),
+						&triangleConvexPairsOutHost.at(0),
+						&hostNumConcavePairsOut,
+						&subTreesCPU.at(0),
+						&treeNodesCPU.at(0),
+						&bvhInfoCPU.at(0),
+						nPairs,
+						maxTriConvexPairCapacity,
+						i);
+					}
+					numConcavePairs = hostNumConcavePairsOut;
+					if (hostNumConcavePairsOut)
+					{
+						triangleConvexPairsOutHost.resize(hostNumConcavePairsOut);
+						triangleConvexPairsOut.copyFromHost(triangleConvexPairsOutHost);
+					}
+					//
+					m_numConcavePairsOut.resize(0);
+					m_numConcavePairsOut.push_back(numConcavePairs);
+			}
+				//printf("numConcavePairs=%d (max = %d\n",numConcavePairs,maxTriConvexPairCapacity);
+			if (numConcavePairs > maxTriConvexPairCapacity)
+			{
+				static int exceeded_maxTriConvexPairCapacity_count = 0;
+				b3Error("Exceeded the maxTriConvexPairCapacity (found %d but max is %d, it happened %d times)\n",
+					numConcavePairs,maxTriConvexPairCapacity,exceeded_maxTriConvexPairCapacity_count++);
+				numConcavePairs = maxTriConvexPairCapacity;
+			}
+			triangleConvexPairsOut.resize(numConcavePairs);
+			if (numConcavePairs)
+			{
+				clippingFacesOutGPU.resize(numConcavePairs);
+				worldNormalsAGPU.resize(numConcavePairs);
+				worldVertsA1GPU.resize(vertexFaceCapacity*(numConcavePairs));
+				worldVertsB1GPU.resize(vertexFaceCapacity*(numConcavePairs));
+				if (findConcaveSeparatingAxisKernelGPU)
+				{
+					/*
+					m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU);
+						clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+						worldVertsA1GPU.copyFromHost(worldVertsA1CPU);
+						worldNormalsAGPU.copyFromHost(worldNormalsACPU);
+						worldVertsB1GPU.copyFromHost(worldVertsB1CPU);
+					*/
+					//now perform a SAT test for each triangle-convex element (stored in triangleConvexPairsOut)
+                    if (splitSearchSepAxisConcave)
+                    {
+                        //printf("numConcavePairs = %d\n",numConcavePairs);
+                        m_dmins.resize(numConcavePairs);
+                        {
+                            B3_PROFILE("findConcaveSeparatingAxisVertexFaceKernel");
+                            b3BufferInfoCL bInfo[] = {
+                                b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ),
+                                b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+                                b3BufferInfoCL( gpuCollidables.getBufferCL(),true),
+                                b3BufferInfoCL( convexData.getBufferCL(),true),
+                                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                                b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+                                b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+                                b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                                b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                                b3BufferInfoCL(worldNormalsAGPU.getBufferCL()),
+                                b3BufferInfoCL(worldVertsB1GPU.getBufferCL()),
+                                b3BufferInfoCL(m_dmins.getBufferCL())
+                            };
+                            b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisVertexFaceKernel,"m_findConcaveSeparatingAxisVertexFaceKernel");
+                            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+                            launcher.setConst(vertexFaceCapacity);
+                            launcher.setConst( numConcavePairs  );
+                            int num = numConcavePairs;
+                            launcher.launch1D( num);
+                            clFinish(m_queue);
+                        }
+//                        numConcavePairs = 0;
+                        if (1)
+                        {
+                            B3_PROFILE("findConcaveSeparatingAxisEdgeEdgeKernel");
+                            b3BufferInfoCL bInfo[] = {
+                                b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ),
+                                b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+                                b3BufferInfoCL( gpuCollidables.getBufferCL(),true),
+                                b3BufferInfoCL( convexData.getBufferCL(),true),
+                                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                                b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+                                b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+                                b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                                b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                                b3BufferInfoCL(worldNormalsAGPU.getBufferCL()),
+                                b3BufferInfoCL(worldVertsB1GPU.getBufferCL()),
+                                b3BufferInfoCL(m_dmins.getBufferCL())
+                            };
+                            b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisEdgeEdgeKernel,"m_findConcaveSeparatingAxisEdgeEdgeKernel");
+                            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+                            launcher.setConst(vertexFaceCapacity);
+                            launcher.setConst( numConcavePairs  );
+                            int num = numConcavePairs;
+                            launcher.launch1D( num);
+                            clFinish(m_queue);
+                        }
+                        // numConcavePairs = 0;
+                    } else
+                    {
+                        B3_PROFILE("findConcaveSeparatingAxisKernel");
+                        b3BufferInfoCL bInfo[] = { 
+                            b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), 
+                            b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                            b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                            b3BufferInfoCL( convexData.getBufferCL(),true),
+                            b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                            b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                            b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                            b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                            b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                            b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                            b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+                            b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+                            b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                            b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                            b3BufferInfoCL(worldNormalsAGPU.getBufferCL()),
+                            b3BufferInfoCL(worldVertsB1GPU.getBufferCL())
+                        };
+                        b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisKernel,"m_findConcaveSeparatingAxisKernel");
+                        launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+                        launcher.setConst(vertexFaceCapacity);
+                        launcher.setConst( numConcavePairs  );
+                        int num = numConcavePairs;
+                        launcher.launch1D( num);
+                        clFinish(m_queue);
+                    }
+				} else
+				{
+						b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+						b3AlignedObjectArray<b3Vector3> worldVertsA1CPU;
+						b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+						b3AlignedObjectArray<b3Vector3> worldVertsB1CPU;
+						b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU;
+						b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost;
+						triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost);
+						//triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity);
+						b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+						bodyBuf->copyToHost(hostBodyBuf);
+						b3AlignedObjectArray<b3Collidable> hostCollidables;
+						gpuCollidables.copyToHost(hostCollidables);
+						b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+						clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+						b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+						convexData.copyToHost(hostConvexData);
+						b3AlignedObjectArray<b3Vector3> hostVertices;
+						gpuVertices.copyToHost(hostVertices);
+						b3AlignedObjectArray<b3Vector3> hostUniqueEdges;
+						gpuUniqueEdges.copyToHost(hostUniqueEdges);
+						b3AlignedObjectArray<b3GpuFace> hostFaces;
+						gpuFaces.copyToHost(hostFaces);
+						b3AlignedObjectArray<int> hostIndices;
+						gpuIndices.copyToHost(hostIndices);
+						b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+						gpuChildShapes.copyToHost(cpuChildShapes);
+						b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost;
+						m_concaveSepNormals.copyToHost(concaveSepNormalsHost);
+						concaveHasSeparatingNormalsCPU.resize(concaveSepNormalsHost.size());
+						b3GpuChildShape* childShapePointerCPU = 0;
+						if (cpuChildShapes.size())
+							childShapePointerCPU  = &cpuChildShapes.at(0);
+						clippingFacesOutCPU.resize(clippingFacesOutGPU.size());
+						worldVertsA1CPU.resize(worldVertsA1GPU.size());
+    					worldNormalsACPU.resize(worldNormalsAGPU.size());
+						worldVertsB1CPU.resize(worldVertsB1GPU.size());
+						for (int i=0;i<numConcavePairs;i++)
+						{
+							b3FindConcaveSeparatingAxisKernel(&triangleConvexPairsOutHost.at(0),
+								&hostBodyBuf.at(0),
+								&hostCollidables.at(0),
+								&hostConvexData.at(0), &hostVertices.at(0),&hostUniqueEdges.at(0),
+								&hostFaces.at(0),&hostIndices.at(0),childShapePointerCPU,
+								&hostAabbsWorldSpace.at(0),
+								&concaveSepNormalsHost.at(0),
+								&clippingFacesOutCPU.at(0),
+								&worldVertsA1CPU.at(0),
+								&worldNormalsACPU.at(0),
+								&worldVertsB1CPU.at(0),
+								&concaveHasSeparatingNormalsCPU.at(0),
+								vertexFaceCapacity,
+								numConcavePairs,i);
+						};
+						m_concaveSepNormals.copyFromHost(concaveSepNormalsHost);
+						m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU);
+						clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+						worldVertsA1GPU.copyFromHost(worldVertsA1CPU);
+						worldNormalsAGPU.copyFromHost(worldNormalsACPU);
+						worldVertsB1GPU.copyFromHost(worldVertsB1CPU);
+				}
+//							b3AlignedObjectArray<b3Vector3> cpuCompoundSepNormals;
+//						m_concaveSepNormals.copyToHost(cpuCompoundSepNormals);
+//					b3AlignedObjectArray<b3Int4> cpuConcavePairs;
+//				triangleConvexPairsOut.copyToHost(cpuConcavePairs);
+			}
+		}
+	}
+	if (numConcavePairs)
+	{
+			if (numConcavePairs)
+		{
+			B3_PROFILE("findConcaveSphereContactsKernel");
+				nContacts = m_totalContactsOut.at(0);
+//				printf("nContacts1 = %d\n",nContacts);
+			b3BufferInfoCL bInfo[] = { 
+				b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), 
+				b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+				b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				b3BufferInfoCL( convexData.getBufferCL(),true),
+				b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+				b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+				b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+				b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+				b3BufferInfoCL( contactOut->getBufferCL()),
+				b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+			};
+			b3LauncherCL launcher(m_queue, m_findConcaveSphereContactsKernel,"m_findConcaveSphereContactsKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( numConcavePairs  );
+			launcher.setConst(maxContactCapacity);
+			int num = numConcavePairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			nContacts = m_totalContactsOut.at(0);
+			//printf("nContacts (after findConcaveSphereContactsKernel) = %d\n",nContacts);
+			//printf("nContacts2 = %d\n",nContacts);
+			if (nContacts >= maxContactCapacity)
+			{
+				b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+				nContacts = maxContactCapacity;
+			}
+		}
+	}
+#ifdef __APPLE__
+	bool contactClippingOnGpu = true;
+	bool contactClippingOnGpu = true;
+	if (contactClippingOnGpu)
+	{
+		m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+//		printf("nContacts3 = %d\n",nContacts);
+		//B3_PROFILE("clipHullHullKernel");
+		bool breakupConcaveConvexKernel = true;
+#ifdef __APPLE__
+		//actually, some Apple OpenCL platform/device combinations work fine...
+		breakupConcaveConvexKernel = true;
+		//concave-convex contact clipping
+		if (numConcavePairs)
+		{
+			//			printf("numConcavePairs = %d\n", numConcavePairs);
+			//		nContacts = m_totalContactsOut.at(0);
+			//	printf("nContacts before = %d\n", nContacts);
+			if (breakupConcaveConvexKernel)
+			{
+				worldVertsB2GPU.resize(vertexFaceCapacity*numConcavePairs);
+				//clipFacesAndFindContacts
+				if (clipConcaveFacesAndFindContactsCPU)
+				{
+					b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+					b3AlignedObjectArray<b3Vector3> worldVertsA1CPU;
+					b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+					b3AlignedObjectArray<b3Vector3> worldVertsB1CPU;
+					clippingFacesOutGPU.copyToHost(clippingFacesOutCPU);
+					worldVertsA1GPU.copyToHost(worldVertsA1CPU);
+					worldNormalsAGPU.copyToHost(worldNormalsACPU);
+					worldVertsB1GPU.copyToHost(worldVertsB1CPU);
+					b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU;
+					m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU);
+					b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost;
+					m_concaveSepNormals.copyToHost(concaveSepNormalsHost);
+					b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;  
+					worldVertsB2CPU.resize(worldVertsB2GPU.size());
+					for (int i=0;i<numConcavePairs;i++)
+					{
+						clipFacesAndFindContactsKernel(   &concaveSepNormalsHost.at(0),
+							&concaveHasSeparatingNormalsCPU.at(0),
+							&clippingFacesOutCPU.at(0),
+							&worldVertsA1CPU.at(0),
+							&worldNormalsACPU.at(0),
+							&worldVertsB1CPU.at(0),
+							&worldVertsB2CPU.at(0),
+							vertexFaceCapacity,
+							i);
+					}
+					clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+					worldVertsB2GPU.copyFromHost(worldVertsB2CPU);
+				} else
+				{
+					if (1)
+					{
+						B3_PROFILE("clipFacesAndFindContacts");
+						//nContacts = m_totalContactsOut.at(0);
+						//int h = m_hasSeparatingNormals.at(0);
+						//int4 p = clippingFacesOutGPU.at(0);
+						b3BufferInfoCL bInfo[] = {
+							b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+							b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+							b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+							b3BufferInfoCL( worldNormalsAGPU.getBufferCL()),
+							b3BufferInfoCL( worldVertsB1GPU.getBufferCL()),
+							b3BufferInfoCL( worldVertsB2GPU.getBufferCL())
+						};
+						b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst(vertexFaceCapacity);
+						launcher.setConst( numConcavePairs  );
+						int debugMode = 0;
+						launcher.setConst( debugMode);
+						int num = numConcavePairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+						//int bla = m_totalContactsOut.at(0);
+					}
+				}
+				//contactReduction
+				{
+					int newContactCapacity=nContacts+numConcavePairs; 
+					contactOut->reserve(newContactCapacity);
+					if (reduceConcaveContactsOnGPU)
+					{
+//						printf("newReservation = %d\n",newReservation);
+						{
+							B3_PROFILE("newContactReductionKernel");
+							b3BufferInfoCL bInfo[] =
+							{
+								b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ),
+								b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+								b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+								b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+								b3BufferInfoCL( contactOut->getBufferCL()),
+								b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+								b3BufferInfoCL( worldVertsB2GPU.getBufferCL()),
+								b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+							};
+							b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel");
+							launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+							launcher.setConst(vertexFaceCapacity);
+							launcher.setConst(newContactCapacity);
+							launcher.setConst( numConcavePairs  );
+							int num = numConcavePairs;
+							launcher.launch1D( num);
+						}
+						nContacts = m_totalContactsOut.at(0);
+						contactOut->resize(nContacts);
+						//printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts);
+					}else
+					{
+						volatile int nGlobalContactsOut = nContacts;
+						b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost;
+						triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost);
+						b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+						bodyBuf->copyToHost(hostBodyBuf);
+						b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU;
+						m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU);
+						b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost;
+						m_concaveSepNormals.copyToHost(concaveSepNormalsHost);
+						b3AlignedObjectArray<b3Contact4> hostContacts;
+						if (nContacts)
+						{
+							contactOut->copyToHost(hostContacts);
+						}
+						hostContacts.resize(newContactCapacity);
+						b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+						b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;
+						clippingFacesOutGPU.copyToHost(clippingFacesOutCPU);
+						worldVertsB2GPU.copyToHost(worldVertsB2CPU);
+						for (int i=0;i<numConcavePairs;i++)
+						{
+							b3NewContactReductionKernel( &triangleConvexPairsOutHost.at(0),
+                                                   &hostBodyBuf.at(0),
+												   &concaveSepNormalsHost.at(0),
+												   &concaveHasSeparatingNormalsCPU.at(0),
+												   &hostContacts.at(0),
+                                                   &clippingFacesOutCPU.at(0),
+                                                   &worldVertsB2CPU.at(0),
+                                                   &nGlobalContactsOut,
+                                                   vertexFaceCapacity,
+												   newContactCapacity,
+                                                   numConcavePairs,
+												   i
+                                                   );
+						}
+						nContacts = nGlobalContactsOut;
+						m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+//						nContacts = m_totalContactsOut.at(0);
+						//contactOut->resize(nContacts);
+						hostContacts.resize(nContacts);
+						//printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts);
+						contactOut->copyFromHost(hostContacts);
+					}
+				}
+				//re-use?
+			} else
+			{
+				B3_PROFILE("clipHullHullConcaveConvexKernel");
+				nContacts = m_totalContactsOut.at(0);
+				int newContactCapacity = contactOut->capacity();
+				//printf("contactOut5 = %d\n",nContacts);
+				b3BufferInfoCL bInfo[] = { 
+					b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), 
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+					b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+					b3BufferInfoCL( contactOut->getBufferCL()),
+					b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+				};
+				b3LauncherCL launcher(m_queue, m_clipHullHullConcaveConvexKernel,"m_clipHullHullConcaveConvexKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst(newContactCapacity);
+				launcher.setConst( numConcavePairs  );
+				int num = numConcavePairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+				nContacts = m_totalContactsOut.at(0);
+				contactOut->resize(nContacts);
+				//printf("contactOut6 = %d\n",nContacts);
+				b3AlignedObjectArray<b3Contact4> cpuContacts;
+				contactOut->copyToHost(cpuContacts);
+			}
+			//			printf("nContacts after = %d\n", nContacts);
+		}//numConcavePairs
+		//convex-convex contact clipping
+		bool breakupKernel = false;
+#ifdef __APPLE__
+		breakupKernel = true;
+	bool computeConvexConvex = false;
+	bool computeConvexConvex = true;
+		if (computeConvexConvex)
+		{
+			B3_PROFILE("clipHullHullKernel");
+		if (breakupKernel)
+		{
+			worldVertsB1GPU.resize(vertexFaceCapacity*nPairs);
+			clippingFacesOutGPU.resize(nPairs);
+			worldNormalsAGPU.resize(nPairs);
+			worldVertsA1GPU.resize(vertexFaceCapacity*nPairs);
+			worldVertsB2GPU.resize(vertexFaceCapacity*nPairs);
+			if (findConvexClippingFacesGPU)
+			{
+				B3_PROFILE("findClippingFacesKernel");
+				b3BufferInfoCL bInfo[] = {
+					b3BufferInfoCL( pairs->getBufferCL(), true ),
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true),
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true), 
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( m_sepNormals.getBufferCL()),
+					b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+					b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+					b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+					b3BufferInfoCL( worldNormalsAGPU.getBufferCL()),
+					b3BufferInfoCL( worldVertsB1GPU.getBufferCL())
+				};
+				b3LauncherCL launcher(m_queue, m_findClippingFacesKernel,"m_findClippingFacesKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( vertexFaceCapacity);
+				launcher.setConst( nPairs  );
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+			} else
+			{
+				float minDist = -1e30f;
+				float maxDist = 0.02f;
+				b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+				convexData.copyToHost(hostConvexData);
+				b3AlignedObjectArray<b3Collidable> hostCollidables;
+				gpuCollidables.copyToHost(hostCollidables);
+				b3AlignedObjectArray<int> hostHasSepNormals;
+				m_hasSeparatingNormals.copyToHost(hostHasSepNormals);
+				b3AlignedObjectArray<b3Vector3> cpuSepNormals;
+				m_sepNormals.copyToHost(cpuSepNormals);
+				b3AlignedObjectArray<b3Int4> hostPairs;
+				pairs->copyToHost(hostPairs);
+				b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+				bodyBuf->copyToHost(hostBodyBuf);
+				//worldVertsB1GPU.resize(vertexFaceCapacity*nPairs);
+				b3AlignedObjectArray<b3Vector3> worldVertsB1CPU;
+				worldVertsB1GPU.copyToHost(worldVertsB1CPU);
+				b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+				clippingFacesOutGPU.copyToHost(clippingFacesOutCPU);
+				b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+				worldNormalsACPU.resize(nPairs);
+				b3AlignedObjectArray<b3Vector3> worldVertsA1CPU;
+				worldVertsA1CPU.resize(worldVertsA1GPU.size());
+				b3AlignedObjectArray<b3Vector3> hostVertices;
+				gpuVertices.copyToHost(hostVertices);
+				b3AlignedObjectArray<b3GpuFace> hostFaces;
+				gpuFaces.copyToHost(hostFaces);
+				b3AlignedObjectArray<int> hostIndices;
+				gpuIndices.copyToHost(hostIndices);
+				for (int i=0;i<nPairs;i++)
+				{
+					int bodyIndexA = hostPairs[i].x;
+					int bodyIndexB = hostPairs[i].y;
+					int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+					int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+					int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex;
+					int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex;
+					if (hostHasSepNormals[i])
+					{
+						b3FindClippingFaces(cpuSepNormals[i],
+							&hostConvexData[shapeIndexA],
+							&hostConvexData[shapeIndexB],
+							hostBodyBuf[bodyIndexA].m_pos,hostBodyBuf[bodyIndexA].m_quat,
+							hostBodyBuf[bodyIndexB].m_pos,hostBodyBuf[bodyIndexB].m_quat,
+							&worldVertsA1CPU.at(0),&worldNormalsACPU.at(0),
+							&worldVertsB1CPU.at(0),
+							vertexFaceCapacity,minDist,maxDist,
+							&hostVertices.at(0),&hostFaces.at(0),
+							&hostIndices.at(0),
+							&hostVertices.at(0),&hostFaces.at(0),
+							&hostIndices.at(0),&clippingFacesOutCPU.at(0),i);
+					}
+				}
+				clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+				worldVertsA1GPU.copyFromHost(worldVertsA1CPU);
+				worldNormalsAGPU.copyFromHost(worldNormalsACPU);
+				worldVertsB1GPU.copyFromHost(worldVertsB1CPU);
+			}
+			///clip face B against face A, reduce contacts and append them to a global contact array
+			if (1)
+			{
+				if (clipConvexFacesAndFindContactsCPU)
+				{
+					//b3AlignedObjectArray<b3Int4> hostPairs;
+					//pairs->copyToHost(hostPairs);
+					b3AlignedObjectArray<b3Vector3> hostSepNormals;
+					m_sepNormals.copyToHost(hostSepNormals);
+					b3AlignedObjectArray<int> hostHasSepAxis;
+					m_hasSeparatingNormals.copyToHost(hostHasSepAxis);
+					b3AlignedObjectArray<b3Int4> hostClippingFaces;
+					clippingFacesOutGPU.copyToHost(hostClippingFaces);
+					b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;
+					worldVertsB2CPU.resize(vertexFaceCapacity*nPairs);
+					b3AlignedObjectArray<b3Vector3>worldVertsA1CPU;
+					worldVertsA1GPU.copyToHost(worldVertsA1CPU);
+					b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+					worldNormalsAGPU.copyToHost(worldNormalsACPU);
+					b3AlignedObjectArray<b3Vector3>  worldVertsB1CPU;
+					worldVertsB1GPU.copyToHost(worldVertsB1CPU);
+					/*
+					  __global const b3Float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global b3Int4* clippingFacesOut,
+                                                   __global b3Float4* worldVertsA1,
+                                                   __global b3Float4* worldNormalsA1,
+                                                   __global b3Float4* worldVertsB1,
+                                                   __global b3Float4* worldVertsB2,
+                                                    int vertexFaceCapacity,
+															int pairIndex
+					*/
+					for (int i=0;i<nPairs;i++)
+					{
+						clipFacesAndFindContactsKernel(
+							&hostSepNormals.at(0),
+							&hostHasSepAxis.at(0),
+							&hostClippingFaces.at(0),
+							&worldVertsA1CPU.at(0),
+							&worldNormalsACPU.at(0),
+							&worldVertsB1CPU.at(0),
+							&worldVertsB2CPU.at(0),
+						vertexFaceCapacity,
+							i);
+					}
+					clippingFacesOutGPU.copyFromHost(hostClippingFaces);
+					worldVertsB2GPU.copyFromHost(worldVertsB2CPU);
+				} else
+				{
+					B3_PROFILE("clipFacesAndFindContacts");
+					//nContacts = m_totalContactsOut.at(0);
+					//int h = m_hasSeparatingNormals.at(0);
+					//int4 p = clippingFacesOutGPU.at(0);
+					b3BufferInfoCL bInfo[] = {
+						b3BufferInfoCL( m_sepNormals.getBufferCL()),
+						b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+						b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+						b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+						b3BufferInfoCL( worldNormalsAGPU.getBufferCL()),
+						b3BufferInfoCL( worldVertsB1GPU.getBufferCL()),
+						b3BufferInfoCL( worldVertsB2GPU.getBufferCL())
+					};
+					b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts");
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setConst(vertexFaceCapacity);
+					launcher.setConst( nPairs  );
+					int debugMode = 0;
+					launcher.setConst( debugMode);
+					int num = nPairs;
+					launcher.launch1D( num);
+					clFinish(m_queue);
+				} 
+				{
+					nContacts = m_totalContactsOut.at(0);
+					//printf("nContacts = %d\n",nContacts);
+					int newContactCapacity = nContacts+nPairs;
+					contactOut->reserve(newContactCapacity);
+					if (reduceConvexContactsOnGPU)
+					{
+						{
+							B3_PROFILE("newContactReductionKernel");
+							b3BufferInfoCL bInfo[] =
+							{
+								b3BufferInfoCL( pairs->getBufferCL(), true ),
+								b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+								b3BufferInfoCL( m_sepNormals.getBufferCL()),
+								b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+								b3BufferInfoCL( contactOut->getBufferCL()),
+								b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+								b3BufferInfoCL( worldVertsB2GPU.getBufferCL()),
+								b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+							};
+							b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel");
+							launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+							launcher.setConst(vertexFaceCapacity);
+							launcher.setConst(newContactCapacity);
+							launcher.setConst( nPairs  );
+							int num = nPairs;
+							launcher.launch1D( num);
+						}
+						nContacts = m_totalContactsOut.at(0);
+						contactOut->resize(nContacts);
+					} else
+					{
+						volatile int nGlobalContactsOut = nContacts;
+						b3AlignedObjectArray<b3Int4> hostPairs;
+						pairs->copyToHost(hostPairs);
+						b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+						bodyBuf->copyToHost(hostBodyBuf);
+						b3AlignedObjectArray<b3Vector3> hostSepNormals;
+						m_sepNormals.copyToHost(hostSepNormals);
+						b3AlignedObjectArray<int> hostHasSepAxis;
+						m_hasSeparatingNormals.copyToHost(hostHasSepAxis);
+						b3AlignedObjectArray<b3Contact4> hostContactsOut;
+						contactOut->copyToHost(hostContactsOut);
+						hostContactsOut.resize(newContactCapacity);
+						b3AlignedObjectArray<b3Int4> hostClippingFaces;
+						clippingFacesOutGPU.copyToHost(hostClippingFaces);
+						b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;
+						worldVertsB2GPU.copyToHost(worldVertsB2CPU);
+						for (int i=0;i<nPairs;i++)
+						{
+							b3NewContactReductionKernel(&hostPairs.at(0),
+								&hostBodyBuf.at(0),
+								&hostSepNormals.at(0),
+								&hostHasSepAxis.at(0),
+								&hostContactsOut.at(0),
+								&hostClippingFaces.at(0),
+								&worldVertsB2CPU.at(0),
+								&nGlobalContactsOut,
+								vertexFaceCapacity,
+								newContactCapacity,
+								nPairs,
+								i);
+						}
+						nContacts = nGlobalContactsOut;
+						m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+						hostContactsOut.resize(nContacts);
+						//printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts);
+						contactOut->copyFromHost(hostContactsOut);
+					}
+					//                    b3Contact4 pt = contactOut->at(0);
+					//                  printf("nContacts = %d\n",nContacts);
+				}
+			}
+		}            
+		else//breakupKernel
+		{
+			if (nPairs)
+			{
+				b3BufferInfoCL bInfo[] = {
+					b3BufferInfoCL( pairs->getBufferCL(), true ), 
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( m_sepNormals.getBufferCL()),
+					b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+					b3BufferInfoCL( contactOut->getBufferCL()),
+					b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+				};
+				b3LauncherCL launcher(m_queue, m_clipHullHullKernel,"m_clipHullHullKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( nPairs  );
+				launcher.setConst(maxContactCapacity);
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+				nContacts = m_totalContactsOut.at(0);
+				if (nContacts >= maxContactCapacity)
+				{
+					b3Error("Exceeded contact capacity (%d/%d)\n",nContacts,maxContactCapacity);
+					nContacts = maxContactCapacity;
+				}
+				contactOut->resize(nContacts);
+			}
+		}
+		int nCompoundsPairs = m_gpuCompoundPairs.size();
+		if (nCompoundsPairs)
+		{
+			b3BufferInfoCL bInfo[] = {
+				b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), 
+				b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+				b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				b3BufferInfoCL( convexData.getBufferCL(),true),
+				b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+				b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+				b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+				b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+				b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL(),true),
+				b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL(),true),
+				b3BufferInfoCL( contactOut->getBufferCL()),
+				b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			b3LauncherCL launcher(m_queue, m_clipCompoundsHullHullKernel,"m_clipCompoundsHullHullKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( nCompoundsPairs  );
+			launcher.setConst(maxContactCapacity);
+			int num = nCompoundsPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			nContacts = m_totalContactsOut.at(0);
+			if (nContacts>maxContactCapacity)
+			{
+				b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+				nContacts = maxContactCapacity;
+			}
+			contactOut->resize(nContacts);
+		}//if nCompoundsPairs
+		}
+	}//contactClippingOnGpu
+	//printf("nContacts end = %d\n",nContacts);
+	//printf("frameCount = %d\n",frameCount++);
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
new file mode 100644
index 00000000..e24c1579
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
@@ -0,0 +1,118 @@
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "b3OptimizedBvh.h"
+#include "b3BvhInfo.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
+struct GpuSatCollision
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	cl_kernel				m_findSeparatingAxisKernel;
+	cl_kernel				m_mprPenetrationKernel;
+	cl_kernel				m_findSeparatingAxisUnitSphereKernel;
+	cl_kernel m_findSeparatingAxisVertexFaceKernel;
+	cl_kernel m_findSeparatingAxisEdgeEdgeKernel;
+	cl_kernel				m_findConcaveSeparatingAxisKernel;
+    cl_kernel				m_findConcaveSeparatingAxisVertexFaceKernel;
+    cl_kernel				m_findConcaveSeparatingAxisEdgeEdgeKernel;
+	cl_kernel				m_findCompoundPairsKernel;
+	cl_kernel				m_processCompoundPairsKernel;
+	cl_kernel				m_clipHullHullKernel;
+	cl_kernel				m_clipCompoundsHullHullKernel;
+    cl_kernel               m_clipFacesAndFindContacts;
+    cl_kernel               m_findClippingFacesKernel;
+	cl_kernel				m_clipHullHullConcaveConvexKernel;
+//	cl_kernel				m_extractManifoldAndAddContactKernel;
+    cl_kernel               m_newContactReductionKernel;
+	cl_kernel				m_bvhTraversalKernel;
+	cl_kernel				m_primitiveContactsKernel;
+	cl_kernel				m_findConcaveSphereContactsKernel;
+	cl_kernel				m_processCompoundPairsPrimitivesKernel;
+	b3OpenCLArray<b3Vector3> m_unitSphereDirections;
+	b3OpenCLArray<int>		m_totalContactsOut;
+	b3OpenCLArray<b3Vector3> m_sepNormals;
+	b3OpenCLArray<float> m_dmins;
+	b3OpenCLArray<int>		m_hasSeparatingNormals;
+	b3OpenCLArray<b3Vector3> m_concaveSepNormals;
+	b3OpenCLArray<int>		m_concaveHasSeparatingNormals;
+	b3OpenCLArray<int>		m_numConcavePairsOut;
+	b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs;
+	b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals;
+	b3OpenCLArray<int>		m_gpuHasCompoundSepNormals;
+	b3OpenCLArray<int>		m_numCompoundPairsOut;
+	GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	virtual ~GpuSatCollision();
+	void computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, 
+			const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+			b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
+			const b3OpenCLArray<b3Contact4>* oldContacts,
+			int maxContactCapacity,
+			int compoundPairCapacity,
+			const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
+			const b3OpenCLArray<b3Vector3>& vertices,
+			const b3OpenCLArray<b3Vector3>& uniqueEdges,
+			const b3OpenCLArray<b3GpuFace>& faces,
+			const b3OpenCLArray<int>& indices,
+			const b3OpenCLArray<b3Collidable>& gpuCollidables,
+			const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
+			const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
+			const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
+           b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
+           b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
+           b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
+           b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
+           b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
+		   b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
+		   b3OpenCLArray<b3QuantizedBvhNode>*	treeNodesGPU,
+			b3OpenCLArray<b3BvhSubtreeInfo>*	subTreesGPU,
+			b3OpenCLArray<b3BvhInfo>*	bvhInfo,
+			int numObjects,
+			int maxTriConvexPairCapacity,
+			b3OpenCLArray<b3Int4>& triangleConvexPairs,
+			int& numTriConvexPairsOut
+			);
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
new file mode 100644
index 00000000..337100fb
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
@@ -0,0 +1,9 @@
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
new file mode 100644
index 00000000..d636f983
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
@@ -0,0 +1,1014 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the
+use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software in a
+product, an acknowledgment in the product documentation would be appreciated
+but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+GJK-EPA collision solver by Nathanael Presson, 2008
+#include "b3GjkEpa.h"
+#include "b3SupportMappings.h"
+namespace gjkepa2_impl2
+	// Config
+	/* GJK	*/ 
+#define GJK_ACCURACY		((b3Scalar)0.0001)
+#define GJK_MIN_DISTANCE	((b3Scalar)0.0001)
+#define GJK_DUPLICATED_EPS	((b3Scalar)0.0001)
+#define GJK_SIMPLEX2_EPS	((b3Scalar)0.0)
+#define GJK_SIMPLEX3_EPS	((b3Scalar)0.0)
+#define GJK_SIMPLEX4_EPS	((b3Scalar)0.0)
+	/* EPA	*/ 
+#define EPA_MAX_VERTICES	64
+#define EPA_ACCURACY		((b3Scalar)0.0001)
+#define EPA_PLANE_EPS		((b3Scalar)0.00001)
+#define EPA_INSIDE_EPS		((b3Scalar)0.01)
+	// Shorthands
+	// MinkowskiDiff
+	struct	b3MinkowskiDiff
+	{
+		const b3ConvexPolyhedronData*	m_shapes[2];
+		b3Matrix3x3				m_toshape1;
+		b3Transform				m_toshape0;
+		bool					m_enableMargin;
+			void					EnableMargin(bool enable)
+		{
+			m_enableMargin = enable;
+		}	
+		inline b3Vector3		Support0(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA) const
+		{
+			if (m_enableMargin)
+			{
+				return localGetSupportVertexWithMargin(d,m_shapes[0],verticesA,0.f);
+			} else
+			{
+				return localGetSupportVertexWithoutMargin(d,m_shapes[0],verticesA);
+			}
+		}
+		inline b3Vector3		Support1(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesB) const
+		{
+			if (m_enableMargin)
+			{
+				return m_toshape0*(localGetSupportVertexWithMargin(m_toshape1*d,m_shapes[1],verticesB,0.f));
+			} else
+			{
+				return m_toshape0*(localGetSupportVertexWithoutMargin(m_toshape1*d,m_shapes[1],verticesB));
+			}
+		}
+		inline b3Vector3		Support(const b3Vector3& d,  const b3AlignedObjectArray<b3Vector3>& verticesA,  const b3AlignedObjectArray<b3Vector3>& verticesB) const
+		{
+			return(Support0(d,verticesA)-Support1(-d,verticesB));
+		}
+		b3Vector3				Support(const b3Vector3& d,unsigned int index,const b3AlignedObjectArray<b3Vector3>& verticesA,  const b3AlignedObjectArray<b3Vector3>& verticesB) const
+		{
+			if(index)
+				return(Support1(d,verticesA));
+			else
+				return(Support0(d,verticesB));
+		}
+	};
+	typedef	b3MinkowskiDiff	tShape;
+	// GJK
+	struct	b3GJK
+	{
+		/* Types		*/ 
+		struct	sSV
+		{
+			b3Vector3	d,w;
+		};
+		struct	sSimplex
+		{
+			sSV*		c[4];
+			b3Scalar	p[4];
+			unsigned int			rank;
+		};
+		struct	eStatus	{ enum _ {
+			Valid,
+			Inside,
+			Failed		};};
+			/* Fields		*/ 
+			tShape			m_shape;
+			const b3AlignedObjectArray<b3Vector3>& m_verticesA;
+			const b3AlignedObjectArray<b3Vector3>& m_verticesB;
+			b3Vector3		m_ray;
+			b3Scalar		m_distance;
+			sSimplex		m_simplices[2];
+			sSV				m_store[4];
+			sSV*			m_free[4];
+			unsigned int				m_nfree;
+			unsigned int				m_current;
+			sSimplex*		m_simplex;
+			eStatus::_		m_status;
+			/* Methods		*/ 
+			b3GJK(const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB)
+				:m_verticesA(verticesA),m_verticesB(verticesB)
+			{
+				Initialize();
+			}
+			void				Initialize()
+			{
+				m_ray		=	b3MakeVector3(0,0,0);
+				m_nfree		=	0;
+				m_status	=	eStatus::Failed;
+				m_current	=	0;
+				m_distance	=	0;
+			}
+			eStatus::_			Evaluate(const tShape& shapearg,const b3Vector3& guess)
+			{
+				unsigned int			iterations=0;
+				b3Scalar	sqdist=0;
+				b3Scalar	alpha=0;
+				b3Vector3	lastw[4];
+				unsigned int			clastw=0;
+				/* Initialize solver		*/ 
+				m_free[0]			=	&m_store[0];
+				m_free[1]			=	&m_store[1];
+				m_free[2]			=	&m_store[2];
+				m_free[3]			=	&m_store[3];
+				m_nfree				=	4;
+				m_current			=	0;
+				m_status			=	eStatus::Valid;
+				m_shape				=	shapearg;
+				m_distance			=	0;
+				/* Initialize simplex		*/ 
+				m_simplices[0].rank	=	0;
+				m_ray				=	guess;
+				const b3Scalar	sqrl=	m_ray.length2();
+				appendvertice(m_simplices[0],sqrl>0?-m_ray:b3MakeVector3(1,0,0));
+				m_simplices[0].p[0]	=	1;
+				m_ray				=	m_simplices[0].c[0]->w;	
+				sqdist				=	sqrl;
+				lastw[0]			=
+					lastw[1]			=
+					lastw[2]			=
+					lastw[3]			=	m_ray;
+				/* Loop						*/ 
+				do	{
+					const unsigned int		next=1-m_current;
+					sSimplex&	cs=m_simplices[m_current];
+					sSimplex&	ns=m_simplices[next];
+					/* Check zero							*/ 
+					const b3Scalar	rl=m_ray.length();
+					if(rl<GJK_MIN_DISTANCE)
+					{/* Touching or inside				*/ 
+						m_status=eStatus::Inside;
+						break;
+					}
+					/* Append new vertice in -'v' direction	*/ 
+					appendvertice(cs,-m_ray);
+					const b3Vector3&	w=cs.c[cs.rank-1]->w;
+					bool				found=false;
+					for(unsigned int i=0;i<4;++i)
+					{
+						if((w-lastw[i]).length2()<GJK_DUPLICATED_EPS)
+						{ found=true;break; }
+					}
+					if(found)
+					{/* Return old simplex				*/ 
+						removevertice(m_simplices[m_current]);
+						break;
+					}
+					else
+					{/* Update lastw					*/ 
+						lastw[clastw=(clastw+1)&3]=w;
+					}
+					/* Check for termination				*/ 
+					const b3Scalar	omega=b3Dot(m_ray,w)/rl;
+					alpha=b3Max(omega,alpha);
+					if(((rl-alpha)-(GJK_ACCURACY*rl))<=0)
+					{/* Return old simplex				*/ 
+						removevertice(m_simplices[m_current]);
+						break;
+					}		
+					/* Reduce simplex						*/ 
+					b3Scalar	weights[4];
+					unsigned int			mask=0;
+					switch(cs.rank)
+					{
+					case	2:	sqdist=projectorigin(	cs.c[0]->w,
+									cs.c[1]->w,
+									weights,mask);break;
+					case	3:	sqdist=projectorigin(	cs.c[0]->w,
+									cs.c[1]->w,
+									cs.c[2]->w,
+									weights,mask);break;
+					case	4:	sqdist=projectorigin(	cs.c[0]->w,
+									cs.c[1]->w,
+									cs.c[2]->w,
+									cs.c[3]->w,
+									weights,mask);break;
+					}
+					if(sqdist>=0)
+					{/* Valid	*/ 
+						ns.rank		=	0;
+						m_ray		=	b3MakeVector3(0,0,0);
+						m_current	=	next;
+						for(unsigned int i=0,ni=cs.rank;i<ni;++i)
+						{
+							if(mask&(1<<i))
+							{
+								ns.c[ns.rank]		=	cs.c[i];
+								ns.p[ns.rank++]		=	weights[i];
+								m_ray				+=	cs.c[i]->w*weights[i];
+							}
+							else
+							{
+								m_free[m_nfree++]	=	cs.c[i];
+							}
+						}
+						if(mask==15) m_status=eStatus::Inside;
+					}
+					else
+					{/* Return old simplex				*/ 
+						removevertice(m_simplices[m_current]);
+						break;
+					}
+					m_status=((++iterations)<GJK_MAX_ITERATIONS)?m_status:eStatus::Failed;
+				} while(m_status==eStatus::Valid);
+				m_simplex=&m_simplices[m_current];
+				switch(m_status)
+				{
+				case	eStatus::Valid:		m_distance=m_ray.length();break;
+				case	eStatus::Inside:	m_distance=0;break;
+				default:
+					{
+					}
+				}	
+				return(m_status);
+			}
+			bool					EncloseOrigin()
+			{
+				switch(m_simplex->rank)
+				{
+				case	1:
+					{
+						for(unsigned int i=0;i<3;++i)
+						{
+							b3Vector3		axis=b3MakeVector3(0,0,0);
+							axis[i]=1;
+							appendvertice(*m_simplex, axis);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+							appendvertice(*m_simplex,-axis);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+						}
+					}
+					break;
+				case	2:
+					{
+						const b3Vector3	d=m_simplex->c[1]->w-m_simplex->c[0]->w;
+						for(unsigned int i=0;i<3;++i)
+						{
+							b3Vector3		axis=b3MakeVector3(0,0,0);
+							axis[i]=1;
+							const b3Vector3	p=b3Cross(d,axis);
+							if(p.length2()>0)
+							{
+								appendvertice(*m_simplex, p);
+								if(EncloseOrigin())	return(true);
+								removevertice(*m_simplex);
+								appendvertice(*m_simplex,-p);
+								if(EncloseOrigin())	return(true);
+								removevertice(*m_simplex);
+							}
+						}
+					}
+					break;
+				case	3:
+					{
+						const b3Vector3	n=b3Cross(m_simplex->c[1]->w-m_simplex->c[0]->w,
+							m_simplex->c[2]->w-m_simplex->c[0]->w);
+						if(n.length2()>0)
+						{
+							appendvertice(*m_simplex,n);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+							appendvertice(*m_simplex,-n);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+						}
+					}
+					break;
+				case	4:
+					{
+						if(b3Fabs(det(	m_simplex->c[0]->w-m_simplex->c[3]->w,
+							m_simplex->c[1]->w-m_simplex->c[3]->w,
+							m_simplex->c[2]->w-m_simplex->c[3]->w))>0)
+							return(true);
+					}
+					break;
+				}
+				return(false);
+			}
+			/* Internals	*/ 
+			void				getsupport(const b3Vector3& d,sSV& sv) const
+			{
+				sv.d	=	d/d.length();
+				sv.w	=	m_shape.Support(sv.d,m_verticesA,m_verticesB);
+			}
+			void				removevertice(sSimplex& simplex)
+			{
+				m_free[m_nfree++]=simplex.c[--simplex.rank];
+			}
+			void				appendvertice(sSimplex& simplex,const b3Vector3& v)
+			{
+				simplex.p[simplex.rank]=0;
+				simplex.c[simplex.rank]=m_free[--m_nfree];
+				getsupport(v,*simplex.c[simplex.rank++]);
+			}
+			static b3Scalar		det(const b3Vector3& a,const b3Vector3& b,const b3Vector3& c)
+			{
+				return(	a.y*b.z*c.x+a.z*b.x*c.y-
+					a.x*b.z*c.y-a.y*b.x*c.z+
+					a.x*b.y*c.z-a.z*b.y*c.x);
+			}
+			static b3Scalar		projectorigin(	const b3Vector3& a,
+				const b3Vector3& b,
+				b3Scalar* w,unsigned int& m)
+			{
+				const b3Vector3	d=b-a;
+				const b3Scalar	l=d.length2();
+				if(l>GJK_SIMPLEX2_EPS)
+				{
+					const b3Scalar	t(l>0?-b3Dot(a,d)/l:0);
+					if(t>=1)		{ w[0]=0;w[1]=1;m=2;return(b.length2()); }
+					else if(t<=0)	{ w[0]=1;w[1]=0;m=1;return(a.length2()); }
+					else			{ w[0]=1-(w[1]=t);m=3;return((a+d*t).length2()); }
+				}
+				return(-1);
+			}
+			static b3Scalar		projectorigin(	const b3Vector3& a,
+				const b3Vector3& b,
+				const b3Vector3& c,
+				b3Scalar* w,unsigned int& m)
+			{
+				static const unsigned int		imd3[]={1,2,0};
+				const b3Vector3*	vt[]={&a,&b,&c};
+				const b3Vector3		dl[]={a-b,b-c,c-a};
+				const b3Vector3		n=b3Cross(dl[0],dl[1]);
+				const b3Scalar		l=n.length2();
+				if(l>GJK_SIMPLEX3_EPS)
+				{
+					b3Scalar	mindist=-1;
+					b3Scalar	subw[2]={0.f,0.f};
+					unsigned int			subm(0);
+					for(unsigned int i=0;i<3;++i)
+					{
+						if(b3Dot(*vt[i],b3Cross(dl[i],n))>0)
+						{
+							const unsigned int			j=imd3[i];
+							const b3Scalar	subd(projectorigin(*vt[i],*vt[j],subw,subm));
+							if((mindist<0)||(subd<mindist))
+							{
+								mindist		=	subd;
+								m			=	static_cast<unsigned int>(((subm&1)?1<<i:0)+((subm&2)?1<<j:0));
+								w[i]		=	subw[0];
+								w[j]		=	subw[1];
+								w[imd3[j]]	=	0;				
+							}
+						}
+					}
+					if(mindist<0)
+					{
+						const b3Scalar	d=b3Dot(a,n);	
+						const b3Scalar	s=b3Sqrt(l);
+						const b3Vector3	p=n*(d/l);
+						mindist	=	p.length2();
+						m		=	7;
+						w[0]	=	(b3Cross(dl[1],b-p)).length()/s;
+						w[1]	=	(b3Cross(dl[2],c-p)).length()/s;
+						w[2]	=	1-(w[0]+w[1]);
+					}
+					return(mindist);
+				}
+				return(-1);
+			}
+			static b3Scalar		projectorigin(	const b3Vector3& a,
+				const b3Vector3& b,
+				const b3Vector3& c,
+				const b3Vector3& d,
+				b3Scalar* w,unsigned int& m)
+			{
+				static const unsigned int		imd3[]={1,2,0};
+				const b3Vector3*	vt[]={&a,&b,&c,&d};
+				const b3Vector3		dl[]={a-d,b-d,c-d};
+				const b3Scalar		vl=det(dl[0],dl[1],dl[2]);
+				const bool			ng=(vl*b3Dot(a,b3Cross(b-c,a-b)))<=0;
+				if(ng&&(b3Fabs(vl)>GJK_SIMPLEX4_EPS))
+				{
+					b3Scalar	mindist=-1;
+					b3Scalar	subw[3]={0.f,0.f,0.f};
+					unsigned int			subm(0);
+					for(unsigned int i=0;i<3;++i)
+					{
+						const unsigned int			j=imd3[i];
+						const b3Scalar	s=vl*b3Dot(d,b3Cross(dl[i],dl[j]));
+						if(s>0)
+						{
+							const b3Scalar	subd=projectorigin(*vt[i],*vt[j],d,subw,subm);
+							if((mindist<0)||(subd<mindist))
+							{
+								mindist		=	subd;
+								m			=	static_cast<unsigned int>((subm&1?1<<i:0)+
+									(subm&2?1<<j:0)+
+									(subm&4?8:0));
+								w[i]		=	subw[0];
+								w[j]		=	subw[1];
+								w[imd3[j]]	=	0;
+								w[3]		=	subw[2];
+							}
+						}
+					}
+					if(mindist<0)
+					{
+						mindist	=	0;
+						m		=	15;
+						w[0]	=	det(c,b,d)/vl;
+						w[1]	=	det(a,c,d)/vl;
+						w[2]	=	det(b,a,d)/vl;
+						w[3]	=	1-(w[0]+w[1]+w[2]);
+					}
+					return(mindist);
+				}
+				return(-1);
+			}
+	};
+	// EPA
+	struct	b3EPA
+	{
+		/* Types		*/ 
+		typedef	b3GJK::sSV	sSV;
+		struct	sFace
+		{
+			b3Vector3	n;
+			b3Scalar	d;
+			sSV*		c[3];
+			sFace*		f[3];
+			sFace*		l[2];
+			unsigned char			e[3];
+			unsigned char			pass;
+		};
+		struct	sList
+		{
+			sFace*		root;
+			unsigned int			count;
+			sList() : root(0),count(0)	{}
+		};
+		struct	sHorizon
+		{
+			sFace*		cf;
+			sFace*		ff;
+			unsigned int			nf;
+			sHorizon() : cf(0),ff(0),nf(0)	{}
+		};
+		struct	eStatus { enum _ {
+			Valid,
+			Touching,
+			Degenerated,
+			NonConvex,
+			InvalidHull,		
+			OutOfFaces,
+			OutOfVertices,
+			AccuraryReached,
+			FallBack,
+			Failed		};};
+			/* Fields		*/ 
+			eStatus::_		m_status;
+			b3GJK::sSimplex	m_result;
+			b3Vector3		m_normal;
+			b3Scalar		m_depth;
+			sSV				m_sv_store[EPA_MAX_VERTICES];
+			sFace			m_fc_store[EPA_MAX_FACES];
+			unsigned int				m_nextsv;
+			sList			m_hull;
+			sList			m_stock;
+			/* Methods		*/ 
+			b3EPA()
+			{
+				Initialize();	
+			}
+			static inline void		bind(sFace* fa,unsigned int ea,sFace* fb,unsigned int eb)
+			{
+				fa->e[ea]=(unsigned char)eb;fa->f[ea]=fb;
+				fb->e[eb]=(unsigned char)ea;fb->f[eb]=fa;
+			}
+			static inline void		append(sList& list,sFace* face)
+			{
+				face->l[0]	=	0;
+				face->l[1]	=	list.root;
+				if(list.root) list.root->l[0]=face;
+				list.root	=	face;
+				++list.count;
+			}
+			static inline void		remove(sList& list,sFace* face)
+			{
+				if(face->l[1]) face->l[1]->l[0]=face->l[0];
+				if(face->l[0]) face->l[0]->l[1]=face->l[1];
+				if(face==list.root) list.root=face->l[1];
+				--list.count;
+			}
+			void				Initialize()
+			{
+				m_status	=	eStatus::Failed;
+				m_normal	=	b3MakeVector3(0,0,0);
+				m_depth		=	0;
+				m_nextsv	=	0;
+				for(unsigned int i=0;i<EPA_MAX_FACES;++i)
+				{
+					append(m_stock,&m_fc_store[EPA_MAX_FACES-i-1]);
+				}
+			}
+			eStatus::_			Evaluate(b3GJK& gjk,const b3Vector3& guess)
+			{
+				b3GJK::sSimplex&	simplex=*gjk.m_simplex;
+				if((simplex.rank>1)&&gjk.EncloseOrigin())
+				{
+					/* Clean up				*/ 
+					while(m_hull.root)
+					{
+						sFace*	f = m_hull.root;
+						remove(m_hull,f);
+						append(m_stock,f);
+					}
+					m_status	=	eStatus::Valid;
+					m_nextsv	=	0;
+					/* Orient simplex		*/ 
+					if(gjk.det(	simplex.c[0]->w-simplex.c[3]->w,
+						simplex.c[1]->w-simplex.c[3]->w,
+						simplex.c[2]->w-simplex.c[3]->w)<0)
+					{
+						b3Swap(simplex.c[0],simplex.c[1]);
+						b3Swap(simplex.p[0],simplex.p[1]);
+					}
+					/* Build initial hull	*/ 
+					sFace*	tetra[]={newface(simplex.c[0],simplex.c[1],simplex.c[2],true),
+						newface(simplex.c[1],simplex.c[0],simplex.c[3],true),
+						newface(simplex.c[2],simplex.c[1],simplex.c[3],true),
+						newface(simplex.c[0],simplex.c[2],simplex.c[3],true)};
+					if(m_hull.count==4)
+					{
+						sFace*		best=findbest();
+						sFace		outer=*best;
+						unsigned int			pass=0;
+						unsigned int			iterations=0;
+						bind(tetra[0],0,tetra[1],0);
+						bind(tetra[0],1,tetra[2],0);
+						bind(tetra[0],2,tetra[3],0);
+						bind(tetra[1],1,tetra[3],2);
+						bind(tetra[1],2,tetra[2],1);
+						bind(tetra[2],2,tetra[3],1);
+						m_status=eStatus::Valid;
+						for(;iterations<EPA_MAX_ITERATIONS;++iterations)
+						{
+							if(m_nextsv<EPA_MAX_VERTICES)
+							{	
+								sHorizon		horizon;
+								sSV*			w=&m_sv_store[m_nextsv++];
+								bool			valid=true;					
+								best->pass	=	(unsigned char)(++pass);
+								gjk.getsupport(best->n,*w);
+								const b3Scalar	wdist=b3Dot(best->n,w->w)-best->d;
+								if(wdist>EPA_ACCURACY)
+								{
+									for(unsigned int j=0;(j<3)&&valid;++j)
+									{
+										valid&=expand(	pass,w,
+											best->f[j],best->e[j],
+											horizon);
+									}
+									if(valid&&(horizon.nf>=3))
+									{
+										bind(horizon.cf,1,horizon.ff,2);
+										remove(m_hull,best);
+										append(m_stock,best);
+										best=findbest();
+										outer=*best;
+									} else { 
+										m_status=eStatus::Failed;
+										//m_status=eStatus::InvalidHull;
+									break; }
+								} else { m_status=eStatus::AccuraryReached;break; }
+							} else { m_status=eStatus::OutOfVertices;break; }
+						}
+						const b3Vector3	projection=outer.n*outer.d;
+						m_normal	=	outer.n;
+						m_depth		=	outer.d;
+						m_result.rank	=	3;
+						m_result.c[0]	=	outer.c[0];
+						m_result.c[1]	=	outer.c[1];
+						m_result.c[2]	=	outer.c[2];
+						m_result.p[0]	=	b3Cross(	outer.c[1]->w-projection,
+							outer.c[2]->w-projection).length();
+						m_result.p[1]	=	b3Cross(	outer.c[2]->w-projection,
+							outer.c[0]->w-projection).length();
+						m_result.p[2]	=	b3Cross(	outer.c[0]->w-projection,
+							outer.c[1]->w-projection).length();
+						const b3Scalar	sum=m_result.p[0]+m_result.p[1]+m_result.p[2];
+						m_result.p[0]	/=	sum;
+						m_result.p[1]	/=	sum;
+						m_result.p[2]	/=	sum;
+						return(m_status);
+					}
+				}
+				/* Fallback		*/ 
+				m_status	=	eStatus::FallBack;
+				m_normal	=	-guess;
+				const b3Scalar	nl=m_normal.length();
+				if(nl>0)
+					m_normal	=	m_normal/nl;
+				else
+					m_normal	=	b3MakeVector3(1,0,0);
+				m_depth	=	0;
+				m_result.rank=1;
+				m_result.c[0]=simplex.c[0];
+				m_result.p[0]=1;	
+				return(m_status);
+			}
+			bool getedgedist(sFace* face, sSV* a, sSV* b, b3Scalar& dist)
+			{
+				const b3Vector3 ba = b->w - a->w;
+				const b3Vector3 n_ab = b3Cross(ba, face->n); // Outward facing edge normal direction, on triangle plane
+				const b3Scalar a_dot_nab = b3Dot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required
+				if(a_dot_nab < 0)
+				{
+					// Outside of edge a->b
+					const b3Scalar ba_l2 = ba.length2();
+					const b3Scalar a_dot_ba = b3Dot(a->w, ba);
+					const b3Scalar b_dot_ba = b3Dot(b->w, ba);
+					if(a_dot_ba > 0)
+					{
+						// Pick distance vertex a
+						dist = a->w.length();
+					}
+					else if(b_dot_ba < 0)
+					{
+						// Pick distance vertex b
+						dist = b->w.length();
+					}
+					else
+					{
+						// Pick distance to edge a->b
+						const b3Scalar a_dot_b = b3Dot(a->w, b->w);
+						dist = b3Sqrt(b3Max((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (b3Scalar)0));
+					}
+					return true;
+				}
+				return false;
+			}
+			sFace*				newface(sSV* a,sSV* b,sSV* c,bool forced)
+			{
+				if(m_stock.root)
+				{
+					sFace*	face=m_stock.root;
+					remove(m_stock,face);
+					append(m_hull,face);
+					face->pass	=	0;
+					face->c[0]	=	a;
+					face->c[1]	=	b;
+					face->c[2]	=	c;
+					face->n		=	b3Cross(b->w-a->w,c->w-a->w);
+					const b3Scalar	l=face->n.length();
+					const bool		v=l>EPA_ACCURACY;
+					if(v)
+					{
+						if(!(getedgedist(face, a, b, face->d) ||
+							 getedgedist(face, b, c, face->d) ||
+							 getedgedist(face, c, a, face->d)))
+						{
+							// Origin projects to the interior of the triangle
+							// Use distance to triangle plane
+							face->d = b3Dot(a->w, face->n) / l;
+						}
+						face->n /= l;
+						if(forced || (face->d >= -EPA_PLANE_EPS))
+						{
+							return face;
+						}
+						else
+							m_status=eStatus::NonConvex;
+					}
+					else
+						m_status=eStatus::Degenerated;
+					remove(m_hull, face);
+					append(m_stock, face);
+					return 0;
+				}
+				m_status = m_stock.root ? eStatus::OutOfVertices : eStatus::OutOfFaces;
+				return 0;
+			}
+			sFace*				findbest()
+			{
+				sFace*		minf=m_hull.root;
+				b3Scalar	mind=minf->d*minf->d;
+				for(sFace* f=minf->l[1];f;f=f->l[1])
+				{
+					const b3Scalar	sqd=f->d*f->d;
+					if(sqd<mind)
+					{
+						minf=f;
+						mind=sqd;
+					}
+				}
+				return(minf);
+			}
+			bool				expand(unsigned int pass,sSV* w,sFace* f,unsigned int e,sHorizon& horizon)
+			{
+				static const unsigned int	i1m3[]={1,2,0};
+				static const unsigned int	i2m3[]={2,0,1};
+				if(f->pass!=pass)
+				{
+					const unsigned int	e1=i1m3[e];
+					if((b3Dot(f->n,w->w)-f->d)<-EPA_PLANE_EPS)
+					{
+						sFace*	nf=newface(f->c[e1],f->c[e],w,false);
+						if(nf)
+						{
+							bind(nf,0,f,e);
+							if(horizon.cf) bind(horizon.cf,1,nf,2); else horizon.ff=nf;
+							horizon.cf=nf;
+							++horizon.nf;
+							return(true);
+						}
+					}
+					else
+					{
+						const unsigned int	e2=i2m3[e];
+						f->pass		=	(unsigned char)pass;
+						if(	expand(pass,w,f->f[e1],f->e[e1],horizon)&&
+							expand(pass,w,f->f[e2],f->e[e2],horizon))
+						{
+							remove(m_hull,f);
+							append(m_stock,f);
+							return(true);
+						}
+					}
+				}
+				return(false);
+			}
+	};
+	//
+	static void	Initialize(const b3Transform&	transA, const b3Transform&	transB,
+								const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+								const b3AlignedObjectArray<b3Vector3>& verticesA,
+								const b3AlignedObjectArray<b3Vector3>& verticesB,
+		b3GjkEpaSolver2::sResults& results,
+		tShape& shape,
+		bool withmargins)
+	{
+		/* Results		*/ 
+		results.witnesses[0]	=
+			results.witnesses[1]	=	b3MakeVector3(0,0,0);
+		results.status			=	b3GjkEpaSolver2::sResults::Separated;
+		/* Shape		*/ 
+		shape.m_shapes[0]		=	hullA;
+		shape.m_shapes[1]		=	hullB;
+		shape.m_toshape1		=	transB.getBasis().transposeTimes(transA.getBasis());
+		shape.m_toshape0		=	transA.inverseTimes(transB);
+		shape.EnableMargin(withmargins);
+	}
+// Api
+using namespace	gjkepa2_impl2;
+int			b3GjkEpaSolver2::StackSizeRequirement()
+	return(sizeof(b3GJK)+sizeof(b3EPA));
+bool		b3GjkEpaSolver2::Distance(	const b3Transform&	transA, const b3Transform&	transB,
+										const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+										const b3AlignedObjectArray<b3Vector3>& verticesA,
+										const b3AlignedObjectArray<b3Vector3>& verticesB,
+									  const b3Vector3&		guess,
+									  sResults&				results)
+	tShape			shape;
+	Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,false);
+	b3GJK				gjk(verticesA,verticesB);
+	b3GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,guess);
+	if(gjk_status==b3GJK::eStatus::Valid)
+	{
+		b3Vector3	w0=b3MakeVector3(0,0,0);
+		b3Vector3	w1=b3MakeVector3(0,0,0);
+		for(unsigned int i=0;i<gjk.m_simplex->rank;++i)
+		{
+			const b3Scalar	p=gjk.m_simplex->p[i];
+			w0+=shape.Support( gjk.m_simplex->c[i]->d,0,verticesA,verticesB)*p;
+			w1+=shape.Support(-gjk.m_simplex->c[i]->d,1,verticesA,verticesB)*p;
+		}
+		results.witnesses[0]	=	transA*w0;
+		results.witnesses[1]	=	transA*w1;
+		results.normal			=	w0-w1;
+		results.distance		=	results.normal.length();
+		results.normal			/=	results.distance>GJK_MIN_DISTANCE?results.distance:1;
+		return(true);
+	}
+	else
+	{
+		results.status	=	gjk_status==b3GJK::eStatus::Inside?
+			sResults::Penetrating	:
+		sResults::GJK_Failed	;
+		return(false);
+	}
+bool	b3GjkEpaSolver2::Penetration(	const b3Transform&	transA, const b3Transform&	transB,
+										const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+										const b3AlignedObjectArray<b3Vector3>& verticesA,
+										const b3AlignedObjectArray<b3Vector3>& verticesB,
+									 const b3Vector3&		guess,
+									 sResults&				results,
+									 bool					usemargins)
+	tShape			shape;
+	Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,usemargins);
+	b3GJK				gjk(verticesA,verticesB);
+	b3GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,guess);
+	switch(gjk_status)
+	{
+	case	b3GJK::eStatus::Inside:
+		{
+			b3EPA				epa;
+			b3EPA::eStatus::_	epa_status=epa.Evaluate(gjk,-guess);
+			if(epa_status!=b3EPA::eStatus::Failed)
+			{
+				b3Vector3	w0=b3MakeVector3(0,0,0);
+				for(unsigned int i=0;i<epa.m_result.rank;++i)
+				{
+					w0+=shape.Support(epa.m_result.c[i]->d,0,verticesA,verticesB)*epa.m_result.p[i];
+				}
+				results.status			=	sResults::Penetrating;
+				results.witnesses[0]	=	transA*w0;
+				results.witnesses[1]	=	transA*(w0-epa.m_normal*epa.m_depth);
+				results.normal			=	-epa.m_normal;
+				results.distance		=	-epa.m_depth;
+				return(true);
+			} else results.status=sResults::EPA_Failed;
+		}
+		break;
+	case	b3GJK::eStatus::Failed:
+		results.status=sResults::GJK_Failed;
+		break;
+		default:
+					{
+					}
+	}
+	return(false);
+#if 0
+b3Scalar	b3GjkEpaSolver2::SignedDistance(const b3Vector3& position,
+											b3Scalar margin,
+											const b3Transform&	transA,
+											const b3ConvexPolyhedronData& hullA, 
+											const b3AlignedObjectArray<b3Vector3>& verticesA,
+											sResults& results)
+	tShape			shape;
+	btSphereShape	shape1(margin);
+	b3Transform		wtrs1(b3Quaternion(0,0,0,1),position);
+	Initialize(shape0,wtrs0,&shape1,wtrs1,results,shape,false);
+	GJK				gjk;	
+	GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,b3Vector3(1,1,1));
+	if(gjk_status==GJK::eStatus::Valid)
+	{
+		b3Vector3	w0=b3Vector3(0,0,0);
+		b3Vector3	w1=b3Vector3(0,0,0);
+		for(unsigned int i=0;i<gjk.m_simplex->rank;++i)
+		{
+			const b3Scalar	p=gjk.m_simplex->p[i];
+			w0+=shape.Support( gjk.m_simplex->c[i]->d,0)*p;
+			w1+=shape.Support(-gjk.m_simplex->c[i]->d,1)*p;
+		}
+		results.witnesses[0]	=	wtrs0*w0;
+		results.witnesses[1]	=	wtrs0*w1;
+		const b3Vector3	delta=	results.witnesses[1]-
+			results.witnesses[0];
+		const b3Scalar	margin=	shape0->getMarginNonVirtual()+
+			shape1.getMarginNonVirtual();
+		const b3Scalar	length=	delta.length();	
+		results.normal			=	delta/length;
+		results.witnesses[0]	+=	results.normal*margin;
+		return(length-margin);
+	}
+	else
+	{
+		if(gjk_status==GJK::eStatus::Inside)
+		{
+			if(Penetration(shape0,wtrs0,&shape1,wtrs1,gjk.m_ray,results))
+			{
+				const b3Vector3	delta=	results.witnesses[0]-
+					results.witnesses[1];
+				const b3Scalar	length=	delta.length();
+				if (length >= B3_EPSILON)
+					results.normal	=	delta/length;			
+				return(-length);
+			}
+		}	
+	}
+	return(B3_INFINITY);
+bool	b3GjkEpaSolver2::SignedDistance(const btConvexShape*	shape0,
+										const b3Transform&		wtrs0,
+										const btConvexShape*	shape1,
+										const b3Transform&		wtrs1,
+										const b3Vector3&		guess,
+										sResults&				results)
+	if(!Distance(shape0,wtrs0,shape1,wtrs1,guess,results))
+		return(Penetration(shape0,wtrs0,shape1,wtrs1,guess,results,false));
+	else
+		return(true);
+/* Symbols cleanup		*/ 
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
new file mode 100644
index 00000000..976238a0
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
@@ -0,0 +1,82 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the
+use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software in a
+product, an acknowledgment in the product documentation would be appreciated
+but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+GJK-EPA collision solver by Nathanael Presson, 2008
+#ifndef B3_GJK_EPA2_H
+#define B3_GJK_EPA2_H
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+///btGjkEpaSolver contributed under zlib by Nathanael Presson
+struct	b3GjkEpaSolver2
+struct	sResults
+	{
+	enum eStatus
+		{
+		Separated,		/* Shapes doesnt penetrate												*/ 
+		Penetrating,	/* Shapes are penetrating												*/ 
+		GJK_Failed,		/* GJK phase fail, no big issue, shapes are probably just 'touching'	*/ 
+		EPA_Failed		/* EPA phase fail, bigger problem, need to save parameters, and debug	*/ 
+		}		status;
+	b3Vector3	witnesses[2];
+	b3Vector3	normal;
+	b3Scalar	distance;
+	};
+static int		StackSizeRequirement();
+static bool		Distance(	 const b3Transform&	transA, const b3Transform&	transB,
+							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+							const b3AlignedObjectArray<b3Vector3>& verticesA,
+							const b3AlignedObjectArray<b3Vector3>& verticesB,
+							const b3Vector3& guess,
+							sResults& results);
+static bool		Penetration( const b3Transform&	transA, const b3Transform&	transB,
+							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+							const b3AlignedObjectArray<b3Vector3>& verticesA,
+							const b3AlignedObjectArray<b3Vector3>& verticesB,
+							const b3Vector3& guess,
+							sResults& results,
+							bool usemargins=true);
+#if 0
+static b3Scalar	SignedDistance(	const b3Vector3& position,
+								b3Scalar margin,
+								const btConvexShape* shape,
+								const btTransform& wtrs,
+								sResults& results);
+static bool		SignedDistance(	const btConvexShape* shape0,const btTransform& wtrs0,
+								const btConvexShape* shape1,const btTransform& wtrs1,
+								const b3Vector3& guess,
+								sResults& results);
+#endif //B3_GJK_EPA2_H
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.cpp
new file mode 100644
index 00000000..8e78a19e
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.cpp
@@ -0,0 +1,533 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3GjkPairDetector.h"
+#include "Bullet3Common/b3Transform.h"
+#include "b3VoronoiSimplexSolver.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "b3VectorFloat4.h"
+#include "b3GjkEpa.h"
+#include "b3SupportMappings.h"
+//must be above the machine epsilon
+#define REL_ERROR2 b3Scalar(1.0e-6)
+//temp globals, to improve GJK/EPA/penetration calculations
+int gNumDeepPenetrationChecks2 = 0;
+int gNumGjkChecks2 = 0;
+int gGjkSeparatingAxis2=0;
+int gEpaSeparatingAxis2=0;
+b3GjkPairDetector::b3GjkPairDetector(b3VoronoiSimplexSolver* simplexSolver,b3GjkEpaSolver2*	penetrationDepthSolver)
+bool calcPenDepth( b3VoronoiSimplexSolver& simplexSolver,
+											  const b3Transform&	transformA, const b3Transform&	transformB,
+	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& verticesB,
+											  b3Vector3& v, b3Vector3& wWitnessOnA, b3Vector3& wWitnessOnB)
+	(void)v;
+	(void)simplexSolver;
+	b3Vector3	guessVector(transformB.getOrigin()-transformA.getOrigin());
+	b3GjkEpaSolver2::sResults	results;
+	if(b3GjkEpaSolver2::Penetration(transformA,transformB,&hullA,&hullB,verticesA,verticesB,guessVector,results))
+	{
+		wWitnessOnA = results.witnesses[0];
+		wWitnessOnB = results.witnesses[1];
+		v = results.normal;
+		return true;		
+	} 
+	else
+	{
+		if(b3GjkEpaSolver2::Distance(transformA,transformB,&hullA,&hullB,verticesA,verticesB,guessVector,results))
+		{
+			wWitnessOnA = results.witnesses[0];
+			wWitnessOnB = results.witnesses[1];
+			v = results.normal;
+			return false;
+		}
+	}
+	return false;
+#define dot3F4 b3Dot
+inline void project(const b3ConvexPolyhedronData& hull,  const float4& pos, const b3Quaternion& orn, const float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max)
+	min = FLT_MAX;
+	max = -FLT_MAX;
+	int numVerts = hull.m_numVertices;
+	const float4 localDir = b3QuatRotate(orn.inverse(),dir);
+	b3Scalar offset = dot3F4(pos,dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		//b3Vector3 pt = trans * vertices[m_vertexOffset+i];
+		//b3Scalar dp = pt.dot(dir);
+		b3Vector3 vertex = vertices[hull.m_vertexOffset+i];
+		b3Scalar dp = dot3F4((float4&)vertices[hull.m_vertexOffset+i],localDir);
+		//b3Assert(dp==dpL);
+		if(dp < min)	min = dp;
+		if(dp > max)	max = dp;
+	}
+	if(min>max)
+	{
+		b3Scalar tmp = min;
+		min = max;
+		max = tmp;
+	}
+	min += offset;
+	max += offset;
+static bool TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA,const b3Quaternion& ornA,
+	const float4& posB,const b3Quaternion& ornB,
+	float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB,b3Scalar& depth)
+	b3Scalar Min0,Max0;
+	b3Scalar Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,verticesA, Min0, Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, Min1, Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	b3Scalar d0 = Max0 - Min1;
+	b3Assert(d0>=0.0f);
+	b3Scalar d1 = Max1 - Min0;
+	b3Assert(d1>=0.0f);
+	if (d0<d1)
+	{
+		depth = d0;
+		sep_axis *=-1;
+	} else
+	{
+		depth = d1;
+	}
+	return true;
+bool getClosestPoints(b3GjkPairDetector* gjkDetector, const b3Transform&	transA, const b3Transform&	transB,
+	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& verticesB,
+	b3Scalar maximumDistanceSquared,
+	b3Vector3& resultSepNormal,
+	float& resultSepDistance,
+	b3Vector3& resultPointOnB)
+	//resultSepDistance = maximumDistanceSquared;
+	gjkDetector->m_cachedSeparatingDistance = 0.f;
+	b3Scalar distance=b3Scalar(0.);
+	b3Vector3	normalInB= b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	b3Vector3 pointOnA,pointOnB;
+	b3Transform localTransA = transA;
+	b3Transform localTransB = transB;
+	b3Vector3 positionOffset = b3MakeVector3(0,0,0);// = (localTransA.getOrigin() + localTransB.getOrigin()) * b3Scalar(0.5);
+	localTransA.getOrigin() -= positionOffset;
+	localTransB.getOrigin() -= positionOffset;
+	bool check2d = false;//m_minkowskiA->isConvex2d() && m_minkowskiB->isConvex2d();
+	b3Scalar marginA = 0.f;//m_marginA;
+	b3Scalar marginB = 0.f;//m_marginB;
+	gNumGjkChecks2++;
+	//for CCD we don't use margins
+	if (gjkDetector->m_ignoreMargin)
+	{
+		marginA = b3Scalar(0.);
+		marginB = b3Scalar(0.);
+	}
+	gjkDetector->m_curIter = 0;
+	int gGjkMaxIter = 1000;//this is to catch invalid input, perhaps check for #NaN?
+	gjkDetector->m_cachedSeparatingAxis.setValue(1,1,1);//0,0,0);
+	bool isValid = false;
+	bool checkSimplex = false;
+	bool checkPenetration = true;
+	gjkDetector->m_degenerateSimplex = 0;
+	gjkDetector->m_lastUsedMethod = -1;
+	{
+		b3Scalar squaredDistance = B3_LARGE_FLOAT;
+		b3Scalar delta = -1e30f;//b3Scalar(0.);
+		b3Scalar prevDelta = -1e30f;//b3Scalar(0.);
+		b3Scalar margin = marginA + marginB;
+		b3Scalar bestDeltaN = -1e30f;
+		b3Vector3 bestSepAxis= b3MakeVector3(0,0,0);
+		gjkDetector->m_simplexSolver->reset();
+		for ( ; ; )
+		//while (true)
+		{
+			b3Vector3 seperatingAxisInA = (-gjkDetector->m_cachedSeparatingAxis)* localTransA.getBasis();
+			b3Vector3 seperatingAxisInB = gjkDetector->m_cachedSeparatingAxis* localTransB.getBasis();
+			b3Vector3 pInA = localGetSupportVertexWithoutMargin(seperatingAxisInA,&hullA,verticesA);
+			b3Vector3 qInB = localGetSupportVertexWithoutMargin(seperatingAxisInB,&hullB,verticesB);
+			b3Vector3  pWorld = localTransA(pInA);	
+			b3Vector3  qWorld = localTransB(qInB);
+		spu_printf("got local supporting vertices\n");
+			if (check2d)
+			{
+				pWorld[2] = 0.f;
+				qWorld[2] = 0.f;
+			}
+			b3Vector3 w	= pWorld - qWorld;
+			delta = gjkDetector->m_cachedSeparatingAxis.dot(w);
+			// potential exit, they don't overlap
+			if ((delta > b3Scalar(0.0)) && (delta * delta > squaredDistance * maximumDistanceSquared)) 
+			{
+				gjkDetector->m_degenerateSimplex = 10;
+				checkSimplex=true;
+				//checkPenetration = false;
+				break;
+			}
+			//exit 0: the new point is already in the simplex, or we didn't come any closer
+			if (gjkDetector->m_simplexSolver->inSimplex(w))
+			{
+				gjkDetector->m_degenerateSimplex = 1;
+				checkSimplex = true;
+				break;
+			}
+			// are we getting any closer ?
+			b3Scalar f0 = squaredDistance - delta;
+			b3Scalar f1 = squaredDistance * REL_ERROR2;
+			if (f0 <= f1)
+			{
+				if (f0 <= b3Scalar(0.))
+				{
+					gjkDetector->m_degenerateSimplex = 2;
+				} else
+				{
+					gjkDetector->m_degenerateSimplex = 11;
+				}
+				checkSimplex = true;
+				break;
+			}
+		spu_printf("addVertex 1\n");
+			//add current vertex to simplex
+			gjkDetector->m_simplexSolver->addVertex(w, pWorld, qWorld);
+		spu_printf("addVertex 2\n");
+			b3Vector3 newCachedSeparatingAxis;
+			//calculate the closest point to the origin (update vector v)
+			if (!gjkDetector->m_simplexSolver->closest(newCachedSeparatingAxis))
+			{
+				gjkDetector->m_degenerateSimplex = 3;
+				checkSimplex = true;
+				break;
+			}
+			if(newCachedSeparatingAxis.length2()<REL_ERROR2)
+            {
+				gjkDetector->m_cachedSeparatingAxis = newCachedSeparatingAxis;
+                gjkDetector->m_degenerateSimplex = 6;
+                checkSimplex = true;
+                break;
+            }
+			b3Scalar previousSquaredDistance = squaredDistance;
+			squaredDistance = newCachedSeparatingAxis.length2();
+#if 0
+///warning: this termination condition leads to some problems in 2d test case see Bullet/Demos/Box2dDemo
+			if (squaredDistance>previousSquaredDistance)
+			{
+				gjkDetector->m_degenerateSimplex = 7;
+				squaredDistance = previousSquaredDistance;
+                checkSimplex = false;
+                break;
+			}
+#endif //
+			//redundant gjkDetector->m_simplexSolver->compute_points(pointOnA, pointOnB);
+			//are we getting any closer ?
+			if (previousSquaredDistance - squaredDistance <= B3_EPSILON * previousSquaredDistance) 
+			{ 
+//				gjkDetector->m_simplexSolver->backup_closest(gjkDetector->m_cachedSeparatingAxis);
+				checkSimplex = true;
+				gjkDetector->m_degenerateSimplex = 12;
+				break;
+			}
+			gjkDetector->m_cachedSeparatingAxis = newCachedSeparatingAxis;
+			  //degeneracy, this is typically due to invalid/uninitialized worldtransforms for a btCollisionObject   
+              if (gjkDetector->m_curIter++ > gGjkMaxIter)   
+              {   
+                      #if defined(DEBUG) || defined (_DEBUG) || defined (DEBUG_SPU_COLLISION_DETECTION)
+                              printf("btGjkPairDetector maxIter exceeded:%i\n",gjkDetector->m_curIter);   
+                              printf("sepAxis=(%f,%f,%f), squaredDistance = %f\n",   
+                              gjkDetector->m_cachedSeparatingAxis.getX(),   
+                              gjkDetector->m_cachedSeparatingAxis.getY(),   
+                              gjkDetector->m_cachedSeparatingAxis.getZ(),   
+                              squaredDistance);
+                      #endif   
+                      break;   
+              } 
+			bool check = (!gjkDetector->m_simplexSolver->fullSimplex());
+			//bool check = (!gjkDetector->m_simplexSolver->fullSimplex() && squaredDistance > B3_EPSILON * gjkDetector->m_simplexSolver->maxVertex());
+			if (!check)
+			{
+				//do we need this backup_closest here ?
+//				gjkDetector->m_simplexSolver->backup_closest(gjkDetector->m_cachedSeparatingAxis);
+				gjkDetector->m_degenerateSimplex = 13;
+				break;
+			}
+		}
+		if (checkSimplex)
+		{
+			gjkDetector->m_simplexSolver->compute_points(pointOnA, pointOnB);
+			normalInB = gjkDetector->m_cachedSeparatingAxis;
+			b3Scalar lenSqr =gjkDetector->m_cachedSeparatingAxis.length2();
+			//valid normal
+			if (lenSqr < 0.0001)
+			{
+				gjkDetector->m_degenerateSimplex = 5;
+			} 
+			if (lenSqr > B3_EPSILON*B3_EPSILON)
+			{
+				b3Scalar rlen = b3Scalar(1.) / b3Sqrt(lenSqr );
+				normalInB *= rlen; //normalize
+				b3Scalar s = b3Sqrt(squaredDistance);
+				b3Assert(s > b3Scalar(0.0));
+				pointOnA -= gjkDetector->m_cachedSeparatingAxis * (marginA / s);
+				pointOnB += gjkDetector->m_cachedSeparatingAxis * (marginB / s);
+				distance = ((b3Scalar(1.)/rlen) - margin);
+				isValid = true;
+				gjkDetector->m_lastUsedMethod = 1;
+			} else
+			{
+				gjkDetector->m_lastUsedMethod = 2;
+			}
+		}
+		bool catchDegeneratePenetrationCase = 
+			(gjkDetector->m_catchDegeneracies && gjkDetector->m_penetrationDepthSolver && gjkDetector->m_degenerateSimplex && ((distance+margin) < 0.01));
+		//if (checkPenetration && !isValid)
+		if (checkPenetration && (!isValid || catchDegeneratePenetrationCase ))
+		{
+			//penetration case
+			//if there is no way to handle penetrations, bail out
+			if (gjkDetector->m_penetrationDepthSolver)
+			{
+				// Penetration depth case.
+				b3Vector3 tmpPointOnA,tmpPointOnB;
+				gNumDeepPenetrationChecks2++;
+				gjkDetector->m_cachedSeparatingAxis.setZero();
+				bool isValid2 = calcPenDepth( 
+					*gjkDetector->m_simplexSolver, 
+					transA,transB,hullA,hullB,verticesA,verticesB,
+					gjkDetector->m_cachedSeparatingAxis, tmpPointOnA, tmpPointOnB
+					);
+				if (isValid2)
+				{
+					b3Vector3 tmpNormalInB = tmpPointOnB-tmpPointOnA;
+					b3Scalar lenSqr = tmpNormalInB.length2();
+					if (lenSqr <= (B3_EPSILON*B3_EPSILON))
+					{
+						tmpNormalInB = gjkDetector->m_cachedSeparatingAxis;
+						lenSqr = gjkDetector->m_cachedSeparatingAxis.length2();
+					}
+					if (lenSqr > (B3_EPSILON*B3_EPSILON))
+					{
+						tmpNormalInB /= b3Sqrt(lenSqr);
+						b3Scalar distance2 = -(tmpPointOnA-tmpPointOnB).length();
+						//only replace valid penetrations when the result is deeper (check)
+						if (!isValid || (distance2 < distance))
+						{
+							distance = distance2;
+							pointOnA = tmpPointOnA;
+							pointOnB = tmpPointOnB;
+							normalInB = tmpNormalInB;
+							isValid = true;
+							gjkDetector->m_lastUsedMethod = 3;
+						} else
+						{
+							gjkDetector->m_lastUsedMethod = 8;
+						}
+					} else
+					{
+						gjkDetector->m_lastUsedMethod = 9;
+					}
+				} else
+				{
+					///this is another degenerate case, where the initial GJK calculation reports a degenerate case
+					///EPA reports no penetration, and the second GJK (using the supporting vector without margin)
+					///reports a valid positive distance. Use the results of the second GJK instead of failing.
+					///thanks to Jacob.Langford for the reproduction case
+					///http://code.google.com/p/bullet/issues/detail?id=250
+					if (gjkDetector->m_cachedSeparatingAxis.length2() > b3Scalar(0.))
+					{
+						b3Scalar distance2 = (tmpPointOnA-tmpPointOnB).length()-margin;
+						//only replace valid distances when the distance is less
+						if (!isValid || (distance2 < distance))
+						{
+							distance = distance2;
+							pointOnA = tmpPointOnA;
+							pointOnB = tmpPointOnB;
+							pointOnA -= gjkDetector->m_cachedSeparatingAxis * marginA ;
+							pointOnB += gjkDetector->m_cachedSeparatingAxis * marginB ;
+							normalInB = gjkDetector->m_cachedSeparatingAxis;
+							normalInB.normalize();
+							isValid = true;
+							gjkDetector->m_lastUsedMethod = 6;
+						} else
+						{
+							gjkDetector->m_lastUsedMethod = 5;
+						}
+					}
+				}
+			}
+		}
+	}
+	if (isValid && (distance < 0))
+	//if (isValid && ((distance < 0) || (distance*distance < maximumDistanceSquared)))
+	{
+		if (1)//m_fixContactNormalDirection)
+		{
+			///@workaround for sticky convex collisions
+			//in some degenerate cases (usually when the use uses very small margins) 
+			//the contact normal is pointing the wrong direction
+			//so fix it now (until we can deal with all degenerate cases in GJK and EPA)
+			//contact normals need to point from B to A in all cases, so we can simply check if the contact normal really points from B to A
+			//We like to use a dot product of the normal against the difference of the centroids, 
+			//once the centroid is available in the API
+			//until then we use the center of the aabb to approximate the centroid
+			b3Vector3 posA  = localTransA*hullA.m_localCenter;
+			b3Vector3 posB  = localTransB*hullB.m_localCenter;
+			b3Vector3 diff = posA-posB;
+			if (diff.dot(normalInB) < 0.f)
+				normalInB *= -1.f;
+		}
+		gjkDetector->m_cachedSeparatingAxis = normalInB;
+		gjkDetector->m_cachedSeparatingDistance = distance;
+		/*output.addContactPoint(
+			normalInB,
+			pointOnB+positionOffset,
+			distance);
+			*/
+		static float maxPenetrationDistance = 0.f;
+		if (distance<maxPenetrationDistance)
+		{
+			maxPenetrationDistance = distance;
+			printf("maxPenetrationDistance = %f\n",maxPenetrationDistance);
+		}
+		resultSepNormal = normalInB;
+		resultSepDistance = distance;
+		resultPointOnB = pointOnB+positionOffset;
+		return true;
+	}
+	return false;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.h
new file mode 100644
index 00000000..2a0f85a8
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.h
@@ -0,0 +1,84 @@
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+class b3Transform;
+struct b3GjkEpaSolver2;
+class b3VoronoiSimplexSolver;
+struct b3ConvexPolyhedronData;
+B3_ATTRIBUTE_ALIGNED16(struct) b3GjkPairDetector
+	b3Vector3	m_cachedSeparatingAxis;
+	b3GjkEpaSolver2*	m_penetrationDepthSolver;
+	b3VoronoiSimplexSolver* m_simplexSolver;
+	bool		m_ignoreMargin;
+	b3Scalar	m_cachedSeparatingDistance;
+	//some debugging to fix degeneracy problems
+	int			m_lastUsedMethod;
+	int			m_curIter;
+	int			m_degenerateSimplex;
+	int			m_catchDegeneracies;
+	int			m_fixContactNormalDirection;
+	b3GjkPairDetector(b3VoronoiSimplexSolver* simplexSolver,b3GjkEpaSolver2*	penetrationDepthSolver);
+	virtual ~b3GjkPairDetector() {};
+	//void	getClosestPoints(,Result& output);
+	void setCachedSeperatingAxis(const b3Vector3& seperatingAxis)
+	{
+		m_cachedSeparatingAxis = seperatingAxis;
+	}
+	const b3Vector3& getCachedSeparatingAxis() const
+	{
+		return m_cachedSeparatingAxis;
+	}
+	b3Scalar	getCachedSeparatingDistance() const
+	{
+		return m_cachedSeparatingDistance;
+	}
+	void	setPenetrationDepthSolver(b3GjkEpaSolver2*	penetrationDepthSolver)
+	{
+		m_penetrationDepthSolver = penetrationDepthSolver;
+	}
+	///don't use setIgnoreMargin, it's for Bullet's internal use
+	void	setIgnoreMargin(bool ignoreMargin)
+	{
+		m_ignoreMargin = ignoreMargin;
+	}
+bool getClosestPoints(b3GjkPairDetector* gjkDetector, const b3Transform&	transA, const b3Transform&	transB,
+	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& verticesB,
+	b3Scalar maximumDistanceSquared,
+	b3Vector3& resultSepNormal,
+	float& resultSepDistance,
+	b3Vector3& resultPointOnB);
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
new file mode 100644
index 00000000..e9e51d5a
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
@@ -0,0 +1,390 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3OptimizedBvh.h"
+#include "b3StridingMeshInterface.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
+	m_useQuantization = useQuantizedAabbCompression;
+	// NodeArray	triangleNodes;
+	struct	NodeTriangleCallback : public b3InternalTriangleIndexCallback
+	{
+		NodeArray&	m_triangleNodes;
+		NodeTriangleCallback& operator=(NodeTriangleCallback& other)
+		{
+			m_triangleNodes.copyFromArray(other.m_triangleNodes);
+			return *this;
+		}
+		NodeTriangleCallback(NodeArray&	triangleNodes)
+			:m_triangleNodes(triangleNodes)
+		{
+		}
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		{
+			b3OptimizedBvhNode node;
+			b3Vector3	aabbMin,aabbMax;
+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+			aabbMin.setMin(triangle[0]);
+			aabbMax.setMax(triangle[0]);
+			aabbMin.setMin(triangle[1]);
+			aabbMax.setMax(triangle[1]);
+			aabbMin.setMin(triangle[2]);
+			aabbMax.setMax(triangle[2]);
+			//with quantization?
+			node.m_aabbMinOrg = aabbMin;
+			node.m_aabbMaxOrg = aabbMax;
+			node.m_escapeIndex = -1;
+			//for child nodes
+			node.m_subPart = partId;
+			node.m_triangleIndex = triangleIndex;
+			m_triangleNodes.push_back(node);
+		}
+	};
+	struct	QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
+	{
+		QuantizedNodeArray&	m_triangleNodes;
+		const b3QuantizedBvh* m_optimizedTree; // for quantization
+		QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
+		{
+			m_triangleNodes.copyFromArray(other.m_triangleNodes);
+			m_optimizedTree = other.m_optimizedTree;
+			return *this;
+		}
+		QuantizedNodeTriangleCallback(QuantizedNodeArray&	triangleNodes,const b3QuantizedBvh* tree)
+			:m_triangleNodes(triangleNodes),m_optimizedTree(tree)
+		{
+		}
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		{
+			// The partId and triangle index must fit in the same (positive) integer
+			b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS));
+			b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS)));
+			//negative indices are reserved for escapeIndex
+			b3Assert(triangleIndex>=0);
+			b3QuantizedBvhNode node;
+			b3Vector3	aabbMin,aabbMax;
+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+			aabbMin.setMin(triangle[0]);
+			aabbMax.setMax(triangle[0]);
+			aabbMin.setMin(triangle[1]);
+			aabbMax.setMax(triangle[1]);
+			aabbMin.setMin(triangle[2]);
+			aabbMax.setMax(triangle[2]);
+			//PCK: add these checks for zero dimensions of aabb
+			const b3Scalar MIN_AABB_DIMENSION = b3Scalar(0.002);
+			const b3Scalar MIN_AABB_HALF_DIMENSION = b3Scalar(0.001);
+			if (aabbMax.getX() - aabbMin.getX() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setX(aabbMax.getX() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setX(aabbMin.getX() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.getY() - aabbMin.getY() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setY(aabbMax.getY() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setY(aabbMin.getY() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.getZ() - aabbMin.getZ() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setZ(aabbMax.getZ() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
+			}
+			m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
+			m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
+			node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
+			m_triangleNodes.push_back(node);
+		}
+	};
+	int numLeafNodes = 0;
+	if (m_useQuantization)
+	{
+		//initialize quantization values
+		setQuantizationValues(bvhAabbMin,bvhAabbMax);
+		QuantizedNodeTriangleCallback	callback(m_quantizedLeafNodes,this);
+		triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax);
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_quantizedLeafNodes.size();
+		m_quantizedContiguousNodes.resize(2*numLeafNodes);
+	} else
+	{
+		NodeTriangleCallback	callback(m_leafNodes);
+		b3Vector3 aabbMin=b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+		b3Vector3 aabbMax=b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+		triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax);
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_leafNodes.size();
+		m_contiguousNodes.resize(2*numLeafNodes);
+	}
+	m_curNodeIndex = 0;
+	buildTree(0,numLeafNodes);
+	///if the entire tree is small then subtree size, we need to create a header info for the tree
+	if(m_useQuantization && !m_SubtreeHeaders.size())
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
+		subtree.m_rootNodeIndex = 0;
+		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
+	}
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
+	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
+void	b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+	if (m_useQuantization)
+	{
+		setQuantizationValues(aabbMin,aabbMax);
+		updateBvhNodes(meshInterface,0,m_curNodeIndex,0);
+		///now update all subtree headers
+		int i;
+		for (i=0;i<m_SubtreeHeaders.size();i++)
+		{
+			b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
+		}
+	} else
+	{
+	}
+void	b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+	//incrementally initialize quantization values
+	b3Assert(m_useQuantization);
+	b3Assert(aabbMin.getX() > m_bvhAabbMin.getX());
+	b3Assert(aabbMin.getY() > m_bvhAabbMin.getY());
+	b3Assert(aabbMin.getZ() > m_bvhAabbMin.getZ());
+	b3Assert(aabbMax.getX() < m_bvhAabbMax.getX());
+	b3Assert(aabbMax.getY() < m_bvhAabbMax.getY());
+	b3Assert(aabbMax.getZ() < m_bvhAabbMax.getZ());
+	///we should update all quantization values, using updateBvhNodes(meshInterface);
+	///but we only update chunks that overlap the given aabb
+	unsigned short	quantizedQueryAabbMin[3];
+	unsigned short	quantizedQueryAabbMax[3];
+	quantize(&quantizedQueryAabbMin[0],aabbMin,0);
+	quantize(&quantizedQueryAabbMax[0],aabbMax,1);
+	int i;
+	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+		//PCK: unsigned instead of bool
+		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
+			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
+		}
+	}
+void	b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index)
+	(void)index;
+	b3Assert(m_useQuantization);
+	int curNodeSubPart=-1;
+	//get access info to trianglemesh data
+		const unsigned char *vertexbase = 0;
+		int numverts = 0;
+		PHY_ScalarType type = PHY_INTEGER;
+		int stride = 0;
+		const unsigned char *indexbase = 0;
+		int indexstride = 0;
+		int numfaces = 0;
+		PHY_ScalarType indicestype = PHY_INTEGER;
+		b3Vector3	triangleVerts[3];
+		b3Vector3	aabbMin,aabbMax;
+		const b3Vector3& meshScaling = meshInterface->getScaling();
+		int i;
+		for (i=endNode-1;i>=firstNode;i--)
+		{
+			b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
+			if (curNode.isLeafNode())
+			{
+				//recalc aabb from triangle data
+				int nodeSubPart = curNode.getPartId();
+				int nodeTriangleIndex = curNode.getTriangleIndex();
+				if (nodeSubPart != curNodeSubPart)
+				{
+					if (curNodeSubPart >= 0)
+						meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+					meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,	type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart);
+					curNodeSubPart = nodeSubPart;
+					b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT);
+				}
+				//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
+				unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride);
+				for (int j=2;j>=0;j--)
+				{
+					int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j];
+					if (type == PHY_FLOAT)
+					{
+						float* graphicsbase = (float*)(vertexbase+graphicsindex*stride);
+						triangleVerts[j] = b3MakeVector3(
+							graphicsbase[0]*meshScaling.getX(),
+							graphicsbase[1]*meshScaling.getY(),
+							graphicsbase[2]*meshScaling.getZ());
+					}
+					else
+					{
+						double* graphicsbase = (double*)(vertexbase+graphicsindex*stride);
+						triangleVerts[j] = b3MakeVector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ()));
+					}
+				}
+				aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+				aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+				aabbMin.setMin(triangleVerts[0]);
+				aabbMax.setMax(triangleVerts[0]);
+				aabbMin.setMin(triangleVerts[1]);
+				aabbMax.setMax(triangleVerts[1]);
+				aabbMin.setMin(triangleVerts[2]);
+				aabbMax.setMax(triangleVerts[2]);
+				quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0);
+				quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1);
+			} else
+			{
+				//combine aabb from both children
+				b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1];
+				b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] :
+					&m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()];
+				{
+					for (int i=0;i<3;i++)
+					{
+						curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
+						if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i])
+							curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i];
+						curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
+						if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
+							curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
+					}
+				}
+			}
+		}
+		if (curNodeSubPart >= 0)
+			meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+	b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
+	//we don't add additional data so just do a static upcast
+	return static_cast<b3OptimizedBvh*>(bvh);
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
new file mode 100644
index 00000000..0272ef83
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
@@ -0,0 +1,65 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///Contains contributions from Disney Studio's
+#include "b3QuantizedBvh.h"
+class b3StridingMeshInterface;
+///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
+B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh
+	b3OptimizedBvh();
+	virtual ~b3OptimizedBvh();
+	void	build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
+	void	refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax);
+	void	refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax);
+	void	updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
+	/// Data buffer MUST be 16 byte aligned
+	virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
+	{
+		return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
+	}
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+#endif //B3_OPTIMIZED_BVH_H
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
new file mode 100644
index 00000000..52027e11
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
@@ -0,0 +1,1301 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3QuantizedBvh.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#define RAYAABB2
+b3QuantizedBvh::b3QuantizedBvh() : 
+					m_bulletVersion(B3_BULLET_VERSION),
+					m_useQuantization(false), 
+					//m_traversalMode(TRAVERSAL_STACKLESS)
+					//m_traversalMode(TRAVERSAL_RECURSIVE)
+					,m_subtreeHeaderCount(0) //PCK: add this line
+	m_bvhAabbMin.setValue(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY);
+	m_bvhAabbMax.setValue(B3_INFINITY,B3_INFINITY,B3_INFINITY);
+void b3QuantizedBvh::buildInternal()
+	///assumes that caller filled in the m_quantizedLeafNodes
+	m_useQuantization = true;
+	int numLeafNodes = 0;
+	if (m_useQuantization)
+	{
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_quantizedLeafNodes.size();
+		m_quantizedContiguousNodes.resize(2*numLeafNodes);
+	}
+	m_curNodeIndex = 0;
+	buildTree(0,numLeafNodes);
+	///if the entire tree is small then subtree size, we need to create a header info for the tree
+	if(m_useQuantization && !m_SubtreeHeaders.size())
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
+		subtree.m_rootNodeIndex = 0;
+		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
+	}
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
+	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
+///just for debugging, to visualize the individual patches/subtrees
+b3Vector3 color[4]=
+	b3Vector3(1,0,0),
+	b3Vector3(0,1,0),
+	b3Vector3(0,0,1),
+	b3Vector3(0,1,1)
+void	b3QuantizedBvh::setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin)
+	//enlarge the AABB to avoid division by zero when initializing the quantization values
+	b3Vector3 clampValue =b3MakeVector3(quantizationMargin,quantizationMargin,quantizationMargin);
+	m_bvhAabbMin = bvhAabbMin - clampValue;
+	m_bvhAabbMax = bvhAabbMax + clampValue;
+	b3Vector3 aabbSize = m_bvhAabbMax - m_bvhAabbMin;
+	m_bvhQuantization = b3MakeVector3(b3Scalar(65533.0),b3Scalar(65533.0),b3Scalar(65533.0)) / aabbSize;
+	m_useQuantization = true;
+int gStackDepth = 0;
+int gMaxStackDepth = 0;
+void	b3QuantizedBvh::buildTree	(int startIndex,int endIndex)
+	gStackDepth++;
+	if (gStackDepth > gMaxStackDepth)
+		gMaxStackDepth = gStackDepth;
+	int splitAxis, splitIndex, i;
+	int numIndices =endIndex-startIndex;
+	int curIndex = m_curNodeIndex;
+	b3Assert(numIndices>0);
+	if (numIndices==1)
+	{
+		gStackDepth--;
+		assignInternalNodeFromLeafNode(m_curNodeIndex,startIndex);
+		m_curNodeIndex++;
+		return;	
+	}
+	//calculate Best Splitting Axis and where to split it. Sort the incoming 'leafNodes' array within range 'startIndex/endIndex'.
+	splitAxis = calcSplittingAxis(startIndex,endIndex);
+	splitIndex = sortAndCalcSplittingIndex(startIndex,endIndex,splitAxis);
+	int internalNodeIndex = m_curNodeIndex;
+	//set the min aabb to 'inf' or a max value, and set the max aabb to a -inf/minimum value.
+	//the aabb will be expanded during buildTree/mergeInternalNodeAabb with actual node values
+	setInternalNodeAabbMin(m_curNodeIndex,m_bvhAabbMax);//can't use b3Vector3(B3_INFINITY,B3_INFINITY,B3_INFINITY)) because of quantization
+	setInternalNodeAabbMax(m_curNodeIndex,m_bvhAabbMin);//can't use b3Vector3(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY)) because of quantization
+	for (i=startIndex;i<endIndex;i++)
+	{
+		mergeInternalNodeAabb(m_curNodeIndex,getAabbMin(i),getAabbMax(i));
+	}
+	m_curNodeIndex++;
+	//internalNode->m_escapeIndex;
+	int leftChildNodexIndex = m_curNodeIndex;
+	//build left child tree
+	buildTree(startIndex,splitIndex);
+	int rightChildNodexIndex = m_curNodeIndex;
+	//build right child tree
+	buildTree(splitIndex,endIndex);
+	gStackDepth--;
+	int escapeIndex = m_curNodeIndex - curIndex;
+	if (m_useQuantization)
+	{
+		//escapeIndex is the number of nodes of this subtree
+		const int sizeQuantizedNode =sizeof(b3QuantizedBvhNode);
+		const int treeSizeInBytes = escapeIndex * sizeQuantizedNode;
+		if (treeSizeInBytes > MAX_SUBTREE_SIZE_IN_BYTES)
+		{
+			updateSubtreeHeaders(leftChildNodexIndex,rightChildNodexIndex);
+		}
+	} else
+	{
+	}
+	setInternalNodeEscapeIndex(internalNodeIndex,escapeIndex);
+void	b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex)
+	b3Assert(m_useQuantization);
+	b3QuantizedBvhNode& leftChildNode = m_quantizedContiguousNodes[leftChildNodexIndex];
+	int leftSubTreeSize = leftChildNode.isLeafNode() ? 1 : leftChildNode.getEscapeIndex();
+	int leftSubTreeSizeInBytes =  leftSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode));
+	b3QuantizedBvhNode& rightChildNode = m_quantizedContiguousNodes[rightChildNodexIndex];
+	int rightSubTreeSize = rightChildNode.isLeafNode() ? 1 : rightChildNode.getEscapeIndex();
+	int rightSubTreeSizeInBytes =  rightSubTreeSize *  static_cast<int>(sizeof(b3QuantizedBvhNode));
+	if(leftSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES)
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(leftChildNode);
+		subtree.m_rootNodeIndex = leftChildNodexIndex;
+		subtree.m_subtreeSize = leftSubTreeSize;
+	}
+	if(rightSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES)
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(rightChildNode);
+		subtree.m_rootNodeIndex = rightChildNodexIndex;
+		subtree.m_subtreeSize = rightSubTreeSize;
+	}
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+int	b3QuantizedBvh::sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis)
+	int i;
+	int splitIndex =startIndex;
+	int numIndices = endIndex - startIndex;
+	b3Scalar splitValue;
+	b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		means+=center;
+	}
+	means *= (b3Scalar(1.)/(b3Scalar)numIndices);
+	splitValue = means[splitAxis];
+	//sort leafNodes so all values larger then splitValue comes first, and smaller values start from 'splitIndex'.
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		if (center[splitAxis] > splitValue)
+		{
+			//swap
+			swapLeafNodes(i,splitIndex);
+			splitIndex++;
+		}
+	}
+	//if the splitIndex causes unbalanced trees, fix this by using the center in between startIndex and endIndex
+	//otherwise the tree-building might fail due to stack-overflows in certain cases.
+	//unbalanced1 is unsafe: it can cause stack overflows
+	//bool unbalanced1 = ((splitIndex==startIndex) || (splitIndex == (endIndex-1)));
+	//unbalanced2 should work too: always use center (perfect balanced trees)	
+	//bool unbalanced2 = true;
+	//this should be safe too:
+	int rangeBalancedIndices = numIndices/3;
+	bool unbalanced = ((splitIndex<=(startIndex+rangeBalancedIndices)) || (splitIndex >=(endIndex-1-rangeBalancedIndices)));
+	if (unbalanced)
+	{
+		splitIndex = startIndex+ (numIndices>>1);
+	}
+	bool unbal = (splitIndex==startIndex) || (splitIndex == (endIndex));
+	(void)unbal;
+	b3Assert(!unbal);
+	return splitIndex;
+int	b3QuantizedBvh::calcSplittingAxis(int startIndex,int endIndex)
+	int i;
+	b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	b3Vector3 variance=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	int numIndices = endIndex-startIndex;
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		means+=center;
+	}
+	means *= (b3Scalar(1.)/(b3Scalar)numIndices);
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		b3Vector3 diff2 = center-means;
+		diff2 = diff2 * diff2;
+		variance += diff2;
+	}
+	variance *= (b3Scalar(1.)/	((b3Scalar)numIndices-1)	);
+	return variance.maxAxis();
+void	b3QuantizedBvh::reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+	//either choose recursive traversal (walkTree) or stackless (walkStacklessTree)
+	if (m_useQuantization)
+	{
+		///quantize query AABB
+		unsigned short int quantizedQueryAabbMin[3];
+		unsigned short int quantizedQueryAabbMax[3];
+		quantizeWithClamp(quantizedQueryAabbMin,aabbMin,0);
+		quantizeWithClamp(quantizedQueryAabbMax,aabbMax,1);
+		switch (m_traversalMode)
+		{
+				walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,0,m_curNodeIndex);
+			break;
+				walkStacklessQuantizedTreeCacheFriendly(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			break;
+			{
+				const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[0];
+				walkRecursiveQuantizedTreeAgainstQueryAabb(rootNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			}
+			break;
+		default:
+			//unsupported
+			b3Assert(0);
+		}
+	} else
+	{
+		walkStacklessTree(nodeCallback,aabbMin,aabbMax);
+	}
+static int b3s_maxIterations = 0;
+void	b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+	b3Assert(!m_useQuantization);
+	const b3OptimizedBvhNode* rootNode = &m_contiguousNodes[0];
+	int escapeIndex, curIndex = 0;
+	int walkIterations = 0;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+	while (curIndex < m_curNodeIndex)
+	{
+		//catch bugs in tree data
+		b3Assert (walkIterations < m_curNodeIndex);
+		walkIterations++;
+		aabbOverlap = b3TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
+		isLeafNode = rootNode->m_escapeIndex == -1;
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (aabbOverlap != 0))
+		{
+			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
+		} 
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->m_escapeIndex;
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+///this was the original recursive traversal, before we optimized towards stackless traversal
+void	b3QuantizedBvh::walkTree(b3OptimizedBvhNode* rootNode,b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+	bool isLeafNode, aabbOverlap = TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMin,rootNode->m_aabbMax);
+	if (aabbOverlap)
+	{
+		isLeafNode = (!rootNode->m_leftChild && !rootNode->m_rightChild);
+		if (isLeafNode)
+		{
+			nodeCallback->processNode(rootNode);
+		} else
+		{
+			walkTree(rootNode->m_leftChild,nodeCallback,aabbMin,aabbMax);
+			walkTree(rootNode->m_rightChild,nodeCallback,aabbMin,aabbMax);
+		}
+	}
+void b3QuantizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const
+	b3Assert(m_useQuantization);
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+	//PCK: unsigned instead of bool
+	aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,currentNode->m_quantizedAabbMin,currentNode->m_quantizedAabbMax);
+	isLeafNode = currentNode->isLeafNode();
+	//PCK: unsigned instead of bool
+	if (aabbOverlap != 0)
+	{
+		if (isLeafNode)
+		{
+			nodeCallback->processNode(currentNode->getPartId(),currentNode->getTriangleIndex());
+		} else
+		{
+			//process left and right children
+			const b3QuantizedBvhNode* leftChildNode = currentNode+1;
+			walkRecursiveQuantizedTreeAgainstQueryAabb(leftChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			const b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? leftChildNode+1:leftChildNode+leftChildNode->getEscapeIndex();
+			walkRecursiveQuantizedTreeAgainstQueryAabb(rightChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+		}
+	}		
+void	b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const
+	b3Assert(!m_useQuantization);
+	const b3OptimizedBvhNode* rootNode = &m_contiguousNodes[0];
+	int escapeIndex, curIndex = 0;
+	int walkIterations = 0;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap=0;
+	unsigned rayBoxOverlap=0;
+	b3Scalar lambda_max = 1.0;
+		/* Quick pruning by quantized box */
+	b3Vector3 rayAabbMin = raySource;
+	b3Vector3 rayAabbMax = raySource;
+	rayAabbMin.setMin(rayTarget);
+	rayAabbMax.setMax(rayTarget);
+	/* Add box cast extents to bounding box */
+	rayAabbMin += aabbMin;
+	rayAabbMax += aabbMax;
+#ifdef RAYAABB2
+	b3Vector3 rayDir = (rayTarget-raySource);
+	rayDir.normalize ();
+	lambda_max = rayDir.dot(rayTarget-raySource);
+	///what about division by zero? --> just set rayDirection[i] to 1.0
+	b3Vector3 rayDirectionInverse;
+	rayDirectionInverse[0] = rayDir[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[0];
+	rayDirectionInverse[1] = rayDir[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[1];
+	rayDirectionInverse[2] = rayDir[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[2];
+	unsigned int sign[3] = { rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0};
+	b3Vector3 bounds[2];
+	while (curIndex < m_curNodeIndex)
+	{
+		b3Scalar param = 1.0;
+		//catch bugs in tree data
+		b3Assert (walkIterations < m_curNodeIndex);
+		walkIterations++;
+		bounds[0] = rootNode->m_aabbMinOrg;
+		bounds[1] = rootNode->m_aabbMaxOrg;
+		/* Add box cast extents */
+		bounds[0] -= aabbMax;
+		bounds[1] -= aabbMin;
+		aabbOverlap = b3TestAabbAgainstAabb2(rayAabbMin,rayAabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
+		//perhaps profile if it is worth doing the aabbOverlap test first
+#ifdef RAYAABB2
+			///careful with this check: need to check division by zero (above) and fix the unQuantize method
+			///thanks Joerg/hiker for the reproduction case!
+			///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858
+		rayBoxOverlap = aabbOverlap ? b3RayAabb2 (raySource, rayDirectionInverse, sign, bounds, param, 0.0f, lambda_max) : false;
+		b3Vector3 normal;
+		rayBoxOverlap = b3RayAabb(raySource, rayTarget,bounds[0],bounds[1],param, normal);
+		isLeafNode = rootNode->m_escapeIndex == -1;
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (rayBoxOverlap != 0))
+		{
+			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
+		} 
+		//PCK: unsigned instead of bool
+		if ((rayBoxOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->m_escapeIndex;
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+void	b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const
+	b3Assert(m_useQuantization);
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+	(void)subTreeSize;
+	const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
+	int escapeIndex;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned boxBoxOverlap = 0;
+	unsigned rayBoxOverlap = 0;
+	b3Scalar lambda_max = 1.0;
+#ifdef RAYAABB2
+	b3Vector3 rayDirection = (rayTarget-raySource);
+	rayDirection.normalize ();
+	lambda_max = rayDirection.dot(rayTarget-raySource);
+	///what about division by zero? --> just set rayDirection[i] to 1.0
+	rayDirection[0] = rayDirection[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[0];
+	rayDirection[1] = rayDirection[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[1];
+	rayDirection[2] = rayDirection[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[2];
+	unsigned int sign[3] = { rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0};
+	/* Quick pruning by quantized box */
+	b3Vector3 rayAabbMin = raySource;
+	b3Vector3 rayAabbMax = raySource;
+	rayAabbMin.setMin(rayTarget);
+	rayAabbMax.setMax(rayTarget);
+	/* Add box cast extents to bounding box */
+	rayAabbMin += aabbMin;
+	rayAabbMax += aabbMax;
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	quantizeWithClamp(quantizedQueryAabbMin,rayAabbMin,0);
+	quantizeWithClamp(quantizedQueryAabbMax,rayAabbMax,1);
+	while (curIndex < endNodeIndex)
+	{
+		//some code snippet to debugDraw aabb, to visually analyze bvh structure
+		static int drawPatch = 0;
+		//need some global access to a debugDrawer
+		extern b3IDebugDraw* debugDrawerPtr;
+		if (curIndex==drawPatch)
+		{
+			b3Vector3 aabbMin,aabbMax;
+			aabbMin = unQuantize(rootNode->m_quantizedAabbMin);
+			aabbMax = unQuantize(rootNode->m_quantizedAabbMax);
+			b3Vector3	color(1,0,0);
+			debugDrawerPtr->drawAabb(aabbMin,aabbMax,color);
+		}
+		//catch bugs in tree data
+		b3Assert (walkIterations < subTreeSize);
+		walkIterations++;
+		//PCK: unsigned instead of bool
+		// only interested if this is closer than any previous hit
+		b3Scalar param = 1.0;
+		rayBoxOverlap = 0;
+		boxBoxOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+		if (boxBoxOverlap)
+		{
+			b3Vector3 bounds[2];
+			bounds[0] = unQuantize(rootNode->m_quantizedAabbMin);
+			bounds[1] = unQuantize(rootNode->m_quantizedAabbMax);
+			/* Add box cast extents */
+			bounds[0] -= aabbMax;
+			bounds[1] -= aabbMin;
+#if 0
+			b3Vector3 normal;
+			bool ra2 = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0, lambda_max);
+			bool ra = b3RayAabb (raySource, rayTarget, bounds[0], bounds[1], param, normal);
+			if (ra2 != ra)
+			{
+				printf("functions don't match\n");
+			}
+#ifdef RAYAABB2
+			///careful with this check: need to check division by zero (above) and fix the unQuantize method
+			///thanks Joerg/hiker for the reproduction case!
+			///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858
+			//B3_PROFILE("b3RayAabb2");
+			rayBoxOverlap = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0f, lambda_max);
+			rayBoxOverlap = true;//b3RayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal);
+		}
+		if (isLeafNode && rayBoxOverlap)
+		{
+			nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex());
+		}
+		//PCK: unsigned instead of bool
+		if ((rayBoxOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+void	b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const
+	b3Assert(m_useQuantization);
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+	(void)subTreeSize;
+	const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
+	int escapeIndex;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+	while (curIndex < endNodeIndex)
+	{
+		//some code snippet to debugDraw aabb, to visually analyze bvh structure
+		static int drawPatch = 0;
+		//need some global access to a debugDrawer
+		extern b3IDebugDraw* debugDrawerPtr;
+		if (curIndex==drawPatch)
+		{
+			b3Vector3 aabbMin,aabbMax;
+			aabbMin = unQuantize(rootNode->m_quantizedAabbMin);
+			aabbMax = unQuantize(rootNode->m_quantizedAabbMax);
+			b3Vector3	color(1,0,0);
+			debugDrawerPtr->drawAabb(aabbMin,aabbMax,color);
+		}
+		//catch bugs in tree data
+		b3Assert (walkIterations < subTreeSize);
+		walkIterations++;
+		//PCK: unsigned instead of bool
+		aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+		if (isLeafNode && aabbOverlap)
+		{
+			nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex());
+		} 
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+//This traversal can be called from Playstation 3 SPU
+void	b3QuantizedBvh::walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const
+	b3Assert(m_useQuantization);
+	int i;
+	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	{
+		const b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+		//PCK: unsigned instead of bool
+		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,
+				subtree.m_rootNodeIndex,
+				subtree.m_rootNodeIndex+subtree.m_subtreeSize);
+		}
+	}
+void	b3QuantizedBvh::reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const
+	reportBoxCastOverlappingNodex(nodeCallback,raySource,rayTarget,b3MakeVector3(0,0,0),b3MakeVector3(0,0,0));
+void	b3QuantizedBvh::reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+	//always use stackless
+	if (m_useQuantization)
+	{
+		walkStacklessQuantizedTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex);
+	}
+	else
+	{
+		walkStacklessTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex);
+	}
+	/*
+	{
+		//recursive traversal
+		b3Vector3 qaabbMin = raySource;
+		b3Vector3 qaabbMax = raySource;
+		qaabbMin.setMin(rayTarget);
+		qaabbMax.setMax(rayTarget);
+		qaabbMin += aabbMin;
+		qaabbMax += aabbMax;
+		reportAabbOverlappingNodex(nodeCallback,qaabbMin,qaabbMax);
+	}
+	*/
+void	b3QuantizedBvh::swapLeafNodes(int i,int splitIndex)
+	if (m_useQuantization)
+	{
+			b3QuantizedBvhNode tmp = m_quantizedLeafNodes[i];
+			m_quantizedLeafNodes[i] = m_quantizedLeafNodes[splitIndex];
+			m_quantizedLeafNodes[splitIndex] = tmp;
+	} else
+	{
+			b3OptimizedBvhNode tmp = m_leafNodes[i];
+			m_leafNodes[i] = m_leafNodes[splitIndex];
+			m_leafNodes[splitIndex] = tmp;
+	}
+void	b3QuantizedBvh::assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex)
+	if (m_useQuantization)
+	{
+		m_quantizedContiguousNodes[internalNode] = m_quantizedLeafNodes[leafNodeIndex];
+	} else
+	{
+		m_contiguousNodes[internalNode] = m_leafNodes[leafNodeIndex];
+	}
+//PCK: include
+#include <new>
+#if 0
+//PCK: consts
+static const unsigned BVH_ALIGNMENT = 16;
+static const unsigned BVH_ALIGNMENT_MASK = BVH_ALIGNMENT-1;
+static const unsigned BVH_ALIGNMENT_BLOCKS = 2;
+unsigned int b3QuantizedBvh::getAlignmentSerializationPadding()
+	// I changed this to 0 since the extra padding is not needed or used.
+unsigned b3QuantizedBvh::calculateSerializeBufferSize() const
+	unsigned baseSize = sizeof(b3QuantizedBvh) + getAlignmentSerializationPadding();
+	baseSize += sizeof(b3BvhSubtreeInfo) * m_subtreeHeaderCount;
+	if (m_useQuantization)
+	{
+		return baseSize + m_curNodeIndex * sizeof(b3QuantizedBvhNode);
+	}
+	return baseSize + m_curNodeIndex * sizeof(b3OptimizedBvhNode);
+bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBufferSize */, bool i_swapEndian) const
+	b3Assert(m_subtreeHeaderCount == m_SubtreeHeaders.size());
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+/*	if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (((unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0))
+	{
+		///check alignedment for buffer?
+		b3Assert(0);
+		return false;
+	}
+	b3QuantizedBvh *targetBvh = (b3QuantizedBvh *)o_alignedDataBuffer;
+	// construct the class so the virtual function table, etc will be set up
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (targetBvh) b3QuantizedBvh;
+	if (i_swapEndian)
+	{
+		targetBvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(m_curNodeIndex));
+		b3SwapVector3Endian(m_bvhAabbMin,targetBvh->m_bvhAabbMin);
+		b3SwapVector3Endian(m_bvhAabbMax,targetBvh->m_bvhAabbMax);
+		b3SwapVector3Endian(m_bvhQuantization,targetBvh->m_bvhQuantization);
+		targetBvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(m_traversalMode);
+		targetBvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(m_subtreeHeaderCount));
+	}
+	else
+	{
+		targetBvh->m_curNodeIndex = m_curNodeIndex;
+		targetBvh->m_bvhAabbMin = m_bvhAabbMin;
+		targetBvh->m_bvhAabbMax = m_bvhAabbMax;
+		targetBvh->m_bvhQuantization = m_bvhQuantization;
+		targetBvh->m_traversalMode = m_traversalMode;
+		targetBvh->m_subtreeHeaderCount = m_subtreeHeaderCount;
+	}
+	targetBvh->m_useQuantization = m_useQuantization;
+	unsigned char *nodeData = (unsigned char *)targetBvh;
+	nodeData += sizeof(b3QuantizedBvh);
+	unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	int nodeCount = m_curNodeIndex;
+	if (m_useQuantization)
+	{
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex));
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex;
+			}
+		}
+		nodeData += sizeof(b3QuantizedBvhNode) * nodeCount;
+		// this clears the pointer in the member variable it doesn't really do anything to the data
+		// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+		// so the memory (which is not freed) is left alone
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(NULL, 0, 0);
+	}
+	else
+	{
+		targetBvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				b3SwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMinOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				b3SwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMaxOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_escapeIndex));
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_subPart));
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_triangleIndex));
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg = m_contiguousNodes[nodeIndex].m_aabbMinOrg;
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg = m_contiguousNodes[nodeIndex].m_aabbMaxOrg;
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = m_contiguousNodes[nodeIndex].m_escapeIndex;
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = m_contiguousNodes[nodeIndex].m_subPart;
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = m_contiguousNodes[nodeIndex].m_triangleIndex;
+			}
+		}
+		nodeData += sizeof(b3OptimizedBvhNode) * nodeCount;
+		// this clears the pointer in the member variable it doesn't really do anything to the data
+		// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+		// so the memory (which is not freed) is left alone
+		targetBvh->m_contiguousNodes.initializeFromBuffer(NULL, 0, 0);
+	}
+	sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	// Now serialize the subtree headers
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, m_subtreeHeaderCount, m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(b3SwapEndian(m_SubtreeHeaders[i].m_rootNodeIndex));
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(b3SwapEndian(m_SubtreeHeaders[i].m_subtreeSize));
+		}
+	}
+	else
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = (m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = (m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = (m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = (m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = (m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = (m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = (m_SubtreeHeaders[i].m_rootNodeIndex);
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = (m_SubtreeHeaders[i].m_subtreeSize);
+			// need to clear padding in destination buffer
+			targetBvh->m_SubtreeHeaders[i].m_padding[0] = 0;
+			targetBvh->m_SubtreeHeaders[i].m_padding[1] = 0;
+			targetBvh->m_SubtreeHeaders[i].m_padding[2] = 0;
+		}
+	}
+	nodeData += sizeof(b3BvhSubtreeInfo) * m_subtreeHeaderCount;
+	// this clears the pointer in the member variable it doesn't really do anything to the data
+	// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+	// so the memory (which is not freed) is left alone
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(NULL, 0, 0);
+	// this wipes the virtual function table pointer at the start of the buffer for the class
+	*((void**)o_alignedDataBuffer) = NULL;
+	return true;
+b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+	if (i_alignedDataBuffer == NULL)// || (((unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0))
+	{
+		return NULL;
+	}
+	b3QuantizedBvh *bvh = (b3QuantizedBvh *)i_alignedDataBuffer;
+	if (i_swapEndian)
+	{
+		bvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(bvh->m_curNodeIndex));
+		b3UnSwapVector3Endian(bvh->m_bvhAabbMin);
+		b3UnSwapVector3Endian(bvh->m_bvhAabbMax);
+		b3UnSwapVector3Endian(bvh->m_bvhQuantization);
+		bvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(bvh->m_traversalMode);
+		bvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(bvh->m_subtreeHeaderCount));
+	}
+	unsigned int calculatedBufSize = bvh->calculateSerializeBufferSize();
+	b3Assert(calculatedBufSize <= i_dataBufferSize);
+	if (calculatedBufSize > i_dataBufferSize)
+	{
+		return NULL;
+	}
+	unsigned char *nodeData = (unsigned char *)bvh;
+	nodeData += sizeof(b3QuantizedBvh);
+	unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	int nodeCount = bvh->m_curNodeIndex;
+	// Must call placement new to fill in virtual function table, etc, but we don't want to overwrite most data, so call a special version of the constructor
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (bvh) b3QuantizedBvh(*bvh, false);
+	if (bvh->m_useQuantization)
+	{
+		bvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex));
+			}
+		}
+		nodeData += sizeof(b3QuantizedBvhNode) * nodeCount;
+	}
+	else
+	{
+		bvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+				bvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_escapeIndex));
+				bvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_subPart));
+				bvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_triangleIndex));
+			}
+		}
+		nodeData += sizeof(b3OptimizedBvhNode) * nodeCount;
+	}
+	sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	// Now serialize the subtree headers
+	bvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, bvh->m_subtreeHeaderCount, bvh->m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < bvh->m_subtreeHeaderCount; i++)
+		{
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+			bvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(b3SwapEndian(bvh->m_SubtreeHeaders[i].m_rootNodeIndex));
+			bvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(b3SwapEndian(bvh->m_SubtreeHeaders[i].m_subtreeSize));
+		}
+	}
+	return bvh;
+// Constructor that prevents b3Vector3's default constructor from being called
+b3QuantizedBvh::b3QuantizedBvh(b3QuantizedBvh &self, bool /* ownsMemory */) :
+void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData)
+	m_bvhAabbMax.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMax);
+	m_bvhAabbMin.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMin);
+	m_bvhQuantization.deSerializeFloat(quantizedBvhFloatData.m_bvhQuantization);
+	m_curNodeIndex = quantizedBvhFloatData.m_curNodeIndex;
+	m_useQuantization = quantizedBvhFloatData.m_useQuantization!=0;
+	{
+		int numElem = quantizedBvhFloatData.m_numContiguousLeafNodes;
+		m_contiguousNodes.resize(numElem);
+		if (numElem)
+		{
+			b3OptimizedBvhNodeFloatData* memPtr = quantizedBvhFloatData.m_contiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_contiguousNodes[i].m_aabbMaxOrg.deSerializeFloat(memPtr->m_aabbMaxOrg);
+				m_contiguousNodes[i].m_aabbMinOrg.deSerializeFloat(memPtr->m_aabbMinOrg);
+				m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex;
+				m_contiguousNodes[i].m_subPart = memPtr->m_subPart;
+				m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex;
+			}
+		}
+	}
+	{
+		int numElem = quantizedBvhFloatData.m_numQuantizedContiguousNodes;
+		m_quantizedContiguousNodes.resize(numElem);
+		if (numElem)
+		{
+			b3QuantizedBvhNodeData* memPtr = quantizedBvhFloatData.m_quantizedContiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex;
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+			}
+		}
+	}
+	m_traversalMode = b3TraversalMode(quantizedBvhFloatData.m_traversalMode);
+	{
+		int numElem = quantizedBvhFloatData.m_numSubtreeHeaders;
+		m_SubtreeHeaders.resize(numElem);
+		if (numElem)
+		{
+			b3BvhSubtreeInfoData* memPtr = quantizedBvhFloatData.m_subTreeInfoPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ;
+				m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+				m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex;
+				m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize;
+			}
+		}
+	}
+void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData)
+	m_bvhAabbMax.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMax);
+	m_bvhAabbMin.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMin);
+	m_bvhQuantization.deSerializeDouble(quantizedBvhDoubleData.m_bvhQuantization);
+	m_curNodeIndex = quantizedBvhDoubleData.m_curNodeIndex;
+	m_useQuantization = quantizedBvhDoubleData.m_useQuantization!=0;
+	{
+		int numElem = quantizedBvhDoubleData.m_numContiguousLeafNodes;
+		m_contiguousNodes.resize(numElem);
+		if (numElem)
+		{
+			b3OptimizedBvhNodeDoubleData* memPtr = quantizedBvhDoubleData.m_contiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_contiguousNodes[i].m_aabbMaxOrg.deSerializeDouble(memPtr->m_aabbMaxOrg);
+				m_contiguousNodes[i].m_aabbMinOrg.deSerializeDouble(memPtr->m_aabbMinOrg);
+				m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex;
+				m_contiguousNodes[i].m_subPart = memPtr->m_subPart;
+				m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex;
+			}
+		}
+	}
+	{
+		int numElem = quantizedBvhDoubleData.m_numQuantizedContiguousNodes;
+		m_quantizedContiguousNodes.resize(numElem);
+		if (numElem)
+		{
+			b3QuantizedBvhNodeData* memPtr = quantizedBvhDoubleData.m_quantizedContiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex;
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+			}
+		}
+	}
+	m_traversalMode = b3TraversalMode(quantizedBvhDoubleData.m_traversalMode);
+	{
+		int numElem = quantizedBvhDoubleData.m_numSubtreeHeaders;
+		m_SubtreeHeaders.resize(numElem);
+		if (numElem)
+		{
+			b3BvhSubtreeInfoData* memPtr = quantizedBvhDoubleData.m_subTreeInfoPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ;
+				m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+				m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex;
+				m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize;
+			}
+		}
+	}
+///fills the dataBuffer and returns the struct name (and 0 on failure)
+const char*	b3QuantizedBvh::serialize(void* dataBuffer, b3Serializer* serializer) const
+	b3Assert(0);
+	return 0;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
new file mode 100644
index 00000000..629a0fce
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
@@ -0,0 +1,556 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+class b3Serializer;
+#ifdef __SPU__
+#define printf spu_printf
+#endif //__SPU__
+#include <stdio.h>
+#include <stdlib.h>
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+#define b3QuantizedBvhData b3QuantizedBvhDoubleData
+#define b3OptimizedBvhNodeData b3OptimizedBvhNodeDoubleData
+#define b3QuantizedBvhDataName "b3QuantizedBvhDoubleData"
+#define b3QuantizedBvhData b3QuantizedBvhFloatData
+#define b3OptimizedBvhNodeData b3OptimizedBvhNodeFloatData
+#define b3QuantizedBvhDataName "b3QuantizedBvhFloatData"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+//Note: currently we have 16 bytes per quantized node
+// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
+// actually) triangles each (since the sign bit is reserved
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+B3_ATTRIBUTE_ALIGNED16	(struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeData
+	bool isLeafNode() const
+	{
+		//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+		return (m_escapeIndexOrTriangleIndex >= 0);
+	}
+	int getEscapeIndex() const
+	{
+		b3Assert(!isLeafNode());
+		return -m_escapeIndexOrTriangleIndex;
+	}
+	int	getTriangleIndex() const
+	{
+		b3Assert(isLeafNode());
+		unsigned int x=0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		// Get only the lower bits where the triangle index is stored
+		return (m_escapeIndexOrTriangleIndex&~(y));
+	}
+	int	getPartId() const
+	{
+		b3Assert(isLeafNode());
+		// Get only the highest bits where the part index is stored
+		return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
+	}
+/// b3OptimizedBvhNode contains both internal and leaf node information.
+/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
+B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode
+	//32 bytes
+	b3Vector3	m_aabbMinOrg;
+	b3Vector3	m_aabbMaxOrg;
+	//4
+	int	m_escapeIndex;
+	//8
+	//for child nodes
+	int	m_subPart;
+	int	m_triangleIndex;
+//pad the size to 64 bytes
+	char	m_padding[20];
+///b3BvhSubtreeInfo provides info to gather a subtree of limited size
+B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
+	b3BvhSubtreeInfo()
+	{
+		//memset(&m_padding[0], 0, sizeof(m_padding));
+	}
+	void	setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
+	{
+		m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
+		m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
+		m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2];
+		m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0];
+		m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
+		m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
+	}
+class b3NodeOverlapCallback
+	virtual ~b3NodeOverlapCallback() {};
+	virtual void processNode(int subPart, int triangleIndex) = 0;
+#include "Bullet3Common/b3AlignedAllocator.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+///for code readability:
+typedef b3AlignedObjectArray<b3OptimizedBvhNode>	NodeArray;
+typedef b3AlignedObjectArray<b3QuantizedBvhNode>	QuantizedNodeArray;
+typedef b3AlignedObjectArray<b3BvhSubtreeInfo>		BvhSubtreeInfoArray;
+///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
+///It is used by the b3BvhTriangleMeshShape as midphase, and by the b3MultiSapBroadphase.
+///It is recommended to use quantization for better performance and lower memory requirements.
+B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh
+	enum b3TraversalMode
+	{
+	};
+	b3Vector3			m_bvhAabbMin;
+	b3Vector3			m_bvhAabbMax;
+	b3Vector3			m_bvhQuantization;
+	int					m_bulletVersion;	//for serialization versioning. It could also be used to detect endianess.
+	int					m_curNodeIndex;
+	//quantization data
+	bool				m_useQuantization;
+	NodeArray			m_leafNodes;
+	NodeArray			m_contiguousNodes;
+	QuantizedNodeArray	m_quantizedLeafNodes;
+	QuantizedNodeArray	m_quantizedContiguousNodes;
+	b3TraversalMode	m_traversalMode;
+	BvhSubtreeInfoArray		m_SubtreeHeaders;
+	//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
+	mutable int m_subtreeHeaderCount;
+	///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
+	///this might be refactored into a virtual, it is usually not calculated at run-time
+	void	setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
+	{
+		if (m_useQuantization)
+		{
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0);
+		} else
+		{
+			m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
+		}
+	}
+	void	setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax)
+	{
+		if (m_useQuantization)
+		{
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1);
+		} else
+		{
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
+		}
+	}
+	b3Vector3 getAabbMin(int nodeIndex) const
+	{
+		if (m_useQuantization)
+		{
+			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]);
+		}
+		//non-quantized
+		return m_leafNodes[nodeIndex].m_aabbMinOrg;
+	}
+	b3Vector3 getAabbMax(int nodeIndex) const
+	{
+		if (m_useQuantization)
+		{
+			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
+		} 
+		//non-quantized
+		return m_leafNodes[nodeIndex].m_aabbMaxOrg;
+	}
+	void	setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
+	{
+		if (m_useQuantization)
+		{
+			m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
+		} 
+		else
+		{
+			m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
+		}
+	}
+	void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax) 
+	{
+		if (m_useQuantization)
+		{
+			unsigned short int quantizedAabbMin[3];
+			unsigned short int quantizedAabbMax[3];
+			quantize(quantizedAabbMin,newAabbMin,0);
+			quantize(quantizedAabbMax,newAabbMax,1);
+			for (int i=0;i<3;i++)
+			{
+				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
+					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
+				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
+					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
+			}
+		} else
+		{
+			//non-quantized
+			m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);		
+		}
+	}
+	void	swapLeafNodes(int firstIndex,int secondIndex);
+	void	assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex);
+	void	buildTree	(int startIndex,int endIndex);
+	int	calcSplittingAxis(int startIndex,int endIndex);
+	int	sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis);
+	void	walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+	void	walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+	void	walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const;
+	void	walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+	///tree traversal designed for small-memory processors like PS3 SPU
+	void	walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
+	void	walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
+	void	walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const;
+	void	updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
+	b3QuantizedBvh();
+	virtual ~b3QuantizedBvh();
+	///***************************************** expert/internal use only *************************
+	void	setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0));
+	QuantizedNodeArray&	getLeafNodeArray() {			return	m_quantizedLeafNodes;	}
+	///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
+	void	buildInternal();
+	///***************************************** expert/internal use only *************************
+	void	reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+	void	reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
+	void	reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+		B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const
+	{
+		b3Assert(m_useQuantization);
+		b3Assert(point.getX() <= m_bvhAabbMax.getX());
+		b3Assert(point.getY() <= m_bvhAabbMax.getY());
+		b3Assert(point.getZ() <= m_bvhAabbMax.getZ());
+		b3Assert(point.getX() >= m_bvhAabbMin.getX());
+		b3Assert(point.getY() >= m_bvhAabbMin.getY());
+		b3Assert(point.getZ() >= m_bvhAabbMin.getZ());
+		b3Vector3 v = (point - m_bvhAabbMin) * m_bvhQuantization;
+		///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative
+		///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly)
+		///@todo: double-check this
+		if (isMax)
+		{
+			out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1));
+			out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1));
+			out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1));
+		} else
+		{
+			out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe));
+			out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe));
+			out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe));
+		}
+		b3Vector3 newPoint = unQuantize(out);
+		if (isMax)
+		{
+			if (newPoint.getX() < point.getX())
+			{
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+			}
+			if (newPoint.getY() < point.getY())
+			{
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+			}
+			if (newPoint.getZ() < point.getZ())
+			{
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+			}
+		} else
+		{
+			if (newPoint.getX() > point.getX())
+			{
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+			}
+			if (newPoint.getY() > point.getY())
+			{
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+			}
+			if (newPoint.getZ() > point.getZ())
+			{
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+			}
+		}
+	}
+	B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const
+	{
+		b3Assert(m_useQuantization);
+		b3Vector3 clampedPoint(point2);
+		clampedPoint.setMax(m_bvhAabbMin);
+		clampedPoint.setMin(m_bvhAabbMax);
+		quantize(out,clampedPoint,isMax);
+	}
+	B3_FORCE_INLINE b3Vector3	unQuantize(const unsigned short* vecIn) const
+	{
+			b3Vector3	vecOut;
+			vecOut.setValue(
+			(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
+			(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
+			(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
+			vecOut += m_bvhAabbMin;
+			return vecOut;
+	}
+	///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
+	void	setTraversalMode(b3TraversalMode	traversalMode)
+	{
+		m_traversalMode = traversalMode;
+	}
+	B3_FORCE_INLINE QuantizedNodeArray&	getQuantizedNodeArray()
+	{	
+		return	m_quantizedContiguousNodes;
+	}
+	B3_FORCE_INLINE BvhSubtreeInfoArray&	getSubtreeInfoArray()
+	{
+		return m_SubtreeHeaders;
+	}
+	/////Calculate space needed to store BVH for serialization
+	unsigned calculateSerializeBufferSize() const;
+	/// Data buffer MUST be 16 byte aligned
+	virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+	static unsigned int getAlignmentSerializationPadding();
+	virtual	int	calculateSerializeBufferSizeNew() const;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+	virtual	void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData);
+	virtual	void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData);
+	B3_FORCE_INLINE bool isQuantized()
+	{
+		return m_useQuantization;
+	}
+	// Special "copy" constructor that allows for in-place deserialization
+	// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
+	// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
+	b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory);
+struct b3OptimizedBvhNodeFloatData
+	b3Vector3FloatData	m_aabbMinOrg;
+	b3Vector3FloatData	m_aabbMaxOrg;
+	int	m_escapeIndex;
+	int	m_subPart;
+	int	m_triangleIndex;
+	char m_pad[4];
+struct b3OptimizedBvhNodeDoubleData
+	b3Vector3DoubleData	m_aabbMinOrg;
+	b3Vector3DoubleData	m_aabbMaxOrg;
+	int	m_escapeIndex;
+	int	m_subPart;
+	int	m_triangleIndex;
+	char	m_pad[4];
+struct	b3QuantizedBvhFloatData
+	b3Vector3FloatData			m_bvhAabbMin;
+	b3Vector3FloatData			m_bvhAabbMax;
+	b3Vector3FloatData			m_bvhQuantization;
+	int					m_curNodeIndex;
+	int					m_useQuantization;
+	int					m_numContiguousLeafNodes;
+	int					m_numQuantizedContiguousNodes;
+	b3OptimizedBvhNodeFloatData	*m_contiguousNodesPtr;
+	b3QuantizedBvhNodeData		*m_quantizedContiguousNodesPtr;
+	b3BvhSubtreeInfoData	*m_subTreeInfoPtr;
+	int					m_traversalMode;
+	int					m_numSubtreeHeaders;
+struct	b3QuantizedBvhDoubleData
+	b3Vector3DoubleData			m_bvhAabbMin;
+	b3Vector3DoubleData			m_bvhAabbMax;
+	b3Vector3DoubleData			m_bvhQuantization;
+	int							m_curNodeIndex;
+	int							m_useQuantization;
+	int							m_numContiguousLeafNodes;
+	int							m_numQuantizedContiguousNodes;
+	b3OptimizedBvhNodeDoubleData	*m_contiguousNodesPtr;
+	b3QuantizedBvhNodeData			*m_quantizedContiguousNodesPtr;
+	int							m_traversalMode;
+	int							m_numSubtreeHeaders;
+	b3BvhSubtreeInfoData		*m_subTreeInfoPtr;
+B3_FORCE_INLINE	int	b3QuantizedBvh::calculateSerializeBufferSizeNew() const
+	return sizeof(b3QuantizedBvhData);
+#endif //B3_QUANTIZED_BVH_H
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
new file mode 100644
index 00000000..4d97f7f6
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
@@ -0,0 +1,214 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3StridingMeshInterface.h"
+void	b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+	(void)aabbMin;
+	(void)aabbMax;
+	int numtotalphysicsverts = 0;
+	int part,graphicssubparts = getNumSubParts();
+	const unsigned char * vertexbase;
+	const unsigned char * indexbase;
+	int indexstride;
+	PHY_ScalarType type;
+	PHY_ScalarType gfxindextype;
+	int stride,numverts,numtriangles;
+	int gfxindex;
+	b3Vector3 triangle[3];
+	b3Vector3 meshScaling = getScaling();
+	///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
+	for (part=0;part<graphicssubparts ;part++)
+	{
+		getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part);
+		numtotalphysicsverts+=numtriangles*3; //upper bound
+		///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
+		///so disable this feature by default
+		///see patch http://code.google.com/p/bullet/issues/detail?id=213
+		switch (type)
+		{
+		case PHY_FLOAT:
+		 {
+			 float* graphicsbase;
+			 switch (gfxindextype)
+			 {
+			 case PHY_INTEGER:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			 case PHY_SHORT:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			case PHY_UCHAR:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			 default:
+				 b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+			 }
+			 break;
+		 }
+		case PHY_DOUBLE:
+			{
+				double* graphicsbase;
+				switch (gfxindextype)
+				{
+				case PHY_INTEGER:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				case PHY_SHORT:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				case PHY_UCHAR:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				default:
+					b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+				}
+				break;
+			}
+		default:
+			b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
+		}
+		unLockReadOnlyVertexBase(part);
+	}
+void	b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax)
+	struct	AabbCalculationCallback : public b3InternalTriangleIndexCallback
+	{
+		b3Vector3	m_aabbMin;
+		b3Vector3	m_aabbMax;
+		AabbCalculationCallback()
+		{
+			m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+			m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+		}
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		{
+			(void)partId;
+			(void)triangleIndex;
+			m_aabbMin.setMin(triangle[0]);
+			m_aabbMax.setMax(triangle[0]);
+			m_aabbMin.setMin(triangle[1]);
+			m_aabbMax.setMax(triangle[1]);
+			m_aabbMin.setMin(triangle[2]);
+			m_aabbMax.setMax(triangle[2]);
+		}
+	};
+	//first calculate the total aabb for all triangles
+	AabbCalculationCallback	aabbCallback;
+	aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+	aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+	InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax);
+	aabbMin = aabbCallback.m_aabbMin;
+	aabbMax = aabbCallback.m_aabbMax;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
new file mode 100644
index 00000000..9513f68f
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
@@ -0,0 +1,167 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "b3TriangleCallback.h"
+//#include "b3ConcaveShape.h"
+enum  	PHY_ScalarType { 
+///	The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
+/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
+/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
+B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface
+	protected:
+		b3Vector3 m_scaling;
+	public:
+		b3StridingMeshInterface() :m_scaling(b3MakeVector3(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.)))
+		{
+		}
+		virtual ~b3StridingMeshInterface();
+		virtual void	InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+		///brute force method to calculate aabb
+		void	calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax);
+		/// get read and write access to a subpart of a triangle mesh
+		/// this subpart has a continuous array of vertices and indices
+		/// in this way the mesh can be handled as chunks of memory with striding
+		/// very similar to OpenGL vertexarray support
+		/// make a call to unLockVertexBase when the read and write access is finished	
+		virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0;
+		virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0;
+		/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+		/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+		virtual void	unLockVertexBase(int subpart)=0;
+		virtual void	unLockReadOnlyVertexBase(int subpart) const=0;
+		/// getNumSubParts returns the number of seperate subparts
+		/// each subpart has a continuous array of vertices and indices
+		virtual int		getNumSubParts() const=0;
+		virtual void	preallocateVertices(int numverts)=0;
+		virtual void	preallocateIndices(int numindices)=0;
+		virtual bool	hasPremadeAabb() const { return false; }
+		virtual void	setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
+                {
+                        (void) aabbMin;
+                        (void) aabbMax;
+                }
+		virtual void	getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
+        {
+            (void) aabbMin;
+            (void) aabbMax;
+        }
+		const b3Vector3&	getScaling() const {
+			return m_scaling;
+		}
+		void	setScaling(const b3Vector3& scaling)
+		{
+			m_scaling = scaling;
+		}
+		virtual	int	calculateSerializeBufferSize() const;
+		///fills the dataBuffer and returns the struct name (and 0 on failure)
+		//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+struct	b3IntIndexData
+	int	m_value;
+struct	b3ShortIntIndexData
+	short m_value;
+	char m_pad[2];
+struct	b3ShortIntIndexTripletData
+	short	m_values[3];
+	char	m_pad[2];
+struct	b3CharIndexTripletData
+	unsigned char m_values[3];
+	char	m_pad;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3MeshPartData
+	b3Vector3FloatData			*m_vertices3f;
+	b3Vector3DoubleData			*m_vertices3d;
+	b3IntIndexData				*m_indices32;
+	b3ShortIntIndexTripletData	*m_3indices16;
+	b3CharIndexTripletData		*m_3indices8;
+	b3ShortIntIndexData			*m_indices16;//backwards compatibility
+	int                     m_numTriangles;//length of m_indices = m_numTriangles
+	int                     m_numVertices;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3StridingMeshInterfaceData
+	b3MeshPartData	*m_meshPartsPtr;
+	b3Vector3FloatData	m_scaling;
+	int	m_numMeshParts;
+	char m_padding[4];
+B3_FORCE_INLINE	int	b3StridingMeshInterface::calculateSerializeBufferSize() const
+	return sizeof(b3StridingMeshInterfaceData);
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
new file mode 100644
index 00000000..d073ee57
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
@@ -0,0 +1,38 @@
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "b3VectorFloat4.h"
+struct b3GjkPairDetector;
+inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
+	b3Vector3 supVec = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT);
+    // Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically. 
+    if( 0 < hull->m_numVertices )
+    {
+        const b3Vector3 scaled = supportVec;
+		int index = (int) scaled.maxDot( &verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot); 
+        return verticesA[hull->m_vertexOffset+index];
+    }
+    return supVec;
+inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA)
+	return localGetSupportVertexWithMargin(supportVec,hull,verticesA,0.f);
diff --git a/src/bullet/BulletMultiThreaded/btThreadSupportInterface.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
similarity index 80%
rename from src/bullet/BulletMultiThreaded/btThreadSupportInterface.cpp
rename to src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
index 8192aa46..90664518 100644
--- a/src/bullet/BulletMultiThreaded/btThreadSupportInterface.cpp
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
@@ -1,6 +1,6 @@
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -13,9 +13,15 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
-#include "btThreadSupportInterface.h"
+#include "b3TriangleCallback.h"
diff --git a/src/bullet/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
similarity index 51%
rename from src/bullet/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp
rename to src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
index 182aa269..3059fa4f 100644
--- a/src/bullet/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
@@ -1,6 +1,6 @@
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -13,36 +13,30 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
-#include "SpuCollisionObjectWrapper.h"
-#include "BulletCollision/CollisionShapes/btCollisionShape.h"
-SpuCollisionObjectWrapper::SpuCollisionObjectWrapper ()
+#include "Bullet3Common/b3Vector3.h"
-#ifndef __SPU__
-SpuCollisionObjectWrapper::SpuCollisionObjectWrapper (const btCollisionObject* collisionObject)
-	m_shapeType = collisionObject->getCollisionShape()->getShapeType ();
-	m_collisionObjectPtr = (ppu_address_t)collisionObject;
-	m_margin = collisionObject->getCollisionShape()->getMargin ();
-SpuCollisionObjectWrapper::getShapeType () const
-	return m_shapeType;
-SpuCollisionObjectWrapper::getCollisionMargin () const
+///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
+///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as  b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
+class b3TriangleCallback
-	return m_margin;
+	virtual ~b3TriangleCallback();
+	virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
-SpuCollisionObjectWrapper::getCollisionObjectPtr () const
+class b3InternalTriangleIndexCallback
-	return m_collisionObjectPtr;
+	virtual ~b3InternalTriangleIndexCallback();
+	virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex) = 0;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
new file mode 100644
index 00000000..a0f59bab
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
@@ -0,0 +1,95 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3TriangleIndexVertexArray.h"
+b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride)
+: m_hasAabb(0)
+	b3IndexedMesh mesh;
+	mesh.m_numTriangles = numTriangles;
+	mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase;
+	mesh.m_triangleIndexStride = triangleIndexStride;
+	mesh.m_numVertices = numVertices;
+	mesh.m_vertexBase = (const unsigned char *)vertexBase;
+	mesh.m_vertexStride = vertexStride;
+	addIndexedMesh(mesh);
+void	b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart)
+	b3Assert(subpart< getNumSubParts() );
+	b3IndexedMesh& mesh = m_indexedMeshes[subpart];
+	numverts = mesh.m_numVertices;
+	(*vertexbase) = (unsigned char *) mesh.m_vertexBase;
+   type = mesh.m_vertexType;
+	vertexStride = mesh.m_vertexStride;
+	numfaces = mesh.m_numTriangles;
+	(*indexbase) = (unsigned char *)mesh.m_triangleIndexBase;
+	indexstride = mesh.m_triangleIndexStride;
+	indicestype = mesh.m_indexType;
+void	b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const
+	const b3IndexedMesh& mesh = m_indexedMeshes[subpart];
+	numverts = mesh.m_numVertices;
+	(*vertexbase) = (const unsigned char *)mesh.m_vertexBase;
+   type = mesh.m_vertexType;
+	vertexStride = mesh.m_vertexStride;
+	numfaces = mesh.m_numTriangles;
+	(*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase;
+	indexstride = mesh.m_triangleIndexStride;
+	indicestype = mesh.m_indexType;
+bool	b3TriangleIndexVertexArray::hasPremadeAabb() const
+	return (m_hasAabb == 1);
+void	b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
+	m_aabbMin = aabbMin;
+	m_aabbMax = aabbMax;
+	m_hasAabb = 1; // this is intentionally an int see notes in header
+void	b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
+	*aabbMin = m_aabbMin;
+	*aabbMax = m_aabbMax;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
new file mode 100644
index 00000000..d26b2893
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
@@ -0,0 +1,133 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3StridingMeshInterface.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Scalar.h"
+///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
+///Instead of the number of indices, we pass the number of triangles.
+B3_ATTRIBUTE_ALIGNED16( struct)	b3IndexedMesh
+   int                     m_numTriangles;
+   const unsigned char *   m_triangleIndexBase;
+   // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
+   int                     m_triangleIndexStride;
+   int                     m_numVertices;
+   const unsigned char *   m_vertexBase;
+   // Size of a vertex, in bytes
+   int                     m_vertexStride;
+   // The index type is set when adding an indexed mesh to the
+   // b3TriangleIndexVertexArray, do not set it manually
+   PHY_ScalarType m_indexType;
+   // The vertex type has a default type similar to Bullet's precision mode (float or double)
+   // but can be set manually if you for example run Bullet with double precision but have
+   // mesh data in single precision..
+   PHY_ScalarType m_vertexType;
+   b3IndexedMesh()
+	   :m_indexType(PHY_INTEGER),
+      m_vertexType(PHY_DOUBLE)
+      m_vertexType(PHY_FLOAT)
+      {
+      }
+typedef b3AlignedObjectArray<b3IndexedMesh>	IndexedMeshArray;
+///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
+///Additional meshes can be added using addIndexedMesh
+///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
+///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
+B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface
+	IndexedMeshArray	m_indexedMeshes;
+	int m_pad[2];
+	mutable int m_hasAabb; // using int instead of bool to maintain alignment
+	mutable b3Vector3 m_aabbMin;
+	mutable b3Vector3 m_aabbMax;
+	b3TriangleIndexVertexArray() : m_hasAabb(0)
+	{
+	}
+	virtual ~b3TriangleIndexVertexArray();
+	//just to be backwards compatible
+	b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride);
+	void	addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
+	{
+		m_indexedMeshes.push_back(mesh);
+		m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType;
+	}
+	virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0);
+	virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const;
+	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+	virtual void	unLockVertexBase(int subpart) {(void)subpart;}
+	virtual void	unLockReadOnlyVertexBase(int subpart) const {(void)subpart;}
+	/// getNumSubParts returns the number of seperate subparts
+	/// each subpart has a continuous array of vertices and indices
+	virtual int		getNumSubParts() const { 
+		return (int)m_indexedMeshes.size();
+	}
+	IndexedMeshArray&	getIndexedMeshArray()
+	{
+		return m_indexedMeshes;
+	}
+	const IndexedMeshArray&	getIndexedMeshArray() const
+	{
+		return m_indexedMeshes;
+	}
+	virtual void	preallocateVertices(int numverts){(void) numverts;}
+	virtual void	preallocateIndices(int numindices){(void) numindices;}
+	virtual bool	hasPremadeAabb() const;
+	virtual void	setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const;
+	virtual void	getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
new file mode 100644
index 00000000..f6f65f77
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
@@ -0,0 +1,11 @@
+#ifndef B3_VECTOR_FLOAT4_H
+#define B3_VECTOR_FLOAT4_H
+#include "Bullet3Common/b3Transform.h"
+//#define cross3(a,b) (a.cross(b))
+#define float4 b3Vector3
+//#define make_float4(x,y,z,w) b3Vector4(x,y,z,w)
+#endif //B3_VECTOR_FLOAT4_H
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
new file mode 100644
index 00000000..cf3d5ef4
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
@@ -0,0 +1,609 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+	Elsevier CDROM license agreements grants nonexclusive license to use the software
+	for any purpose, commercial or non-commercial as long as the following credit is included
+	identifying the original source of the software:
+	Parts of the source are "from the book Real-Time Collision Detection by
+	Christer Ericson, published by Morgan Kaufmann Publishers,
+	(c) 2005 Elsevier Inc."
+#include "b3VoronoiSimplexSolver.h"
+#define VERTA  0
+#define VERTB  1
+#define VERTC  2
+#define VERTD  3
+void	b3VoronoiSimplexSolver::removeVertex(int index)
+	b3Assert(m_numVertices>0);
+	m_numVertices--;
+	m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
+	m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
+	m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
+void	b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts)
+	if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
+		removeVertex(3);
+	if ((numVertices() >= 3) && (!usedVerts.usedVertexC))
+		removeVertex(2);
+	if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
+		removeVertex(1);
+	if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
+		removeVertex(0);
+//clear the simplex, remove all the vertices
+void b3VoronoiSimplexSolver::reset()
+	m_cachedValidClosest = false;
+	m_numVertices = 0;
+	m_needsUpdate = true;
+	m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+	m_cachedBC.reset();
+	//add a vertex
+void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q)
+	m_lastW = w;
+	m_needsUpdate = true;
+	m_simplexVectorW[m_numVertices] = w;
+	m_simplexPointsP[m_numVertices] = p;
+	m_simplexPointsQ[m_numVertices] = q;
+	m_numVertices++;
+bool	b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
+	if (m_needsUpdate)
+	{
+		m_cachedBC.reset();
+		m_needsUpdate = false;
+		switch (numVertices())
+		{
+		case 0:
+				m_cachedValidClosest = false;
+				break;
+		case 1:
+			{
+				m_cachedP1 = m_simplexPointsP[0];
+				m_cachedP2 = m_simplexPointsQ[0];
+				m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0]
+				m_cachedBC.reset();
+				m_cachedBC.setBarycentricCoordinates(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+				m_cachedValidClosest = m_cachedBC.isValid();
+				break;
+			};
+		case 2:
+			{
+			//closest point origin from line segment
+					const b3Vector3& from = m_simplexVectorW[0];
+					const b3Vector3& to = m_simplexVectorW[1];
+					b3Vector3 nearest;
+					b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+					b3Vector3 diff = p - from;
+					b3Vector3 v = to - from;
+					b3Scalar t = v.dot(diff);
+					if (t > 0) {
+						b3Scalar dotVV = v.dot(v);
+						if (t < dotVV) {
+							t /= dotVV;
+							diff -= t*v;
+							m_cachedBC.m_usedVertices.usedVertexA = true;
+							m_cachedBC.m_usedVertices.usedVertexB = true;
+						} else {
+							t = 1;
+							diff -= v;
+							//reduce to 1 point
+							m_cachedBC.m_usedVertices.usedVertexB = true;
+						}
+					} else
+					{
+						t = 0;
+						//reduce to 1 point
+						m_cachedBC.m_usedVertices.usedVertexA = true;
+					}
+					m_cachedBC.setBarycentricCoordinates(1-t,t);
+					nearest = from + t*v;
+					m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
+					m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
+					m_cachedV = m_cachedP1 - m_cachedP2;
+					reduceVertices(m_cachedBC.m_usedVertices);
+					m_cachedValidClosest = m_cachedBC.isValid();
+					break;
+			}
+		case 3: 
+			{ 
+				//closest point origin from triangle 
+				b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); 
+				const b3Vector3& a = m_simplexVectorW[0]; 
+				const b3Vector3& b = m_simplexVectorW[1]; 
+				const b3Vector3& c = m_simplexVectorW[2]; 
+				closestPtPointTriangle(p,a,b,c,m_cachedBC); 
+				m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + 
+				m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + 
+				m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2]; 
+				m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + 
+				m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + 
+				m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2]; 
+				m_cachedV = m_cachedP1-m_cachedP2; 
+				reduceVertices (m_cachedBC.m_usedVertices); 
+				m_cachedValidClosest = m_cachedBC.isValid(); 
+				break; 
+			}
+		case 4:
+			{
+				b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+				const b3Vector3& a = m_simplexVectorW[0];
+				const b3Vector3& b = m_simplexVectorW[1];
+				const b3Vector3& c = m_simplexVectorW[2];
+				const b3Vector3& d = m_simplexVectorW[3];
+				bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC);
+				if (hasSeperation)
+				{
+					m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
+						m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
+						m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
+						m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
+					m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
+						m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
+						m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
+						m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
+					m_cachedV = m_cachedP1-m_cachedP2;
+					reduceVertices (m_cachedBC.m_usedVertices);
+				} else
+				{
+//					printf("sub distance got penetration\n");
+					if (m_cachedBC.m_degenerate)
+					{
+						m_cachedValidClosest = false;
+					} else
+					{
+						m_cachedValidClosest = true;
+						//degenerate case == false, penetration = true + zero
+						m_cachedV.setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+					}
+					break;
+				}
+				m_cachedValidClosest = m_cachedBC.isValid();
+				//closest point origin from tetrahedron
+				break;
+			}
+		default:
+			{
+				m_cachedValidClosest = false;
+			}
+		};
+	}
+	return m_cachedValidClosest;
+//return/calculate the closest vertex
+bool b3VoronoiSimplexSolver::closest(b3Vector3& v)
+	bool succes = updateClosestVectorAndPoints();
+	v = m_cachedV;
+	return succes;
+b3Scalar b3VoronoiSimplexSolver::maxVertex()
+	int i, numverts = numVertices();
+	b3Scalar maxV = b3Scalar(0.);
+	for (i=0;i<numverts;i++)
+	{
+		b3Scalar curLen2 = m_simplexVectorW[i].length2();
+		if (maxV < curLen2)
+			maxV = curLen2;
+	}
+	return maxV;
+	//return the current simplex
+int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const
+	int i;
+	for (i=0;i<numVertices();i++)
+	{
+		yBuf[i] = m_simplexVectorW[i];
+		pBuf[i] = m_simplexPointsP[i];
+		qBuf[i] = m_simplexPointsQ[i];
+	}
+	return numVertices();
+bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
+	bool found = false;
+	int i, numverts = numVertices();
+	//b3Scalar maxV = b3Scalar(0.);
+	//w is in the current (reduced) simplex
+	for (i=0;i<numverts;i++)
+	{
+		if ( m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
+		if (m_simplexVectorW[i] == w)
+			found = true;
+	}
+	//check in case lastW is already removed
+	if (w == m_lastW)
+		return true;
+	return found;
+void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v) 
+	v = m_cachedV;
+bool b3VoronoiSimplexSolver::emptySimplex() const 
+	return (numVertices() == 0);
+void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2) 
+	updateClosestVectorAndPoints();
+	p1 = m_cachedP1;
+	p2 = m_cachedP2;
+bool	b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result)
+	result.m_usedVertices.reset();
+    // Check if P in vertex region outside A
+    b3Vector3 ab = b - a;
+    b3Vector3 ac = c - a;
+    b3Vector3 ap = p - a;
+    b3Scalar d1 = ab.dot(ap);
+    b3Scalar d2 = ac.dot(ap);
+    if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0)) 
+	{
+		result.m_closestPointOnSimplex = a;
+		result.m_usedVertices.usedVertexA = true;
+		result.setBarycentricCoordinates(1,0,0);
+		return true;// a; // barycentric coordinates (1,0,0)
+	}
+    // Check if P in vertex region outside B
+    b3Vector3 bp = p - b;
+    b3Scalar d3 = ab.dot(bp);
+    b3Scalar d4 = ac.dot(bp);
+    if (d3 >= b3Scalar(0.0) && d4 <= d3) 
+	{
+		result.m_closestPointOnSimplex = b;
+		result.m_usedVertices.usedVertexB = true;
+		result.setBarycentricCoordinates(0,1,0);
+		return true; // b; // barycentric coordinates (0,1,0)
+	}
+    // Check if P in edge region of AB, if so return projection of P onto AB
+    b3Scalar vc = d1*d4 - d3*d2;
+    if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) {
+        b3Scalar v = d1 / (d1 - d3);
+		result.m_closestPointOnSimplex = a + v * ab;
+		result.m_usedVertices.usedVertexA = true;
+		result.m_usedVertices.usedVertexB = true;
+		result.setBarycentricCoordinates(1-v,v,0);
+		return true;
+        //return a + v * ab; // barycentric coordinates (1-v,v,0)
+    }
+    // Check if P in vertex region outside C
+    b3Vector3 cp = p - c;
+    b3Scalar d5 = ab.dot(cp);
+    b3Scalar d6 = ac.dot(cp);
+    if (d6 >= b3Scalar(0.0) && d5 <= d6) 
+	{
+		result.m_closestPointOnSimplex = c;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(0,0,1);
+		return true;//c; // barycentric coordinates (0,0,1)
+	}
+    // Check if P in edge region of AC, if so return projection of P onto AC
+    b3Scalar vb = d5*d2 - d1*d6;
+    if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) {
+        b3Scalar w = d2 / (d2 - d6);
+		result.m_closestPointOnSimplex = a + w * ac;
+		result.m_usedVertices.usedVertexA = true;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(1-w,0,w);
+		return true;
+        //return a + w * ac; // barycentric coordinates (1-w,0,w)
+    }
+    // Check if P in edge region of BC, if so return projection of P onto BC
+    b3Scalar va = d3*d6 - d5*d4;
+    if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) {
+        b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
+		result.m_closestPointOnSimplex = b + w * (c - b);
+		result.m_usedVertices.usedVertexB = true;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(0,1-w,w);
+		return true;		
+       // return b + w * (c - b); // barycentric coordinates (0,1-w,w)
+    }
+    // P inside face region. Compute Q through its barycentric coordinates (u,v,w)
+    b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
+    b3Scalar v = vb * denom;
+    b3Scalar w = vc * denom;
+	result.m_closestPointOnSimplex = a + ab * v + ac * w;
+	result.m_usedVertices.usedVertexA = true;
+	result.m_usedVertices.usedVertexB = true;
+	result.m_usedVertices.usedVertexC = true;
+	result.setBarycentricCoordinates(1-v-w,v,w);
+	return true;
+//	return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
+/// Test if point p and d lie on opposite sides of plane through abc
+int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d)
+	b3Vector3 normal = (b-a).cross(c-a);
+    b3Scalar signp = (p - a).dot(normal); // [AP AB AC]
+    b3Scalar signd = (d - a).dot( normal); // [AD AB AC]
+if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
+	{
+		return -1;
+	}
+	if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4)))
+	{
+//		printf("affine dependent/degenerate\n");//
+		return -1;
+	}
+	// Points on opposite sides if expression signs are opposite
+    return signp * signd < b3Scalar(0.);
+bool	b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
+	b3SubSimplexClosestResult tempResult;
+    // Start out assuming point inside all halfspaces, so closest to itself
+	finalResult.m_closestPointOnSimplex = p;
+	finalResult.m_usedVertices.reset();
+    finalResult.m_usedVertices.usedVertexA = true;
+	finalResult.m_usedVertices.usedVertexB = true;
+	finalResult.m_usedVertices.usedVertexC = true;
+	finalResult.m_usedVertices.usedVertexD = true;
+    int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
+	int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
+  	int	pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
+	int	pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
+   if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
+   {
+	   finalResult.m_degenerate = true;
+	   return false;
+   }
+   if (!pointOutsideABC  && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
+	 {
+		 return false;
+	 }
+    b3Scalar bestSqDist = FLT_MAX;
+    // If point outside face abc then compute closest point on abc
+	if (pointOutsideABC) 
+	{
+        closestPtPointTriangle(p, a, b, c,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+        b3Scalar sqDist = (q - p).dot( q - p);
+        // Update best closest point if (squared) distance is less than current best
+        if (sqDist < bestSqDist) {
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			//convert result bitmask!
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTB],
+					tempResult.m_barycentricCoords[VERTC],
+					0
+			);
+		}
+    }
+	// Repeat test for face acd
+	if (pointOutsideACD) 
+	{
+        closestPtPointTriangle(p, a, c, d,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+        b3Scalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					0,
+					tempResult.m_barycentricCoords[VERTB],
+					tempResult.m_barycentricCoords[VERTC]
+			);
+		}
+    }
+    // Repeat test for face adb
+	if (pointOutsideADB)
+	{
+		closestPtPointTriangle(p, a, d, b,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+        b3Scalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTC],
+					0,
+					tempResult.m_barycentricCoords[VERTB]
+			);
+		}
+    }
+    // Repeat test for face bdc
+	if (pointOutsideBDC)
+	{
+        closestPtPointTriangle(p, b, d, c,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+        b3Scalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			//
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
+			finalResult.setBarycentricCoordinates(
+					0,
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTC],
+					tempResult.m_barycentricCoords[VERTB]
+			);
+		}
+    }
+	//help! we ended up full !
+	if (finalResult.m_usedVertices.usedVertexA &&
+		finalResult.m_usedVertices.usedVertexB &&
+		finalResult.m_usedVertices.usedVertexC &&
+		finalResult.m_usedVertices.usedVertexD) 
+	{
+		return true;
+	}
+    return true;
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
new file mode 100644
index 00000000..a6e27667
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
@@ -0,0 +1,177 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure
+struct b3UsageBitfield{
+	b3UsageBitfield()
+	{
+		reset();
+	}
+	void reset()
+	{
+		usedVertexA = false;
+		usedVertexB = false;
+		usedVertexC = false;
+		usedVertexD = false;
+	}
+	unsigned short usedVertexA	: 1;
+	unsigned short usedVertexB	: 1;
+	unsigned short usedVertexC	: 1;
+	unsigned short usedVertexD	: 1;
+	unsigned short unused1		: 1;
+	unsigned short unused2		: 1;
+	unsigned short unused3		: 1;
+	unsigned short unused4		: 1;
+struct	b3SubSimplexClosestResult
+	b3Vector3	m_closestPointOnSimplex;
+	//MASK for m_usedVertices
+	//stores the simplex vertex-usage, using the MASK, 
+	// if m_usedVertices & MASK then the related vertex is used
+	b3UsageBitfield	m_usedVertices;
+	b3Scalar	m_barycentricCoords[4];
+	bool m_degenerate;
+	void	reset()
+	{
+		m_degenerate = false;
+		setBarycentricCoordinates();
+		m_usedVertices.reset();
+	}
+	bool	isValid()
+	{
+		bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) &&
+			(m_barycentricCoords[1] >= b3Scalar(0.)) &&
+			(m_barycentricCoords[2] >= b3Scalar(0.)) &&
+			(m_barycentricCoords[3] >= b3Scalar(0.));
+		return valid;
+	}
+	void	setBarycentricCoordinates(b3Scalar a=b3Scalar(0.),b3Scalar b=b3Scalar(0.),b3Scalar c=b3Scalar(0.),b3Scalar d=b3Scalar(0.))
+	{
+		m_barycentricCoords[0] = a;
+		m_barycentricCoords[1] = b;
+		m_barycentricCoords[2] = c;
+		m_barycentricCoords[3] = d;
+	}
+/// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
+/// Can be used with GJK, as an alternative to Johnson distance algorithm.
+B3_ATTRIBUTE_ALIGNED16(class) b3VoronoiSimplexSolver 
+	int	m_numVertices;
+	b3Vector3	m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3	m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3	m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3	m_cachedP1;
+	b3Vector3	m_cachedP2;
+	b3Vector3	m_cachedV;
+	b3Vector3	m_lastW;
+	b3Scalar	m_equalVertexThreshold;
+	bool		m_cachedValidClosest;
+	b3SubSimplexClosestResult m_cachedBC;
+	bool	m_needsUpdate;
+	void	removeVertex(int index);
+	void	reduceVertices (const b3UsageBitfield& usedVerts);
+	bool	updateClosestVectorAndPoints();
+	bool	closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
+	int		pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
+	bool	closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result);
+	b3VoronoiSimplexSolver()
+	{
+	}
+	 void reset();
+	 void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
+	 void	setEqualVertexThreshold(b3Scalar threshold)
+	 {
+		 m_equalVertexThreshold = threshold;
+	 }
+	 b3Scalar	getEqualVertexThreshold() const
+	 {
+		 return m_equalVertexThreshold;
+	 }
+	 bool closest(b3Vector3& v);
+	 b3Scalar maxVertex();
+	 bool fullSimplex() const
+	 {
+		 return (m_numVertices == 4);
+	 }
+	 int getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const;
+	 bool inSimplex(const b3Vector3& w);
+	 void backup_closest(b3Vector3& v) ;
+	 bool emptySimplex() const ;
+	 void compute_points(b3Vector3& p1, b3Vector3& p2) ;
+	 int numVertices() const 
+	 {
+		 return m_numVertices;
+	 }
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
new file mode 100644
index 00000000..faa41344
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
@@ -0,0 +1,283 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+#define SHAPE_SPHERE 7
+typedef unsigned int u32;
+///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} btQuantizedBvhNode;
+typedef struct
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+} b3BvhInfo;
+int	getTriangleIndex(const btQuantizedBvhNode* rootNode)
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+int isLeaf(const btQuantizedBvhNode* rootNode)
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+int getEscapeIndex(const btQuantizedBvhNode* rootNode)
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+typedef struct
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} btBvhSubtreeInfo;
+///keep this in sync with btCollidable.h
+typedef struct
+	int m_numChildShapes;
+	int blaat2;
+	int m_shapeType;
+	int m_shapeIndex;
+} btCollidableGpu;
+typedef struct
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+typedef struct
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+typedef struct 
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+int testQuantizedAabbAgainstQuantizedAabb(
+								const unsigned short int* aabbMin1,
+								const unsigned short int* aabbMax1,
+								const unsigned short int* aabbMin2,
+								const unsigned short int* aabbMax2)
+	//int overlap = 1;
+	if (aabbMin1[0] > aabbMax2[0])
+		return 0;
+	if (aabbMax1[0] < aabbMin2[0])
+		return 0;
+	if (aabbMin1[1] > aabbMax2[1])
+		return 0;
+	if (aabbMax1[1] < aabbMin2[1])
+		return 0;
+	if (aabbMin1[2] > aabbMax2[2])
+		return 0;
+	if (aabbMax1[2] < aabbMin2[2])
+		return 0;
+	return 1;
+	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
+	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
+	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
+	//return overlap;
+void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)
+	float4 clampedPoint = max(point2,bvhAabbMin);
+	clampedPoint = min (clampedPoint, bvhAabbMax);
+	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
+	if (isMax)
+	{
+		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));
+		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));
+		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));
+	} else
+	{
+		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));
+		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));
+		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));
+	}
+// work-in-progress
+__kernel void   bvhTraversalKernel( __global const int4* pairs, 
+									__global const BodyData* rigidBodies, 
+									__global const btCollidableGpu* collidables,
+									__global btAabbCL* aabbs,
+									__global int4* concavePairsOut,
+									__global volatile int* numConcavePairsOut,
+									__global const btBvhSubtreeInfo* subtreeHeadersRoot,
+									__global const btQuantizedBvhNode* quantizedNodesRoot,
+									__global const b3BvhInfo* bvhInfos,
+									int numPairs,
+									int maxNumConcavePairsCapacity)
+	int id = get_global_id(0);
+	if (id>=numPairs)
+		return;
+	int bodyIndexA = pairs[id].x;
+	int bodyIndexB = pairs[id].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	//once the broadphase avoids static-static pairs, we can remove this test
+	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+	{
+		return;
+	}
+	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)
+		return;
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	if (shapeTypeB!=SHAPE_CONVEX_HULL &&
+		shapeTypeB!=SHAPE_SPHERE	&&
+		)
+		return;
+	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
+	float4 bvhAabbMin = bvhInfo.m_aabbMin;
+	float4 bvhAabbMax = bvhInfo.m_aabbMax;
+	float4 bvhQuantization = bvhInfo.m_quantization;
+	int numSubtreeHeaders = bvhInfo.m_numSubTrees;
+	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
+	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);
+	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);
+	for (int i=0;i<numSubtreeHeaders;i++)
+	{
+		btBvhSubtreeInfo subtree = subtreeHeaders[i];
+		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			int startNodeIndex = subtree.m_rootNodeIndex;
+			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;
+			int curIndex = startNodeIndex;
+			int escapeIndex;
+			int isLeafNode;
+			int aabbOverlap;
+			while (curIndex < endNodeIndex)
+			{
+				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];
+				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);
+				isLeafNode = isLeaf(&rootNode);
+				if (aabbOverlap)
+				{
+					if (isLeafNode)
+					{
+						int triangleIndex = getTriangleIndex(&rootNode);
+						{
+								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);
+								for (int b=0;b<numChildrenB;b++)
+								{
+									if ((pairIdx+b)<maxNumConcavePairsCapacity)
+									{
+										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);
+										concavePairsOut[pairIdx+b] = newPair;
+									}
+								}
+						} else
+						{
+							int pairIdx = atomic_inc(numConcavePairsOut);
+							if (pairIdx<maxNumConcavePairsCapacity)
+							{
+								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);
+								concavePairsOut[pairIdx] = newPair;
+							}
+						}
+					} 
+					curIndex++;
+				} else
+				{
+					if (isLeafNode)
+					{
+						curIndex++;
+					} else
+					{
+						escapeIndex = getEscapeIndex(&rootNode);
+						curIndex += escapeIndex;
+					}
+				}
+			}
+		}
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
new file mode 100644
index 00000000..4b3b49ea
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
@@ -0,0 +1,258 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* bvhTraversalKernelCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_SPHERE 7\n"
+"typedef unsigned int u32;\n"
+"#define MAX_NUM_PARTS_IN_BITS 10\n"
+"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+"typedef struct\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes\n"
+"	int	m_escapeIndexOrTriangleIndex;\n"
+"} btQuantizedBvhNode;\n"
+"typedef struct\n"
+"	float4		m_aabbMin;\n"
+"	float4		m_aabbMax;\n"
+"	float4		m_quantization;\n"
+"	int			m_numNodes;\n"
+"	int			m_numSubTrees;\n"
+"	int			m_nodeOffset;\n"
+"	int			m_subTreeOffset;\n"
+"} b3BvhInfo;\n"
+"int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"	\n"
+"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"typedef struct\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes, points to the root of the subtree\n"
+"	int			m_rootNodeIndex;\n"
+"	//4 bytes\n"
+"	int			m_subtreeSize;\n"
+"	int			m_padding[3];\n"
+"} btBvhSubtreeInfo;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"	int m_numChildShapes;\n"
+"	int blaat2;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"typedef struct\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct \n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"int testQuantizedAabbAgainstQuantizedAabb(\n"
+"								const unsigned short int* aabbMin1,\n"
+"								const unsigned short int* aabbMax1,\n"
+"								const unsigned short int* aabbMin2,\n"
+"								const unsigned short int* aabbMax2)\n"
+"	//int overlap = 1;\n"
+"	if (aabbMin1[0] > aabbMax2[0])\n"
+"		return 0;\n"
+"	if (aabbMax1[0] < aabbMin2[0])\n"
+"		return 0;\n"
+"	if (aabbMin1[1] > aabbMax2[1])\n"
+"		return 0;\n"
+"	if (aabbMax1[1] < aabbMin2[1])\n"
+"		return 0;\n"
+"	if (aabbMin1[2] > aabbMax2[2])\n"
+"		return 0;\n"
+"	if (aabbMax1[2] < aabbMin2[2])\n"
+"		return 0;\n"
+"	return 1;\n"
+"	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
+"	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
+"	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
+"	//return overlap;\n"
+"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
+"	float4 clampedPoint = max(point2,bvhAabbMin);\n"
+"	clampedPoint = min (clampedPoint, bvhAabbMax);\n"
+"	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
+"	if (isMax)\n"
+"	{\n"
+"		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
+"		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
+"		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
+"	} else\n"
+"	{\n"
+"		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
+"		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
+"		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
+"	}\n"
+"// work-in-progress\n"
+"__kernel void   bvhTraversalKernel( __global const int4* pairs, \n"
+"									__global const BodyData* rigidBodies, \n"
+"									__global const btCollidableGpu* collidables,\n"
+"									__global btAabbCL* aabbs,\n"
+"									__global int4* concavePairsOut,\n"
+"									__global volatile int* numConcavePairsOut,\n"
+"									__global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
+"									__global const btQuantizedBvhNode* quantizedNodesRoot,\n"
+"									__global const b3BvhInfo* bvhInfos,\n"
+"									int numPairs,\n"
+"									int maxNumConcavePairsCapacity)\n"
+"	int id = get_global_id(0);\n"
+"	if (id>=numPairs)\n"
+"		return;\n"
+"	\n"
+"	int bodyIndexA = pairs[id].x;\n"
+"	int bodyIndexB = pairs[id].y;\n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"	//once the broadphase avoids static-static pairs, we can remove this test\n"
+"	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"	{\n"
+"		return;\n"
+"	}\n"
+"		\n"
+"	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
+"		return;\n"
+"	int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"		\n"
+"	if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
+"		shapeTypeB!=SHAPE_SPHERE	&&\n"
+"		)\n"
+"		return;\n"
+"	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
+"	float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
+"	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
+"	float4 bvhQuantization = bvhInfo.m_quantization;\n"
+"	int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
+"	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
+"	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
+"	\n"
+"	unsigned short int quantizedQueryAabbMin[3];\n"
+"	unsigned short int quantizedQueryAabbMax[3];\n"
+"	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
+"	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
+"	\n"
+"	for (int i=0;i<numSubtreeHeaders;i++)\n"
+"	{\n"
+"		btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
+"				\n"
+"		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
+"		if (overlap != 0)\n"
+"		{\n"
+"			int startNodeIndex = subtree.m_rootNodeIndex;\n"
+"			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
+"			int curIndex = startNodeIndex;\n"
+"			int escapeIndex;\n"
+"			int isLeafNode;\n"
+"			int aabbOverlap;\n"
+"			while (curIndex < endNodeIndex)\n"
+"			{\n"
+"				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
+"				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
+"				isLeafNode = isLeaf(&rootNode);\n"
+"				if (aabbOverlap)\n"
+"				{\n"
+"					if (isLeafNode)\n"
+"					{\n"
+"						int triangleIndex = getTriangleIndex(&rootNode);\n"
+"						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"						{\n"
+"								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
+"								for (int b=0;b<numChildrenB;b++)\n"
+"								{\n"
+"									if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
+"									{\n"
+"										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
+"										concavePairsOut[pairIdx+b] = newPair;\n"
+"									}\n"
+"								}\n"
+"						} else\n"
+"						{\n"
+"							int pairIdx = atomic_inc(numConcavePairsOut);\n"
+"							if (pairIdx<maxNumConcavePairsCapacity)\n"
+"							{\n"
+"								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
+"								concavePairsOut[pairIdx] = newPair;\n"
+"							}\n"
+"						}\n"
+"					} \n"
+"					curIndex++;\n"
+"				} else\n"
+"				{\n"
+"					if (isLeafNode)\n"
+"					{\n"
+"						curIndex++;\n"
+"					} else\n"
+"					{\n"
+"						escapeIndex = getEscapeIndex(&rootNode);\n"
+"						curIndex += escapeIndex;\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
new file mode 100644
index 00000000..e754f4e1
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
@@ -0,0 +1,311 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#define AppendInc(x, out) out = atomic_inc(x)
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+#ifdef cl_ext_atomic_counters_32
+	#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+	#define counter32_t volatile __global int*
+__kernel void   mprPenetrationKernel( __global int4* pairs,
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global float4* separatingNormals,
+																					__global int* hasSeparatingAxis,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int contactCapacity,
+																					int numPairs)
+	int i = get_global_id(0);
+	int pairIndex = i;
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+		float depthOut;
+		b3Float4 dirOut;
+		b3Float4 posOut;
+		int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);
+		if (res==0)
+		{
+			//add a contact
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+			if (dstIdx<contactCapacity)
+			{
+				pairs[pairIndex].z = dstIdx;
+				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;
+				c->m_worldNormalOnB = -dirOut;//normal;
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+				c->m_batchIdx = pairIndex;
+				int bodyA = pairs[pairIndex].x;
+				int bodyB = pairs[pairIndex].y;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
+				c->m_childIndexA = -1;
+				c->m_childIndexB = -1;
+				//for (int i=0;i<nContacts;i++)
+				posOut.w = -depthOut;
+				c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];
+				GET_NPOINTS(*c) = 1;//nContacts;
+			}
+		}
+	}
+typedef float4 Quaternion;
+#define make_float4 (float4)
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+	return qtRotate( *orientation, *p ) + (*translation);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+inline void project(__global const b3ConvexPolyhedronData_t* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+bool findSeparatingAxisUnitSphere(	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices,
+	__global const float4* unitSphereDirections,
+	int numUnitSphereDirections,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test unit sphere directions
+	for (int i=0;i<numUnitSphereDirections;i++)
+	{
+		float4 crossje;
+		crossje = unitSphereDirections[i];	
+		if (dot3F4(DeltaC2,crossje)>0)
+			crossje *= -1.f;
+		{
+			float dist;
+			bool result = true;
+			float Min0,Max0;
+			float Min1,Max1;
+			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+			if(Max0<Min1 || Max1<Min0)
+				return false;
+			float d0 = Max0 - Min1;
+			float d1 = Max1 - Min0;
+			dist = d0<d1 ? d0:d1;
+			result = true;
+			if(dist<*dmin)
+			{
+				*dmin = dist;
+				*sep = crossje;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+__kernel void   findSeparatingAxisUnitSphereKernel( __global const int4* pairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* unitSphereDirections,
+																					__global  float4* separatingNormals,
+																					__global  int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					int numUnitSphereDirections,
+																					int numPairs
+																					)
+	int i = get_global_id(0);
+	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+			float dmin = dmins[i];
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			posA.w = 0.f;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			posB.w = 0.f;
+			float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 c0 = transform(&c0local, &posA, &ornA);
+			float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+			float4 ornB =rigidBodies[bodyIndexB].m_quat;
+			float4 c1 = transform(&c1local,&posB,&ornB);
+			const float4 DeltaC2 = c0 - c1;
+			float4 sepNormal = separatingNormals[i];
+			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
+			if (numEdgeEdgeDirections>numUnitSphereDirections)
+			{
+				bool sepEE = findSeparatingAxisUnitSphere(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																										posB,ornB,
+																										DeltaC2,
+																										vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis[i] = 0;
+				} else
+				{
+					hasSeparatingAxis[i] = 1;
+					separatingNormals[i] = sepNormal;
+				}
+			}
+		}		//if (hasSeparatingAxis[i])
+	}//(i<numPairs)
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
new file mode 100644
index 00000000..7ed4b382
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
@@ -0,0 +1,1446 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* mprKernelsCL= \
+" * ---------------------------------\n"
+" * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>\n"
+" *\n"
+" *  This file was ported from mpr.c file, part of libccd.\n"
+" *  The Minkoski Portal Refinement implementation was ported \n"
+" *  to OpenCL by Erwin Coumans for the Bullet 3 Physics library.\n"
+" *  at http://github.com/erwincoumans/bullet3\n"
+" *\n"
+" *  Distributed under the OSI-approved BSD License (the \"License\");\n"
+" *  see <http://www.opensource.org/licenses/bsd-license.php>.\n"
+" *  This software is distributed WITHOUT ANY WARRANTY; without even the\n"
+" *  See the License for more information.\n"
+" */\n"
+"#ifndef B3_MPR_PENETRATION_H\n"
+"#define B3_MPR_PENETRATION_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"typedef struct b3GpuFace b3GpuFace_t;\n"
+"struct b3GpuFace\n"
+"	b3Float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"	int m_unusedPadding1;\n"
+"	int m_unusedPadding2;\n"
+"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n"
+"struct b3ConvexPolyhedronData\n"
+"	b3Float4		m_localCenter;\n"
+"	b3Float4		m_extents;\n"
+"	b3Float4		mC;\n"
+"	b3Float4		mE;\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"#ifndef B3_COLLIDABLE_H\n"
+"#define B3_COLLIDABLE_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"enum b3ShapeTypes\n"
+"	SHAPE_PLANE=4,\n"
+"typedef struct b3Collidable b3Collidable_t;\n"
+"struct b3Collidable\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
+"struct b3GpuChildShape\n"
+"	b3Float4	m_childPosition;\n"
+"	b3Quat		m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"struct b3CompoundOverlappingPair\n"
+"	int m_bodyIndexA;\n"
+"	int m_bodyIndexB;\n"
+"//	int	m_pairType;\n"
+"	int m_childShapeIndexA;\n"
+"	int m_childShapeIndexB;\n"
+"#endif //B3_COLLIDABLE_H\n"
+"#ifdef __cplusplus\n"
+"#define B3_MPR_SQRT sqrt\n"
+"#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))\n"
+"#define B3_MPR_FABS fabs\n"
+"#define B3_MPR_TOLERANCE 1E-6f\n"
+"#define B3_MPR_MAX_ITERATIONS 1000\n"
+"struct _b3MprSupport_t \n"
+"    b3Float4 v;  //!< Support point in minkowski sum\n"
+"    b3Float4 v1; //!< Support point in obj1\n"
+"    b3Float4 v2; //!< Support point in obj2\n"
+"typedef struct _b3MprSupport_t b3MprSupport_t;\n"
+"struct _b3MprSimplex_t \n"
+"    b3MprSupport_t ps[4];\n"
+"    int last; //!< index of last added point\n"
+"typedef struct _b3MprSimplex_t b3MprSimplex_t;\n"
+"inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)\n"
+"    return &s->ps[idx];\n"
+"inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)\n"
+"    s->last = size - 1;\n"
+"inline int b3MprSimplexSize(const b3MprSimplex_t *s)\n"
+"    return s->last + 1;\n"
+"inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)\n"
+"    // here is no check on boundaries\n"
+"    return &s->ps[idx];\n"
+"inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)\n"
+"    *d = *s;\n"
+"inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)\n"
+"    b3MprSupportCopy(s->ps + pos, a);\n"
+"inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)\n"
+"    b3MprSupport_t supp;\n"
+"    b3MprSupportCopy(&supp, &s->ps[pos1]);\n"
+"    b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);\n"
+"    b3MprSupportCopy(&s->ps[pos2], &supp);\n"
+"inline int b3MprIsZero(float val)\n"
+"    return B3_MPR_FABS(val) < FLT_EPSILON;\n"
+"inline int b3MprEq(float _a, float _b)\n"
+"    float ab;\n"
+"    float a, b;\n"
+"    ab = B3_MPR_FABS(_a - _b);\n"
+"    if (B3_MPR_FABS(ab) < FLT_EPSILON)\n"
+"        return 1;\n"
+"    a = B3_MPR_FABS(_a);\n"
+"    b = B3_MPR_FABS(_b);\n"
+"    if (b > a){\n"
+"        return ab < FLT_EPSILON * b;\n"
+"    }else{\n"
+"        return ab < FLT_EPSILON * a;\n"
+"    }\n"
+"inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)\n"
+"    return b3MprEq((*a).x, (*b).x)\n"
+"            && b3MprEq((*a).y, (*b).y)\n"
+"            && b3MprEq((*a).z, (*b).z);\n"
+"inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, 	b3ConstArray(b3Float4) verticesA)\n"
+"	b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n"
+"	float maxDot = -B3_LARGE_FLOAT;\n"
+"    if( 0 < hull->m_numVertices )\n"
+"    {\n"
+"        const b3Float4 scaled = supportVec;\n"
+"		int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n"
+"        return verticesA[hull->m_vertexOffset+index];\n"
+"    }\n"
+"    return supVec;\n"
+"B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"														const b3Float4* _dir, b3Float4* outp, int logme)\n"
+"	//dir is in worldspace, move to local space\n"
+"	\n"
+"	b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;\n"
+"	b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;\n"
+"	\n"
+"	b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);\n"
+"	\n"
+"	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);\n"
+"	\n"
+"	//find local support vertex\n"
+"	int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;\n"
+"	\n"
+"	b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);\n"
+"	__global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];\n"
+"	\n"
+"	b3Float4 pInA;\n"
+"	if (logme)\n"
+"	{\n"
+"		b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n"
+"		float maxDot = -B3_LARGE_FLOAT;\n"
+"		if( 0 < hull->m_numVertices )\n"
+"		{\n"
+"			const b3Float4 scaled = localDir;\n"
+"			int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n"
+"			pInA = cpuVertices[hull->m_vertexOffset+index];\n"
+"			\n"
+"		}\n"
+"	} else\n"
+"	{\n"
+"		pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);\n"
+"	}\n"
+"	//move vertex to world space\n"
+"	*outp = b3TransformPoint(pInA,pos,orn);\n"
+"	\n"
+"inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB,   b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"													const b3Float4* _dir, b3MprSupport_t *supp)\n"
+"    b3Float4 dir;\n"
+"	dir = *_dir;\n"
+"	b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);\n"
+"    dir = *_dir*-1.f;\n"
+"	b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);\n"
+"    supp->v = supp->v1 - supp->v2;\n"
+"inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)\n"
+"    center->v1 = cpuBodyBuf[bodyIndexA].m_pos;\n"
+"	center->v2 = cpuBodyBuf[bodyIndexB].m_pos;\n"
+"    center->v = center->v1 - center->v2;\n"
+"inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)\n"
+"	(*v).x = x;\n"
+"	(*v).y = y;\n"
+"	(*v).z = z;\n"
+"	(*v).w = 0.f;\n"
+"inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)\n"
+"    (*v).x += (*w).x;\n"
+"    (*v).y += (*w).y;\n"
+"    (*v).z += (*w).z;\n"
+"inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)\n"
+"    *v = *w;\n"
+"inline void b3MprVec3Scale(b3Float4 *d, float k)\n"
+"    *d *= k;\n"
+"inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)\n"
+"    float dot;\n"
+"	dot = b3Dot3F4(*a,*b);\n"
+"    return dot;\n"
+"inline float b3MprVec3Len2(const b3Float4 *v)\n"
+"    return b3MprVec3Dot(v, v);\n"
+"inline void b3MprVec3Normalize(b3Float4 *d)\n"
+"    float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));\n"
+"    b3MprVec3Scale(d, k);\n"
+"inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)\n"
+"	*d = b3Cross3(*a,*b);\n"
+"	\n"
+"inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)\n"
+"	*d = *v - *w;\n"
+"inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)\n"
+"    b3Float4 v2v1, v3v1;\n"
+"    b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b3MprVec3Cross(dir, &v2v1, &v3v1);\n"
+"    b3MprVec3Normalize(dir);\n"
+"inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,\n"
+"                                       const b3Float4 *dir)\n"
+"    float dot;\n"
+"    dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);\n"
+"    return b3MprIsZero(dot) || dot > 0.f;\n"
+"inline int portalReachTolerance(const b3MprSimplex_t *portal,\n"
+"                                     const b3MprSupport_t *v4,\n"
+"                                     const b3Float4 *dir)\n"
+"    float dv1, dv2, dv3, dv4;\n"
+"    float dot1, dot2, dot3;\n"
+"    // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}\n"
+"    dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);\n"
+"    dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);\n"
+"    dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);\n"
+"    dv4 = b3MprVec3Dot(&v4->v, dir);\n"
+"    dot1 = dv4 - dv1;\n"
+"    dot2 = dv4 - dv2;\n"
+"    dot3 = dv4 - dv3;\n"
+"    dot1 = B3_MPR_FMIN(dot1, dot2);\n"
+"    dot1 = B3_MPR_FMIN(dot1, dot3);\n"
+"    return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;\n"
+"inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal,   \n"
+"                                         const b3MprSupport_t *v4,\n"
+"                                         const b3Float4 *dir)\n"
+"    float dot;\n"
+"    dot = b3MprVec3Dot(&v4->v, dir);\n"
+"    return b3MprIsZero(dot) || dot > 0.f;\n"
+"inline void b3ExpandPortal(b3MprSimplex_t *portal,\n"
+"                              const b3MprSupport_t *v4)\n"
+"    float dot;\n"
+"    b3Float4 v4v0;\n"
+"    b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);\n"
+"    if (dot > 0.f){\n"
+"        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);\n"
+"        if (dot > 0.f){\n"
+"            b3MprSimplexSet(portal, 1, v4);\n"
+"        }else{\n"
+"            b3MprSimplexSet(portal, 3, v4);\n"
+"        }\n"
+"    }else{\n"
+"        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);\n"
+"        if (dot > 0.f){\n"
+"            b3MprSimplexSet(portal, 2, v4);\n"
+"        }else{\n"
+"            b3MprSimplexSet(portal, 1, v4);\n"
+"        }\n"
+"    }\n"
+"B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"													__global int*	hasSepAxis,\n"
+"													b3MprSimplex_t *portal)\n"
+"    b3Float4 dir, va, vb;\n"
+"    float dot;\n"
+"    int cont;\n"
+"	\n"
+"	\n"
+"    // vertex 0 is center of portal\n"
+"    b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));\n"
+"    // vertex 0 is center of portal\n"
+"    b3MprSimplexSetSize(portal, 1);\n"
+"	\n"
+"	b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"    if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){\n"
+"        // Portal's center lies on origin (0,0,0) => we know that objects\n"
+"        // intersect but we would need to know penetration info.\n"
+"        // So move center little bit...\n"
+"        b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);\n"
+"        b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);\n"
+"    }\n"
+"    // vertex 1 = support in direction of origin\n"
+"    b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Scale(&dir, -1.f);\n"
+"    b3MprVec3Normalize(&dir);\n"
+"    b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));\n"
+"    b3MprSimplexSetSize(portal, 2);\n"
+"    // test if origin isn't outside of v1\n"
+"    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);\n"
+"	\n"
+"    if (b3MprIsZero(dot) || dot < 0.f)\n"
+"        return -1;\n"
+"    // vertex 2\n"
+"    b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    if (b3MprIsZero(b3MprVec3Len2(&dir))){\n"
+"        if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){\n"
+"            // origin lies on v1\n"
+"            return 1;\n"
+"        }else{\n"
+"            // origin lies on v0-v1 segment\n"
+"            return 2;\n"
+"        }\n"
+"    }\n"
+"    b3MprVec3Normalize(&dir);\n"
+"	 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));\n"
+"    \n"
+"    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);\n"
+"    if (b3MprIsZero(dot) || dot < 0.f)\n"
+"        return -1;\n"
+"    b3MprSimplexSetSize(portal, 3);\n"
+"    // vertex 3 direction\n"
+"    b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                     &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                     &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Cross(&dir, &va, &vb);\n"
+"    b3MprVec3Normalize(&dir);\n"
+"    // it is better to form portal faces to be oriented \"outside\" origin\n"
+"    dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    if (dot > 0.f){\n"
+"        b3MprSimplexSwap(portal, 1, 2);\n"
+"        b3MprVec3Scale(&dir, -1.f);\n"
+"    }\n"
+"    while (b3MprSimplexSize(portal) < 4){\n"
+"		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));\n"
+"        \n"
+"        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);\n"
+"        if (b3MprIsZero(dot) || dot < 0.f)\n"
+"            return -1;\n"
+"        cont = 0;\n"
+"        // test if origin is outside (v1, v0, v3) - set v2 as v3 and\n"
+"        // continue\n"
+"        b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                          &b3MprSimplexPoint(portal, 3)->v);\n"
+"        dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n"
+"        if (dot < 0.f && !b3MprIsZero(dot)){\n"
+"            b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));\n"
+"            cont = 1;\n"
+"        }\n"
+"        if (!cont){\n"
+"            // test if origin is outside (v3, v0, v2) - set v1 as v3 and\n"
+"            // continue\n"
+"            b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                              &b3MprSimplexPoint(portal, 2)->v);\n"
+"            dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n"
+"            if (dot < 0.f && !b3MprIsZero(dot)){\n"
+"                b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));\n"
+"                cont = 1;\n"
+"            }\n"
+"        }\n"
+"        if (cont){\n"
+"            b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                             &b3MprSimplexPoint(portal, 0)->v);\n"
+"            b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                             &b3MprSimplexPoint(portal, 0)->v);\n"
+"            b3MprVec3Cross(&dir, &va, &vb);\n"
+"            b3MprVec3Normalize(&dir);\n"
+"        }else{\n"
+"            b3MprSimplexSetSize(portal, 4);\n"
+"        }\n"
+"    }\n"
+"    return 0;\n"
+"B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"													b3MprSimplex_t *portal)\n"
+"    b3Float4 dir;\n"
+"    b3MprSupport_t v4;\n"
+"	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n"
+"    //while (1)\n"
+"	{\n"
+"        // compute direction outside the portal (from v0 throught v1,v2,v3\n"
+"        // face)\n"
+"        b3PortalDir(portal, &dir);\n"
+"        // test if origin is inside the portal\n"
+"        if (portalEncapsulesOrigin(portal, &dir))\n"
+"            return 0;\n"
+"        // get next support point\n"
+"        \n"
+"		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n"
+"        // test if v4 can expand portal to contain origin and if portal\n"
+"        // expanding doesn't reach given tolerance\n"
+"        if (!portalCanEncapsuleOrigin(portal, &v4, &dir)\n"
+"                || portalReachTolerance(portal, &v4, &dir))\n"
+"		{\n"
+"            return -1;\n"
+"        }\n"
+"        // v1-v2-v3 triangle must be rearranged to face outside Minkowski\n"
+"        // difference (direction from v0).\n"
+"        b3ExpandPortal(portal, &v4);\n"
+"    }\n"
+"    return -1;\n"
+"B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)\n"
+"	b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"    b3Float4 dir;\n"
+"    size_t i;\n"
+"    float b[4], sum, inv;\n"
+"    b3Float4 vec, p1, p2;\n"
+"    b3PortalDir(portal, &dir);\n"
+"    // use barycentric coordinates of tetrahedron to find origin\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                       &b3MprSimplexPoint(portal, 2)->v);\n"
+"    b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                       &b3MprSimplexPoint(portal, 2)->v);\n"
+"    b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n"
+"	sum = b[0] + b[1] + b[2] + b[3];\n"
+"    if (b3MprIsZero(sum) || sum < 0.f){\n"
+"		b[0] = 0.f;\n"
+"        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                           &b3MprSimplexPoint(portal, 3)->v);\n"
+"        b[1] = b3MprVec3Dot(&vec, &dir);\n"
+"        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                           &b3MprSimplexPoint(portal, 1)->v);\n"
+"        b[2] = b3MprVec3Dot(&vec, &dir);\n"
+"        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                           &b3MprSimplexPoint(portal, 2)->v);\n"
+"        b[3] = b3MprVec3Dot(&vec, &dir);\n"
+"		sum = b[1] + b[2] + b[3];\n"
+"	}\n"
+"	inv = 1.f / sum;\n"
+"    b3MprVec3Copy(&p1, b3mpr_vec3_origin);\n"
+"    b3MprVec3Copy(&p2, b3mpr_vec3_origin);\n"
+"    for (i = 0; i < 4; i++){\n"
+"        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);\n"
+"        b3MprVec3Scale(&vec, b[i]);\n"
+"        b3MprVec3Add(&p1, &vec);\n"
+"        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);\n"
+"        b3MprVec3Scale(&vec, b[i]);\n"
+"        b3MprVec3Add(&p2, &vec);\n"
+"    }\n"
+"    b3MprVec3Scale(&p1, inv);\n"
+"    b3MprVec3Scale(&p2, inv);\n"
+"    b3MprVec3Copy(pos, &p1);\n"
+"    b3MprVec3Add(pos, &p2);\n"
+"    b3MprVec3Scale(pos, 0.5);\n"
+"inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)\n"
+"    b3Float4 ab;\n"
+"    b3MprVec3Sub2(&ab, a, b);\n"
+"    return b3MprVec3Len2(&ab);\n"
+"inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,\n"
+"                                                  const b3Float4 *x0,\n"
+"                                                  const b3Float4 *b,\n"
+"                                                  b3Float4 *witness)\n"
+"    // The computation comes from solving equation of segment:\n"
+"    //      S(t) = x0 + t.d\n"
+"    //          where - x0 is initial point of segment\n"
+"    //                - d is direction of segment from x0 (|d| > 0)\n"
+"    //                - t belongs to <0, 1> interval\n"
+"    // \n"
+"    // Than, distance from a segment to some point P can be expressed:\n"
+"    //      D(t) = |x0 + t.d - P|^2\n"
+"    //          which is distance from any point on segment. Minimization\n"
+"    //          of this function brings distance from P to segment.\n"
+"    // Minimization of D(t) leads to simple quadratic equation that's\n"
+"    // solving is straightforward.\n"
+"    //\n"
+"    // Bonus of this method is witness point for free.\n"
+"    float dist, t;\n"
+"    b3Float4 d, a;\n"
+"    // direction of segment\n"
+"    b3MprVec3Sub2(&d, b, x0);\n"
+"    // precompute vector from P to x0\n"
+"    b3MprVec3Sub2(&a, x0, P);\n"
+"    t  = -1.f * b3MprVec3Dot(&a, &d);\n"
+"    t /= b3MprVec3Len2(&d);\n"
+"    if (t < 0.f || b3MprIsZero(t)){\n"
+"        dist = b3MprVec3Dist2(x0, P);\n"
+"        if (witness)\n"
+"            b3MprVec3Copy(witness, x0);\n"
+"    }else if (t > 1.f || b3MprEq(t, 1.f)){\n"
+"        dist = b3MprVec3Dist2(b, P);\n"
+"        if (witness)\n"
+"            b3MprVec3Copy(witness, b);\n"
+"    }else{\n"
+"        if (witness){\n"
+"            b3MprVec3Copy(witness, &d);\n"
+"            b3MprVec3Scale(witness, t);\n"
+"            b3MprVec3Add(witness, x0);\n"
+"            dist = b3MprVec3Dist2(witness, P);\n"
+"        }else{\n"
+"            // recycling variables\n"
+"            b3MprVec3Scale(&d, t);\n"
+"            b3MprVec3Add(&d, &a);\n"
+"            dist = b3MprVec3Len2(&d);\n"
+"        }\n"
+"    }\n"
+"    return dist;\n"
+"inline float b3MprVec3PointTriDist2(const b3Float4 *P,\n"
+"                                const b3Float4 *x0, const b3Float4 *B,\n"
+"                                const b3Float4 *C,\n"
+"                                b3Float4 *witness)\n"
+"    // Computation comes from analytic expression for triangle (x0, B, C)\n"
+"    //      T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and\n"
+"    // Then equation for distance is:\n"
+"    //      D(s, t) = | T(s, t) - P |^2\n"
+"    // This leads to minimization of quadratic function of two variables.\n"
+"    // The solution from is taken only if s is between 0 and 1, t is\n"
+"    // between 0 and 1 and t + s < 1, otherwise distance from segment is\n"
+"    // computed.\n"
+"    b3Float4 d1, d2, a;\n"
+"    float u, v, w, p, q, r;\n"
+"    float s, t, dist, dist2;\n"
+"    b3Float4 witness2;\n"
+"    b3MprVec3Sub2(&d1, B, x0);\n"
+"    b3MprVec3Sub2(&d2, C, x0);\n"
+"    b3MprVec3Sub2(&a, x0, P);\n"
+"    u = b3MprVec3Dot(&a, &a);\n"
+"    v = b3MprVec3Dot(&d1, &d1);\n"
+"    w = b3MprVec3Dot(&d2, &d2);\n"
+"    p = b3MprVec3Dot(&a, &d1);\n"
+"    q = b3MprVec3Dot(&a, &d2);\n"
+"    r = b3MprVec3Dot(&d1, &d2);\n"
+"    s = (q * r - w * p) / (w * v - r * r);\n"
+"    t = (-s * r - q) / w;\n"
+"    if ((b3MprIsZero(s) || s > 0.f)\n"
+"            && (b3MprEq(s, 1.f) || s < 1.f)\n"
+"            && (b3MprIsZero(t) || t > 0.f)\n"
+"            && (b3MprEq(t, 1.f) || t < 1.f)\n"
+"            && (b3MprEq(t + s, 1.f) || t + s < 1.f)){\n"
+"        if (witness){\n"
+"            b3MprVec3Scale(&d1, s);\n"
+"            b3MprVec3Scale(&d2, t);\n"
+"            b3MprVec3Copy(witness, x0);\n"
+"            b3MprVec3Add(witness, &d1);\n"
+"            b3MprVec3Add(witness, &d2);\n"
+"            dist = b3MprVec3Dist2(witness, P);\n"
+"        }else{\n"
+"            dist  = s * s * v;\n"
+"            dist += t * t * w;\n"
+"            dist += 2.f * s * t * r;\n"
+"            dist += 2.f * s * p;\n"
+"            dist += 2.f * t * q;\n"
+"            dist += u;\n"
+"        }\n"
+"    }else{\n"
+"        dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);\n"
+"        dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);\n"
+"        if (dist2 < dist){\n"
+"            dist = dist2;\n"
+"            if (witness)\n"
+"                b3MprVec3Copy(witness, &witness2);\n"
+"        }\n"
+"        dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);\n"
+"        if (dist2 < dist){\n"
+"            dist = dist2;\n"
+"            if (witness)\n"
+"                b3MprVec3Copy(witness, &witness2);\n"
+"        }\n"
+"    }\n"
+"    return dist;\n"
+"B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"                       b3MprSimplex_t *portal,\n"
+"                       float *depth, b3Float4 *pdir, b3Float4 *pos)\n"
+"    b3Float4 dir;\n"
+"    b3MprSupport_t v4;\n"
+"    unsigned long iterations;\n"
+"	b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"    iterations = 1UL;\n"
+"	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n"
+"    //while (1)\n"
+"	{\n"
+"        // compute portal direction and obtain next support point\n"
+"        b3PortalDir(portal, &dir);\n"
+"        \n"
+"		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n"
+"        // reached tolerance -> find penetration info\n"
+"        if (portalReachTolerance(portal, &v4, &dir)\n"
+"                || iterations ==B3_MPR_MAX_ITERATIONS)\n"
+"		{\n"
+"            *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);\n"
+"            *depth = B3_MPR_SQRT(*depth);\n"
+"			\n"
+"			if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))\n"
+"			{\n"
+"				\n"
+"				*pdir = dir;\n"
+"			} \n"
+"			b3MprVec3Normalize(pdir);\n"
+"			\n"
+"            // barycentric coordinates:\n"
+"            b3FindPos(portal, pos);\n"
+"            return;\n"
+"        }\n"
+"        b3ExpandPortal(portal, &v4);\n"
+"        iterations++;\n"
+"    }\n"
+"B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)\n"
+"    // Touching contact on portal's v1 - so depth is zero and direction\n"
+"    // is unimportant and pos can be guessed\n"
+"    *depth = 0.f;\n"
+"    b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"	b3MprVec3Copy(dir, b3mpr_vec3_origin);\n"
+"    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n"
+"    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n"
+"    b3MprVec3Scale(pos, 0.5);\n"
+"B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,\n"
+"                              float *depth, b3Float4 *dir, b3Float4 *pos)\n"
+"    \n"
+"    // Origin lies on v0-v1 segment.\n"
+"    // Depth is distance to v1, direction also and position must be\n"
+"    // computed\n"
+"    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n"
+"    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n"
+"    b3MprVec3Scale(pos, 0.5f);\n"
+"    \n"
+"    b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);\n"
+"    *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));\n"
+"    b3MprVec3Normalize(dir);\n"
+"inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,\n"
+"					b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,\n"
+"					b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"					b3ConstArray(b3Collidable_t)	cpuCollidables,\n"
+"					b3ConstArray(b3Float4)	cpuVertices,\n"
+"					__global b3Float4* sepAxis,\n"
+"					__global int*	hasSepAxis,\n"
+"					float *depthOut, b3Float4* dirOut, b3Float4* posOut)\n"
+"	\n"
+"	 b3MprSimplex_t portal;\n"
+"	 \n"
+"//	if (!hasSepAxis[pairIndex])\n"
+"	//	return -1;\n"
+"	\n"
+"	hasSepAxis[pairIndex] = 0;\n"
+"	 int res;\n"
+"    // Phase 1: Portal discovery\n"
+"    res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);\n"
+"	\n"
+"	  \n"
+"	//sepAxis[pairIndex] = *pdir;//or -dir?\n"
+"	switch (res)\n"
+"	{\n"
+"	case 0:\n"
+"		{\n"
+"			// Phase 2: Portal refinement\n"
+"		\n"
+"			res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);\n"
+"			if (res < 0)\n"
+"				return -1;\n"
+"			// Phase 3. Penetration info\n"
+"			b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);\n"
+"			hasSepAxis[pairIndex] = 1;\n"
+"			sepAxis[pairIndex] = -*dirOut;\n"
+"			break;\n"
+"		}\n"
+"	case 1:\n"
+"		{\n"
+"			 // Touching contact on portal's v1.\n"
+"			b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);\n"
+"			break;\n"
+"		}\n"
+"	case 2:\n"
+"		{\n"
+"			\n"
+"			b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);\n"
+"			break;\n"
+"		}\n"
+"	default:\n"
+"		{\n"
+"			hasSepAxis[pairIndex]=0;\n"
+"			//if (res < 0)\n"
+"			//{\n"
+"				// Origin isn't inside portal - no collision.\n"
+"				return -1;\n"
+"			//}\n"
+"		}\n"
+"	};\n"
+"	\n"
+"	return 0;\n"
+"#endif //B3_MPR_PENETRATION_H\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"	#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"	#define counter32_t volatile __global int*\n"
+"__kernel void   mprPenetrationKernel( __global int4* pairs,\n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global float4* separatingNormals,\n"
+"																					__global int* hasSeparatingAxis,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int contactCapacity,\n"
+"																					int numPairs)\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		float depthOut;\n"
+"		b3Float4 dirOut;\n"
+"		b3Float4 posOut;\n"
+"		int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);\n"
+"		\n"
+"		\n"
+"		\n"
+"		\n"
+"		if (res==0)\n"
+"		{\n"
+"			//add a contact\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			if (dstIdx<contactCapacity)\n"
+"			{\n"
+"				pairs[pairIndex].z = dstIdx;\n"
+"				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;\n"
+"				c->m_worldNormalOnB = -dirOut;//normal;\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				int bodyA = pairs[pairIndex].x;\n"
+"				int bodyB = pairs[pairIndex].y;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n"
+"				c->m_childIndexA = -1;\n"
+"				c->m_childIndexB = -1;\n"
+"				//for (int i=0;i<nContacts;i++)\n"
+"				posOut.w = -depthOut;\n"
+"				c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];\n"
+"				GET_NPOINTS(*c) = 1;//nContacts;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"typedef float4 Quaternion;\n"
+"#define make_float4 (float4)\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"inline void project(__global const b3ConvexPolyhedronData_t* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"bool findSeparatingAxisUnitSphere(	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices,\n"
+"	__global const float4* unitSphereDirections,\n"
+"	int numUnitSphereDirections,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test unit sphere directions\n"
+"	for (int i=0;i<numUnitSphereDirections;i++)\n"
+"	{\n"
+"		float4 crossje;\n"
+"		crossje = unitSphereDirections[i];	\n"
+"		if (dot3F4(DeltaC2,crossje)>0)\n"
+"			crossje *= -1.f;\n"
+"		{\n"
+"			float dist;\n"
+"			bool result = true;\n"
+"			float Min0,Max0;\n"
+"			float Min1,Max1;\n"
+"			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"		\n"
+"			if(Max0<Min1 || Max1<Min0)\n"
+"				return false;\n"
+"		\n"
+"			float d0 = Max0 - Min1;\n"
+"			float d1 = Max1 - Min0;\n"
+"			dist = d0<d1 ? d0:d1;\n"
+"			result = true;\n"
+"	\n"
+"			if(dist<*dmin)\n"
+"			{\n"
+"				*dmin = dist;\n"
+"				*sep = crossje;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"__kernel void   findSeparatingAxisUnitSphereKernel( __global const int4* pairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* unitSphereDirections,\n"
+"																					__global  float4* separatingNormals,\n"
+"																					__global  int* hasSeparatingAxis,\n"
+"																					__global  float* dmins,\n"
+"																					int numUnitSphereDirections,\n"
+"																					int numPairs\n"
+"																					)\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"	\n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"	\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"			\n"
+"			int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	\n"
+"			float dmin = dmins[i];\n"
+"	\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			posA.w = 0.f;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			posB.w = 0.f;\n"
+"			float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"			float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"			float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"			const float4 DeltaC2 = c0 - c1;\n"
+"			float4 sepNormal = separatingNormals[i];\n"
+"			\n"
+"			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n"
+"			if (numEdgeEdgeDirections>numUnitSphereDirections)\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisUnitSphere(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																										posB,ornB,\n"
+"																										DeltaC2,\n"
+"																										vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 1;\n"
+"					separatingNormals[i] = sepNormal;\n"
+"				}\n"
+"			}\n"
+"		}		//if (hasSeparatingAxis[i])\n"
+"	}//(i<numPairs)\n"
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
new file mode 100644
index 00000000..9c9e920f
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
@@ -0,0 +1,1374 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#define SHAPE_PLANE 4
+#define SHAPE_SPHERE 7
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile __global int*
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define max2 max
+#define min2 min
+typedef unsigned int u32;
+typedef struct 
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+///keep this in sync with btCollidable.h
+typedef struct
+	int m_numChildShapes;
+	float m_radius;
+	int m_shapeType;
+	int m_shapeIndex;
+} btCollidableGpu;
+typedef struct
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+typedef struct
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_collidableIdx;	
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+typedef struct  
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+typedef struct
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+float fastDiv(float numerator, float denominator)
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+float4 fastDiv4(float4 numerator, float4 denominator)
+	return native_divide(numerator, denominator);	
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+//#define dot3F4 dot
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+//	Quaternion
+typedef float4 Quaternion;
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+	return qtRotate( *orientation, *p ) + (*translation);
+void	trInverse(float4 translationIn, Quaternion orientationIn,
+		float4* translationOut, Quaternion* orientationOut)
+	*orientationOut = qtInvert(orientationIn);
+	*translationOut = qtRotate(*orientationOut, -translationIn);
+void	trMul(float4 translationA, Quaternion orientationA,
+						float4 translationB, Quaternion orientationB,
+		float4* translationOut, Quaternion* orientationOut)
+	*orientationOut = qtMul(orientationA,orientationB);
+	*translationOut = transform(&translationB,&translationA,&orientationA);
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+__inline float4 lerp3(const float4 a,const float4 b, float  t)
+	return make_float4(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)
+	float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);
+	float dist = dot3F4(n, point) + planeEqn.w;
+	*closestPointOnFace = point - dist * n;
+	return dist;
+inline bool IsPointInPolygon(float4 p, 
+							const btGpuFace* face,
+							__global const float4* baseVertex,
+							__global const  int* convexIndices,
+							float4* out)
+    float4 a;
+    float4 b;
+    float4 ab;
+    float4 ap;
+    float4 v;
+	float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);
+	if (face->m_numIndices<2)
+		return false;
+	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];
+	b = v0;
+    for(unsigned i=0; i != face->m_numIndices; ++i)
+    {
+		a = b;
+		float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];
+		b = vi;
+        ab = b-a;
+        ap = p-a;
+        v = cross3(ab,plane);
+        if (dot(ap, v) > 0.f)
+        {
+            float ab_m2 = dot(ab, ab);
+            float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;
+            if (rt <= 0.f)
+            {
+                *out = a;
+            }
+            else if (rt >= 1.f) 
+            {
+                *out = b;
+            }
+            else
+            {
+            	float s = 1.f - rt;
+				out[0].x = s * a.x + rt * b.x;
+				out[0].y = s * a.y + rt * b.y;
+				out[0].z = s * a.z + rt * b.z;
+            }
+            return false;
+        }
+    }
+    return true;
+void	computeContactSphereConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																__global const BodyData* rigidBodies, 
+																__global const btCollidableGpu* collidables,
+																__global const ConvexPolyhedronCL* convexShapes,
+																__global const float4* convexVertices,
+																__global const int* convexIndices,
+																__global const btGpuFace* faces,
+																__global struct b3Contact4Data* restrict globalContactsOut,
+																counter32_t nGlobalContactsOut,
+																int maxContactCapacity,
+																float4 spherePos2,
+																float radius,
+																float4 pos,
+																float4 quat
+																)
+	float4 invPos;
+	float4 invOrn;
+	trInverse(pos,quat, &invPos,&invOrn);
+	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);
+	int shapeIndex = collidables[collidableIndexB].m_shapeIndex;
+	int numFaces = convexShapes[shapeIndex].m_numFaces;
+	float4 closestPnt = (float4)(0, 0, 0, 0);
+	float4 hitNormalWorld = (float4)(0, 0, 0, 0);
+	float minDist = -1000000.f;
+	bool bCollide = true;
+	for ( int f = 0; f < numFaces; f++ )
+	{
+		btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];
+		// set up a plane equation 
+		float4 planeEqn;
+		float4 n1 = face.m_plane;
+		n1.w = 0.f;
+		planeEqn = n1;
+		planeEqn.w = face.m_plane.w;
+		// compute a signed distance from the vertex in cloth to the face of rigidbody.
+		float4 pntReturn;
+		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);
+		// If the distance is positive, the plane is a separating plane. 
+		if ( dist > radius )
+		{
+			bCollide = false;
+			break;
+		}
+		if (dist>0)
+		{
+			//might hit an edge or vertex
+			float4 out;
+			float4 zeroPos = make_float4(0,0,0,0);
+			bool isInPoly = IsPointInPolygon(spherePos,
+					&face,
+					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],
+					convexIndices,
+           &out);
+			if (isInPoly)
+			{
+				if (dist>minDist)
+				{
+					minDist = dist;
+					closestPnt = pntReturn;
+					hitNormalWorld = planeEqn;
+				}
+			} else
+			{
+				float4 tmp = spherePos-out;
+				float l2 = dot(tmp,tmp);
+				if (l2<radius*radius)
+				{
+					dist  = sqrt(l2);
+					if (dist>minDist)
+					{
+						minDist = dist;
+						closestPnt = out;
+						hitNormalWorld = tmp/dist;
+					}
+				} else
+				{
+					bCollide = false;
+					break;
+				}
+			}
+		} else
+		{
+			if ( dist > minDist )
+			{
+				minDist = dist;
+				closestPnt = pntReturn;
+				hitNormalWorld.xyz = planeEqn.xyz;
+			}
+		}
+	}
+	if (bCollide && minDist > -10000)
+	{
+		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);
+		float4 pOnB1 = transform(&closestPnt,&pos,&quat);
+		float actualDepth = minDist-radius;
+		if (actualDepth<=0.f)
+		{
+			pOnB1.w = actualDepth;
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+			if (1)//dstIdx < maxContactCapacity)
+			{
+				__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+				c->m_worldNormalOnB = -normalOnSurfaceB1;
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+				c->m_batchIdx = pairIndex;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+				c->m_worldPosB[0] = pOnB1;
+				c->m_childIndexA = -1;
+				c->m_childIndexB = -1;
+				GET_NPOINTS(*c) = 1;
+			} 
+		}
+	}//if (hasCollision)
+int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)
+	if( nPoints == 0 )
+        return 0;
+    if (nPoints <=4)
+        return nPoints;
+    if (nPoints >64)
+        nPoints = 64;
+	float4 center = make_float4(0.f);
+	{
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+	//	sample 4 directions
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    int minIndex=-1;
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+    }
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    return 4;
+int computeContactPlaneConvex(int pairIndex,
+								int bodyIndexA, int bodyIndexB, 
+								int collidableIndexA, int collidableIndexB, 
+								__global const BodyData* rigidBodies, 
+								__global const btCollidableGpu*collidables,
+								__global const ConvexPolyhedronCL* convexShapes,
+								__global const float4* convexVertices,
+								__global const int* convexIndices,
+								__global const btGpuFace* faces,
+								__global struct b3Contact4Data* restrict globalContactsOut,
+								counter32_t nGlobalContactsOut,
+								int maxContactCapacity,
+								float4 posB,
+								Quaternion ornB
+								)
+	int resultIndex=-1;
+		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;
+	__global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];
+	float4 posA;
+	posA = rigidBodies[bodyIndexA].m_pos;
+	Quaternion ornA;
+	ornA = rigidBodies[bodyIndexA].m_quat;
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	float4 planeEq;
+	 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);
+	float4 planeNormalWorld;
+	planeNormalWorld = qtRotate(ornA,planeNormal);
+	float planeConstant = planeEq.w;
+	float4 invPosA;Quaternion invOrnA;
+	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;
+	{
+		trInverse(posA,ornA,&invPosA,&invOrnA);
+		trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	}
+	float4 invPosB;Quaternion invOrnB;
+	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;
+	{
+		trInverse(posB,ornB,&invPosB,&invOrnB);
+		trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1);	
+	}
+	float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);
+	float maxDot = -1e30;
+	int hitVertex=-1;
+	float4 hitVtx;
+	float4 contactPoints[MAX_PLANE_CONVEX_POINTS];
+	int numPoints = 0;
+	int4 contactIdx;
+	contactIdx=make_int4(0,1,2,3);
+	for (int i=0;i<hullB->m_numVertices;i++)
+	{
+		float4 vtx = convexVertices[hullB->m_vertexOffset+i];
+		float curDot = dot(vtx,planeNormalInConvex);
+		if (curDot>maxDot)
+		{
+			hitVertex=i;
+			maxDot=curDot;
+			hitVtx = vtx;
+			//make sure the deepest points is always included
+			if (numPoints==MAX_PLANE_CONVEX_POINTS)
+				numPoints--;
+		}
+		{
+			float4 vtxWorld = transform(&vtx, &posB, &ornB);
+			float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;
+			float dist = dot(planeNormal,vtxInPlane)-planeConstant;
+			if (dist<0.f)
+			{
+				vtxWorld.w = dist;
+				contactPoints[numPoints] = vtxWorld;
+				numPoints++;
+			}
+		}
+	}
+	int numReducedPoints  = numPoints;
+	if (numPoints>4)
+	{
+		numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);
+	}
+	if (numReducedPoints>0)
+	{
+		int dstIdx;
+	    AppendInc( nGlobalContactsOut, dstIdx );
+		if (dstIdx < maxContactCapacity)
+		{
+			resultIndex = dstIdx;
+			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = -planeNormalWorld;
+			//c->setFrictionCoeff(0.7);
+			//c->setRestituitionCoeff(0.f);
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_childIndexA = -1;
+			c->m_childIndexB = -1;
+			switch (numReducedPoints)
+            {
+                case 4:
+                    c->m_worldPosB[3] = contactPoints[contactIdx.w];
+                case 3:
+                    c->m_worldPosB[2] = contactPoints[contactIdx.z];
+                case 2:
+                    c->m_worldPosB[1] = contactPoints[contactIdx.y];
+                case 1:
+                    c->m_worldPosB[0] = contactPoints[contactIdx.x];
+                default:
+                {
+                }
+            };
+			GET_NPOINTS(*c) = numReducedPoints;
+		}//if (dstIdx < numPairs)
+	}	
+	return resultIndex;
+void	computeContactPlaneSphere(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																__global const BodyData* rigidBodies, 
+																__global const btCollidableGpu* collidables,
+																__global const btGpuFace* faces,
+																__global struct b3Contact4Data* restrict globalContactsOut,
+																counter32_t nGlobalContactsOut,
+																int maxContactCapacity)
+	float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	float radius = collidables[collidableIndexB].m_radius;
+	float4 posA1 = rigidBodies[bodyIndexA].m_pos;
+	float4 ornA1 = rigidBodies[bodyIndexA].m_quat;
+	float4 posB1 = rigidBodies[bodyIndexB].m_pos;
+	float4 ornB1 = rigidBodies[bodyIndexB].m_quat;
+	bool hasCollision = false;
+	float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);
+	float planeConstant = planeEq.w;
+	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;
+	{
+		float4 invPosA;Quaternion invOrnA;
+		trInverse(posA1,ornA1,&invPosA,&invOrnA);
+		trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	}
+	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;
+	{
+		float4 invPosB;Quaternion invOrnB;
+		trInverse(posB1,ornB1,&invPosB,&invOrnB);
+		trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1);	
+	}
+	float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;
+	float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;
+	hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();
+	if (hasCollision)
+	{
+		float4 vtxInPlaneProjected1 = vtxInPlane1 -   distance*planeNormal1;
+		float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);
+		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);
+		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;
+		pOnB1.w = distance;
+		int dstIdx;
+    AppendInc( nGlobalContactsOut, dstIdx );
+		if (dstIdx < maxContactCapacity)
+		{
+			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = -normalOnSurfaceB1;
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_worldPosB[0] = pOnB1;
+			c->m_childIndexA = -1;
+			c->m_childIndexB = -1;
+			GET_NPOINTS(*c) = 1;
+		}//if (dstIdx < numPairs)
+	}//if (hasCollision)
+__kernel void   primitiveContactsKernel( __global int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numPairs, int maxContactCapacity)
+	int i = get_global_id(0);
+	int pairIndex = i;
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			float4 posB;
+			posB = rigidBodies[bodyIndexB].m_pos;
+			Quaternion ornB;
+			ornB = rigidBodies[bodyIndexB].m_quat;
+			int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);
+			if (contactIndex>=0)
+				pairs[pairIndex].z = contactIndex;
+			return;
+		}
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			float4 posA;
+			posA = rigidBodies[bodyIndexA].m_pos;
+			Quaternion ornA;
+			ornA = rigidBodies[bodyIndexA].m_quat;
+			int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);
+			if (contactIndex>=0)
+				pairs[pairIndex].z = contactIndex;
+			return;
+		}
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);
+			return;
+		}
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);
+			return;
+		}
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			float4 spherePos = rigidBodies[bodyIndexA].m_pos;
+			float sphereRadius = collidables[collidableIndexA].m_radius;
+			float4 convexPos = rigidBodies[bodyIndexB].m_pos;
+			float4 convexOrn = rigidBodies[bodyIndexB].m_quat;
+			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+																spherePos,sphereRadius,convexPos,convexOrn);
+			return;
+		}
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			float4 spherePos = rigidBodies[bodyIndexB].m_pos;
+			float sphereRadius = collidables[collidableIndexB].m_radius;
+			float4 convexPos = rigidBodies[bodyIndexA].m_pos;
+			float4 convexOrn = rigidBodies[bodyIndexA].m_quat;
+			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, 
+																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+																spherePos,sphereRadius,convexPos,convexOrn);
+			return;
+		}
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//sphere-sphere
+			float radiusA = collidables[collidableIndexA].m_radius;
+			float radiusB = collidables[collidableIndexB].m_radius;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			float4 diff = posA-posB;
+			float len = length(diff);
+			///iff distance positive, don't generate a new contact
+			if ( len <= (radiusA+radiusB))
+			{
+				///distance (negative means penetration)
+				float dist = len - (radiusA+radiusB);
+				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);
+				if (len > 0.00001)
+				{
+					normalOnSurfaceB = diff / len;
+				}
+				float4 contactPosB = posB + normalOnSurfaceB*radiusB;
+				contactPosB.w = dist;
+				int dstIdx;
+				 AppendInc( nGlobalContactsOut, dstIdx );
+				if (dstIdx < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = normalOnSurfaceB;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_worldPosB[0] = contactPosB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+					GET_NPOINTS(*c) = 1;
+				}//if (dstIdx < numPairs)
+			}//if ( len <= (radiusA+radiusB))
+			return;
+	}//	if (i<numPairs)
+// work-in-progress
+__kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,
+													__global const BodyData* rigidBodies, 
+													__global const btCollidableGpu* collidables,
+													__global const ConvexPolyhedronCL* convexShapes, 
+													__global const float4* vertices,
+													__global const float4* uniqueEdges,
+													__global const btGpuFace* faces,
+													__global const int* indices,
+													__global btAabbCL* aabbs,
+													__global const btGpuChildShape* gpuChildShapes,
+													__global struct b3Contact4Data* restrict globalContactsOut,
+													counter32_t nGlobalContactsOut,
+													int numCompoundPairs, int maxContactCapacity
+													)
+	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		float4 ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = qtRotate(ornA,childPosA)+posA;
+			float4 newOrnA = qtMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+		int pairIndex = i;
+		if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))
+		{
+			computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB,  collidableIndexA,collidableIndexB, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);
+			return;
+		}
+		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))
+		{
+			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);
+			return;
+		}
+		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))
+		{
+			float4 spherePos = rigidBodies[bodyIndexB].m_pos;
+			float sphereRadius = collidables[collidableIndexB].m_radius;
+			float4 convexPos = posA;
+			float4 convexOrn = ornA;
+			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, 
+										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+										spherePos,sphereRadius,convexPos,convexOrn);
+			return;
+		}
+		if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))
+		{
+			float4 spherePos = rigidBodies[bodyIndexA].m_pos;
+			float sphereRadius = collidables[collidableIndexA].m_radius;
+			float4 convexPos = posB;
+			float4 convexOrn = ornB;
+			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+										spherePos,sphereRadius,convexPos,convexOrn);
+			return;
+		}
+	}//	if (i<numCompoundPairs)
+bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )
+	const float4* p1 = &vertices[0];
+	const float4* p2 = &vertices[1];
+	const float4* p3 = &vertices[2];
+	float4 edge1;	edge1 = (*p2 - *p1);
+	float4 edge2;	edge2 = ( *p3 - *p2 );
+	float4 edge3;	edge3 = ( *p1 - *p3 );
+	float4 p1_to_p; p1_to_p = ( *p - *p1 );
+	float4 p2_to_p; p2_to_p = ( *p - *p2 );
+	float4 p3_to_p; p3_to_p = ( *p - *p3 );
+	float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));
+	float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));
+	float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));
+	float r1, r2, r3;
+	r1 = dot(edge1_normal,p1_to_p );
+	r2 = dot(edge2_normal,p2_to_p );
+	r3 = dot(edge3_normal,p3_to_p );
+	if ( r1 > 0 && r2 > 0 && r3 > 0 )
+		return true;
+    if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) 
+		return true;
+	return false;
+float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) 
+	float4 diff = p - from;
+	float4 v = to - from;
+	float t = dot(v,diff);
+	if (t > 0) 
+	{
+		float dotVV = dot(v,v);
+		if (t < dotVV) 
+		{
+			t /= dotVV;
+			diff -= t*v;
+		} else 
+		{
+			t = 1;
+			diff -= v;
+		}
+	} else
+	{
+		t = 0;
+	}
+	*nearest = from + t*v;
+	return dot(diff,diff);	
+void	computeContactSphereTriangle(int pairIndex,
+									int bodyIndexA, int bodyIndexB,
+									int collidableIndexA, int collidableIndexB, 
+									__global const BodyData* rigidBodies, 
+									__global const btCollidableGpu* collidables,
+									const float4* triangleVertices,
+									__global struct b3Contact4Data* restrict globalContactsOut,
+									counter32_t nGlobalContactsOut,
+									int maxContactCapacity,
+									float4 spherePos2,
+									float radius,
+									float4 pos,
+									float4 quat,
+									int faceIndex
+									)
+	float4 invPos;
+	float4 invOrn;
+	trInverse(pos,quat, &invPos,&invOrn);
+	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);
+	int numFaces = 3;
+	float4 closestPnt = (float4)(0, 0, 0, 0);
+	float4 hitNormalWorld = (float4)(0, 0, 0, 0);
+	float minDist = -1000000.f;
+	bool bCollide = false;
+	//////////////////////////////////////
+	float4 sphereCenter;
+	sphereCenter = spherePos;
+	const float4* vertices = triangleVertices;
+	float contactBreakingThreshold = 0.f;//todo?
+	float radiusWithThreshold = radius + contactBreakingThreshold;
+	float4 edge10;
+	edge10 = vertices[1]-vertices[0];
+	edge10.w = 0.f;//is this needed?
+	float4 edge20;
+	edge20 = vertices[2]-vertices[0];
+	edge20.w = 0.f;//is this needed?
+	float4 normal = cross3(edge10,edge20);
+	normal = normalize(normal);
+	float4 p1ToCenter;
+	p1ToCenter = sphereCenter - vertices[0];
+	float distanceFromPlane = dot(p1ToCenter,normal);
+	if (distanceFromPlane < 0.f)
+	{
+		//triangle facing the other way
+		distanceFromPlane *= -1.f;
+		normal *= -1.f;
+	}
+	hitNormalWorld = normal;
+	bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;
+	// Check for contact / intersection
+	bool hasContact = false;
+	float4 contactPoint;
+	if (isInsideContactPlane) 
+	{
+		if (pointInTriangle(vertices,&normal, &sphereCenter)) 
+		{
+			// Inside the contact wedge - touches a point on the shell plane
+			hasContact = true;
+			contactPoint = sphereCenter - normal*distanceFromPlane;
+		} else {
+			// Could be inside one of the contact capsules
+			float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;
+			float4 nearestOnEdge;
+			int numEdges = 3;
+			for (int i = 0; i < numEdges; i++) 
+			{
+				float4 pa =vertices[i];
+				float4 pb = vertices[(i+1)%3];
+				float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);
+				if (distanceSqr < contactCapsuleRadiusSqr) 
+				{
+					// Yep, we're inside a capsule
+					hasContact = true;
+					contactPoint = nearestOnEdge;
+				}
+			}
+		}
+	}
+	if (hasContact) 
+	{
+		closestPnt = contactPoint;
+		float4 contactToCenter = sphereCenter - contactPoint;
+		minDist = length(contactToCenter);
+		if (minDist>FLT_EPSILON)
+		{
+			hitNormalWorld = normalize(contactToCenter);//*(1./minDist);
+			bCollide  = true;
+		}
+	}
+	/////////////////////////////////////
+	if (bCollide && minDist > -10000)
+	{
+		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);
+		float4 pOnB1 = transform(&closestPnt,&pos,&quat);
+		float actualDepth = minDist-radius;
+		if (actualDepth<=0.f)
+		{
+			pOnB1.w = actualDepth;
+			int dstIdx;
+			float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);
+			if (lenSqr>FLT_EPSILON)
+			{
+				AppendInc( nGlobalContactsOut, dstIdx );
+				if (dstIdx < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normalOnSurfaceB1;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+					c->m_worldPosB[0] = pOnB1;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = faceIndex;
+					GET_NPOINTS(*c) = 1;
+				} 
+			}
+		}
+	}//if (hasCollision)
+// work-in-progress
+__kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,
+												__global const BodyData* rigidBodies,
+												__global const btCollidableGpu* collidables,
+												__global const ConvexPolyhedronCL* convexShapes, 
+												__global const float4* vertices,
+												__global const float4* uniqueEdges,
+												__global const btGpuFace* faces,
+												__global const int* indices,
+												__global btAabbCL* aabbs,
+												__global struct b3Contact4Data* restrict globalContactsOut,
+												counter32_t nGlobalContactsOut,
+													int numConcavePairs, int maxContactCapacity
+												)
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	int pairIdx = i;
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)
+	{
+		int f = concavePairs[i].z;
+		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+		float4 verticesA[3];
+		for (int i=0;i<3;i++)
+		{
+			int index = indices[face.m_indexOffset+i];
+			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+			verticesA[i] = vert;
+		}
+		float4 spherePos = rigidBodies[bodyIndexB].m_pos;
+		float sphereRadius = collidables[collidableIndexB].m_radius;
+		float4 convexPos = rigidBodies[bodyIndexA].m_pos;
+		float4 convexOrn = rigidBodies[bodyIndexA].m_quat;
+		computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, 
+																rigidBodies,collidables,
+																verticesA,
+																globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+																spherePos,sphereRadius,convexPos,convexOrn, f);
+		return;
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
new file mode 100644
index 00000000..b0103fe6
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
@@ -0,0 +1,1289 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* primitiveContactsKernelsCL= \
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_PLANE 4\n"
+"#define SHAPE_SPHERE 7\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile __global int*\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"typedef unsigned int u32;\n"
+"typedef struct \n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"	int m_numChildShapes;\n"
+"	float m_radius;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"typedef struct\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;	\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct  \n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"float fastDiv(float numerator, float denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"//#define dot3F4 dot\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"	*orientationOut = qtInvert(orientationIn);\n"
+"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+"void	trMul(float4 translationA, Quaternion orientationA,\n"
+"						float4 translationB, Quaternion orientationB,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"	*orientationOut = qtMul(orientationA,orientationB);\n"
+"	*translationOut = transform(&translationB,&translationA,&orientationA);\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"__inline float4 lerp3(const float4 a,const float4 b, float  t)\n"
+"	return make_float4(	a.x + (b.x - a.x) * t,\n"
+"						a.y + (b.y - a.y) * t,\n"
+"						a.z + (b.z - a.z) * t,\n"
+"						0.f);\n"
+"float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)\n"
+"	float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);\n"
+"	float dist = dot3F4(n, point) + planeEqn.w;\n"
+"	*closestPointOnFace = point - dist * n;\n"
+"	return dist;\n"
+"inline bool IsPointInPolygon(float4 p, \n"
+"							const btGpuFace* face,\n"
+"							__global const float4* baseVertex,\n"
+"							__global const  int* convexIndices,\n"
+"							float4* out)\n"
+"    float4 a;\n"
+"    float4 b;\n"
+"    float4 ab;\n"
+"    float4 ap;\n"
+"    float4 v;\n"
+"	float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);\n"
+"	\n"
+"	if (face->m_numIndices<2)\n"
+"		return false;\n"
+"	\n"
+"	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];\n"
+"	\n"
+"	b = v0;\n"
+"    for(unsigned i=0; i != face->m_numIndices; ++i)\n"
+"    {\n"
+"		a = b;\n"
+"		float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];\n"
+"		b = vi;\n"
+"        ab = b-a;\n"
+"        ap = p-a;\n"
+"        v = cross3(ab,plane);\n"
+"        if (dot(ap, v) > 0.f)\n"
+"        {\n"
+"            float ab_m2 = dot(ab, ab);\n"
+"            float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;\n"
+"            if (rt <= 0.f)\n"
+"            {\n"
+"                *out = a;\n"
+"            }\n"
+"            else if (rt >= 1.f) \n"
+"            {\n"
+"                *out = b;\n"
+"            }\n"
+"            else\n"
+"            {\n"
+"            	float s = 1.f - rt;\n"
+"				out[0].x = s * a.x + rt * b.x;\n"
+"				out[0].y = s * a.y + rt * b.y;\n"
+"				out[0].z = s * a.z + rt * b.z;\n"
+"            }\n"
+"            return false;\n"
+"        }\n"
+"    }\n"
+"    return true;\n"
+"void	computeContactSphereConvex(int pairIndex,\n"
+"																int bodyIndexA, int bodyIndexB, \n"
+"																int collidableIndexA, int collidableIndexB, \n"
+"																__global const BodyData* rigidBodies, \n"
+"																__global const btCollidableGpu* collidables,\n"
+"																__global const ConvexPolyhedronCL* convexShapes,\n"
+"																__global const float4* convexVertices,\n"
+"																__global const int* convexIndices,\n"
+"																__global const btGpuFace* faces,\n"
+"																__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																counter32_t nGlobalContactsOut,\n"
+"																int maxContactCapacity,\n"
+"																float4 spherePos2,\n"
+"																float radius,\n"
+"																float4 pos,\n"
+"																float4 quat\n"
+"																)\n"
+"	float4 invPos;\n"
+"	float4 invOrn;\n"
+"	trInverse(pos,quat, &invPos,&invOrn);\n"
+"	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n"
+"	int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n"
+"	int numFaces = convexShapes[shapeIndex].m_numFaces;\n"
+"	float4 closestPnt = (float4)(0, 0, 0, 0);\n"
+"	float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n"
+"	float minDist = -1000000.f;\n"
+"	bool bCollide = true;\n"
+"	for ( int f = 0; f < numFaces; f++ )\n"
+"	{\n"
+"		btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
+"		// set up a plane equation \n"
+"		float4 planeEqn;\n"
+"		float4 n1 = face.m_plane;\n"
+"		n1.w = 0.f;\n"
+"		planeEqn = n1;\n"
+"		planeEqn.w = face.m_plane.w;\n"
+"		\n"
+"	\n"
+"		// compute a signed distance from the vertex in cloth to the face of rigidbody.\n"
+"		float4 pntReturn;\n"
+"		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);\n"
+"		// If the distance is positive, the plane is a separating plane. \n"
+"		if ( dist > radius )\n"
+"		{\n"
+"			bCollide = false;\n"
+"			break;\n"
+"		}\n"
+"		if (dist>0)\n"
+"		{\n"
+"			//might hit an edge or vertex\n"
+"			float4 out;\n"
+"			float4 zeroPos = make_float4(0,0,0,0);\n"
+"			bool isInPoly = IsPointInPolygon(spherePos,\n"
+"					&face,\n"
+"					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],\n"
+"					convexIndices,\n"
+"           &out);\n"
+"			if (isInPoly)\n"
+"			{\n"
+"				if (dist>minDist)\n"
+"				{\n"
+"					minDist = dist;\n"
+"					closestPnt = pntReturn;\n"
+"					hitNormalWorld = planeEqn;\n"
+"					\n"
+"				}\n"
+"			} else\n"
+"			{\n"
+"				float4 tmp = spherePos-out;\n"
+"				float l2 = dot(tmp,tmp);\n"
+"				if (l2<radius*radius)\n"
+"				{\n"
+"					dist  = sqrt(l2);\n"
+"					if (dist>minDist)\n"
+"					{\n"
+"						minDist = dist;\n"
+"						closestPnt = out;\n"
+"						hitNormalWorld = tmp/dist;\n"
+"						\n"
+"					}\n"
+"					\n"
+"				} else\n"
+"				{\n"
+"					bCollide = false;\n"
+"					break;\n"
+"				}\n"
+"			}\n"
+"		} else\n"
+"		{\n"
+"			if ( dist > minDist )\n"
+"			{\n"
+"				minDist = dist;\n"
+"				closestPnt = pntReturn;\n"
+"				hitNormalWorld.xyz = planeEqn.xyz;\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"	\n"
+"	if (bCollide && minDist > -10000)\n"
+"	{\n"
+"		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n"
+"		float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n"
+"		\n"
+"		float actualDepth = minDist-radius;\n"
+"		if (actualDepth<=0.f)\n"
+"		{\n"
+"			\n"
+"			pOnB1.w = actualDepth;\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		\n"
+"			\n"
+"			if (1)//dstIdx < maxContactCapacity)\n"
+"			{\n"
+"				__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"				c->m_worldNormalOnB = -normalOnSurfaceB1;\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"				c->m_worldPosB[0] = pOnB1;\n"
+"				c->m_childIndexA = -1;\n"
+"				c->m_childIndexB = -1;\n"
+"				GET_NPOINTS(*c) = 1;\n"
+"			} \n"
+"		}\n"
+"	}//if (hasCollision)\n"
+"							\n"
+"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n"
+"	if( nPoints == 0 )\n"
+"        return 0;\n"
+"    \n"
+"    if (nPoints <=4)\n"
+"        return nPoints;\n"
+"    \n"
+"    \n"
+"    if (nPoints >64)\n"
+"        nPoints = 64;\n"
+"    \n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			center += p[i];\n"
+"		center /= (float)nPoints;\n"
+"	}\n"
+"    \n"
+"	\n"
+"    \n"
+"	//	sample 4 directions\n"
+"    \n"
+"    float4 aVector = p[0] - center;\n"
+"    float4 u = cross3( nearNormal, aVector );\n"
+"    float4 v = cross3( nearNormal, u );\n"
+"    u = normalize3( u );\n"
+"    v = normalize3( v );\n"
+"    \n"
+"    \n"
+"    //keep point with deepest penetration\n"
+"    float minW= FLT_MAX;\n"
+"    \n"
+"    int minIndex=-1;\n"
+"    \n"
+"    float4 maxDots;\n"
+"    maxDots.x = FLT_MIN;\n"
+"    maxDots.y = FLT_MIN;\n"
+"    maxDots.z = FLT_MIN;\n"
+"    maxDots.w = FLT_MIN;\n"
+"    \n"
+"    //	idx, distance\n"
+"    for(int ie = 0; ie<nPoints; ie++ )\n"
+"    {\n"
+"        if (p[ie].w<minW)\n"
+"        {\n"
+"            minW = p[ie].w;\n"
+"            minIndex=ie;\n"
+"        }\n"
+"        float f;\n"
+"        float4 r = p[ie]-center;\n"
+"        f = dot3F4( u, r );\n"
+"        if (f<maxDots.x)\n"
+"        {\n"
+"            maxDots.x = f;\n"
+"            contactIdx[0].x = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -u, r );\n"
+"        if (f<maxDots.y)\n"
+"        {\n"
+"            maxDots.y = f;\n"
+"            contactIdx[0].y = ie;\n"
+"        }\n"
+"        \n"
+"        \n"
+"        f = dot3F4( v, r );\n"
+"        if (f<maxDots.z)\n"
+"        {\n"
+"            maxDots.z = f;\n"
+"            contactIdx[0].z = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -v, r );\n"
+"        if (f<maxDots.w)\n"
+"        {\n"
+"            maxDots.w = f;\n"
+"            contactIdx[0].w = ie;\n"
+"        }\n"
+"        \n"
+"    }\n"
+"    \n"
+"    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n"
+"    {\n"
+"        //replace the first contact with minimum (todo: replace contact with least penetration)\n"
+"        contactIdx[0].x = minIndex;\n"
+"    }\n"
+"    \n"
+"    return 4;\n"
+"    \n"
+"int computeContactPlaneConvex(int pairIndex,\n"
+"								int bodyIndexA, int bodyIndexB, \n"
+"								int collidableIndexA, int collidableIndexB, \n"
+"								__global const BodyData* rigidBodies, \n"
+"								__global const btCollidableGpu*collidables,\n"
+"								__global const ConvexPolyhedronCL* convexShapes,\n"
+"								__global const float4* convexVertices,\n"
+"								__global const int* convexIndices,\n"
+"								__global const btGpuFace* faces,\n"
+"								__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"								counter32_t nGlobalContactsOut,\n"
+"								int maxContactCapacity,\n"
+"								float4 posB,\n"
+"								Quaternion ornB\n"
+"								)\n"
+"	int resultIndex=-1;\n"
+"		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n"
+"	__global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];\n"
+"	\n"
+"	float4 posA;\n"
+"	posA = rigidBodies[bodyIndexA].m_pos;\n"
+"	Quaternion ornA;\n"
+"	ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"	float4 planeEq;\n"
+"	 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n"
+"	float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n"
+"	float4 planeNormalWorld;\n"
+"	planeNormalWorld = qtRotate(ornA,planeNormal);\n"
+"	float planeConstant = planeEq.w;\n"
+"	\n"
+"	float4 invPosA;Quaternion invOrnA;\n"
+"	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n"
+"	{\n"
+"		\n"
+"		trInverse(posA,ornA,&invPosA,&invOrnA);\n"
+"		trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	}\n"
+"	float4 invPosB;Quaternion invOrnB;\n"
+"	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;\n"
+"	{\n"
+"		\n"
+"		trInverse(posB,ornB,&invPosB,&invOrnB);\n"
+"		trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1);	\n"
+"	}\n"
+"	\n"
+"	float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);\n"
+"	float maxDot = -1e30;\n"
+"	int hitVertex=-1;\n"
+"	float4 hitVtx;\n"
+"	float4 contactPoints[MAX_PLANE_CONVEX_POINTS];\n"
+"	int numPoints = 0;\n"
+"	int4 contactIdx;\n"
+"	contactIdx=make_int4(0,1,2,3);\n"
+"    \n"
+"	\n"
+"	for (int i=0;i<hullB->m_numVertices;i++)\n"
+"	{\n"
+"		float4 vtx = convexVertices[hullB->m_vertexOffset+i];\n"
+"		float curDot = dot(vtx,planeNormalInConvex);\n"
+"		if (curDot>maxDot)\n"
+"		{\n"
+"			hitVertex=i;\n"
+"			maxDot=curDot;\n"
+"			hitVtx = vtx;\n"
+"			//make sure the deepest points is always included\n"
+"			if (numPoints==MAX_PLANE_CONVEX_POINTS)\n"
+"				numPoints--;\n"
+"		}\n"
+"		if (numPoints<MAX_PLANE_CONVEX_POINTS)\n"
+"		{\n"
+"			float4 vtxWorld = transform(&vtx, &posB, &ornB);\n"
+"			float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;\n"
+"			float dist = dot(planeNormal,vtxInPlane)-planeConstant;\n"
+"			if (dist<0.f)\n"
+"			{\n"
+"				vtxWorld.w = dist;\n"
+"				contactPoints[numPoints] = vtxWorld;\n"
+"				numPoints++;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	int numReducedPoints  = numPoints;\n"
+"	if (numPoints>4)\n"
+"	{\n"
+"		numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);\n"
+"	}\n"
+"	if (numReducedPoints>0)\n"
+"	{\n"
+"		int dstIdx;\n"
+"	    AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		if (dstIdx < maxContactCapacity)\n"
+"		{\n"
+"			resultIndex = dstIdx;\n"
+"			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"			c->m_worldNormalOnB = -planeNormalWorld;\n"
+"			//c->setFrictionCoeff(0.7);\n"
+"			//c->setRestituitionCoeff(0.f);\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"			c->m_batchIdx = pairIndex;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"			c->m_childIndexA = -1;\n"
+"			c->m_childIndexB = -1;\n"
+"			switch (numReducedPoints)\n"
+"            {\n"
+"                case 4:\n"
+"                    c->m_worldPosB[3] = contactPoints[contactIdx.w];\n"
+"                case 3:\n"
+"                    c->m_worldPosB[2] = contactPoints[contactIdx.z];\n"
+"                case 2:\n"
+"                    c->m_worldPosB[1] = contactPoints[contactIdx.y];\n"
+"                case 1:\n"
+"                    c->m_worldPosB[0] = contactPoints[contactIdx.x];\n"
+"                default:\n"
+"                {\n"
+"                }\n"
+"            };\n"
+"			\n"
+"			GET_NPOINTS(*c) = numReducedPoints;\n"
+"		}//if (dstIdx < numPairs)\n"
+"	}	\n"
+"	return resultIndex;\n"
+"void	computeContactPlaneSphere(int pairIndex,\n"
+"																int bodyIndexA, int bodyIndexB, \n"
+"																int collidableIndexA, int collidableIndexB, \n"
+"																__global const BodyData* rigidBodies, \n"
+"																__global const btCollidableGpu* collidables,\n"
+"																__global const btGpuFace* faces,\n"
+"																__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																counter32_t nGlobalContactsOut,\n"
+"																int maxContactCapacity)\n"
+"	float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n"
+"	float radius = collidables[collidableIndexB].m_radius;\n"
+"	float4 posA1 = rigidBodies[bodyIndexA].m_pos;\n"
+"	float4 ornA1 = rigidBodies[bodyIndexA].m_quat;\n"
+"	float4 posB1 = rigidBodies[bodyIndexB].m_pos;\n"
+"	float4 ornB1 = rigidBodies[bodyIndexB].m_quat;\n"
+"	\n"
+"	bool hasCollision = false;\n"
+"	float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n"
+"	float planeConstant = planeEq.w;\n"
+"	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n"
+"	{\n"
+"		float4 invPosA;Quaternion invOrnA;\n"
+"		trInverse(posA1,ornA1,&invPosA,&invOrnA);\n"
+"		trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	}\n"
+"	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;\n"
+"	{\n"
+"		float4 invPosB;Quaternion invOrnB;\n"
+"		trInverse(posB1,ornB1,&invPosB,&invOrnB);\n"
+"		trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1);	\n"
+"	}\n"
+"	float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;\n"
+"	float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;\n"
+"	hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();\n"
+"	if (hasCollision)\n"
+"	{\n"
+"		float4 vtxInPlaneProjected1 = vtxInPlane1 -   distance*planeNormal1;\n"
+"		float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);\n"
+"		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n"
+"		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n"
+"		pOnB1.w = distance;\n"
+"		int dstIdx;\n"
+"    AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		\n"
+"		if (dstIdx < maxContactCapacity)\n"
+"		{\n"
+"			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"			c->m_worldNormalOnB = -normalOnSurfaceB1;\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"			c->m_batchIdx = pairIndex;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"			c->m_worldPosB[0] = pOnB1;\n"
+"			c->m_childIndexA = -1;\n"
+"			c->m_childIndexB = -1;\n"
+"			GET_NPOINTS(*c) = 1;\n"
+"		}//if (dstIdx < numPairs)\n"
+"	}//if (hasCollision)\n"
+"__kernel void   primitiveContactsKernel( __global int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numPairs, int maxContactCapacity)\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"			float4 posB;\n"
+"			posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			Quaternion ornB;\n"
+"			ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);\n"
+"			if (contactIndex>=0)\n"
+"				pairs[pairIndex].z = contactIndex;\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
+"		{\n"
+"			float4 posA;\n"
+"			posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			Quaternion ornA;\n"
+"			ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n"
+"			if (contactIndex>=0)\n"
+"				pairs[pairIndex].z = contactIndex;\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
+"		{\n"
+"			computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"		\n"
+"			float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexA].m_radius;\n"
+"			float4 convexPos = rigidBodies[bodyIndexB].m_pos;\n"
+"			float4 convexOrn = rigidBodies[bodyIndexB].m_quat;\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"																spherePos,sphereRadius,convexPos,convexOrn);\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"		\n"
+"			float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexB].m_radius;\n"
+"			float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"																spherePos,sphereRadius,convexPos,convexOrn);\n"
+"			return;\n"
+"		}\n"
+"	\n"
+"	\n"
+"	\n"
+"		\n"
+"	\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			//sphere-sphere\n"
+"			float radiusA = collidables[collidableIndexA].m_radius;\n"
+"			float radiusB = collidables[collidableIndexB].m_radius;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			float4 diff = posA-posB;\n"
+"			float len = length(diff);\n"
+"			\n"
+"			///iff distance positive, don't generate a new contact\n"
+"			if ( len <= (radiusA+radiusB))\n"
+"			{\n"
+"				///distance (negative means penetration)\n"
+"				float dist = len - (radiusA+radiusB);\n"
+"				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n"
+"				if (len > 0.00001)\n"
+"				{\n"
+"					normalOnSurfaceB = diff / len;\n"
+"				}\n"
+"				float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n"
+"				contactPosB.w = dist;\n"
+"								\n"
+"				int dstIdx;\n"
+"				 AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				\n"
+"				if (dstIdx < maxContactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = normalOnSurfaceB;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_worldPosB[0] = contactPosB;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = -1;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				}//if (dstIdx < numPairs)\n"
+"			}//if ( len <= (radiusA+radiusB))\n"
+"			return;\n"
+"	}//	if (i<numPairs)\n"
+"// work-in-progress\n"
+"__kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n"
+"													__global const BodyData* rigidBodies, \n"
+"													__global const btCollidableGpu* collidables,\n"
+"													__global const ConvexPolyhedronCL* convexShapes, \n"
+"													__global const float4* vertices,\n"
+"													__global const float4* uniqueEdges,\n"
+"													__global const btGpuFace* faces,\n"
+"													__global const int* indices,\n"
+"													__global btAabbCL* aabbs,\n"
+"													__global const btGpuChildShape* gpuChildShapes,\n"
+"													__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"													counter32_t nGlobalContactsOut,\n"
+"													int numCompoundPairs, int maxContactCapacity\n"
+"													)\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"		int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"		\n"
+"		int collidableIndexA = -1;\n"
+"		int collidableIndexB = -1;\n"
+"		\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		\n"
+"		float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							\n"
+"		if (childShapeIndexA >= 0)\n"
+"		{\n"
+"			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"			float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"			float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"			posA = newPosA;\n"
+"			ornA = newOrnA;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		}\n"
+"		\n"
+"		if (childShapeIndexB>=0)\n"
+"		{\n"
+"			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"		}\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n"
+"		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"		int pairIndex = i;\n"
+"		if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB,  collidableIndexA,collidableIndexB, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);\n"
+"			return;\n"
+"		}\n"
+"		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))\n"
+"		{\n"
+"			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n"
+"			return;\n"
+"		}\n"
+"		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))\n"
+"		{\n"
+"			float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexB].m_radius;\n"
+"			float4 convexPos = posA;\n"
+"			float4 convexOrn = ornA;\n"
+"			\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, \n"
+"										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"										spherePos,sphereRadius,convexPos,convexOrn);\n"
+"	\n"
+"			return;\n"
+"		}\n"
+"		if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexA].m_radius;\n"
+"			float4 convexPos = posB;\n"
+"			float4 convexOrn = ornB;\n"
+"			\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"										spherePos,sphereRadius,convexPos,convexOrn);\n"
+"	\n"
+"			return;\n"
+"		}\n"
+"	}//	if (i<numCompoundPairs)\n"
+"bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )\n"
+"	const float4* p1 = &vertices[0];\n"
+"	const float4* p2 = &vertices[1];\n"
+"	const float4* p3 = &vertices[2];\n"
+"	float4 edge1;	edge1 = (*p2 - *p1);\n"
+"	float4 edge2;	edge2 = ( *p3 - *p2 );\n"
+"	float4 edge3;	edge3 = ( *p1 - *p3 );\n"
+"	\n"
+"	float4 p1_to_p; p1_to_p = ( *p - *p1 );\n"
+"	float4 p2_to_p; p2_to_p = ( *p - *p2 );\n"
+"	float4 p3_to_p; p3_to_p = ( *p - *p3 );\n"
+"	float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));\n"
+"	float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));\n"
+"	float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));\n"
+"	\n"
+"	\n"
+"	float r1, r2, r3;\n"
+"	r1 = dot(edge1_normal,p1_to_p );\n"
+"	r2 = dot(edge2_normal,p2_to_p );\n"
+"	r3 = dot(edge3_normal,p3_to_p );\n"
+"	\n"
+"	if ( r1 > 0 && r2 > 0 && r3 > 0 )\n"
+"		return true;\n"
+"    if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) \n"
+"		return true;\n"
+"	return false;\n"
+"float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) \n"
+"	float4 diff = p - from;\n"
+"	float4 v = to - from;\n"
+"	float t = dot(v,diff);\n"
+"	\n"
+"	if (t > 0) \n"
+"	{\n"
+"		float dotVV = dot(v,v);\n"
+"		if (t < dotVV) \n"
+"		{\n"
+"			t /= dotVV;\n"
+"			diff -= t*v;\n"
+"		} else \n"
+"		{\n"
+"			t = 1;\n"
+"			diff -= v;\n"
+"		}\n"
+"	} else\n"
+"	{\n"
+"		t = 0;\n"
+"	}\n"
+"	*nearest = from + t*v;\n"
+"	return dot(diff,diff);	\n"
+"void	computeContactSphereTriangle(int pairIndex,\n"
+"									int bodyIndexA, int bodyIndexB,\n"
+"									int collidableIndexA, int collidableIndexB, \n"
+"									__global const BodyData* rigidBodies, \n"
+"									__global const btCollidableGpu* collidables,\n"
+"									const float4* triangleVertices,\n"
+"									__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"									counter32_t nGlobalContactsOut,\n"
+"									int maxContactCapacity,\n"
+"									float4 spherePos2,\n"
+"									float radius,\n"
+"									float4 pos,\n"
+"									float4 quat,\n"
+"									int faceIndex\n"
+"									)\n"
+"	float4 invPos;\n"
+"	float4 invOrn;\n"
+"	trInverse(pos,quat, &invPos,&invOrn);\n"
+"	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n"
+"	int numFaces = 3;\n"
+"	float4 closestPnt = (float4)(0, 0, 0, 0);\n"
+"	float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n"
+"	float minDist = -1000000.f;\n"
+"	bool bCollide = false;\n"
+"	\n"
+"	//////////////////////////////////////\n"
+"	float4 sphereCenter;\n"
+"	sphereCenter = spherePos;\n"
+"	const float4* vertices = triangleVertices;\n"
+"	float contactBreakingThreshold = 0.f;//todo?\n"
+"	float radiusWithThreshold = radius + contactBreakingThreshold;\n"
+"	float4 edge10;\n"
+"	edge10 = vertices[1]-vertices[0];\n"
+"	edge10.w = 0.f;//is this needed?\n"
+"	float4 edge20;\n"
+"	edge20 = vertices[2]-vertices[0];\n"
+"	edge20.w = 0.f;//is this needed?\n"
+"	float4 normal = cross3(edge10,edge20);\n"
+"	normal = normalize(normal);\n"
+"	float4 p1ToCenter;\n"
+"	p1ToCenter = sphereCenter - vertices[0];\n"
+"	\n"
+"	float distanceFromPlane = dot(p1ToCenter,normal);\n"
+"	if (distanceFromPlane < 0.f)\n"
+"	{\n"
+"		//triangle facing the other way\n"
+"		distanceFromPlane *= -1.f;\n"
+"		normal *= -1.f;\n"
+"	}\n"
+"	hitNormalWorld = normal;\n"
+"	bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;\n"
+"	\n"
+"	// Check for contact / intersection\n"
+"	bool hasContact = false;\n"
+"	float4 contactPoint;\n"
+"	if (isInsideContactPlane) \n"
+"	{\n"
+"	\n"
+"		if (pointInTriangle(vertices,&normal, &sphereCenter)) \n"
+"		{\n"
+"			// Inside the contact wedge - touches a point on the shell plane\n"
+"			hasContact = true;\n"
+"			contactPoint = sphereCenter - normal*distanceFromPlane;\n"
+"			\n"
+"		} else {\n"
+"			// Could be inside one of the contact capsules\n"
+"			float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;\n"
+"			float4 nearestOnEdge;\n"
+"			int numEdges = 3;\n"
+"			for (int i = 0; i < numEdges; i++) \n"
+"			{\n"
+"				float4 pa =vertices[i];\n"
+"				float4 pb = vertices[(i+1)%3];\n"
+"				float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);\n"
+"				if (distanceSqr < contactCapsuleRadiusSqr) \n"
+"				{\n"
+"					// Yep, we're inside a capsule\n"
+"					hasContact = true;\n"
+"					contactPoint = nearestOnEdge;\n"
+"					\n"
+"				}\n"
+"				\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (hasContact) \n"
+"	{\n"
+"		closestPnt = contactPoint;\n"
+"		float4 contactToCenter = sphereCenter - contactPoint;\n"
+"		minDist = length(contactToCenter);\n"
+"		if (minDist>FLT_EPSILON)\n"
+"		{\n"
+"			hitNormalWorld = normalize(contactToCenter);//*(1./minDist);\n"
+"			bCollide  = true;\n"
+"		}\n"
+"		\n"
+"	}\n"
+"	/////////////////////////////////////\n"
+"	if (bCollide && minDist > -10000)\n"
+"	{\n"
+"		\n"
+"		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n"
+"		float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n"
+"		float actualDepth = minDist-radius;\n"
+"		\n"
+"		if (actualDepth<=0.f)\n"
+"		{\n"
+"			pOnB1.w = actualDepth;\n"
+"			int dstIdx;\n"
+"			\n"
+"			float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);\n"
+"			if (lenSqr>FLT_EPSILON)\n"
+"			{\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			\n"
+"				if (dstIdx < maxContactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = -normalOnSurfaceB1;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"					c->m_worldPosB[0] = pOnB1;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = faceIndex;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				} \n"
+"			}\n"
+"		}\n"
+"	}//if (hasCollision)\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,\n"
+"												__global const BodyData* rigidBodies,\n"
+"												__global const btCollidableGpu* collidables,\n"
+"												__global const ConvexPolyhedronCL* convexShapes, \n"
+"												__global const float4* vertices,\n"
+"												__global const float4* uniqueEdges,\n"
+"												__global const btGpuFace* faces,\n"
+"												__global const int* indices,\n"
+"												__global btAabbCL* aabbs,\n"
+"												__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"												counter32_t nGlobalContactsOut,\n"
+"													int numConcavePairs, int maxContactCapacity\n"
+"												)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"	int pairIdx = i;\n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n"
+"	{\n"
+"		int f = concavePairs[i].z;\n"
+"		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		\n"
+"		float4 verticesA[3];\n"
+"		for (int i=0;i<3;i++)\n"
+"		{\n"
+"			int index = indices[face.m_indexOffset+i];\n"
+"			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"			verticesA[i] = vert;\n"
+"		}\n"
+"		float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
+"		float sphereRadius = collidables[collidableIndexB].m_radius;\n"
+"		float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n"
+"		float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n"
+"		computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n"
+"																rigidBodies,collidables,\n"
+"																verticesA,\n"
+"																globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"																spherePos,sphereRadius,convexPos,convexOrn, f);\n"
+"		return;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
new file mode 100644
index 00000000..a6565fd6
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
@@ -0,0 +1,2018 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+#define B3_MAX_STACK_DEPTH 256
+typedef unsigned int u32;
+///keep this in sync with btCollidable.h
+typedef struct
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union
+	{
+		float m_radius;
+		int	m_compoundBvhIndex;
+	};
+	int m_shapeType;
+	int m_shapeIndex;
+} btCollidableGpu;
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} b3QuantizedBvhNode;
+typedef struct
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+} b3BvhInfo;
+int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+int isLeafNode(const b3QuantizedBvhNode* rootNode)
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+int getEscapeIndex(const b3QuantizedBvhNode* rootNode)
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+typedef struct
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} b3BvhSubtreeInfo;
+typedef struct
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+typedef struct
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+typedef struct  
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+typedef struct 
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+typedef struct
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+#define make_float4 (float4)
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+//	float4 a1 = make_float4(a.xyz,0.f);
+//	float4 b1 = make_float4(b.xyz,0.f);
+//	return cross(a1,b1);
+//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);
+	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);
+	//return c;
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 fastNormalize4(float4 v)
+	v = make_float4(v.xyz,0.f);
+	return fast_normalize(v);
+//	Quaternion
+typedef float4 Quaternion;
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+	return qtRotate( *orientation, *p ) + (*translation);
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, const float4* vertices, float* min, float* max)
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)
+	float Min0,Max0;
+	float Min1,Max1;
+	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+inline bool IsAlmostZero(const float4 v)
+	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)
+		return false;
+	return true;
+bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* verticesA, 
+	__global const float4* uniqueEdgesA, 
+	__global const btGpuFace* facesA,
+	__global const int*  indicesA,
+	const float4* verticesB,
+	const float4* uniqueEdgesB, 
+	const btGpuFace* facesB,
+	const int*  indicesB,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+		float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+			float4 crossje = cross3(edge0World,edge1World);
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje *= -1.f;
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+				}
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, __global const float4* vertices,float* depth)
+	float Min0,Max0;
+	float Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+		if((dot3F4(-DeltaC2,*sep))>0.0f)
+		{
+			*sep = -(*sep);
+		}
+	return true;
+bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices,
+	__global const float4* unitSphereDirections,
+	int numUnitSphereDirections,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test unit sphere directions
+	for (int i=0;i<numUnitSphereDirections;i++)
+	{
+		float4 crossje;
+		crossje = unitSphereDirections[i];	
+		if (dot3F4(DeltaC2,crossje)>0)
+			crossje *= -1.f;
+		{
+			float dist;
+			bool result = true;
+			float Min0,Max0;
+			float Min1,Max1;
+			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+			if(Max0<Min1 || Max1<Min0)
+				return false;
+			float d0 = Max0 - Min1;
+			float d1 = Max1 - Min0;
+			dist = d0<d1 ? d0:d1;
+			result = true;
+			if(dist<*dmin)
+			{
+				*dmin = dist;
+				*sep = crossje;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+			float4 crossje = cross3(edge0World,edge1World);
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje*=-1.f;
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+					if(Max0<Min1 || Max1<Min0)
+						return false;
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+				}
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+// work-in-progress
+__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global volatile float4* gpuCompoundSepNormalsOut,
+																					__global volatile int* gpuHasCompoundSepNormalsOut,
+																					int numCompoundPairs
+																					)
+	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		float4 ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = qtRotate(ornA,childPosA)+posA;
+			float4 newOrnA = qtMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+		gpuHasCompoundSepNormalsOut[i] = 0;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+		int hasSeparatingAxis = 5;
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		posA.w = 0.f;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal = make_float4(1,0,0,0);
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else//(!sepB)
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+				if (sepEE)
+				{
+						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);
+						gpuHasCompoundSepNormalsOut[i] = 1;
+				}//sepEE
+			}//(!sepB)
+		}//(!sepA)
+	}
+inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)
+		b3Float4 vecOut;
+		vecOut = b3MakeFloat4(
+			(float)(vecIn[0]) / (quantization.x),
+			(float)(vecIn[1]) / (quantization.y),
+			(float)(vecIn[2]) / (quantization.z),
+			0.f);
+		vecOut += bvhAabbMin;
+		return vecOut;
+inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)
+		b3Float4 vecOut;
+		vecOut = b3MakeFloat4(
+			(float)(vecIn[0]) / (quantization.x),
+			(float)(vecIn[1]) / (quantization.y),
+			(float)(vecIn[2]) / (quantization.z),
+			0.f);
+		vecOut += bvhAabbMin;
+		return vecOut;
+// work-in-progress
+__kernel void   findCompoundPairsKernel( __global const int4* pairs, 
+	__global const BodyData* rigidBodies, 
+	__global const btCollidableGpu* collidables,
+	__global const ConvexPolyhedronCL* convexShapes, 
+	__global const float4* vertices,
+	__global const float4* uniqueEdges,
+	__global const btGpuFace* faces,
+	__global const int* indices,
+	__global b3Aabb_t* aabbLocalSpace,
+	__global const btGpuChildShape* gpuChildShapes,
+	__global volatile int4* gpuCompoundPairsOut,
+	__global volatile int* numCompoundPairsOut,
+	__global const b3BvhSubtreeInfo* subtrees,
+	__global const b3QuantizedBvhNode* quantizedNodes,
+	__global const b3BvhInfo* bvhInfos,
+	int numPairs,
+	int maxNumCompoundPairsCapacity
+	)
+	int i = get_global_id(0);
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;
+			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;
+			int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;
+			int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;
+			int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;
+			int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			for (int p=0;p<numSubTreesA;p++)
+			{
+				b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];
+				//bvhInfos[bvhA].m_quantization
+				b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+				b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+				b3Float4 aabbAMinOut,aabbAMaxOut;
+				float margin=0.f;
+				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);
+				for (int q=0;q<numSubTreesB;q++)
+				{
+					b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];
+					b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+					b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+					b3Float4 aabbBMinOut,aabbBMaxOut;
+					float margin=0.f;
+					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);
+					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+					if (aabbOverlap)
+					{
+						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;
+						int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;
+						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;
+						int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;
+						b3Int2 nodeStack[B3_MAX_STACK_DEPTH];
+						b3Int2 node0;
+						node0.x = startNodeIndexA;
+						node0.y = startNodeIndexB;
+						int maxStackDepth = B3_MAX_STACK_DEPTH;
+						int depth=0;
+						nodeStack[depth++]=node0;
+						do
+						{
+							b3Int2 node = nodeStack[--depth];
+							b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+							b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+							b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+							b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+							float margin=0.f;
+							b3Float4 aabbAMinOut,aabbAMaxOut;
+							b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);
+							b3Float4 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);
+							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (nodeOverlap)
+							{
+								bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);
+								bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);
+								bool isInternalA = !isLeafA;
+								bool isInternalB = !isLeafB;
+								//fail, even though it might hit two leaf nodes
+								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))
+								{
+									//printf("Error: traversal exceeded maxStackDepth");
+									continue;
+								}
+								if(isInternalA)
+								{
+									int nodeAleftChild = node.x+1;
+									bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);
+									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);
+									if(isInternalB)
+									{					
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);
+									}
+									else
+									{
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);
+									}
+								}
+								else
+								{
+									if(isInternalB)
+									{
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);
+									}
+									else
+									{
+										int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+										if (compoundPairIdx<maxNumCompoundPairsCapacity)
+										{
+											int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);
+											int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);
+											gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+										}
+									}
+								}
+							}
+						} while (depth);
+					}
+				}
+			}
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) 
+			{
+				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;
+				for (int c=0;c<numChildrenA;c++)
+				{
+					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;
+					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					float4 ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = qtRotate(ornA,childPosA)+posA;
+					float4 newOrnA = qtMul(ornA,childOrnA);
+					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;
+					b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];
+					float margin = 0.f;
+					b3Float4 aabbAMinWS;
+					b3Float4 aabbAMaxWS;
+					b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,
+						newPosA,
+						newOrnA,
+						&aabbAMinWS,&aabbAMaxWS);
+					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+						for (int b=0;b<numChildrenB;b++)
+						{
+							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+							float4 newPosB = transform(&childPosB,&posB,&ornB);
+							float4 newOrnB = qtMul(ornB,childOrnB);
+							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+							b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];
+							b3Float4 aabbBMinWS;
+							b3Float4 aabbBMaxWS;
+							b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,
+								newPosB,
+								newOrnB,
+								&aabbBMinWS,&aabbBMaxWS);
+							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);
+							if (aabbOverlap)
+							{
+								int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+								float dmin = FLT_MAX;
+								float4 posA = newPosA;
+								posA.w = 0.f;
+								float4 posB = newPosB;
+								posB.w = 0.f;
+								float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+								float4 ornA = newOrnA;
+								float4 c0 = transform(&c0local, &posA, &ornA);
+								float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+								float4 ornB =newOrnB;
+								float4 c1 = transform(&c1local,&posB,&ornB);
+								const float4 DeltaC2 = c0 - c1;
+								{//
+									int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+									if (compoundPairIdx<maxNumCompoundPairsCapacity)
+									{
+										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+									}
+								}//
+							}//fi(1)
+						} //for (int b=0
+					}//if (collidables[collidableIndexB].
+					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						if (1)
+						{
+							int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+							float dmin = FLT_MAX;
+							float4 posA = newPosA;
+							posA.w = 0.f;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							posB.w = 0.f;
+							float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+							float4 ornA = newOrnA;
+							float4 c0 = transform(&c0local, &posA, &ornA);
+							float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 c1 = transform(&c1local,&posB,&ornB);
+							const float4 DeltaC2 = c0 - c1;
+							{
+								int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+								if (compoundPairIdx<maxNumCompoundPairsCapacity)
+								{
+									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);
+								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							}//
+						}//fi (1)
+					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+				}//for (int b=0;b<numChildrenB;b++)	
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) 
+				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+			{
+				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+				for (int b=0;b<numChildrenB;b++)
+				{
+					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+					float4 ornB = rigidBodies[bodyIndexB].m_quat;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+					float4 newPosB = qtRotate(ornB,childPosB)+posB;
+					float4 newOrnB = qtMul(ornB,childOrnB);
+					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+					//////////////////////////////////////
+					if (1)
+					{
+						int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+						float dmin = FLT_MAX;
+						float4 posA = rigidBodies[bodyIndexA].m_pos;
+						posA.w = 0.f;
+						float4 posB = newPosB;
+						posB.w = 0.f;
+						float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+						float4 ornA = rigidBodies[bodyIndexA].m_quat;
+						float4 c0 = transform(&c0local, &posA, &ornA);
+						float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+						float4 ornB =newOrnB;
+						float4 c1 = transform(&c1local,&posB,&ornB);
+						const float4 DeltaC2 = c0 - c1;
+						{//
+							int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+							if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							{
+								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);
+							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)
+						}//
+					}//fi (1)	
+				}//for (int b=0;b<numChildrenB;b++)
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			return;
+		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+	}//i<numPairs
+// work-in-progress
+__kernel void   findSeparatingAxisKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					int numPairs
+																					)
+	int i = get_global_id(0);
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+			if (!sepB)
+			{
+				hasSeparatingAxis[i] = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis[i] = 0;
+				} else
+				{
+					hasSeparatingAxis[i] = 1;
+					separatingNormals[i] = sepNormal;
+				}
+			}
+		}
+	}
+__kernel void   findSeparatingAxisVertexFaceKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					int numPairs
+																					)
+	int i = get_global_id(0);
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		hasSeparatingAxis[i] = 0;	
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		dmins[i] = dmin;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+			if (sepB)
+			{
+				dmins[i] = dmin;
+				hasSeparatingAxis[i] = 1;
+				separatingNormals[i] = sepNormal;
+			}
+		}
+	}
+__kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global  float4* separatingNormals,
+																					__global  int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					__global const float4* unitSphereDirections,
+																					int numUnitSphereDirections,
+																					int numPairs
+																					)
+	int i = get_global_id(0);
+	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+			float dmin = dmins[i];
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			posA.w = 0.f;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			posB.w = 0.f;
+			float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 c0 = transform(&c0local, &posA, &ornA);
+			float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+			float4 ornB =rigidBodies[bodyIndexB].m_quat;
+			float4 c1 = transform(&c1local,&posB,&ornB);
+			const float4 DeltaC2 = c0 - c1;
+			float4 sepNormal = separatingNormals[i];
+			bool sepEE = false;
+			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
+			if (numEdgeEdgeDirections<=numUnitSphereDirections)
+			{
+				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+					if (!sepEE)
+					{
+						hasSeparatingAxis[i] = 0;
+					} else
+					{
+						hasSeparatingAxis[i] = 1;
+						separatingNormals[i] = sepNormal;
+					}
+			}
+			/*
+			///else case is a separate kernel, to make Mac OSX OpenCL compiler happy
+			else
+			{
+				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,unitSphereDirections,numUnitSphereDirections,
+																									&sepNormal,&dmin);
+					if (!sepEE)
+					{
+						hasSeparatingAxis[i] = 0;
+					} else
+					{
+						hasSeparatingAxis[i] = 1;
+						separatingNormals[i] = sepNormal;
+					}
+			}
+			*/
+		}		//if (hasSeparatingAxis[i])
+	}//(i<numPairs)
+inline int	findClippingFaces(const float4 separatingNormal,
+                      const ConvexPolyhedronCL* hullA, 
+					  __global const ConvexPolyhedronCL* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+					  const float4* verticesA,
+                      const btGpuFace* facesA,
+                      const int* indicesA,
+					  __global const float4* verticesB,
+                      __global const btGpuFace* facesB,
+                      __global const int* indicesB,
+                      __global int4* clippingFaces, int pairIndex)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	int closestFaceB=0;
+	float dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,
+                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+	{
+		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		int numVertices = polyB.m_numIndices;
+        if (numVertices>capacityWorldVerts)
+            numVertices = capacityWorldVerts;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+            if (e0<capacityWorldVerts)
+            {
+                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+            }
+		}
+	}
+    int closestFaceA=0;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              facesA[hullA->m_faceOffset+face].m_plane.x,
+                                              facesA[hullA->m_faceOffset+face].m_plane.y,
+                                              facesA[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;
+    if (numVerticesA>capacityWorldVerts)
+       numVerticesA = capacityWorldVerts;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        if (e0<capacityWorldVerts)
+        {
+            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+        }
+    }
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+	return numContactsOut;
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,
+																					__global const BodyData* rigidBodies,
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global btAabbCL* aabbs,
+																					__global float4* concaveSeparatingNormalsOut,
+																					__global int* concaveHasSeparatingNormals,
+																					__global int4* clippingFacesOut,
+																					__global float4* worldVertsA1GPU,
+																					__global float4*  worldNormalsAGPU,
+																					__global float4* worldVertsB1GPU,
+																					int vertexFaceCapacity,
+																					int numConcavePairs
+																					)
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	concaveHasSeparatingNormals[i] = 0;
+	int pairIdx = i;
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&
+		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	int f = concavePairs[i].z;
+	bool overlap = false;
+	ConvexPolyhedronCL convexPolyhedronA;
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+		triAabb.m_min = min(triAabb.m_min,vert);		
+		triAabb.m_max = max(triAabb.m_max,vert);		
+	}
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+		int localCC=0;
+		numActualConcaveConvexTests++;
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+		convexPolyhedronA.m_faceOffset = 0;
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		///////////////////
+		///compound shape support
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+												posA,ornA,
+												posB,ornB,
+												DeltaC2,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												vertices,uniqueEdges,faces,indices,
+												&sepAxis,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,
+												posB,ornB,
+												posA,ornA,
+												DeltaC2,
+												vertices,uniqueEdges,faces,indices,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												&sepAxis,&dmin);
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+															posA,ornA,
+															posB,ornB,
+															DeltaC2,
+															verticesA,uniqueEdgesA,facesA,indicesA,
+															vertices,uniqueEdges,faces,indices,
+															&sepAxis,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis = 0;
+				} else
+				{
+					hasSeparatingAxis = 1;
+				}
+			}
+		}	
+		if (hasSeparatingAxis)
+		{
+			sepAxis.w = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+			float minDist = -1e30f;
+			float maxDist = 0.02f;
+			findClippingFaces(sepAxis,
+                     &convexPolyhedronA,
+					 &convexShapes[shapeIndexB],
+					 posA,ornA,
+					 posB,ornB,
+                      worldVertsA1GPU,
+                      worldNormalsAGPU,
+                      worldVertsB1GPU,
+					  vertexFaceCapacity,
+                      minDist, maxDist,
+                      verticesA,
+                      facesA,
+                      indicesA,
+ 					  vertices,
+                      faces,
+                      indices,
+                      clippingFacesOut, pairIdx);
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
new file mode 100644
index 00000000..f4339717
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
@@ -0,0 +1,1888 @@
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile __global int*
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define max2 max
+#define min2 min
+typedef unsigned int u32;
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+float fastDiv(float numerator, float denominator)
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+float4 fastDiv4(float4 numerator, float4 denominator)
+	return native_divide(numerator, denominator);	
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+//#define dot3F4 dot
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+//	Quaternion
+typedef float4 Quaternion;
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+	return qtRotate( *orientation, *p ) + (*translation);
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+__inline float4 lerp3(const float4 a,const float4 b, float  t)
+	return make_float4(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+    //double-check next test
+    	if (numVertsIn < 2)
+    		return 0;
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+//double-check next test
+	if (numVertsIn < 2)
+		return 0;
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA,  
+	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	__global const float4* vertices,
+	__global const b3GpuFace_t* faces,
+	__global const int* indices,
+	float4* contactsOut,
+	int contactCapacity)
+	int numContactsOut = 0;
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+				faces[hullA->m_faceOffset+face].m_plane.x, 
+				faces[hullA->m_faceOffset+face].m_plane.y, 
+				faces[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+	b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];
+		const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = qtRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+			if (depth <=maxDist)
+			{
+				float4 pointInWorld = pVtxIn[i];
+				//resultOut.addContactPoint(separatingNormal,point,depth);
+				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+			}
+		}
+	}
+	return numContactsOut;
+int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA,  
+	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	const float4* verticesA,
+	const b3GpuFace_t* facesA,
+	const int* indicesA,
+	__global const float4* verticesB,
+	__global const b3GpuFace_t* facesB,
+	__global const int* indicesB,
+	float4* contactsOut,
+	int contactCapacity)
+	int numContactsOut = 0;
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+				facesA[hullA->m_faceOffset+face].m_plane.x, 
+				facesA[hullA->m_faceOffset+face].m_plane.y, 
+				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+	b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];
+		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = qtRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+			if (depth <=maxDist)
+			{
+				float4 pointInWorld = pVtxIn[i];
+				//resultOut.addContactPoint(separatingNormal,point,depth);
+				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+			}
+		}
+	}
+	return numContactsOut;
+int	clipHullAgainstHull(const float4 separatingNormal,
+	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
+	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	__global const float4* vertices,
+	__global const b3GpuFace_t* faces,
+	__global const int* indices,
+	float4*	localContactsOut,
+	int localContactCapacity)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, 
+				faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+	{
+		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+	if (closestFaceB>=0)
+	{
+		numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,
+				faces,
+				indices,localContactsOut,localContactCapacity);
+	}
+	return numContactsOut;
+int	clipHullAgainstHullLocalA(const float4 separatingNormal,
+	const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
+	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	const float4* verticesA,
+	const b3GpuFace_t* facesA,
+	const int* indicesA,
+	__global const float4* verticesB,
+	__global const b3GpuFace_t* facesB,
+	__global const int* indicesB,
+	float4*	localContactsOut,
+	int localContactCapacity)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, 
+				facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+	{
+		const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+	if (closestFaceB>=0)
+	{
+		numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,
+				verticesA,facesA,indicesA,
+				verticesB,facesB,indicesB,
+				localContactsOut,localContactCapacity);
+	}
+	return numContactsOut;
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)
+	if( nPoints == 0 )
+        return 0;
+    if (nPoints <=4)
+        return nPoints;
+    if (nPoints >64)
+        nPoints = 64;
+	float4 center = make_float4(0.f);
+	{
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+	//	sample 4 directions
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    int minIndex=-1;
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+    }
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    return 4;
+int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)
+    contactIdx[0] = 0;
+    contactIdx[1] = 1;
+    contactIdx[2] = 2;
+    contactIdx[3] = 3;
+	if( nPoints == 0 ) return 0;
+	nPoints = min2( nPoints, 4 );
+    return nPoints;
+int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)
+	if( nPoints == 0 ) return 0;
+	nPoints = min2( nPoints, 64 );
+	float4 center = make_float4(0.f);
+	{
+		float4 v[64];
+		for (int i=0;i<nPoints;i++)
+			v[i] = p[i];
+		//memcpy( v, p, nPoints*sizeof(float4) );
+		PARALLEL_SUM( v, nPoints );
+		center = v[0]/(float)nPoints;
+	}
+	{	//	sample 4 directions
+		if( nPoints < 4 )
+		{
+			for(int i=0; i<nPoints; i++) 
+				contactIdx[i] = i;
+			return nPoints;
+		}
+		float4 aVector = p[0] - center;
+		float4 u = cross3( nearNormal, aVector );
+		float4 v = cross3( nearNormal, u );
+		u = normalize3( u );
+		v = normalize3( v );
+		int idx[4];
+		float2 max00 = make_float2(0,FLT_MAX);
+		{
+			//	idx, distance
+			{
+				{
+					int4 a[64];
+					for(int ie = 0; ie<nPoints; ie++ )
+					{
+						float f;
+						float4 r = p[ie]-center;
+						f = dot3F4( u, r );
+						a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+						f = dot3F4( -u, r );
+						a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+						f = dot3F4( v, r );
+						a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+						f = dot3F4( -v, r );
+						a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+					}
+					for(int ie=0; ie<nPoints; ie++)
+					{
+						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;
+						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;
+						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;
+						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;
+					}
+					idx[0] = (int)a[0].x & 0xff;
+					idx[1] = (int)a[0].y & 0xff;
+					idx[2] = (int)a[0].z & 0xff;
+					idx[3] = (int)a[0].w & 0xff;
+				}
+			}
+			{
+				float2 h[64];
+				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );
+				REDUCE_MIN( h, nPoints );
+				max00 = h[0];
+			}
+		}
+		contactIdx[0] = idx[0];
+		contactIdx[1] = idx[1];
+		contactIdx[2] = idx[2];
+		contactIdx[3] = idx[3];
+		return 4;
+	}
+__kernel void   extractManifoldAndAddContactKernel(__global const int4* pairs, 
+																	__global const b3RigidBodyData_t* rigidBodies, 
+																	__global const float4* closestPointsWorld,
+																	__global const float4* separatingNormalsWorld,
+																	__global const int* contactCounts,
+																	__global const int* contactOffsets,
+																	__global struct b3Contact4Data* restrict contactsOut,
+																	counter32_t nContactsOut,
+																	int contactCapacity,
+																	int numPairs,
+																	int pairIndex
+																	)
+	int idx = get_global_id(0);
+	if (idx<numPairs)
+	{
+		float4 normal = separatingNormalsWorld[idx];
+		int nPoints = contactCounts[idx];
+		__global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];
+		float4 localPoints[64];
+		for (int i=0;i<nPoints;i++)
+		{
+			localPoints[i] = pointsIn[i];
+		}
+		int contactIdx[4];// = {-1,-1,-1,-1};
+		contactIdx[0] = -1;
+		contactIdx[1] = -1;
+		contactIdx[2] = -1;
+		contactIdx[3] = -1;
+		int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);
+		int dstIdx;
+		AppendInc( nContactsOut, dstIdx );
+		if (dstIdx<contactCapacity)
+		{
+			__global struct b3Contact4Data* c = contactsOut + dstIdx;
+			c->m_worldNormalOnB = -normal;
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+			c->m_batchIdx = idx;
+			int bodyA = pairs[pairIndex].x;
+			int bodyB = pairs[pairIndex].y;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
+			c->m_childIndexA = -1;
+			c->m_childIndexB = -1;
+			for (int i=0;i<nContacts;i++)
+			{
+				c->m_worldPosB[i] = localPoints[contactIdx[i]];
+			}
+			GET_NPOINTS(*c) = nContacts;
+		}
+	}
+void	trInverse(float4 translationIn, Quaternion orientationIn,
+		float4* translationOut, Quaternion* orientationOut)
+	*orientationOut = qtInvert(orientationIn);
+	*translationOut = qtRotate(*orientationOut, -translationIn);
+void	trMul(float4 translationA, Quaternion orientationA,
+						float4 translationB, Quaternion orientationB,
+		float4* translationOut, Quaternion* orientationOut)
+	*orientationOut = qtMul(orientationA,orientationB);
+	*translationOut = transform(&translationB,&translationA,&orientationA);
+__kernel void   clipHullHullKernel( __global int4* pairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const b3GpuFace_t* faces,
+																					__global const int* indices,
+																					__global const float4* separatingNormals,
+																					__global const int* hasSeparatingAxis,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numPairs,
+																					int contactCapacity)
+	int i = get_global_id(0);
+	int pairIndex = i;
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		if (hasSeparatingAxis[i])
+		{
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],
+														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -separatingNormals[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				int contactIdx[4];// = {-1,-1,-1,-1};
+				contactIdx[0] = -1;
+				contactIdx[1] = -1;
+				contactIdx[2] = -1;
+				contactIdx[3] = -1;
+				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+				int mprContactIndex = pairs[pairIndex].z;
+				int dstIdx = mprContactIndex;
+				if (dstIdx<0)
+				{
+					AppendInc( nGlobalContactsOut, dstIdx );
+				}
+				if (dstIdx<contactCapacity)
+				{
+					pairs[pairIndex].z = dstIdx;
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+					for (int i=0;i<nReducedContacts;i++)
+					{
+					//this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact
+						if (i>0||(mprContactIndex<0))
+						{
+							c->m_worldPosB[i] = pointsIn[contactIdx[i]];
+						}
+					}
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const b3GpuFace_t* faces,
+																					__global const int* indices,
+																					__global const b3GpuChildShape_t* gpuChildShapes,
+																					__global const float4* gpuCompoundSepNormalsOut,
+																					__global const int* gpuHasCompoundSepNormalsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numCompoundPairs, int maxContactCapacity)
+	int i = get_global_id(0);
+	int pairIndex = i;
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+	if (i<numCompoundPairs)
+	{
+		if (gpuHasCompoundSepNormalsOut[i])
+		{
+			int bodyIndexA = gpuCompoundPairs[i].x;
+			int bodyIndexB = gpuCompoundPairs[i].y;
+			int childShapeIndexA = gpuCompoundPairs[i].z;
+			int childShapeIndexB = gpuCompoundPairs[i].w;
+			int collidableIndexA = -1;
+			int collidableIndexB = -1;
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			if (childShapeIndexA >= 0)
+			{
+				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+				float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+				float4 newPosA = qtRotate(ornA,childPosA)+posA;
+				float4 newOrnA = qtMul(ornA,childOrnA);
+				posA = newPosA;
+				ornA = newOrnA;
+			} else
+			{
+				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			}
+			if (childShapeIndexB>=0)
+			{
+				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+				float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+				float4 newPosB = transform(&childPosB,&posB,&ornB);
+				float4 newOrnB = qtMul(ornB,childOrnB);
+				posB = newPosB;
+				ornB = newOrnB;
+			} else
+			{
+				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+			}
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],
+														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -gpuCompoundSepNormalsOut[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				int contactIdx[4];// = {-1,-1,-1,-1};
+				contactIdx[0] = -1;
+				contactIdx[1] = -1;
+				contactIdx[2] = -1;
+				contactIdx[3] = -1;
+				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+				int dstIdx;
+				AppendInc( nGlobalContactsOut, dstIdx );
+				if ((dstIdx+nReducedContacts) < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = gpuCompoundPairs[pairIndex].x;
+					int bodyB = gpuCompoundPairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_childIndexA = childShapeIndexA;
+					c->m_childIndexB = childShapeIndexB;
+					for (int i=0;i<nReducedContacts;i++)
+					{
+						c->m_worldPosB[i] = pointsIn[contactIdx[i]];
+					}
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+			}//		if (numContactsOut>0)
+		}//		if (gpuHasCompoundSepNormalsOut[i])
+	}//	if (i<numCompoundPairs)
+__kernel void   sphereSphereCollisionKernel( __global const int4* pairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const float4* separatingNormals,
+																					__global const int* hasSeparatingAxis,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int contactCapacity,
+																					int numPairs)
+	int i = get_global_id(0);
+	int pairIndex = i;
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//sphere-sphere
+			float radiusA = collidables[collidableIndexA].m_radius;
+			float radiusB = collidables[collidableIndexB].m_radius;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			float4 diff = posA-posB;
+			float len = length(diff);
+			///iff distance positive, don't generate a new contact
+			if ( len <= (radiusA+radiusB))
+			{
+				///distance (negative means penetration)
+				float dist = len - (radiusA+radiusB);
+				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);
+				if (len > 0.00001)
+				{
+					normalOnSurfaceB = diff / len;
+				}
+				float4 contactPosB = posB + normalOnSurfaceB*radiusB;
+				contactPosB.w = dist;
+				int dstIdx;
+				AppendInc( nGlobalContactsOut, dstIdx );
+				if (dstIdx < contactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normalOnSurfaceB;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_worldPosB[0] = contactPosB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+					GET_NPOINTS(*c) = 1;
+				}//if (dstIdx < numPairs)
+			}//if ( len <= (radiusA+radiusB))
+	}//if (i<numPairs)
+__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const b3GpuFace_t* faces,
+																					__global const int* indices,
+																					__global const b3GpuChildShape_t* gpuChildShapes,
+																					__global const float4* separatingNormals,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int contactCapacity,
+																					int numConcavePairs)
+	int i = get_global_id(0);
+	int pairIndex = i;
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+	if (i<numConcavePairs)
+	{
+		//negative value means that the pair is invalid
+		if (concavePairsIn[i].w<0)
+			return;
+		int bodyIndexA = concavePairsIn[i].x;
+		int bodyIndexB = concavePairsIn[i].y;
+		int f = concavePairsIn[i].z;
+		int childShapeIndexA = f;
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		///////////////////////////////////////////////////////////////
+		bool overlap = false;
+		b3ConvexPolyhedronData_t convexPolyhedronA;
+	//add 3 vertices of the triangle
+		convexPolyhedronA.m_numVertices = 3;
+		convexPolyhedronA.m_vertexOffset = 0;
+		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+		b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+		float4 verticesA[3];
+		for (int i=0;i<3;i++)
+		{
+			int index = indices[face.m_indexOffset+i];
+			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+			verticesA[i] = vert;
+			localCenter += vert;
+		}
+		float dmin = FLT_MAX;
+		int localCC=0;
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+		convexPolyhedronA.m_faceOffset = 0;
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot3F4(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot3F4(edgeNormal,v0);
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 sepAxis = separatingNormals[i];
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+		int childShapeIndexB =-1;
+		{
+			///////////////////
+			///compound shape support
+			childShapeIndexB = concavePairsIn[pairIndex].w;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		}
+		////////////////////////////////////////
+		int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,
+														&convexPolyhedronA, &convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														&verticesA,&facesA,&indicesA,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+		if (numLocalContactsOut>0)
+		{
+			float4 normal = -separatingNormals[i];
+			int nPoints = numLocalContactsOut;
+			float4* pointsIn = localContactsOut;
+			int contactIdx[4];// = {-1,-1,-1,-1};
+			contactIdx[0] = -1;
+			contactIdx[1] = -1;
+			contactIdx[2] = -1;
+			contactIdx[3] = -1;
+			int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+			if (dstIdx<contactCapacity)
+			{
+				__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+				c->m_worldNormalOnB = -normal;
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+				c->m_batchIdx = pairIndex;
+				int bodyA = concavePairsIn[pairIndex].x;
+				int bodyB = concavePairsIn[pairIndex].y;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+				c->m_childIndexA = childShapeIndexA;
+				c->m_childIndexB = childShapeIndexB;
+				for (int i=0;i<nReducedContacts;i++)
+				{
+					c->m_worldPosB[i] = pointsIn[contactIdx[i]];
+				}
+				GET_NPOINTS(*c) = nReducedContacts;
+			}
+		}//		if (numContactsOut>0)
+	}//	if (i<numPairs)
+int	findClippingFaces(const float4 separatingNormal,
+                      __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+                      __global const float4* vertices,
+                      __global const b3GpuFace_t* faces,
+                      __global const int* indices,
+                      __global int4* clippingFaces, int pairIndex)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,
+                                              faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+	{
+		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];
+			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+    int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              faces[hullA->m_faceOffset+face].m_plane.x,
+                                              faces[hullA->m_faceOffset+face].m_plane.y,
+                                              faces[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+        worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+    }
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+	return numContactsOut;
+int clipFaces(__global float4* worldVertsA1,
+              __global float4* worldNormalsA1,
+              __global float4* worldVertsB1,
+              __global float4* worldVertsB2, 
+              int capacityWorldVertsB2,
+              const float minDist, float maxDist,
+              __global int4* clippingFaces,
+              int pairIndex)
+	int numContactsOut = 0;
+    int closestFaceA = clippingFaces[pairIndex].x;
+    int closestFaceB = clippingFaces[pairIndex].y;
+	int numVertsInA = clippingFaces[pairIndex].z;
+	int numVertsInB = clippingFaces[pairIndex].w;
+	int numVertsOut = 0;
+	if (closestFaceA<0)
+		return numContactsOut;
+    __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+    __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	for(int e0=0;e0<numVertsInA;e0++)
+	{
+		const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+		const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+		const float4 WorldEdge0 = aw - bw;
+		float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = aw;
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+		__global float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsInB = numVertsOut;
+		numVertsOut = 0;
+	}
+    //float4 planeNormalWS = worldNormalsA1[pairIndex];
+    //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+    /*for (int i=0;i<numVertsInB;i++)
+    {
+        pVtxOut[i] = pVtxIn[i];
+    }*/
+    //numVertsInB=0;
+    float4 planeNormalWS = worldNormalsA1[pairIndex];
+    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+    for (int i=0;i<numVertsInB;i++)
+    {
+        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+        if (depth <=minDist)
+        {
+            depth = minDist;
+        }
+        if (depth <=maxDist)
+        {
+            float4 pointInWorld = pVtxIn[i];
+            pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+        }
+    }
+    clippingFaces[pairIndex].w =numContactsOut;
+	return numContactsOut;
+__kernel void   findClippingFacesKernel(  __global const int4* pairs,
+                                        __global const b3RigidBodyData_t* rigidBodies,
+                                        __global const b3Collidable_t* collidables,
+                                        __global const b3ConvexPolyhedronData_t* convexShapes,
+                                        __global const float4* vertices,
+                                        __global const float4* uniqueEdges,
+                                        __global const b3GpuFace_t* faces,
+                                        __global const int* indices,
+                                        __global const float4* separatingNormals,
+                                        __global const int* hasSeparatingAxis,
+                                        __global int4* clippingFacesOut,
+                                        __global float4* worldVertsA1,
+                                        __global float4* worldNormalsA1,
+                                        __global float4* worldVertsB1,
+                                        int capacityWorldVerts,
+                                        int numPairs
+                                        )
+	int i = get_global_id(0);
+	int pairIndex = i;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			int numLocalContactsOut = findClippingFaces(separatingNormals[i],
+                                                        &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+                                                        rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+                                                        rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+                                                        worldVertsA1,
+                                                        worldNormalsA1,
+                                                        worldVertsB1,capacityWorldVerts,
+                                                        minDist, maxDist,
+                                                        vertices,faces,indices,
+                                                        clippingFacesOut,i);
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+__kernel void   clipFacesAndFindContactsKernel(    __global const float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global int4* clippingFacesOut,
+                                                   __global float4* worldVertsA1,
+                                                   __global float4* worldNormalsA1,
+                                                   __global float4* worldVertsB1,
+                                                   __global float4* worldVertsB2,
+                                                    int vertexFaceCapacity,
+                                                   int numPairs,
+					                                        int debugMode
+                                                   )
+    int i = get_global_id(0);
+	int pairIndex = i;
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+//			int bodyIndexA = pairs[i].x;
+	//		int bodyIndexB = pairs[i].y;
+            int numLocalContactsOut = 0;
+            int capacityWorldVertsB2 = vertexFaceCapacity;
+            __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+            __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+            {
+                __global int4* clippingFaces = clippingFacesOut;
+                int closestFaceA = clippingFaces[pairIndex].x;
+                int closestFaceB = clippingFaces[pairIndex].y;
+                int numVertsInA = clippingFaces[pairIndex].z;
+                int numVertsInB = clippingFaces[pairIndex].w;
+                int numVertsOut = 0;
+                if (closestFaceA>=0)
+                {
+                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+                    for(int e0=0;e0<numVertsInA;e0++)
+                    {
+                        const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+                        const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+                        const float4 WorldEdge0 = aw - bw;
+                        float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+                        float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+                        float4 worldA1 = aw;
+                        float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+                        float4 planeNormalWS = planeNormalWS1;
+                        float planeEqWS=planeEqWS1;
+                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+                        __global float4* tmp = pVtxOut;
+                        pVtxOut = pVtxIn;
+                        pVtxIn = tmp;
+                        numVertsInB = numVertsOut;
+                        numVertsOut = 0;
+                    }
+                    float4 planeNormalWS = worldNormalsA1[pairIndex];
+                    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+                    for (int i=0;i<numVertsInB;i++)
+                    {
+                        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+                        if (depth <=minDist)
+                        {
+                            depth = minDist;
+                        }
+                        if (depth <=maxDist)
+                        {
+                            float4 pointInWorld = pVtxIn[i];
+                            pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+                        }
+                    }
+                }
+                clippingFaces[pairIndex].w =numLocalContactsOut;
+            }
+            for (int i=0;i<numLocalContactsOut;i++)
+                pVtxIn[i] = pVtxOut[i];
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+__kernel void   newContactReductionKernel( __global int4* pairs,
+                                                   __global const b3RigidBodyData_t* rigidBodies,
+                                                   __global const float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global struct b3Contact4Data* globalContactsOut,
+                                                   __global int4* clippingFaces,
+                                                   __global float4* worldVertsB2,
+                                                   volatile __global int* nGlobalContactsOut,
+                                                   int vertexFaceCapacity,
+												   int contactCapacity,
+                                                   int numPairs
+                                                   )
+    int i = get_global_id(0);
+	int pairIndex = i;
+    int4 contactIdx;
+    contactIdx=make_int4(0,1,2,3);
+	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			int nPoints = clippingFaces[pairIndex].w;
+            if (nPoints>0)
+            {
+                 __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];
+                float4 normal = -separatingNormals[i];
+                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+				int mprContactIndex = pairs[pairIndex].z;
+                int dstIdx = mprContactIndex;
+				if (dstIdx<0)
+				{
+	                AppendInc( nGlobalContactsOut, dstIdx );
+				}
+//#if 0
+				if (dstIdx < contactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					pairs[pairIndex].w = dstIdx;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+                    c->m_childIndexA =-1;
+					c->m_childIndexB =-1;
+                    switch (nReducedContacts)
+                    {
+                        case 4:
+                            c->m_worldPosB[3] = pointsIn[contactIdx.w];
+                        case 3:
+                            c->m_worldPosB[2] = pointsIn[contactIdx.z];
+                        case 2:
+                            c->m_worldPosB[1] = pointsIn[contactIdx.y];
+                        case 1:
+							if (mprContactIndex<0)//test
+	                            c->m_worldPosB[0] = pointsIn[contactIdx.x];
+                        default:
+                        {
+                        }
+                    };
+					GET_NPOINTS(*c) = nReducedContacts;
+                 }
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
new file mode 100644
index 00000000..f0ecfc78
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
@@ -0,0 +1,2099 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satClipKernelsCL= \
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile __global int*\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"typedef unsigned int u32;\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"typedef struct b3GpuFace b3GpuFace_t;\n"
+"struct b3GpuFace\n"
+"	b3Float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"	int m_unusedPadding1;\n"
+"	int m_unusedPadding2;\n"
+"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n"
+"struct b3ConvexPolyhedronData\n"
+"	b3Float4		m_localCenter;\n"
+"	b3Float4		m_extents;\n"
+"	b3Float4		mC;\n"
+"	b3Float4		mE;\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"#ifndef B3_COLLIDABLE_H\n"
+"#define B3_COLLIDABLE_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"enum b3ShapeTypes\n"
+"	SHAPE_PLANE=4,\n"
+"typedef struct b3Collidable b3Collidable_t;\n"
+"struct b3Collidable\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
+"struct b3GpuChildShape\n"
+"	b3Float4	m_childPosition;\n"
+"	b3Quat		m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"struct b3CompoundOverlappingPair\n"
+"	int m_bodyIndexA;\n"
+"	int m_bodyIndexB;\n"
+"//	int	m_pairType;\n"
+"	int m_childShapeIndexA;\n"
+"	int m_childShapeIndexB;\n"
+"#endif //B3_COLLIDABLE_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"float fastDiv(float numerator, float denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"//#define dot3F4 dot\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"__inline float4 lerp3(const float4 a,const float4 b, float  t)\n"
+"	return make_float4(	a.x + (b.x - a.x) * t,\n"
+"						a.y + (b.y - a.y) * t,\n"
+"						a.z + (b.z - a.z) * t,\n"
+"						0.f);\n"
+"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n"
+"int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)\n"
+"	\n"
+"	int ve;\n"
+"	float ds, de;\n"
+"	int numVertsOut = 0;\n"
+"    //double-check next test\n"
+"    	if (numVertsIn < 2)\n"
+"    		return 0;\n"
+"    \n"
+"	float4 firstVertex=pVtxIn[numVertsIn-1];\n"
+"	float4 endVertex = pVtxIn[0];\n"
+"	\n"
+"	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n"
+"    \n"
+"	for (ve = 0; ve < numVertsIn; ve++)\n"
+"	{\n"
+"		endVertex=pVtxIn[ve];\n"
+"		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n"
+"		if (ds<0)\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start < 0, end < 0, so output endVertex\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// Start < 0, end >= 0, so output intersection\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"			}\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start >= 0, end < 0 so output intersection and end\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"		}\n"
+"		firstVertex = endVertex;\n"
+"		ds = de;\n"
+"	}\n"
+"	return numVertsOut;\n"
+"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n"
+"int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)\n"
+"	\n"
+"	int ve;\n"
+"	float ds, de;\n"
+"	int numVertsOut = 0;\n"
+"//double-check next test\n"
+"	if (numVertsIn < 2)\n"
+"		return 0;\n"
+"	float4 firstVertex=pVtxIn[numVertsIn-1];\n"
+"	float4 endVertex = pVtxIn[0];\n"
+"	\n"
+"	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n"
+"	for (ve = 0; ve < numVertsIn; ve++)\n"
+"	{\n"
+"		endVertex=pVtxIn[ve];\n"
+"		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n"
+"		if (ds<0)\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start < 0, end < 0, so output endVertex\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// Start < 0, end >= 0, so output intersection\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"			}\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start >= 0, end < 0 so output intersection and end\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"		}\n"
+"		firstVertex = endVertex;\n"
+"		ds = de;\n"
+"	}\n"
+"	return numVertsOut;\n"
+"int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA,  \n"
+"	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n"
+"	float4* worldVertsB2, int capacityWorldVertsB2,\n"
+"	const float minDist, float maxDist,\n"
+"	__global const float4* vertices,\n"
+"	__global const b3GpuFace_t* faces,\n"
+"	__global const int* indices,\n"
+"	float4* contactsOut,\n"
+"	int contactCapacity)\n"
+"	int numContactsOut = 0;\n"
+"	float4* pVtxIn = worldVertsB1;\n"
+"	float4* pVtxOut = worldVertsB2;\n"
+"	\n"
+"	int numVertsIn = numWorldVertsB1;\n"
+"	int numVertsOut = 0;\n"
+"	int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"				faces[hullA->m_faceOffset+face].m_plane.x, \n"
+"				faces[hullA->m_faceOffset+face].m_plane.y, \n"
+"				faces[hullA->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"		\n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"	b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"	int numVerticesA = polyA.m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"		const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];\n"
+"		const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n"
+"		const float4 edge0 = a - b;\n"
+"		const float4 WorldEdge0 = qtRotate(ornA,edge0);\n"
+"		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = transform(&a,&posA,&ornA);\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		\n"
+"		//clip face\n"
+"		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
+"		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
+"		float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsIn = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"	\n"
+"	// only keep points that are behind the witness face\n"
+"	{\n"
+"		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float localPlaneEq = polyA.m_plane.w;\n"
+"		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n"
+"		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n"
+"		for (int i=0;i<numVertsIn;i++)\n"
+"		{\n"
+"			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"			if (depth <=minDist)\n"
+"			{\n"
+"				depth = minDist;\n"
+"			}\n"
+"			if (depth <=maxDist)\n"
+"			{\n"
+"				float4 pointInWorld = pVtxIn[i];\n"
+"				//resultOut.addContactPoint(separatingNormal,point,depth);\n"
+"				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	return numContactsOut;\n"
+"int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA,  \n"
+"	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n"
+"	float4* worldVertsB2, int capacityWorldVertsB2,\n"
+"	const float minDist, float maxDist,\n"
+"	const float4* verticesA,\n"
+"	const b3GpuFace_t* facesA,\n"
+"	const int* indicesA,\n"
+"	__global const float4* verticesB,\n"
+"	__global const b3GpuFace_t* facesB,\n"
+"	__global const int* indicesB,\n"
+"	float4* contactsOut,\n"
+"	int contactCapacity)\n"
+"	int numContactsOut = 0;\n"
+"	float4* pVtxIn = worldVertsB1;\n"
+"	float4* pVtxOut = worldVertsB2;\n"
+"	\n"
+"	int numVertsIn = numWorldVertsB1;\n"
+"	int numVertsOut = 0;\n"
+"	int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"				facesA[hullA->m_faceOffset+face].m_plane.x, \n"
+"				facesA[hullA->m_faceOffset+face].m_plane.y, \n"
+"				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"		\n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"	b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"	int numVerticesA = polyA.m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];\n"
+"		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n"
+"		const float4 edge0 = a - b;\n"
+"		const float4 WorldEdge0 = qtRotate(ornA,edge0);\n"
+"		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = transform(&a,&posA,&ornA);\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		\n"
+"		//clip face\n"
+"		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
+"		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
+"		float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsIn = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"	\n"
+"	// only keep points that are behind the witness face\n"
+"	{\n"
+"		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float localPlaneEq = polyA.m_plane.w;\n"
+"		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n"
+"		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n"
+"		for (int i=0;i<numVertsIn;i++)\n"
+"		{\n"
+"			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"			if (depth <=minDist)\n"
+"			{\n"
+"				depth = minDist;\n"
+"			}\n"
+"			if (depth <=maxDist)\n"
+"			{\n"
+"				float4 pointInWorld = pVtxIn[i];\n"
+"				//resultOut.addContactPoint(separatingNormal,point,depth);\n"
+"				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	return numContactsOut;\n"
+"int	clipHullAgainstHull(const float4 separatingNormal,\n"
+"	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n"
+"	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n"
+"	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
+"	const float minDist, float maxDist,\n"
+"	__global const float4* vertices,\n"
+"	__global const b3GpuFace_t* faces,\n"
+"	__global const int* indices,\n"
+"	float4*	localContactsOut,\n"
+"	int localContactCapacity)\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, \n"
+"				faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	{\n"
+"		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceB>=0)\n"
+"	{\n"
+"		numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, \n"
+"				posA,ornA,\n"
+"				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,\n"
+"				faces,\n"
+"				indices,localContactsOut,localContactCapacity);\n"
+"	}\n"
+"	return numContactsOut;\n"
+"int	clipHullAgainstHullLocalA(const float4 separatingNormal,\n"
+"	const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n"
+"	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n"
+"	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
+"	const float minDist, float maxDist,\n"
+"	const float4* verticesA,\n"
+"	const b3GpuFace_t* facesA,\n"
+"	const int* indicesA,\n"
+"	__global const float4* verticesB,\n"
+"	__global const b3GpuFace_t* facesB,\n"
+"	__global const int* indicesB,\n"
+"	float4*	localContactsOut,\n"
+"	int localContactCapacity)\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, \n"
+"				facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	{\n"
+"		const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceB>=0)\n"
+"	{\n"
+"		numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, \n"
+"				posA,ornA,\n"
+"				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,\n"
+"				verticesA,facesA,indicesA,\n"
+"				verticesB,facesB,indicesB,\n"
+"				localContactsOut,localContactCapacity);\n"
+"	}\n"
+"	return numContactsOut;\n"
+"#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];\n"
+"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n"
+"#define REDUCE_MAX(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }\n"
+"#define REDUCE_MIN(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }\n"
+"int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n"
+"	if( nPoints == 0 )\n"
+"        return 0;\n"
+"    \n"
+"    if (nPoints <=4)\n"
+"        return nPoints;\n"
+"    \n"
+"    \n"
+"    if (nPoints >64)\n"
+"        nPoints = 64;\n"
+"    \n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			center += p[i];\n"
+"		center /= (float)nPoints;\n"
+"	}\n"
+"    \n"
+"	\n"
+"    \n"
+"	//	sample 4 directions\n"
+"    \n"
+"    float4 aVector = p[0] - center;\n"
+"    float4 u = cross3( nearNormal, aVector );\n"
+"    float4 v = cross3( nearNormal, u );\n"
+"    u = normalize3( u );\n"
+"    v = normalize3( v );\n"
+"    \n"
+"    \n"
+"    //keep point with deepest penetration\n"
+"    float minW= FLT_MAX;\n"
+"    \n"
+"    int minIndex=-1;\n"
+"    \n"
+"    float4 maxDots;\n"
+"    maxDots.x = FLT_MIN;\n"
+"    maxDots.y = FLT_MIN;\n"
+"    maxDots.z = FLT_MIN;\n"
+"    maxDots.w = FLT_MIN;\n"
+"    \n"
+"    //	idx, distance\n"
+"    for(int ie = 0; ie<nPoints; ie++ )\n"
+"    {\n"
+"        if (p[ie].w<minW)\n"
+"        {\n"
+"            minW = p[ie].w;\n"
+"            minIndex=ie;\n"
+"        }\n"
+"        float f;\n"
+"        float4 r = p[ie]-center;\n"
+"        f = dot3F4( u, r );\n"
+"        if (f<maxDots.x)\n"
+"        {\n"
+"            maxDots.x = f;\n"
+"            contactIdx[0].x = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -u, r );\n"
+"        if (f<maxDots.y)\n"
+"        {\n"
+"            maxDots.y = f;\n"
+"            contactIdx[0].y = ie;\n"
+"        }\n"
+"        \n"
+"        \n"
+"        f = dot3F4( v, r );\n"
+"        if (f<maxDots.z)\n"
+"        {\n"
+"            maxDots.z = f;\n"
+"            contactIdx[0].z = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -v, r );\n"
+"        if (f<maxDots.w)\n"
+"        {\n"
+"            maxDots.w = f;\n"
+"            contactIdx[0].w = ie;\n"
+"        }\n"
+"        \n"
+"    }\n"
+"    \n"
+"    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n"
+"    {\n"
+"        //replace the first contact with minimum (todo: replace contact with least penetration)\n"
+"        contactIdx[0].x = minIndex;\n"
+"    }\n"
+"    \n"
+"    return 4;\n"
+"    \n"
+"int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n"
+"    contactIdx[0] = 0;\n"
+"    contactIdx[1] = 1;\n"
+"    contactIdx[2] = 2;\n"
+"    contactIdx[3] = 3;\n"
+"    \n"
+"	if( nPoints == 0 ) return 0;\n"
+"    \n"
+"	nPoints = min2( nPoints, 4 );\n"
+"    return nPoints;\n"
+"    \n"
+"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n"
+"	if( nPoints == 0 ) return 0;\n"
+"	nPoints = min2( nPoints, 64 );\n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		float4 v[64];\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			v[i] = p[i];\n"
+"		//memcpy( v, p, nPoints*sizeof(float4) );\n"
+"		PARALLEL_SUM( v, nPoints );\n"
+"		center = v[0]/(float)nPoints;\n"
+"	}\n"
+"	\n"
+"	{	//	sample 4 directions\n"
+"		if( nPoints < 4 )\n"
+"		{\n"
+"			for(int i=0; i<nPoints; i++) \n"
+"				contactIdx[i] = i;\n"
+"			return nPoints;\n"
+"		}\n"
+"		float4 aVector = p[0] - center;\n"
+"		float4 u = cross3( nearNormal, aVector );\n"
+"		float4 v = cross3( nearNormal, u );\n"
+"		u = normalize3( u );\n"
+"		v = normalize3( v );\n"
+"		int idx[4];\n"
+"		float2 max00 = make_float2(0,FLT_MAX);\n"
+"		{\n"
+"			//	idx, distance\n"
+"			{\n"
+"				{\n"
+"					int4 a[64];\n"
+"					for(int ie = 0; ie<nPoints; ie++ )\n"
+"					{\n"
+"						\n"
+"						\n"
+"						float f;\n"
+"						float4 r = p[ie]-center;\n"
+"						f = dot3F4( u, r );\n"
+"						a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"						f = dot3F4( -u, r );\n"
+"						a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"						f = dot3F4( v, r );\n"
+"						a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"						f = dot3F4( -v, r );\n"
+"						a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"					}\n"
+"					for(int ie=0; ie<nPoints; ie++)\n"
+"					{\n"
+"						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;\n"
+"						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;\n"
+"						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;\n"
+"						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;\n"
+"					}\n"
+"					idx[0] = (int)a[0].x & 0xff;\n"
+"					idx[1] = (int)a[0].y & 0xff;\n"
+"					idx[2] = (int)a[0].z & 0xff;\n"
+"					idx[3] = (int)a[0].w & 0xff;\n"
+"				}\n"
+"			}\n"
+"			{\n"
+"				float2 h[64];\n"
+"				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );\n"
+"				REDUCE_MIN( h, nPoints );\n"
+"				max00 = h[0];\n"
+"			}\n"
+"		}\n"
+"		contactIdx[0] = idx[0];\n"
+"		contactIdx[1] = idx[1];\n"
+"		contactIdx[2] = idx[2];\n"
+"		contactIdx[3] = idx[3];\n"
+"		return 4;\n"
+"	}\n"
+"__kernel void   extractManifoldAndAddContactKernel(__global const int4* pairs, \n"
+"																	__global const b3RigidBodyData_t* rigidBodies, \n"
+"																	__global const float4* closestPointsWorld,\n"
+"																	__global const float4* separatingNormalsWorld,\n"
+"																	__global const int* contactCounts,\n"
+"																	__global const int* contactOffsets,\n"
+"																	__global struct b3Contact4Data* restrict contactsOut,\n"
+"																	counter32_t nContactsOut,\n"
+"																	int contactCapacity,\n"
+"																	int numPairs,\n"
+"																	int pairIndex\n"
+"																	)\n"
+"	int idx = get_global_id(0);\n"
+"	\n"
+"	if (idx<numPairs)\n"
+"	{\n"
+"		float4 normal = separatingNormalsWorld[idx];\n"
+"		int nPoints = contactCounts[idx];\n"
+"		__global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];\n"
+"		float4 localPoints[64];\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"		{\n"
+"			localPoints[i] = pointsIn[i];\n"
+"		}\n"
+"		int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"		contactIdx[0] = -1;\n"
+"		contactIdx[1] = -1;\n"
+"		contactIdx[2] = -1;\n"
+"		contactIdx[3] = -1;\n"
+"		int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);\n"
+"		int dstIdx;\n"
+"		AppendInc( nContactsOut, dstIdx );\n"
+"		if (dstIdx<contactCapacity)\n"
+"		{\n"
+"			__global struct b3Contact4Data* c = contactsOut + dstIdx;\n"
+"			c->m_worldNormalOnB = -normal;\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"			c->m_batchIdx = idx;\n"
+"			int bodyA = pairs[pairIndex].x;\n"
+"			int bodyB = pairs[pairIndex].y;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n"
+"			c->m_childIndexA = -1;\n"
+"			c->m_childIndexB = -1;\n"
+"			for (int i=0;i<nContacts;i++)\n"
+"			{\n"
+"				c->m_worldPosB[i] = localPoints[contactIdx[i]];\n"
+"			}\n"
+"			GET_NPOINTS(*c) = nContacts;\n"
+"		}\n"
+"	}\n"
+"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"	*orientationOut = qtInvert(orientationIn);\n"
+"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+"void	trMul(float4 translationA, Quaternion orientationA,\n"
+"						float4 translationB, Quaternion orientationB,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"	*orientationOut = qtMul(orientationA,orientationB);\n"
+"	*translationOut = transform(&translationB,&translationA,&orientationA);\n"
+"__kernel void   clipHullHullKernel( __global int4* pairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const b3GpuFace_t* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global const int* hasSeparatingAxis,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numPairs,\n"
+"																					int contactCapacity)\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"		\n"
+"			int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],\n"
+"														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"				float4 normal = -separatingNormals[i];\n"
+"				int nPoints = numLocalContactsOut;\n"
+"				float4* pointsIn = localContactsOut;\n"
+"				int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"				contactIdx[0] = -1;\n"
+"				contactIdx[1] = -1;\n"
+"				contactIdx[2] = -1;\n"
+"				contactIdx[3] = -1;\n"
+"		\n"
+"				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"		\n"
+"				\n"
+"				int mprContactIndex = pairs[pairIndex].z;\n"
+"				int dstIdx = mprContactIndex;\n"
+"				if (dstIdx<0)\n"
+"				{\n"
+"					AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				}\n"
+"				if (dstIdx<contactCapacity)\n"
+"				{\n"
+"					pairs[pairIndex].z = dstIdx;\n"
+"					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n"
+"					c->m_worldNormalOnB = -normal;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = -1;\n"
+"					for (int i=0;i<nReducedContacts;i++)\n"
+"					{\n"
+"					//this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact\n"
+"						if (i>0||(mprContactIndex<0))\n"
+"						{\n"
+"							c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n"
+"						}\n"
+"					}\n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"				}\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const b3GpuFace_t* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const b3GpuChildShape_t* gpuChildShapes,\n"
+"																					__global const float4* gpuCompoundSepNormalsOut,\n"
+"																					__global const int* gpuHasCompoundSepNormalsOut,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numCompoundPairs, int maxContactCapacity)\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		if (gpuHasCompoundSepNormalsOut[i])\n"
+"		{\n"
+"			int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"			int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"			\n"
+"			int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"			int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"			\n"
+"			int collidableIndexA = -1;\n"
+"			int collidableIndexB = -1;\n"
+"			\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			\n"
+"			float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"								\n"
+"			if (childShapeIndexA >= 0)\n"
+"			{\n"
+"				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"				float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"				float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"				float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"				posA = newPosA;\n"
+"				ornA = newOrnA;\n"
+"			} else\n"
+"			{\n"
+"				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			}\n"
+"			\n"
+"			if (childShapeIndexB>=0)\n"
+"			{\n"
+"				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"				float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"				float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"				float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"				posB = newPosB;\n"
+"				ornB = newOrnB;\n"
+"			} else\n"
+"			{\n"
+"				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"			}\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],\n"
+"														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"														posA,ornA,\n"
+"													  posB,ornB,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"				float4 normal = -gpuCompoundSepNormalsOut[i];\n"
+"				int nPoints = numLocalContactsOut;\n"
+"				float4* pointsIn = localContactsOut;\n"
+"				int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"				contactIdx[0] = -1;\n"
+"				contactIdx[1] = -1;\n"
+"				contactIdx[2] = -1;\n"
+"				contactIdx[3] = -1;\n"
+"		\n"
+"				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"		\n"
+"				int dstIdx;\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				if ((dstIdx+nReducedContacts) < maxContactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n"
+"					c->m_worldNormalOnB = -normal;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = gpuCompoundPairs[pairIndex].x;\n"
+"					int bodyB = gpuCompoundPairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_childIndexA = childShapeIndexA;\n"
+"					c->m_childIndexB = childShapeIndexB;\n"
+"					for (int i=0;i<nReducedContacts;i++)\n"
+"					{\n"
+"						c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n"
+"					}\n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"				}\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (gpuHasCompoundSepNormalsOut[i])\n"
+"	}//	if (i<numCompoundPairs)\n"
+"__kernel void   sphereSphereCollisionKernel( __global const int4* pairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global const int* hasSeparatingAxis,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int contactCapacity,\n"
+"																					int numPairs)\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			//sphere-sphere\n"
+"			float radiusA = collidables[collidableIndexA].m_radius;\n"
+"			float radiusB = collidables[collidableIndexB].m_radius;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			float4 diff = posA-posB;\n"
+"			float len = length(diff);\n"
+"			\n"
+"			///iff distance positive, don't generate a new contact\n"
+"			if ( len <= (radiusA+radiusB))\n"
+"			{\n"
+"				///distance (negative means penetration)\n"
+"				float dist = len - (radiusA+radiusB);\n"
+"				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n"
+"				if (len > 0.00001)\n"
+"				{\n"
+"					normalOnSurfaceB = diff / len;\n"
+"				}\n"
+"				float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n"
+"				contactPosB.w = dist;\n"
+"								\n"
+"				int dstIdx;\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				if (dstIdx < contactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = -normalOnSurfaceB;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_worldPosB[0] = contactPosB;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = -1;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				}//if (dstIdx < numPairs)\n"
+"			}//if ( len <= (radiusA+radiusB))\n"
+"	}//if (i<numPairs)\n"
+"}				\n"
+"__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const b3GpuFace_t* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const b3GpuChildShape_t* gpuChildShapes,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int contactCapacity,\n"
+"																					int numConcavePairs)\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numConcavePairs)\n"
+"	{\n"
+"		//negative value means that the pair is invalid\n"
+"		if (concavePairsIn[i].w<0)\n"
+"			return;\n"
+"		int bodyIndexA = concavePairsIn[i].x;\n"
+"		int bodyIndexB = concavePairsIn[i].y;\n"
+"		int f = concavePairsIn[i].z;\n"
+"		int childShapeIndexA = f;\n"
+"		\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		///////////////////////////////////////////////////////////////\n"
+"		\n"
+"	\n"
+"		bool overlap = false;\n"
+"		\n"
+"		b3ConvexPolyhedronData_t convexPolyhedronA;\n"
+"	//add 3 vertices of the triangle\n"
+"		convexPolyhedronA.m_numVertices = 3;\n"
+"		convexPolyhedronA.m_vertexOffset = 0;\n"
+"		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"		b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		\n"
+"		float4 verticesA[3];\n"
+"		for (int i=0;i<3;i++)\n"
+"		{\n"
+"			int index = indices[face.m_indexOffset+i];\n"
+"			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"			verticesA[i] = vert;\n"
+"			localCenter += vert;\n"
+"		}\n"
+"		float dmin = FLT_MAX;\n"
+"		int localCC=0;\n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"                                  \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"                             \n"
+"		b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot3F4(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                                            \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot3F4(edgeNormal,v0);\n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                                            \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 sepAxis = separatingNormals[i];\n"
+"		\n"
+"		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"		int childShapeIndexB =-1;\n"
+"		{\n"
+"			///////////////////\n"
+"			///compound shape support\n"
+"			\n"
+"			childShapeIndexB = concavePairsIn[pairIndex].w;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			\n"
+"		}\n"
+"		\n"
+"		////////////////////////////////////////\n"
+"		\n"
+"		\n"
+"		\n"
+"		int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,\n"
+"														&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"														posA,ornA,\n"
+"													  posB,ornB,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														&verticesA,&facesA,&indicesA,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"			float4 normal = -separatingNormals[i];\n"
+"			int nPoints = numLocalContactsOut;\n"
+"			float4* pointsIn = localContactsOut;\n"
+"			int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"			contactIdx[0] = -1;\n"
+"			contactIdx[1] = -1;\n"
+"			contactIdx[2] = -1;\n"
+"			contactIdx[3] = -1;\n"
+"	\n"
+"			int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"	\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			if (dstIdx<contactCapacity)\n"
+"			{\n"
+"				__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n"
+"				c->m_worldNormalOnB = -normal;\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				int bodyA = concavePairsIn[pairIndex].x;\n"
+"				int bodyB = concavePairsIn[pairIndex].y;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"				c->m_childIndexA = childShapeIndexA;\n"
+"				c->m_childIndexB = childShapeIndexB;\n"
+"				for (int i=0;i<nReducedContacts;i++)\n"
+"				{\n"
+"					c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n"
+"				}\n"
+"				GET_NPOINTS(*c) = nReducedContacts;\n"
+"			}\n"
+"				\n"
+"		}//		if (numContactsOut>0)\n"
+"	}//	if (i<numPairs)\n"
+"int	findClippingFaces(const float4 separatingNormal,\n"
+"                      __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"                      __global const float4* vertices,\n"
+"                      __global const b3GpuFace_t* faces,\n"
+"                      __global const int* indices,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"        worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"int clipFaces(__global float4* worldVertsA1,\n"
+"              __global float4* worldNormalsA1,\n"
+"              __global float4* worldVertsB1,\n"
+"              __global float4* worldVertsB2, \n"
+"              int capacityWorldVertsB2,\n"
+"              const float minDist, float maxDist,\n"
+"              __global int4* clippingFaces,\n"
+"              int pairIndex)\n"
+"	int numContactsOut = 0;\n"
+"    \n"
+"    int closestFaceA = clippingFaces[pairIndex].x;\n"
+"    int closestFaceB = clippingFaces[pairIndex].y;\n"
+"	int numVertsInA = clippingFaces[pairIndex].z;\n"
+"	int numVertsInB = clippingFaces[pairIndex].w;\n"
+"    \n"
+"	int numVertsOut = 0;\n"
+"    \n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"    \n"
+"    __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n"
+"    __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n"
+"    \n"
+"    \n"
+"	\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"    \n"
+"	for(int e0=0;e0<numVertsInA;e0++)\n"
+"	{\n"
+"		const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n"
+"		const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n"
+"		const float4 WorldEdge0 = aw - bw;\n"
+"		float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = aw;\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		__global float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsInB = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"    \n"
+"    //float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"    //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"    \n"
+"    /*for (int i=0;i<numVertsInB;i++)\n"
+"    {\n"
+"        pVtxOut[i] = pVtxIn[i];\n"
+"    }*/\n"
+"    \n"
+"    \n"
+"    \n"
+"    \n"
+"    //numVertsInB=0;\n"
+"	\n"
+"    float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"    for (int i=0;i<numVertsInB;i++)\n"
+"    {\n"
+"        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"        if (depth <=minDist)\n"
+"        {\n"
+"            depth = minDist;\n"
+"        }\n"
+"        \n"
+"        if (depth <=maxDist)\n"
+"        {\n"
+"            float4 pointInWorld = pVtxIn[i];\n"
+"            pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"        }\n"
+"    }\n"
+"   \n"
+"    clippingFaces[pairIndex].w =numContactsOut;\n"
+"   \n"
+"    \n"
+"	return numContactsOut;\n"
+"__kernel void   findClippingFacesKernel(  __global const int4* pairs,\n"
+"                                        __global const b3RigidBodyData_t* rigidBodies,\n"
+"                                        __global const b3Collidable_t* collidables,\n"
+"                                        __global const b3ConvexPolyhedronData_t* convexShapes,\n"
+"                                        __global const float4* vertices,\n"
+"                                        __global const float4* uniqueEdges,\n"
+"                                        __global const b3GpuFace_t* faces,\n"
+"                                        __global const int* indices,\n"
+"                                        __global const float4* separatingNormals,\n"
+"                                        __global const int* hasSeparatingAxis,\n"
+"                                        __global int4* clippingFacesOut,\n"
+"                                        __global float4* worldVertsA1,\n"
+"                                        __global float4* worldNormalsA1,\n"
+"                                        __global float4* worldVertsB1,\n"
+"                                        int capacityWorldVerts,\n"
+"                                        int numPairs\n"
+"                                        )\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"    \n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"            \n"
+"            \n"
+"			int numLocalContactsOut = findClippingFaces(separatingNormals[i],\n"
+"                                                        &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"                                                        rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"                                                        rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"                                                        worldVertsA1,\n"
+"                                                        worldNormalsA1,\n"
+"                                                        worldVertsB1,capacityWorldVerts,\n"
+"                                                        minDist, maxDist,\n"
+"                                                        vertices,faces,indices,\n"
+"                                                        clippingFacesOut,i);\n"
+"            \n"
+"            \n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"__kernel void   clipFacesAndFindContactsKernel(    __global const float4* separatingNormals,\n"
+"                                                   __global const int* hasSeparatingAxis,\n"
+"                                                   __global int4* clippingFacesOut,\n"
+"                                                   __global float4* worldVertsA1,\n"
+"                                                   __global float4* worldNormalsA1,\n"
+"                                                   __global float4* worldVertsB1,\n"
+"                                                   __global float4* worldVertsB2,\n"
+"                                                    int vertexFaceCapacity,\n"
+"                                                   int numPairs,\n"
+"					                                        int debugMode\n"
+"                                                   )\n"
+"    int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"    \n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"//			int bodyIndexA = pairs[i].x;\n"
+"	//		int bodyIndexB = pairs[i].y;\n"
+"		    \n"
+"            int numLocalContactsOut = 0;\n"
+"            int capacityWorldVertsB2 = vertexFaceCapacity;\n"
+"            \n"
+"            __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n"
+"            __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n"
+"            \n"
+"            {\n"
+"                __global int4* clippingFaces = clippingFacesOut;\n"
+"            \n"
+"                \n"
+"                int closestFaceA = clippingFaces[pairIndex].x;\n"
+"                int closestFaceB = clippingFaces[pairIndex].y;\n"
+"                int numVertsInA = clippingFaces[pairIndex].z;\n"
+"                int numVertsInB = clippingFaces[pairIndex].w;\n"
+"                \n"
+"                int numVertsOut = 0;\n"
+"                \n"
+"                if (closestFaceA>=0)\n"
+"                {\n"
+"                    \n"
+"                    \n"
+"                    \n"
+"                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"                    \n"
+"                    for(int e0=0;e0<numVertsInA;e0++)\n"
+"                    {\n"
+"                        const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n"
+"                        const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n"
+"                        const float4 WorldEdge0 = aw - bw;\n"
+"                        float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n"
+"                        float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"                        float4 worldA1 = aw;\n"
+"                        float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"                        float4 planeNormalWS = planeNormalWS1;\n"
+"                        float planeEqWS=planeEqWS1;\n"
+"                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n"
+"                        __global float4* tmp = pVtxOut;\n"
+"                        pVtxOut = pVtxIn;\n"
+"                        pVtxIn = tmp;\n"
+"                        numVertsInB = numVertsOut;\n"
+"                        numVertsOut = 0;\n"
+"                    }\n"
+"                    \n"
+"                    float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"                    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"                    \n"
+"                    for (int i=0;i<numVertsInB;i++)\n"
+"                    {\n"
+"                        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"                        if (depth <=minDist)\n"
+"                        {\n"
+"                            depth = minDist;\n"
+"                        }\n"
+"                        \n"
+"                        if (depth <=maxDist)\n"
+"                        {\n"
+"                            float4 pointInWorld = pVtxIn[i];\n"
+"                            pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"                        }\n"
+"                    }\n"
+"                    \n"
+"                }\n"
+"                clippingFaces[pairIndex].w =numLocalContactsOut;\n"
+"                \n"
+"            }\n"
+"            \n"
+"            for (int i=0;i<numLocalContactsOut;i++)\n"
+"                pVtxIn[i] = pVtxOut[i];\n"
+"                \n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"__kernel void   newContactReductionKernel( __global int4* pairs,\n"
+"                                                   __global const b3RigidBodyData_t* rigidBodies,\n"
+"                                                   __global const float4* separatingNormals,\n"
+"                                                   __global const int* hasSeparatingAxis,\n"
+"                                                   __global struct b3Contact4Data* globalContactsOut,\n"
+"                                                   __global int4* clippingFaces,\n"
+"                                                   __global float4* worldVertsB2,\n"
+"                                                   volatile __global int* nGlobalContactsOut,\n"
+"                                                   int vertexFaceCapacity,\n"
+"												   int contactCapacity,\n"
+"                                                   int numPairs\n"
+"                                                   )\n"
+"    int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"    int4 contactIdx;\n"
+"    contactIdx=make_int4(0,1,2,3);\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			\n"
+"            \n"
+"            \n"
+"			int nPoints = clippingFaces[pairIndex].w;\n"
+"           \n"
+"            if (nPoints>0)\n"
+"            {\n"
+"                 __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];\n"
+"                float4 normal = -separatingNormals[i];\n"
+"                \n"
+"                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n"
+"            \n"
+"				int mprContactIndex = pairs[pairIndex].z;\n"
+"                int dstIdx = mprContactIndex;\n"
+"				if (dstIdx<0)\n"
+"				{\n"
+"	                AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				}\n"
+"//#if 0\n"
+"                \n"
+"				if (dstIdx < contactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = -normal;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					pairs[pairIndex].w = dstIdx;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"                    c->m_childIndexA =-1;\n"
+"					c->m_childIndexB =-1;\n"
+"                    switch (nReducedContacts)\n"
+"                    {\n"
+"                        case 4:\n"
+"                            c->m_worldPosB[3] = pointsIn[contactIdx.w];\n"
+"                        case 3:\n"
+"                            c->m_worldPosB[2] = pointsIn[contactIdx.z];\n"
+"                        case 2:\n"
+"                            c->m_worldPosB[1] = pointsIn[contactIdx.y];\n"
+"                        case 1:\n"
+"							if (mprContactIndex<0)//test\n"
+"	                            c->m_worldPosB[0] = pointsIn[contactIdx.x];\n"
+"                        default:\n"
+"                        {\n"
+"                        }\n"
+"                    };\n"
+"                    \n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"                    \n"
+"                 }\n"
+"                 \n"
+"                \n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"    \n"
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
new file mode 100644
index 00000000..31ca43b8
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
@@ -0,0 +1,1220 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+#define B3_MAX_STACK_DEPTH 256
+typedef unsigned int u32;
+///keep this in sync with btCollidable.h
+typedef struct
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union
+	{
+		float m_radius;
+		int	m_compoundBvhIndex;
+	};
+	int m_shapeType;
+	int m_shapeIndex;
+} btCollidableGpu;
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} b3QuantizedBvhNode;
+typedef struct
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+} b3BvhInfo;
+int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+int isLeafNode(const b3QuantizedBvhNode* rootNode)
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+int getEscapeIndex(const b3QuantizedBvhNode* rootNode)
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+typedef struct
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} b3BvhSubtreeInfo;
+typedef struct
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+typedef struct
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+typedef struct  
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+typedef struct 
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+typedef struct
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+#define make_float4 (float4)
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+//	float4 a1 = make_float4(a.xyz,0.f);
+//	float4 b1 = make_float4(b.xyz,0.f);
+//	return cross(a1,b1);
+//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);
+	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);
+	//return c;
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 fastNormalize4(float4 v)
+	v = make_float4(v.xyz,0.f);
+	return fast_normalize(v);
+//	Quaternion
+typedef float4 Quaternion;
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+	return qtRotate( *orientation, *p ) + (*translation);
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, const float4* vertices, float* min, float* max)
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)
+	float Min0,Max0;
+	float Min1,Max1;
+	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+inline bool IsAlmostZero(const float4 v)
+	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)
+		return false;
+	return true;
+bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* verticesA, 
+	__global const float4* uniqueEdgesA, 
+	__global const btGpuFace* facesA,
+	__global const int*  indicesA,
+	const float4* verticesB,
+	const float4* uniqueEdgesB, 
+	const btGpuFace* facesB,
+	const int*  indicesB,
+	float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+		float4* sep,
+	float* dmin)
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+			float4 crossje = cross3(edge0World,edge1World);
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje *= -1.f;
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+				}
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+inline int	findClippingFaces(const float4 separatingNormal,
+                      const ConvexPolyhedronCL* hullA, 
+					  __global const ConvexPolyhedronCL* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+					  const float4* verticesA,
+                      const btGpuFace* facesA,
+                      const int* indicesA,
+					  __global const float4* verticesB,
+                      __global const btGpuFace* facesB,
+                      __global const int* indicesB,
+                      __global int4* clippingFaces, int pairIndex)
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	int closestFaceB=0;
+	float dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,
+                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+	{
+		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		int numVertices = polyB.m_numIndices;
+        if (numVertices>capacityWorldVerts)
+            numVertices = capacityWorldVerts;
+        if (numVertices<0)
+            numVertices = 0;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+            if (e0<capacityWorldVerts)
+            {
+                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+            }
+		}
+	}
+    int closestFaceA=0;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              facesA[hullA->m_faceOffset+face].m_plane.x,
+                                              facesA[hullA->m_faceOffset+face].m_plane.y,
+                                              facesA[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;
+    if (numVerticesA>capacityWorldVerts)
+       numVerticesA = capacityWorldVerts;
+    if (numVerticesA<0)
+        numVerticesA=0;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        if (e0<capacityWorldVerts)
+        {
+            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+        }
+    }
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+	return numContactsOut;
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,
+                                                __global const BodyData* rigidBodies,
+                                                __global const btCollidableGpu* collidables,
+                                                __global const ConvexPolyhedronCL* convexShapes,
+                                                __global const float4* vertices,
+                                                __global const float4* uniqueEdges,
+                                                __global const btGpuFace* faces,
+                                                __global const int* indices,
+                                                __global const btGpuChildShape* gpuChildShapes,
+                                                __global btAabbCL* aabbs,
+                                                __global float4* concaveSeparatingNormalsOut,
+                                                __global int* concaveHasSeparatingNormals,
+                                                __global int4* clippingFacesOut,
+                                                __global float4* worldVertsA1GPU,
+                                                __global float4*  worldNormalsAGPU,
+                                                __global float4* worldVertsB1GPU,
+                                                __global float* dmins,
+                                                int vertexFaceCapacity,
+                                                int numConcavePairs
+                                                )
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	concaveHasSeparatingNormals[i] = 0;
+	int pairIdx = i;
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&
+		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	int f = concavePairs[i].z;
+	bool overlap = false;
+	ConvexPolyhedronCL convexPolyhedronA;
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+		triAabb.m_min = min(triAabb.m_min,vert);
+		triAabb.m_max = max(triAabb.m_max,vert);
+	}
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+		int localCC=0;
+		numActualConcaveConvexTests++;
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+		convexPolyhedronA.m_faceOffset = 0;
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		///////////////////
+		///compound shape support
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+                                             posA,ornA,
+                                             posB,ornB,
+                                             DeltaC2,
+                                             verticesA,uniqueEdgesA,facesA,indicesA,
+                                             vertices,uniqueEdges,faces,indices,
+                                             &sepAxis,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,
+                                                 posB,ornB,
+                                                 posA,ornA,
+                                                 DeltaC2,
+                                                 vertices,uniqueEdges,faces,indices,
+                                                 verticesA,uniqueEdgesA,facesA,indicesA,
+                                                 &sepAxis,&dmin);
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				hasSeparatingAxis = 1;
+			}
+		}	
+		if (hasSeparatingAxis)
+		{
+            dmins[i] = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,
+                                                          __global const BodyData* rigidBodies,
+                                                          __global const btCollidableGpu* collidables,
+                                                          __global const ConvexPolyhedronCL* convexShapes,
+                                                          __global const float4* vertices,
+                                                          __global const float4* uniqueEdges,
+                                                          __global const btGpuFace* faces,
+                                                          __global const int* indices,
+                                                          __global const btGpuChildShape* gpuChildShapes,
+                                                          __global btAabbCL* aabbs,
+                                                          __global float4* concaveSeparatingNormalsOut,
+                                                          __global int* concaveHasSeparatingNormals,
+                                                          __global int4* clippingFacesOut,
+                                                          __global float4* worldVertsA1GPU,
+                                                          __global float4*  worldNormalsAGPU,
+                                                          __global float4* worldVertsB1GPU,
+                                                          __global float* dmins,
+                                                          int vertexFaceCapacity,
+                                                          int numConcavePairs
+                                                          )
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	if (!concaveHasSeparatingNormals[i])
+        return;
+	int pairIdx = i;
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	int f = concavePairs[i].z;
+	bool overlap = false;
+	ConvexPolyhedronCL convexPolyhedronA;
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+		triAabb.m_min = min(triAabb.m_min,vert);
+		triAabb.m_max = max(triAabb.m_max,vert);
+	}
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+	if (overlap)
+	{
+		float dmin = dmins[i];
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+        sepAxis = concaveSeparatingNormalsOut[pairIdx];
+		int localCC=0;
+		numActualConcaveConvexTests++;
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+		convexPolyhedronA.m_faceOffset = 0;
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		///////////////////
+		///compound shape support
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		{
+			bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+                                                              posA,ornA,
+                                                              posB,ornB,
+                                                              DeltaC2,
+                                                              verticesA,uniqueEdgesA,facesA,indicesA,
+                                                              vertices,uniqueEdges,faces,indices,
+                                                              &sepAxis,&dmin);
+			if (!sepEE)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				hasSeparatingAxis = 1;
+			}
+		}
+		if (hasSeparatingAxis)
+		{
+			sepAxis.w = dmin;
+            dmins[i] = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+ 	float minDist = -1e30f;
+			float maxDist = 0.02f;
+            findClippingFaces(sepAxis,
+                              &convexPolyhedronA,
+                              &convexShapes[shapeIndexB],
+                              posA,ornA,
+                              posB,ornB,
+                              worldVertsA1GPU,
+                              worldNormalsAGPU,
+                              worldVertsB1GPU,
+                              vertexFaceCapacity,
+                              minDist, maxDist,
+                              verticesA,
+                              facesA,
+                              indicesA,
+                              vertices,
+                              faces,
+                              indices,
+                              clippingFacesOut, pairIdx);
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+	concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
new file mode 100644
index 00000000..611569ca
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
@@ -0,0 +1,1457 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satConcaveKernelsCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define B3_MAX_STACK_DEPTH 256\n"
+"typedef unsigned int u32;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"#define MAX_NUM_PARTS_IN_BITS 10\n"
+"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+"typedef struct\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes\n"
+"	int	m_escapeIndexOrTriangleIndex;\n"
+"} b3QuantizedBvhNode;\n"
+"typedef struct\n"
+"	float4		m_aabbMin;\n"
+"	float4		m_aabbMax;\n"
+"	float4		m_quantization;\n"
+"	int			m_numNodes;\n"
+"	int			m_numSubTrees;\n"
+"	int			m_nodeOffset;\n"
+"	int			m_subTreeOffset;\n"
+"} b3BvhInfo;\n"
+"int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"	\n"
+"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"typedef struct\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes, points to the root of the subtree\n"
+"	int			m_rootNodeIndex;\n"
+"	//4 bytes\n"
+"	int			m_subtreeSize;\n"
+"	int			m_padding[3];\n"
+"} b3BvhSubtreeInfo;\n"
+"typedef struct\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct  \n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct \n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"#ifndef B3_AABB_H\n"
+"#define B3_AABB_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3Aabb b3Aabb_t;\n"
+"struct b3Aabb\n"
+"	union\n"
+"	{\n"
+"		float m_min[4];\n"
+"		b3Float4 m_minVec;\n"
+"		int m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float	m_max[4];\n"
+"		b3Float4 m_maxVec;\n"
+"		int m_signedMaxIndices[4];\n"
+"	};\n"
+"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
+"						b3Float4ConstArg pos,\n"
+"						b3QuatConstArg orn,\n"
+"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
+"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
+"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
+"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
+"		b3Mat3x3 m;\n"
+"		m = b3QuatGetRotationMatrix(orn);\n"
+"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
+"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
+"		\n"
+"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
+"										 0.f);\n"
+"		*aabbMinOut = center-extent;\n"
+"		*aabbMaxOut = center+extent;\n"
+"/// conservative test for overlap between two aabbs\n"
+"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
+"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
+"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
+"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
+"	return overlap;\n"
+"#endif //B3_AABB_H\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose,\n"
+"including commercial applications, and to alter it and redistribute it freely,\n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"#ifndef B3_INT2_H\n"
+"#define B3_INT2_H\n"
+"#ifdef __cplusplus\n"
+"#define b3UnsignedInt2 uint2\n"
+"#define b3Int2 int2\n"
+"#define b3MakeInt2 (int2)\n"
+"#endif //__cplusplus\n"
+"typedef struct\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"#define make_float4 (float4)\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"	\n"
+"//	float4 a1 = make_float4(a.xyz,0.f);\n"
+"//	float4 b1 = make_float4(b.xyz,0.f);\n"
+"//	return cross(a1,b1);\n"
+"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//return c;\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	v = make_float4(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, const float4* vertices, float* min, float* max)\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"inline bool IsAlmostZero(const float4 v)\n"
+"	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n"
+"		return false;\n"
+"	return true;\n"
+"bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* verticesA, \n"
+"	__global const float4* uniqueEdgesA, \n"
+"	__global const btGpuFace* facesA,\n"
+"	__global const int*  indicesA,\n"
+"	const float4* verticesB,\n"
+"	const float4* uniqueEdgesB, \n"
+"	const btGpuFace* facesB,\n"
+"	const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS *= -1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"		float4* sep,\n"
+"	float* dmin)\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje *= -1.f;\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						result = false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"				}\n"
+"				\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"inline int	findClippingFaces(const float4 separatingNormal,\n"
+"                      const ConvexPolyhedronCL* hullA, \n"
+"					  __global const ConvexPolyhedronCL* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"					  const float4* verticesA,\n"
+"                      const btGpuFace* facesA,\n"
+"                      const int* indicesA,\n"
+"					  __global const float4* verticesB,\n"
+"                      __global const btGpuFace* facesB,\n"
+"                      __global const int* indicesB,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=0;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		int numVertices = polyB.m_numIndices;\n"
+"        if (numVertices>capacityWorldVerts)\n"
+"            numVertices = capacityWorldVerts;\n"
+"        if (numVertices<0)\n"
+"            numVertices = 0;\n"
+"        \n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"            if (e0<capacityWorldVerts)\n"
+"            {\n"
+"                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"            }\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=0;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"    if (numVerticesA>capacityWorldVerts)\n"
+"       numVerticesA = capacityWorldVerts;\n"
+"    if (numVerticesA<0)\n"
+"        numVerticesA=0;\n"
+"    \n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        if (e0<capacityWorldVerts)\n"
+"        {\n"
+"            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"        }\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,\n"
+"                                                __global const BodyData* rigidBodies,\n"
+"                                                __global const btCollidableGpu* collidables,\n"
+"                                                __global const ConvexPolyhedronCL* convexShapes,\n"
+"                                                __global const float4* vertices,\n"
+"                                                __global const float4* uniqueEdges,\n"
+"                                                __global const btGpuFace* faces,\n"
+"                                                __global const int* indices,\n"
+"                                                __global const btGpuChildShape* gpuChildShapes,\n"
+"                                                __global btAabbCL* aabbs,\n"
+"                                                __global float4* concaveSeparatingNormalsOut,\n"
+"                                                __global int* concaveHasSeparatingNormals,\n"
+"                                                __global int4* clippingFacesOut,\n"
+"                                                __global float4* worldVertsA1GPU,\n"
+"                                                __global float4*  worldNormalsAGPU,\n"
+"                                                __global float4* worldVertsB1GPU,\n"
+"                                                __global float* dmins,\n"
+"                                                int vertexFaceCapacity,\n"
+"                                                int numConcavePairs\n"
+"                                                )\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"    \n"
+"	concaveHasSeparatingNormals[i] = 0;\n"
+"    \n"
+"	int pairIdx = i;\n"
+"    \n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"    \n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"    \n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"    \n"
+"	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n"
+"		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"	{\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"		return;\n"
+"	}\n"
+"    \n"
+"    \n"
+"    \n"
+"	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	int numActualConcaveConvexTests = 0;\n"
+"	\n"
+"	int f = concavePairs[i].z;\n"
+"	\n"
+"	bool overlap = false;\n"
+"	\n"
+"	ConvexPolyhedronCL convexPolyhedronA;\n"
+"    \n"
+"	//add 3 vertices of the triangle\n"
+"	convexPolyhedronA.m_numVertices = 3;\n"
+"	convexPolyhedronA.m_vertexOffset = 0;\n"
+"	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"    \n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	float4 triMinAabb, triMaxAabb;\n"
+"	btAabbCL triAabb;\n"
+"	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"	\n"
+"	float4 verticesA[3];\n"
+"	for (int i=0;i<3;i++)\n"
+"	{\n"
+"		int index = indices[face.m_indexOffset+i];\n"
+"		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"		verticesA[i] = vert;\n"
+"		localCenter += vert;\n"
+"        \n"
+"		triAabb.m_min = min(triAabb.m_min,vert);\n"
+"		triAabb.m_max = max(triAabb.m_max,vert);\n"
+"        \n"
+"	}\n"
+"    \n"
+"	overlap = true;\n"
+"	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"    \n"
+"	if (overlap)\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		int hasSeparatingAxis=5;\n"
+"		float4 sepAxis=make_float4(1,2,3,4);\n"
+"        \n"
+"		int localCC=0;\n"
+"		numActualConcaveConvexTests++;\n"
+"        \n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"        \n"
+"        \n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"        \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"        \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"        \n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"        \n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot(edgeNormal,v0);\n"
+"                \n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"        \n"
+"        \n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"        \n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"        \n"
+"		\n"
+"        \n"
+"        \n"
+"		///////////////////\n"
+"		///compound shape support\n"
+"        \n"
+"		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			int compoundChild = concavePairs[pairIdx].w;\n"
+"			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"		}\n"
+"		//////////////////\n"
+"        \n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"        \n"
+"        \n"
+"		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"                                             posA,ornA,\n"
+"                                             posB,ornB,\n"
+"                                             DeltaC2,\n"
+"                                             verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"                                             vertices,uniqueEdges,faces,indices,\n"
+"                                             &sepAxis,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,\n"
+"                                                 posB,ornB,\n"
+"                                                 posA,ornA,\n"
+"                                                 DeltaC2,\n"
+"                                                 vertices,uniqueEdges,faces,indices,\n"
+"                                                 verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"                                                 &sepAxis,&dmin);\n"
+"            \n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else\n"
+"			{\n"
+"				hasSeparatingAxis = 1;\n"
+"			}\n"
+"		}	\n"
+"		\n"
+"		if (hasSeparatingAxis)\n"
+"		{\n"
+"            dmins[i] = dmin;\n"
+"			concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"			concaveHasSeparatingNormals[i]=1;\n"
+"            \n"
+"		} else\n"
+"		{	\n"
+"			//mark this pair as in-active\n"
+"			concavePairs[pairIdx].w = -1;\n"
+"		}\n"
+"	}\n"
+"	else\n"
+"	{	\n"
+"		//mark this pair as in-active\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"	}\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,\n"
+"                                                          __global const BodyData* rigidBodies,\n"
+"                                                          __global const btCollidableGpu* collidables,\n"
+"                                                          __global const ConvexPolyhedronCL* convexShapes,\n"
+"                                                          __global const float4* vertices,\n"
+"                                                          __global const float4* uniqueEdges,\n"
+"                                                          __global const btGpuFace* faces,\n"
+"                                                          __global const int* indices,\n"
+"                                                          __global const btGpuChildShape* gpuChildShapes,\n"
+"                                                          __global btAabbCL* aabbs,\n"
+"                                                          __global float4* concaveSeparatingNormalsOut,\n"
+"                                                          __global int* concaveHasSeparatingNormals,\n"
+"                                                          __global int4* clippingFacesOut,\n"
+"                                                          __global float4* worldVertsA1GPU,\n"
+"                                                          __global float4*  worldNormalsAGPU,\n"
+"                                                          __global float4* worldVertsB1GPU,\n"
+"                                                          __global float* dmins,\n"
+"                                                          int vertexFaceCapacity,\n"
+"                                                          int numConcavePairs\n"
+"                                                          )\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"    \n"
+"	if (!concaveHasSeparatingNormals[i])\n"
+"        return;\n"
+"    \n"
+"	int pairIdx = i;\n"
+"    \n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"    \n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"    \n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"    \n"
+"    \n"
+"	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	int numActualConcaveConvexTests = 0;\n"
+"	\n"
+"	int f = concavePairs[i].z;\n"
+"	\n"
+"	bool overlap = false;\n"
+"	\n"
+"	ConvexPolyhedronCL convexPolyhedronA;\n"
+"    \n"
+"	//add 3 vertices of the triangle\n"
+"	convexPolyhedronA.m_numVertices = 3;\n"
+"	convexPolyhedronA.m_vertexOffset = 0;\n"
+"	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"    \n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	float4 triMinAabb, triMaxAabb;\n"
+"	btAabbCL triAabb;\n"
+"	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"	\n"
+"	float4 verticesA[3];\n"
+"	for (int i=0;i<3;i++)\n"
+"	{\n"
+"		int index = indices[face.m_indexOffset+i];\n"
+"		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"		verticesA[i] = vert;\n"
+"		localCenter += vert;\n"
+"        \n"
+"		triAabb.m_min = min(triAabb.m_min,vert);\n"
+"		triAabb.m_max = max(triAabb.m_max,vert);\n"
+"        \n"
+"	}\n"
+"    \n"
+"	overlap = true;\n"
+"	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"    \n"
+"	if (overlap)\n"
+"	{\n"
+"		float dmin = dmins[i];\n"
+"		int hasSeparatingAxis=5;\n"
+"		float4 sepAxis=make_float4(1,2,3,4);\n"
+"        sepAxis = concaveSeparatingNormalsOut[pairIdx];\n"
+"        \n"
+"		int localCC=0;\n"
+"		numActualConcaveConvexTests++;\n"
+"        \n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"        \n"
+"        \n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"        \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"        \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"        \n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"        \n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot(edgeNormal,v0);\n"
+"                \n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"        \n"
+"        \n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"        \n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"        \n"
+"		\n"
+"        \n"
+"        \n"
+"		///////////////////\n"
+"		///compound shape support\n"
+"        \n"
+"		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			int compoundChild = concavePairs[pairIdx].w;\n"
+"			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"		}\n"
+"		//////////////////\n"
+"        \n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"        \n"
+"        \n"
+"		{\n"
+"			bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"                                                              posA,ornA,\n"
+"                                                              posB,ornB,\n"
+"                                                              DeltaC2,\n"
+"                                                              verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"                                                              vertices,uniqueEdges,faces,indices,\n"
+"                                                              &sepAxis,&dmin);\n"
+"                \n"
+"			if (!sepEE)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else\n"
+"			{\n"
+"				hasSeparatingAxis = 1;\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		\n"
+"		if (hasSeparatingAxis)\n"
+"		{\n"
+"			sepAxis.w = dmin;\n"
+"            dmins[i] = dmin;\n"
+"			concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"			concaveHasSeparatingNormals[i]=1;\n"
+"           \n"
+" 	float minDist = -1e30f;\n"
+"			float maxDist = 0.02f;\n"
+"            \n"
+"            findClippingFaces(sepAxis,\n"
+"                              &convexPolyhedronA,\n"
+"                              &convexShapes[shapeIndexB],\n"
+"                              posA,ornA,\n"
+"                              posB,ornB,\n"
+"                              worldVertsA1GPU,\n"
+"                              worldNormalsAGPU,\n"
+"                              worldVertsB1GPU,\n"
+"                              vertexFaceCapacity,\n"
+"                              minDist, maxDist,\n"
+"                              verticesA,\n"
+"                              facesA,\n"
+"                              indicesA,\n"
+"                              vertices,\n"
+"                              faces,\n"
+"                              indices,\n"
+"                              clippingFacesOut, pairIdx);\n"
+"	           \n"
+"            \n"
+"		} else\n"
+"		{	\n"
+"			//mark this pair as in-active\n"
+"			concavePairs[pairIdx].w = -1;\n"
+"		}\n"
+"	}\n"
+"	else\n"
+"	{	\n"
+"		//mark this pair as in-active\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"	}\n"
+"	\n"
+"	concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points\n"
diff --git a/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
new file mode 100644
index 00000000..6f8b0a90
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
@@ -0,0 +1,2104 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satKernelsCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define B3_MAX_STACK_DEPTH 256\n"
+"typedef unsigned int u32;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"#define MAX_NUM_PARTS_IN_BITS 10\n"
+"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+"typedef struct\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes\n"
+"	int	m_escapeIndexOrTriangleIndex;\n"
+"} b3QuantizedBvhNode;\n"
+"typedef struct\n"
+"	float4		m_aabbMin;\n"
+"	float4		m_aabbMax;\n"
+"	float4		m_quantization;\n"
+"	int			m_numNodes;\n"
+"	int			m_numSubTrees;\n"
+"	int			m_nodeOffset;\n"
+"	int			m_subTreeOffset;\n"
+"} b3BvhInfo;\n"
+"int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"	\n"
+"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"typedef struct\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes, points to the root of the subtree\n"
+"	int			m_rootNodeIndex;\n"
+"	//4 bytes\n"
+"	int			m_subtreeSize;\n"
+"	int			m_padding[3];\n"
+"} b3BvhSubtreeInfo;\n"
+"typedef struct\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct  \n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct \n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"#ifndef B3_AABB_H\n"
+"#define B3_AABB_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3Aabb b3Aabb_t;\n"
+"struct b3Aabb\n"
+"	union\n"
+"	{\n"
+"		float m_min[4];\n"
+"		b3Float4 m_minVec;\n"
+"		int m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float	m_max[4];\n"
+"		b3Float4 m_maxVec;\n"
+"		int m_signedMaxIndices[4];\n"
+"	};\n"
+"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
+"						b3Float4ConstArg pos,\n"
+"						b3QuatConstArg orn,\n"
+"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
+"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
+"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
+"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
+"		b3Mat3x3 m;\n"
+"		m = b3QuatGetRotationMatrix(orn);\n"
+"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
+"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
+"		\n"
+"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
+"										 0.f);\n"
+"		*aabbMinOut = center-extent;\n"
+"		*aabbMaxOut = center+extent;\n"
+"/// conservative test for overlap between two aabbs\n"
+"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
+"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
+"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
+"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
+"	return overlap;\n"
+"#endif //B3_AABB_H\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose,\n"
+"including commercial applications, and to alter it and redistribute it freely,\n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"#ifndef B3_INT2_H\n"
+"#define B3_INT2_H\n"
+"#ifdef __cplusplus\n"
+"#define b3UnsignedInt2 uint2\n"
+"#define b3Int2 int2\n"
+"#define b3MakeInt2 (int2)\n"
+"#endif //__cplusplus\n"
+"typedef struct\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"#define make_float4 (float4)\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"	\n"
+"//	float4 a1 = make_float4(a.xyz,0.f);\n"
+"//	float4 b1 = make_float4(b.xyz,0.f);\n"
+"//	return cross(a1,b1);\n"
+"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//return c;\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	v = make_float4(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, const float4* vertices, float* min, float* max)\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"inline bool IsAlmostZero(const float4 v)\n"
+"	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n"
+"		return false;\n"
+"	return true;\n"
+"bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* verticesA, \n"
+"	__global const float4* uniqueEdgesA, \n"
+"	__global const btGpuFace* facesA,\n"
+"	__global const int*  indicesA,\n"
+"	const float4* verticesB,\n"
+"	const float4* uniqueEdgesB, \n"
+"	const btGpuFace* facesB,\n"
+"	const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS *= -1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"		float4* sep,\n"
+"	float* dmin)\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje *= -1.f;\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						result = false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"				}\n"
+"				\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, __global const float4* vertices,float* depth)\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices, \n"
+"	__global const float4* uniqueEdges, \n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int*  indices,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"	\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"				\n"
+"			curPlaneTests++;\n"
+"	\n"
+"			float d;\n"
+"			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))\n"
+"				return false;\n"
+"	\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"		if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"		{\n"
+"			*sep = -(*sep);\n"
+"		}\n"
+"	\n"
+"	return true;\n"
+"bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices,\n"
+"	__global const float4* unitSphereDirections,\n"
+"	int numUnitSphereDirections,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test unit sphere directions\n"
+"	for (int i=0;i<numUnitSphereDirections;i++)\n"
+"	{\n"
+"		float4 crossje;\n"
+"		crossje = unitSphereDirections[i];	\n"
+"		if (dot3F4(DeltaC2,crossje)>0)\n"
+"			crossje *= -1.f;\n"
+"		{\n"
+"			float dist;\n"
+"			bool result = true;\n"
+"			float Min0,Max0;\n"
+"			float Min1,Max1;\n"
+"			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"		\n"
+"			if(Max0<Min1 || Max1<Min0)\n"
+"				return false;\n"
+"		\n"
+"			float d0 = Max0 - Min1;\n"
+"			float d1 = Max1 - Min0;\n"
+"			dist = d0<d1 ? d0:d1;\n"
+"			result = true;\n"
+"	\n"
+"			if(dist<*dmin)\n"
+"			{\n"
+"				*dmin = dist;\n"
+"				*sep = crossje;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices, \n"
+"	__global const float4* uniqueEdges, \n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int*  indices,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje*=-1.f;\n"
+"					\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						return false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"				}\n"
+"				\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"// work-in-progress\n"
+"__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global volatile float4* gpuCompoundSepNormalsOut,\n"
+"																					__global volatile int* gpuHasCompoundSepNormalsOut,\n"
+"																					int numCompoundPairs\n"
+"																					)\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"		int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"		\n"
+"		int collidableIndexA = -1;\n"
+"		int collidableIndexB = -1;\n"
+"		\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		\n"
+"		float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							\n"
+"		if (childShapeIndexA >= 0)\n"
+"		{\n"
+"			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"			float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"			float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"			posA = newPosA;\n"
+"			ornA = newOrnA;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		}\n"
+"		\n"
+"		if (childShapeIndexB>=0)\n"
+"		{\n"
+"			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"		}\n"
+"	\n"
+"		gpuHasCompoundSepNormalsOut[i] = 0;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n"
+"		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"	\n"
+"		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		int hasSeparatingAxis = 5;\n"
+"							\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		posA.w = 0.f;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal = make_float4(1,0,0,0);\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else//(!sepB)\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"				if (sepEE)\n"
+"				{\n"
+"						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);\n"
+"						gpuHasCompoundSepNormalsOut[i] = 1;\n"
+"				}//sepEE\n"
+"			}//(!sepB)\n"
+"		}//(!sepA)\n"
+"		\n"
+"		\n"
+"	}\n"
+"		\n"
+"inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n"
+"		b3Float4 vecOut;\n"
+"		vecOut = b3MakeFloat4(\n"
+"			(float)(vecIn[0]) / (quantization.x),\n"
+"			(float)(vecIn[1]) / (quantization.y),\n"
+"			(float)(vecIn[2]) / (quantization.z),\n"
+"			0.f);\n"
+"		vecOut += bvhAabbMin;\n"
+"		return vecOut;\n"
+"inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n"
+"		b3Float4 vecOut;\n"
+"		vecOut = b3MakeFloat4(\n"
+"			(float)(vecIn[0]) / (quantization.x),\n"
+"			(float)(vecIn[1]) / (quantization.y),\n"
+"			(float)(vecIn[2]) / (quantization.z),\n"
+"			0.f);\n"
+"		vecOut += bvhAabbMin;\n"
+"		return vecOut;\n"
+"// work-in-progress\n"
+"__kernel void   findCompoundPairsKernel( __global const int4* pairs, \n"
+"	__global const BodyData* rigidBodies, \n"
+"	__global const btCollidableGpu* collidables,\n"
+"	__global const ConvexPolyhedronCL* convexShapes, \n"
+"	__global const float4* vertices,\n"
+"	__global const float4* uniqueEdges,\n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int* indices,\n"
+"	__global b3Aabb_t* aabbLocalSpace,\n"
+"	__global const btGpuChildShape* gpuChildShapes,\n"
+"	__global volatile int4* gpuCompoundPairsOut,\n"
+"	__global volatile int* numCompoundPairsOut,\n"
+"	__global const b3BvhSubtreeInfo* subtrees,\n"
+"	__global const b3QuantizedBvhNode* quantizedNodes,\n"
+"	__global const b3BvhInfo* bvhInfos,\n"
+"	int numPairs,\n"
+"	int maxNumCompoundPairsCapacity\n"
+"	)\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"		{\n"
+"			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;\n"
+"			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;\n"
+"			int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;\n"
+"			int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;\n"
+"			int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;\n"
+"			int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;\n"
+"			\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			\n"
+"			for (int p=0;p<numSubTreesA;p++)\n"
+"			{\n"
+"				b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];\n"
+"				//bvhInfos[bvhA].m_quantization\n"
+"				b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"				b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"				b3Float4 aabbAMinOut,aabbAMaxOut;\n"
+"				float margin=0.f;\n"
+"				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n"
+"				\n"
+"				for (int q=0;q<numSubTreesB;q++)\n"
+"				{\n"
+"					b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];\n"
+"					b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"					b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"					b3Float4 aabbBMinOut,aabbBMaxOut;\n"
+"					float margin=0.f;\n"
+"					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n"
+"					\n"
+"					\n"
+"					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n"
+"					if (aabbOverlap)\n"
+"					{\n"
+"						\n"
+"						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;\n"
+"						int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;\n"
+"						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;\n"
+"						int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;\n"
+"						b3Int2 nodeStack[B3_MAX_STACK_DEPTH];\n"
+"						b3Int2 node0;\n"
+"						node0.x = startNodeIndexA;\n"
+"						node0.y = startNodeIndexB;\n"
+"						int maxStackDepth = B3_MAX_STACK_DEPTH;\n"
+"						int depth=0;\n"
+"						nodeStack[depth++]=node0;\n"
+"						do\n"
+"						{\n"
+"							b3Int2 node = nodeStack[--depth];\n"
+"							b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"							b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"							b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"							b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"							float margin=0.f;\n"
+"							b3Float4 aabbAMinOut,aabbAMaxOut;\n"
+"							b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n"
+"							b3Float4 aabbBMinOut,aabbBMaxOut;\n"
+"							b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n"
+"							\n"
+"							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n"
+"							if (nodeOverlap)\n"
+"							{\n"
+"								bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);\n"
+"								bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);\n"
+"								bool isInternalA = !isLeafA;\n"
+"								bool isInternalB = !isLeafB;\n"
+"								//fail, even though it might hit two leaf nodes\n"
+"								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))\n"
+"								{\n"
+"									//printf(\"Error: traversal exceeded maxStackDepth\");\n"
+"									continue;\n"
+"								}\n"
+"								if(isInternalA)\n"
+"								{\n"
+"									int nodeAleftChild = node.x+1;\n"
+"									bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);\n"
+"									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);\n"
+"									if(isInternalB)\n"
+"									{					\n"
+"										int nodeBleftChild = node.y+1;\n"
+"										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n"
+"										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);\n"
+"									}\n"
+"									else\n"
+"									{\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);\n"
+"									}\n"
+"								}\n"
+"								else\n"
+"								{\n"
+"									if(isInternalB)\n"
+"									{\n"
+"										int nodeBleftChild = node.y+1;\n"
+"										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n"
+"										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n"
+"										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);\n"
+"									}\n"
+"									else\n"
+"									{\n"
+"										int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"										if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"										{\n"
+"											int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);\n"
+"											int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);\n"
+"											gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n"
+"										}\n"
+"									}\n"
+"								}\n"
+"							}\n"
+"						} while (depth);\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"			\n"
+"			return;\n"
+"		}\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"		{\n"
+"			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n"
+"			{\n"
+"				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n"
+"				for (int c=0;c<numChildrenA;c++)\n"
+"				{\n"
+"					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n"
+"					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"					float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"					float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"					float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"					float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n"
+"					b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];\n"
+"					float margin = 0.f;\n"
+"					\n"
+"					b3Float4 aabbAMinWS;\n"
+"					b3Float4 aabbAMaxWS;\n"
+"					\n"
+"					b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,\n"
+"						newPosA,\n"
+"						newOrnA,\n"
+"						&aabbAMinWS,&aabbAMaxWS);\n"
+"						\n"
+"					\n"
+"					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"					{\n"
+"						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"						for (int b=0;b<numChildrenB;b++)\n"
+"						{\n"
+"							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"							float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"							float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"							float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"							b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];\n"
+"							\n"
+"							b3Float4 aabbBMinWS;\n"
+"							b3Float4 aabbBMaxWS;\n"
+"							\n"
+"							b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,\n"
+"								newPosB,\n"
+"								newOrnB,\n"
+"								&aabbBMinWS,&aabbBMaxWS);\n"
+"								\n"
+"								\n"
+"							\n"
+"							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);\n"
+"							if (aabbOverlap)\n"
+"							{\n"
+"								int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"								float dmin = FLT_MAX;\n"
+"								float4 posA = newPosA;\n"
+"								posA.w = 0.f;\n"
+"								float4 posB = newPosB;\n"
+"								posB.w = 0.f;\n"
+"								float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"								float4 ornA = newOrnA;\n"
+"								float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"								float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"								float4 ornB =newOrnB;\n"
+"								float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"								const float4 DeltaC2 = c0 - c1;\n"
+"								{//\n"
+"									int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"									if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"									{\n"
+"										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n"
+"									}\n"
+"								}//\n"
+"							}//fi(1)\n"
+"						} //for (int b=0\n"
+"					}//if (collidables[collidableIndexB].\n"
+"					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"					{\n"
+"						if (1)\n"
+"						{\n"
+"							int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"							float dmin = FLT_MAX;\n"
+"							float4 posA = newPosA;\n"
+"							posA.w = 0.f;\n"
+"							float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							posB.w = 0.f;\n"
+"							float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"							float4 ornA = newOrnA;\n"
+"							float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"							float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"							float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"							const float4 DeltaC2 = c0 - c1;\n"
+"							{\n"
+"								int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"								if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"								{\n"
+"									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);\n"
+"								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"							}//\n"
+"						}//fi (1)\n"
+"					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"				}//for (int b=0;b<numChildrenB;b++)	\n"
+"				return;\n"
+"			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) \n"
+"				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"			{\n"
+"				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"				for (int b=0;b<numChildrenB;b++)\n"
+"				{\n"
+"					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"					float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"					float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"					float4 newPosB = qtRotate(ornB,childPosB)+posB;\n"
+"					float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"					//////////////////////////////////////\n"
+"					if (1)\n"
+"					{\n"
+"						int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"						float dmin = FLT_MAX;\n"
+"						float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"						posA.w = 0.f;\n"
+"						float4 posB = newPosB;\n"
+"						posB.w = 0.f;\n"
+"						float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"						float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"						float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"						float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"						float4 ornB =newOrnB;\n"
+"						float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"						const float4 DeltaC2 = c0 - c1;\n"
+"						{//\n"
+"							int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"							if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"							{\n"
+"								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);\n"
+"							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"						}//\n"
+"					}//fi (1)	\n"
+"				}//for (int b=0;b<numChildrenB;b++)\n"
+"				return;\n"
+"			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			return;\n"
+"		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"	}//i<numPairs\n"
+"// work-in-progress\n"
+"__kernel void   findSeparatingAxisKernel( __global const int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global volatile float4* separatingNormals,\n"
+"																					__global volatile int* hasSeparatingAxis,\n"
+"																					int numPairs\n"
+"																					)\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"	\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"			\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal;\n"
+"		\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																								posB,ornB,\n"
+"																								DeltaC2,\n"
+"																								vertices,uniqueEdges,faces,\n"
+"																								indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis[i] = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n"
+"																									posA,ornA,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis[i] = 0;\n"
+"			} else\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 1;\n"
+"					separatingNormals[i] = sepNormal;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"__kernel void   findSeparatingAxisVertexFaceKernel( __global const int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global volatile float4* separatingNormals,\n"
+"																					__global volatile int* hasSeparatingAxis,\n"
+"																					__global  float* dmins,\n"
+"																					int numPairs\n"
+"																					)\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"	\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		hasSeparatingAxis[i] = 0;	\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"			\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		dmins[i] = dmin;\n"
+"		\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal;\n"
+"		\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																								posB,ornB,\n"
+"																								DeltaC2,\n"
+"																								vertices,uniqueEdges,faces,\n"
+"																								indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis[i] = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n"
+"																									posA,ornA,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"			if (sepB)\n"
+"			{\n"
+"				dmins[i] = dmin;\n"
+"				hasSeparatingAxis[i] = 1;\n"
+"				separatingNormals[i] = sepNormal;\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"__kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global  float4* separatingNormals,\n"
+"																					__global  int* hasSeparatingAxis,\n"
+"																					__global  float* dmins,\n"
+"																					__global const float4* unitSphereDirections,\n"
+"																					int numUnitSphereDirections,\n"
+"																					int numPairs\n"
+"																					)\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"	\n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"	\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"			\n"
+"			int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	\n"
+"			float dmin = dmins[i];\n"
+"	\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			posA.w = 0.f;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			posB.w = 0.f;\n"
+"			float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"			float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"			float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"			const float4 DeltaC2 = c0 - c1;\n"
+"			float4 sepNormal = separatingNormals[i];\n"
+"			\n"
+"			\n"
+"			\n"
+"			bool sepEE = false;\n"
+"			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n"
+"			if (numEdgeEdgeDirections<=numUnitSphereDirections)\n"
+"			{\n"
+"				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"																									\n"
+"					if (!sepEE)\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 0;\n"
+"					} else\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 1;\n"
+"						separatingNormals[i] = sepNormal;\n"
+"					}\n"
+"			}\n"
+"			/*\n"
+"			///else case is a separate kernel, to make Mac OSX OpenCL compiler happy\n"
+"			else\n"
+"			{\n"
+"				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,unitSphereDirections,numUnitSphereDirections,\n"
+"																									&sepNormal,&dmin);\n"
+"					if (!sepEE)\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 0;\n"
+"					} else\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 1;\n"
+"						separatingNormals[i] = sepNormal;\n"
+"					}\n"
+"			}\n"
+"			*/\n"
+"		}		//if (hasSeparatingAxis[i])\n"
+"	}//(i<numPairs)\n"
+"inline int	findClippingFaces(const float4 separatingNormal,\n"
+"                      const ConvexPolyhedronCL* hullA, \n"
+"					  __global const ConvexPolyhedronCL* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"					  const float4* verticesA,\n"
+"                      const btGpuFace* facesA,\n"
+"                      const int* indicesA,\n"
+"					  __global const float4* verticesB,\n"
+"                      __global const btGpuFace* facesB,\n"
+"                      __global const int* indicesB,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=0;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		int numVertices = polyB.m_numIndices;\n"
+"        if (numVertices>capacityWorldVerts)\n"
+"            numVertices = capacityWorldVerts;\n"
+"        \n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"            if (e0<capacityWorldVerts)\n"
+"            {\n"
+"                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"            }\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=0;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"    if (numVerticesA>capacityWorldVerts)\n"
+"       numVerticesA = capacityWorldVerts;\n"
+"    \n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        if (e0<capacityWorldVerts)\n"
+"        {\n"
+"            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"        }\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n"
+"																					__global const BodyData* rigidBodies,\n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global float4* concaveSeparatingNormalsOut,\n"
+"																					__global int* concaveHasSeparatingNormals,\n"
+"																					__global int4* clippingFacesOut,\n"
+"																					__global float4* worldVertsA1GPU,\n"
+"																					__global float4*  worldNormalsAGPU,\n"
+"																					__global float4* worldVertsB1GPU,\n"
+"																					int vertexFaceCapacity,\n"
+"																					int numConcavePairs\n"
+"																					)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"	concaveHasSeparatingNormals[i] = 0;\n"
+"	int pairIdx = i;\n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n"
+"		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"	{\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"		return;\n"
+"	}\n"
+"	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	int numActualConcaveConvexTests = 0;\n"
+"	\n"
+"	int f = concavePairs[i].z;\n"
+"	\n"
+"	bool overlap = false;\n"
+"	\n"
+"	ConvexPolyhedronCL convexPolyhedronA;\n"
+"	//add 3 vertices of the triangle\n"
+"	convexPolyhedronA.m_numVertices = 3;\n"
+"	convexPolyhedronA.m_vertexOffset = 0;\n"
+"	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	float4 triMinAabb, triMaxAabb;\n"
+"	btAabbCL triAabb;\n"
+"	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"	\n"
+"	float4 verticesA[3];\n"
+"	for (int i=0;i<3;i++)\n"
+"	{\n"
+"		int index = indices[face.m_indexOffset+i];\n"
+"		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"		verticesA[i] = vert;\n"
+"		localCenter += vert;\n"
+"			\n"
+"		triAabb.m_min = min(triAabb.m_min,vert);		\n"
+"		triAabb.m_max = max(triAabb.m_max,vert);		\n"
+"	}\n"
+"	overlap = true;\n"
+"	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"		\n"
+"	if (overlap)\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		int hasSeparatingAxis=5;\n"
+"		float4 sepAxis=make_float4(1,2,3,4);\n"
+"		int localCC=0;\n"
+"		numActualConcaveConvexTests++;\n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"                                  \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"                             \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                                            \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot(edgeNormal,v0);\n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                                            \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		\n"
+"		///////////////////\n"
+"		///compound shape support\n"
+"		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			int compoundChild = concavePairs[pairIdx].w;\n"
+"			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"		}\n"
+"		//////////////////\n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"												posA,ornA,\n"
+"												posB,ornB,\n"
+"												DeltaC2,\n"
+"												verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"												vertices,uniqueEdges,faces,indices,\n"
+"												&sepAxis,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,\n"
+"												posB,ornB,\n"
+"												posA,ornA,\n"
+"												DeltaC2,\n"
+"												vertices,uniqueEdges,faces,indices,\n"
+"												verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"												&sepAxis,&dmin);\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"															posA,ornA,\n"
+"															posB,ornB,\n"
+"															DeltaC2,\n"
+"															verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"															vertices,uniqueEdges,faces,indices,\n"
+"															&sepAxis,&dmin);\n"
+"	\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis = 1;\n"
+"				}\n"
+"			}\n"
+"		}	\n"
+"		\n"
+"		if (hasSeparatingAxis)\n"
+"		{\n"
+"			sepAxis.w = dmin;\n"
+"			concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"			concaveHasSeparatingNormals[i]=1;\n"
+"			float minDist = -1e30f;\n"
+"			float maxDist = 0.02f;\n"
+"		\n"
+"			findClippingFaces(sepAxis,\n"
+"                     &convexPolyhedronA,\n"
+"					 &convexShapes[shapeIndexB],\n"
+"					 posA,ornA,\n"
+"					 posB,ornB,\n"
+"                      worldVertsA1GPU,\n"
+"                      worldNormalsAGPU,\n"
+"                      worldVertsB1GPU,\n"
+"					  vertexFaceCapacity,\n"
+"                      minDist, maxDist,\n"
+"                      verticesA,\n"
+"                      facesA,\n"
+"                      indicesA,\n"
+" 					  vertices,\n"
+"                      faces,\n"
+"                      indices,\n"
+"                      clippingFacesOut, pairIdx);\n"
+"		} else\n"
+"		{	\n"
+"			//mark this pair as in-active\n"
+"			concavePairs[pairIdx].w = -1;\n"
+"		}\n"
+"	}\n"
+"	else\n"
+"	{	\n"
+"		//mark this pair as in-active\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"	}\n"
+"	\n"
+"	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts\n"
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
new file mode 100644
index 00000000..a4980f71
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
@@ -0,0 +1,213 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+//Host-code rewritten by Erwin Coumans
+#define BOUNDSEARCH_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl"
+#define KERNEL0 "SearchSortDataLowerKernel"
+#define KERNEL1 "SearchSortDataUpperKernel"
+#define KERNEL2 "SubtractKernel"
+#include "b3BoundSearchCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "b3LauncherCL.h"
+#include "kernels/BoundSearchKernelsCL.h"
+b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
+	:m_context(ctx),
+	m_device(device),
+	m_queue(queue)
+	const char* additionalMacros = "";
+	//const char* srcFileNameForCaching="";
+	cl_int pErrNum;
+	const char* kernelSource = boundSearchKernelsCL;
+	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
+	b3Assert(boundSearchProg);
+	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
+	b3Assert(m_lowerSortDataKernel );
+	m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
+	b3Assert(m_upperSortDataKernel);
+	m_subtractKernel = 0;
+	if( maxSize )
+	{
+		m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
+		b3Assert(m_subtractKernel);
+	}
+	//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
+	m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
+	m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );
+	m_filler = new b3FillCL(ctx,device,queue);
+	delete m_lower;
+	delete m_upper;
+	delete m_filler;
+	clReleaseKernel(m_lowerSortDataKernel);
+	clReleaseKernel(m_upperSortDataKernel);
+	clReleaseKernel(m_subtractKernel);
+void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
+	b3Int4 constBuffer;
+	constBuffer.x = nSrc;
+	constBuffer.y = nDst;
+	if( option == BOUND_LOWER )
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
+		b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst( nSrc );
+        launcher.setConst( nDst );
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == BOUND_UPPER )
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+		b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+        launcher.setConst( nSrc );
+        launcher.setConst( nDst );
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == COUNT )
+	{
+		b3Assert( m_lower );
+		b3Assert( m_upper );
+		b3Assert( m_lower->capacity() <= (int)nDst );
+		b3Assert( m_upper->capacity() <= (int)nDst );
+		int zero = 0;
+		m_filler->execute( *m_lower, zero, nDst );
+		m_filler->execute( *m_upper, zero, nDst );
+		execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
+		execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
+		{
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+			b3LauncherCL  launcher( m_queue, m_subtractKernel ,"m_subtractKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( nSrc );
+            launcher.setConst( nDst );
+			launcher.launch1D( nDst, 64 );
+		}
+	}
+	else
+	{
+		b3Assert( 0 );
+	}
+void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, 
+	b3AlignedObjectArray<unsigned int>& dst,  int nDst, Option option )
+	for(int i=0; i<nSrc-1; i++) 
+		b3Assert( src[i].m_key <= src[i+1].m_key );
+	b3SortData minData,zeroData,maxData;
+	minData.m_key = -1;
+	minData.m_value = -1;
+	zeroData.m_key=0;
+	zeroData.m_value=0;
+	maxData.m_key = nDst;
+	maxData.m_value = nDst;
+	if( option == BOUND_LOWER )
+	{
+		for(int i=0; i<nSrc; i++)
+		{
+			b3SortData& iData = (i==0)? minData: src[i-1];
+			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			if( iData.m_key != jData.m_key )
+			{
+				int k = jData.m_key;
+				{
+					dst[k] = i;
+				}
+			}
+		}
+	}
+	else if( option == BOUND_UPPER )
+	{
+		for(int i=1; i<nSrc+1; i++)
+		{
+			b3SortData& iData = src[i-1];
+			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			if( iData.m_key != jData.m_key )
+			{
+				int k = iData.m_key;
+				{
+					dst[k] = i;
+				}
+			}
+		}
+	}
+	else if( option == COUNT )
+	{
+		b3AlignedObjectArray<unsigned int> lower;
+		lower.resize(nDst );
+		b3AlignedObjectArray<unsigned int> upper;
+		upper.resize(nDst );
+		for(int i=0; i<nDst; i++) 
+		{ 
+			lower[i] = upper[i] = 0; 
+		}
+		executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
+		executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
+		for( int i=0; i<nDst; i++) 
+		{ 
+			dst[i] = upper[i] - lower[i]; 
+		}
+	}
+	else
+	{
+		b3Assert( 0 );
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
new file mode 100644
index 00000000..7e294096
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
@@ -0,0 +1,67 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#pragma once
+/*#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Fill/Fill.h>
+#include "b3OpenCLArray.h"
+#include "b3FillCL.h"
+#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
+class b3BoundSearchCL
+	public:
+		enum Option
+		{
+			COUNT,
+		};
+		cl_context m_context;
+		cl_device_id m_device;
+		cl_command_queue m_queue;
+		cl_kernel m_lowerSortDataKernel;
+		cl_kernel m_upperSortDataKernel;
+		cl_kernel m_subtractKernel;
+		b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
+		b3OpenCLArray<unsigned int>* m_lower;
+		b3OpenCLArray<unsigned int>* m_upper;
+		b3FillCL* m_filler;
+		b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
+		virtual ~b3BoundSearchCL();
+		//	src has to be src[i].m_key <= src[i+1].m_key
+		void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
+		void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+#endif //B3_BOUNDSEARCH_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
new file mode 100644
index 00000000..52f219ae
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
@@ -0,0 +1,19 @@
+#ifndef B3_BUFFER_INFO_CL_H
+#define B3_BUFFER_INFO_CL_H
+#include "b3OpenCLArray.h"
+struct b3BufferInfoCL
+	//b3BufferInfoCL(){}
+//	template<typename T>
+	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
+	cl_mem m_clBuffer;
+	bool m_isReadOnly;
+#endif //B3_BUFFER_INFO_CL_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
new file mode 100644
index 00000000..f05c2648
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
@@ -0,0 +1,126 @@
+#include "b3FillCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "b3BufferInfoCL.h"
+#include "b3LauncherCL.h"
+#define FILL_CL_PROGRAM_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl"
+#include "kernels/FillKernelsCL.h"
+b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
+	const char* kernelSource = fillKernelsCL;
+	cl_int pErrNum;
+	const char* additionalMacros = "";
+	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
+	b3Assert(fillProg);
+	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
+	b3Assert(m_fillIntKernel);
+	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
+	b3Assert(m_fillIntKernel);
+	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
+	b3Assert(m_fillFloatKernel);
+	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
+	b3Assert(m_fillKernelInt2);
+	clReleaseKernel(m_fillKernelInt2);
+	clReleaseKernel(m_fillIntKernel);
+	clReleaseKernel(m_fillUnsignedIntKernel);
+	clReleaseKernel(m_fillFloatKernel);
+void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
+	b3Assert( n>0 );
+	{
+		b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" );
+		launcher.setBuffer( src.getBufferCL());
+		launcher.setConst( n );
+		launcher.setConst( value );
+		launcher.setConst( offset);
+		launcher.launch1D( n );
+	}
+void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
+	b3Assert( n>0 );
+	{
+		b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel");
+		launcher.setBuffer(src.getBufferCL());
+		launcher.setConst( n);
+		launcher.setConst( value);
+		launcher.setConst( offset);
+		launcher.launch1D( n );
+	}
+void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
+	b3Assert( n>0 );
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst( n );
+        launcher.setConst(value);
+		launcher.setConst(offset);
+		launcher.launch1D( n );
+	}
+void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+	for (int i=0;i<n;i++)
+	{
+		src[i+offset]=value;
+	}
+void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
+	for (int i=0;i<n;i++)
+	{
+		src[i+offset]=value;
+	}
+void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+	b3Assert( n>0 );
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2");
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);
+		//( constBuffer );
+		launcher.launch1D( n );
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
new file mode 100644
index 00000000..1609676b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
@@ -0,0 +1,63 @@
+#ifndef B3_FILL_CL_H
+#define B3_FILL_CL_H
+#include "b3OpenCLArray.h"
+#include "Bullet3Common/b3Scalar.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+class b3FillCL
+	cl_command_queue	m_commandQueue;
+	cl_kernel			m_fillKernelInt2;
+	cl_kernel			m_fillIntKernel;
+	cl_kernel			m_fillUnsignedIntKernel;
+	cl_kernel			m_fillFloatKernel;
+	public:
+		struct b3ConstData
+		{
+			union
+			{
+				b3Int4 m_data;
+				b3UnsignedInt4 m_UnsignedData;
+			};
+			int m_offset;
+			int m_n;
+			int m_padding[2];
+		};
+		b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
+		virtual ~b3FillCL();
+		void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
+		void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
+		void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
+		void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
+		void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
+		void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
+	//	void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
+#endif //B3_FILL_CL_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
new file mode 100644
index 00000000..94590d11
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
@@ -0,0 +1,308 @@
+#include "b3LauncherCL.h"
+bool gDebugLauncherCL = false;
+b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
+	if (gDebugLauncherCL)
+	{
+		static int counter = 0;
+		printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
+	}
+      m_serializationSizeInBytes = sizeof(int);
+  {
+      for (int i=0;i<m_arrays.size();i++)
+      {
+		  delete (m_arrays[i]);
+      }
+	  m_arrays.clear();
+	  if (gDebugLauncherCL)
+	  {
+		static int counter = 0;
+		printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name);
+	  }
+  }
+void b3LauncherCL::setBuffer( cl_mem clBuffer)
+		if (m_enableSerialization)
+		{
+			b3KernelArgData kernelArg;
+			kernelArg.m_argIndex = m_idx;
+			kernelArg.m_isBuffer = 1;
+			kernelArg.m_clBuffer = clBuffer;
+			cl_mem_info param_name = CL_MEM_SIZE;
+			size_t param_value;
+			size_t sizeInBytes = sizeof(size_t);
+			size_t actualSizeInBytes;
+			cl_int err;
+			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
+									  param_name,
+									  sizeInBytes,
+									  &param_value,
+									  &actualSizeInBytes);
+			b3Assert( err == CL_SUCCESS );
+			kernelArg.m_argSizeInBytes = param_value;
+			m_kernelArguments.push_back(kernelArg);
+			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
+			m_serializationSizeInBytes+=param_value;
+            }
+            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
+		b3Assert( status == CL_SUCCESS );
+void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
+	for(int i=0; i<n; i++)
+	{
+		if (m_enableSerialization)
+		{
+			b3KernelArgData kernelArg;
+			kernelArg.m_argIndex = m_idx;
+			kernelArg.m_isBuffer = 1;
+			kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
+			cl_mem_info param_name = CL_MEM_SIZE;
+			size_t param_value;
+			size_t sizeInBytes = sizeof(size_t);
+			size_t actualSizeInBytes;
+			cl_int err;
+			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
+									  param_name,
+									  sizeInBytes,
+									  &param_value,
+									  &actualSizeInBytes);
+			b3Assert( err == CL_SUCCESS );
+			kernelArg.m_argSizeInBytes = param_value;
+			m_kernelArguments.push_back(kernelArg);
+			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
+			m_serializationSizeInBytes+=param_value;
+            }
+            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
+		b3Assert( status == CL_SUCCESS );
+        }
+struct b3KernelArgDataUnaligned
+    int m_isBuffer;
+    int m_argIndex;
+    int m_argSizeInBytes;
+	int m_unusedPadding;
+    union
+    {
+        cl_mem m_clBuffer;
+        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+    };
+#include <string.h>
+int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
+    int index=0;
+    int numArguments = *(int*) &buf[index];
+    index+=sizeof(int);
+    for (int i=0;i<numArguments;i++)
+    {
+        b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
+        index+=sizeof(b3KernelArgData);
+        if (arg->m_isBuffer)
+        {
+            b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
+            clData->resize(arg->m_argSizeInBytes);
+            clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
+            arg->m_clBuffer = clData->getBufferCL();
+            m_arrays.push_back(clData);
+            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
+		b3Assert( status == CL_SUCCESS );
+            index+=arg->m_argSizeInBytes;
+        } else 
+        {
+            cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
+		b3Assert( status == CL_SUCCESS );
+        }
+		b3KernelArgData b;
+		memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned));
+	m_kernelArguments.push_back(b);
+    }
+m_serializationSizeInBytes = index;
+    return index;
+int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
+  {
+	 int index=0;
+      int numArguments = *(int*) &goldBuffer[index];
+      index+=sizeof(int);
+	if (numArguments != m_kernelArguments.size())
+	{
+		printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
+		return -1;
+	}
+      for (int ii=0;ii<numArguments;ii++)
+      {
+          b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
+		if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
+		{
+			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
+			return -2;
+		}
+		{
+			int expected = argGold->m_isBuffer;
+			int found = m_kernelArguments[ii].m_isBuffer;
+			if (expected != found)
+			{
+				printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
+				return -3;
+			}
+		}
+		index+=sizeof(b3KernelArgData);
+		if (argGold->m_isBuffer)
+          {
+			unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
+			unsigned char* goldBuf = &goldBuffer[index];
+			for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
+			{
+				memBuf[j] = 0xaa;
+			}
+			cl_int status = 0;
+			status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
+                                           memBuf, 0,0,0 );
+              b3Assert( status==CL_SUCCESS );
+              clFinish(m_commandQueue);
+			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			{
+				int expected = goldBuf[b];
+				int found = memBuf[b];
+				if (expected != found)
+				{
+					printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
+						ii, b, expected, found);
+					return -4;
+				}
+			}
+              index+=argGold->m_argSizeInBytes;
+          } else 
+          {
+			//compare content
+			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			{
+				int expected = argGold->m_argData[b];
+				int found =m_kernelArguments[ii].m_argData[b];
+				if (expected != found)
+				{
+					printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
+						ii, b, expected, found);
+					return -5;
+				}
+			}
+          }
+      }
+      return index;
+int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
+//initialize to known values
+for (int i=0;i<destBufferCapacity;i++)
+	destBuffer[i] = 0xec;
+    assert(destBufferCapacity>=m_serializationSizeInBytes);
+    //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc        
+    int numArguments = m_kernelArguments.size();
+    int curBufferSize = 0;
+    int* dest = (int*)&destBuffer[curBufferSize];
+    *dest = numArguments;
+    curBufferSize += sizeof(int);
+    for (int i=0;i<this->m_kernelArguments.size();i++)
+    {
+        b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
+        *arg = m_kernelArguments[i];
+        curBufferSize+=sizeof(b3KernelArgData);
+        if (arg->m_isBuffer==1)
+        {
+            //copy the OpenCL buffer content
+            cl_int status = 0;
+            status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
+                                         &destBuffer[curBufferSize], 0,0,0 );
+            b3Assert( status==CL_SUCCESS );
+            clFinish(m_commandQueue);
+            curBufferSize+=arg->m_argSizeInBytes;
+        }
+    }
+    return curBufferSize;
+void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
+	int num = numWorkItems;
+	int buffSize = getSerializationBufferSize();
+	unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
+	for (int i=0;i<buffSize+1;i++)
+	{
+		unsigned char* ptr = (unsigned char*)&buf[i];
+		*ptr = 0xff;
+	}
+//	int actualWrite = serializeArguments(buf,buffSize);
+//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
+//            printf("buf[buffSize] = %d\n",*cptr);
+	assert(buf[buffSize]==0xff);//check for buffer overrun
+	int* ptr = (int*)&buf[buffSize];
+	*ptr = num;
+	FILE* f = fopen(fileName,"wb");
+	fwrite(buf,buffSize+sizeof(int),1,f);
+	fclose(f);
+	delete[] buf;
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
new file mode 100644
index 00000000..1b267b31
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
@@ -0,0 +1,135 @@
+#ifndef B3_LAUNCHER_CL_H
+#define B3_LAUNCHER_CL_H
+#include "b3BufferInfoCL.h"
+#include "Bullet3Common/b3MinMax.h"
+#include "b3OpenCLArray.h"
+#include <stdio.h>
+#ifdef _WIN32
+#pragma warning(disable :4996)
+#define B3_CL_MAX_ARG_SIZE 16
+B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData
+    int m_isBuffer;
+    int m_argIndex;
+    int m_argSizeInBytes;
+	int m_unusedPadding;
+    union
+    {
+        cl_mem m_clBuffer;
+        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+    };
+class b3LauncherCL
+	cl_command_queue m_commandQueue;
+	cl_kernel m_kernel;
+	int m_idx;
+    b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
+    int m_serializationSizeInBytes;
+	bool	m_enableSerialization;
+	const char* m_name;
+	public:
+     b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
+		b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
+    virtual ~b3LauncherCL();
+		void setBuffer( cl_mem clBuffer);
+		void setBuffers( b3BufferInfoCL* buffInfo, int n );
+    int getSerializationBufferSize() const 
+    {
+        return m_serializationSizeInBytes;
+    }
+    int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
+	inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);
+    int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
+	int getNumArguments() const
+	{
+		return m_kernelArguments.size();
+	}
+	b3KernelArgData getArgument(int index)
+	{
+		return m_kernelArguments[index];
+	}
+	void serializeToFile(const char* fileName, int numWorkItems);
+	template<typename T>
+		inline void setConst( const T& consts )
+		{
+			int sz=sizeof(T);
+			b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
+			if (m_enableSerialization)
+			{
+				b3KernelArgData kernelArg;
+				kernelArg.m_argIndex = m_idx;
+				kernelArg.m_isBuffer = 0;
+				T* destArg = (T*)kernelArg.m_argData;
+				*destArg = consts;
+				kernelArg.m_argSizeInBytes = sizeof(T);
+				m_kernelArguments.push_back(kernelArg);
+				m_serializationSizeInBytes+=sizeof(b3KernelArgData);
+			}
+			cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
+			b3Assert( status == CL_SUCCESS );
+		}
+		inline void launch1D( int numThreads, int localSize = 64)
+		{
+			launch2D( numThreads, 1, localSize, 1 );
+		}
+		inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+		{
+			size_t gRange[3] = {1,1,1};
+			size_t lRange[3] = {1,1,1};
+			lRange[0] = localSizeX;
+			lRange[1] = localSizeY;
+			gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
+			gRange[0] *= lRange[0];
+			gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
+			gRange[1] *= lRange[1];
+			cl_int status = clEnqueueNDRangeKernel( m_commandQueue, 
+				m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
+            if (status != CL_SUCCESS)
+            {
+                printf("Error: OpenCL status = %d\n",status);
+            }
+			b3Assert( status == CL_SUCCESS );
+		}
+		void	enableSerialization(bool serialize)
+		{
+			m_enableSerialization = serialize;
+		}
+#endif //B3_LAUNCHER_CL_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
new file mode 100644
index 00000000..d70c30f5
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
@@ -0,0 +1,306 @@
+#ifndef B3_OPENCL_ARRAY_H
+#define B3_OPENCL_ARRAY_H
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+template <typename T>
+class b3OpenCLArray
+	size_t	m_size;
+	size_t	m_capacity;
+	cl_mem	m_clBuffer;
+	cl_context		 m_clContext;
+	cl_command_queue m_commandQueue;
+	bool	m_ownsMemory;
+	bool	m_allowGrowingCapacity;
+	void deallocate()
+	{
+		if (m_clBuffer && m_ownsMemory)
+		{
+			clReleaseMemObject(m_clBuffer);
+		}
+		m_clBuffer = 0;
+		m_capacity=0;
+	}
+	b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
+	B3_FORCE_INLINE	size_t	allocSize(size_t size)
+		{
+			return (size ? size*2 : 1);
+		}
+	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true)
+	:m_size(0),  m_capacity(0),m_clBuffer(0),
+	m_clContext(ctx),m_commandQueue(queue),
+	m_ownsMemory(true),m_allowGrowingCapacity(true)
+	{
+		if (initialCapacity)
+		{
+			reserve(initialCapacity);
+		}
+		m_allowGrowingCapacity = allowGrowingCapacity;
+	}
+	///this is an error-prone method with no error checking, be careful!
+	void setFromOpenCLBuffer(cl_mem buffer, size_t sizeInElements)
+	{
+		deallocate();
+		m_ownsMemory = false;
+		m_allowGrowingCapacity = false;
+		m_clBuffer = buffer;
+		m_size = sizeInElements;
+		m_capacity = sizeInElements;
+	}
+// we could enable this assignment, but need to make sure to avoid accidental deep copies
+//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
+//	{
+//		copyFromArray(src);
+//		return *this;
+//	}
+	cl_mem	getBufferCL() const
+	{
+		return m_clBuffer;
+	}
+	virtual ~b3OpenCLArray()
+	{
+		deallocate();
+		m_size=0;
+		m_capacity=0;
+	}
+	B3_FORCE_INLINE	bool push_back(const T& _Val,bool waitForCompletion=true)
+	{
+		bool result = true;
+		size_t sz = size();
+		if( sz == capacity() )
+		{
+			result = reserve( allocSize(size()) );
+		}
+		copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
+		m_size++;
+		return result;
+	}
+	B3_FORCE_INLINE T forcedAt(size_t n) const
+	{
+		b3Assert(n>=0);
+		b3Assert(n<capacity());
+		T elem;
+		copyToHostPointer(&elem,1,n,true);
+		return elem;
+	}
+	B3_FORCE_INLINE T at(size_t n) const
+	{
+		b3Assert(n>=0);
+		b3Assert(n<size());
+		T elem;
+		copyToHostPointer(&elem,1,n,true);
+		return elem;
+	}
+	B3_FORCE_INLINE	bool resize(size_t newsize, bool copyOldContents=true)
+	{
+		bool result = true;
+		size_t curSize = size();
+		if (newsize < curSize)
+		{
+			//leave the OpenCL memory for now
+		} else
+		{
+			if (newsize > size())
+			{
+				result = reserve(newsize,copyOldContents);
+			}
+			//leave new data uninitialized (init in debug mode?)
+			//for (size_t i=curSize;i<newsize;i++) ...
+		}
+		if (result)
+		{
+			m_size = newsize;
+		} else
+		{
+			m_size = 0;
+		}
+		return result;
+	}
+	B3_FORCE_INLINE size_t size() const
+	{
+		return m_size;
+	}
+	B3_FORCE_INLINE	size_t capacity() const
+	{
+		return m_capacity;
+	}
+	B3_FORCE_INLINE	bool reserve(size_t _Count, bool copyOldContents=true)
+	{
+		bool result=true;
+		// determine new minimum length of allocated storage
+		if (capacity() < _Count)
+		{	// not enough room, reallocate
+			if (m_allowGrowingCapacity)
+			{
+				cl_int ciErrNum;
+				//create a new OpenCL buffer
+				size_t memSizeInBytes = sizeof(T)*_Count;
+				cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
+				if (ciErrNum!=CL_SUCCESS)
+				{
+					b3Error("OpenCL out-of-memory\n");
+					_Count = 0;
+					result = false;
+				}
+				unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
+				for (size_t i=0;i<memSizeInBytes;i++)
+					src[i] = 0xbb;
+				ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
+				b3Assert(ciErrNum==CL_SUCCESS);
+				clFinish(m_commandQueue);
+				free(src);
+				if (result)
+				{
+					if (copyOldContents)
+						copyToCL(buf, size());
+				}
+				//deallocate the old buffer
+				deallocate();
+				m_clBuffer = buf;
+				m_capacity = _Count;
+			} else
+			{
+				//fail: assert and
+				b3Assert(0);
+				deallocate();
+				result=false;
+			}
+		}
+		return result;
+	}
+	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const
+	{
+		if (numElements<=0)
+			return;
+		b3Assert(m_clBuffer);
+		b3Assert(destination);
+		//likely some error, destination is same as source
+		b3Assert(m_clBuffer != destination);
+		b3Assert((firstElem+numElements)<=m_size);
+		cl_int status = 0;
+		b3Assert(numElements>0);
+		b3Assert(numElements<=m_size);
+		size_t srcOffsetBytes = sizeof(T)*firstElem;
+		size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
+		status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
+			srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
+		b3Assert( status == CL_SUCCESS );
+	}
+	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
+	{
+		size_t newSize = srcArray.size();
+		bool copyOldContents = false;
+		resize (newSize,copyOldContents);
+		if (newSize)
+			copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
+	}
+	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true)
+	{
+		b3Assert(numElems+destFirstElem <= capacity());
+		if (numElems+destFirstElem)
+		{
+			cl_int status = 0;
+			size_t sizeInBytes=sizeof(T)*numElems;
+			status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
+			src, 0,0,0 );
+			b3Assert(status == CL_SUCCESS );
+			if (waitForCompletion)
+				clFinish(m_commandQueue);
+		} else
+		{
+			b3Error("copyFromHostPointer invalid range\n");
+		}
+	}
+	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
+	{
+		destArray.resize(this->size());
+		if (size())
+			copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
+	}
+	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const
+	{
+		b3Assert(numElem+srcFirstElem <= capacity());
+		if(numElem+srcFirstElem <= capacity())
+		{
+			cl_int status = 0;
+			status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
+			destPtr, 0,0,0 );
+			b3Assert( status==CL_SUCCESS );
+			if (waitForCompletion)
+				clFinish(m_commandQueue);
+		} else
+		{
+			b3Error("copyToHostPointer invalid range\n");
+		}
+	}
+	void copyFromOpenCLArray(const b3OpenCLArray& src)
+	{
+		size_t newSize = src.size();
+		resize(newSize);
+		if (size())
+		{
+			src.copyToCL(m_clBuffer,size());
+		}
+	}
+#endif //B3_OPENCL_ARRAY_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
new file mode 100644
index 00000000..42cd1977
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
@@ -0,0 +1,126 @@
+#include "b3PrefixScanCL.h"
+#include "b3FillCL.h"
+#define B3_PREFIXSCAN_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl"
+#include "b3LauncherCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "kernels/PrefixScanKernelsCL.h"
+b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
+	const char* scanKernelSource = prefixScanKernelsCL;
+	cl_int pErrNum;
+	char* additionalMacros=0;
+	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
+	b3Assert(scanProg);
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
+	b3Assert(m_localScanKernel );
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
+	b3Assert(m_blockSumKernel );
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
+	b3Assert(m_propagationKernel );
+	delete m_workBuffer;
+	clReleaseKernel(m_localScanKernel);
+	clReleaseKernel(m_blockSumKernel);
+	clReleaseKernel(m_propagationKernel);
+template<class T>
+T b3NextPowerOf2(T n)
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
+//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	dst.resize(src.size());
+	m_workBuffer->resize(src.size());
+	b3Int4 constBuffer;
+	constBuffer.x = n;
+	constBuffer.y = numBlocks;
+	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	b3OpenCLArray<unsigned int>* srcNative = &src;
+	b3OpenCLArray<unsigned int>* dstNative = &dst;
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(  constBuffer );
+		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	}
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst( constBuffer );
+		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+	}
+	if( numBlocks > 1 )
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst( constBuffer );
+		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+	}
+	if( sum )
+	{
+		clFinish(m_commandQueue);
+		dstNative->copyToHostPointer(sum,1,n-1,true);
+	}
+void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
+	unsigned int s = 0;
+	//if( data->m_option == EXCLUSIVE )
+	{
+		for(int i=0; i<n; i++)
+		{
+			dst[i] = s;
+			s += src[i];
+		}
+	}
+	/*else
+	{
+		for(int i=0; i<n; i++)
+		{
+			s += hSrc[i];
+			hDst[i] = s;
+		}
+	}
+	*/
+	if( sum )
+	{
+		*sum = dst[n-1];
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
new file mode 100644
index 00000000..a9a2e61b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
@@ -0,0 +1,37 @@
+#ifndef B3_PREFIX_SCAN_CL_H
+#define B3_PREFIX_SCAN_CL_H
+#include "b3OpenCLArray.h"
+#include "b3BufferInfoCL.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+class b3PrefixScanCL
+	enum
+	{
+		BLOCK_SIZE = 128
+	};
+//	Option m_option;
+	cl_command_queue	m_commandQueue;
+	cl_kernel m_localScanKernel;
+	cl_kernel m_blockSumKernel;
+	cl_kernel m_propagationKernel;
+	b3OpenCLArray<unsigned int>* m_workBuffer;
+	public:
+	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+	virtual ~b3PrefixScanCL();
+	void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
+	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0);
+#endif //B3_PREFIX_SCAN_CL_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
new file mode 100644
index 00000000..80560d79
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
@@ -0,0 +1,126 @@
+#include "b3PrefixScanFloat4CL.h"
+#include "b3FillCL.h"
+#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl"
+#include "b3LauncherCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "kernels/PrefixScanKernelsFloat4CL.h"
+b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
+	const char* scanKernelSource = prefixScanKernelsFloat4CL;
+	cl_int pErrNum;
+	char* additionalMacros=0;
+	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
+	b3Assert(scanProg);
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
+	b3Assert(m_localScanKernel );
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
+	b3Assert(m_blockSumKernel );
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
+	b3Assert(m_propagationKernel );
+	delete m_workBuffer;
+	clReleaseKernel(m_localScanKernel);
+	clReleaseKernel(m_blockSumKernel);
+	clReleaseKernel(m_propagationKernel);
+template<class T>
+T b3NextPowerOf2(T n)
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
+//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	dst.resize(src.size());
+	m_workBuffer->resize(src.size());
+	b3Int4 constBuffer;
+	constBuffer.x = n;
+	constBuffer.y = numBlocks;
+	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	b3OpenCLArray<b3Vector3>* srcNative = &src;
+	b3OpenCLArray<b3Vector3>* dstNative = &dst;
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel");
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(  constBuffer );
+		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	}
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel");
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst( constBuffer );
+		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+	}
+	if( numBlocks > 1 )
+	{
+		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel");
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst( constBuffer );
+		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+	}
+	if( sum )
+	{
+		clFinish(m_commandQueue);
+		dstNative->copyToHostPointer(sum,1,n-1,true);
+	}
+void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
+	b3Vector3 s=b3MakeVector3(0,0,0);
+	//if( data->m_option == EXCLUSIVE )
+	{
+		for(int i=0; i<n; i++)
+		{
+			dst[i] = s;
+			s += src[i];
+		}
+	}
+	/*else
+	{
+		for(int i=0; i<n; i++)
+		{
+			s += hSrc[i];
+			hDst[i] = s;
+		}
+	}
+	*/
+	if( sum )
+	{
+		*sum = dst[n-1];
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
new file mode 100644
index 00000000..2c8003c1
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
@@ -0,0 +1,38 @@
+#ifndef B3_PREFIX_SCAN_CL_H
+#define B3_PREFIX_SCAN_CL_H
+#include "b3OpenCLArray.h"
+#include "b3BufferInfoCL.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+class b3PrefixScanFloat4CL
+	enum
+	{
+		BLOCK_SIZE = 128
+	};
+//	Option m_option;
+	cl_command_queue	m_commandQueue;
+	cl_kernel m_localScanKernel;
+	cl_kernel m_blockSumKernel;
+	cl_kernel m_propagationKernel;
+	b3OpenCLArray<b3Vector3>* m_workBuffer;
+	public:
+	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+	virtual ~b3PrefixScanFloat4CL();
+	void execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum = 0);
+	void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
+#endif //B3_PREFIX_SCAN_CL_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
new file mode 100644
index 00000000..f11ae4bc
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
@@ -0,0 +1,710 @@
+#include "b3RadixSort32CL.h"
+#include "b3LauncherCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "b3PrefixScanCL.h"
+#include "b3FillCL.h"
+#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
+#include "kernels/RadixSort32KernelsCL.h"
+b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
+	b3OpenCLDeviceInfo info;
+	b3OpenCLUtils::getDeviceInfo(device,&info);
+	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
+	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
+	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
+	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
+	if (initialCapacity>0)
+	{
+		m_workBuffer1->resize(initialCapacity);
+		m_workBuffer3->resize(initialCapacity);
+		m_workBuffer3a->resize(initialCapacity);
+		m_workBuffer4->resize(initialCapacity);
+		m_workBuffer4a->resize(initialCapacity);
+	}
+	m_scan = new b3PrefixScanCL(ctx,device,queue);
+	m_fill = new b3FillCL(ctx,device,queue);
+	const char* additionalMacros = "";
+	cl_int pErrNum;
+	const char* kernelSource = radixSort32KernelsCL;
+	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
+	b3Assert(sortProg);
+	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
+	b3Assert(m_streamCountSortDataKernel );
+	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
+	b3Assert(m_streamCountKernel);
+	if (m_deviceCPU)
+	{
+		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
+		b3Assert(m_sortAndScatterSortDataKernel);
+		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
+		b3Assert(m_sortAndScatterKernel);
+	} else
+	{
+		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
+		b3Assert(m_sortAndScatterSortDataKernel);
+		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
+		b3Assert(m_sortAndScatterKernel);
+	}
+	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
+	b3Assert(m_prefixScanKernel);
+	delete m_scan;
+	delete m_fill;
+	delete m_workBuffer1;
+	delete m_workBuffer2;
+	delete m_workBuffer3;
+	delete m_workBuffer3a;
+	delete m_workBuffer4;
+	delete m_workBuffer4a;
+	clReleaseKernel(m_streamCountSortDataKernel);
+	clReleaseKernel(m_streamCountKernel);
+	clReleaseKernel(m_sortAndScatterSortDataKernel);
+	clReleaseKernel(m_sortAndScatterKernel);
+	clReleaseKernel(m_prefixScanKernel);
+void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
+	int n = inout.size();
+	const int BITS_PER_PASS = 8;
+	const int NUM_TABLES = (1<<BITS_PER_PASS);
+	int tables[NUM_TABLES];
+	int counter[NUM_TABLES];
+	b3SortData* src = &inout[0];
+	b3AlignedObjectArray<b3SortData> workbuffer;
+	workbuffer.resize(inout.size());
+	b3SortData* dst = &workbuffer[0];
+	int count=0;
+	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+	{
+		for(int i=0; i<NUM_TABLES; i++)
+		{
+			tables[i] = 0;
+		}
+		for(int i=0; i<n; i++)
+		{
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			tables[tableIdx]++;
+		}
+//#define TEST
+#ifdef TEST
+		printf("histogram size=%d\n",NUM_TABLES);
+		for (int i=0;i<NUM_TABLES;i++)
+		{
+			if (tables[i]!=0)
+			{
+				printf("tables[%d]=%d]\n",i,tables[i]);
+			}
+		}
+#endif //TEST
+		//	prefix scan
+		int sum = 0;
+		for(int i=0; i<NUM_TABLES; i++)
+		{
+			int iData = tables[i];
+			tables[i] = sum;
+			sum += iData;
+			counter[i] = 0;
+		}
+		//	distribute
+		for(int i=0; i<n; i++)
+		{
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
+			counter[tableIdx] ++;
+		}
+		b3Swap( src, dst );
+		count++;
+	}
+	if (count&1)
+	{
+		b3Assert(0);//need to copy 
+	}
+void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
+	b3AlignedObjectArray<b3SortData> inout;
+	keyValuesInOut.copyToHost(inout);
+	executeHost(inout,sortBits);
+	keyValuesInOut.copyFromHost(inout);
+void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
+								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
+void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
+	int originalSize = keyValuesInOut.size();
+	int workingSize = originalSize;
+	int dataAlignment = DATA_ALIGNMENT;
+    b3AlignedObjectArray<b3SortData>   test2;
+    keyValuesInOut.copyToHost(test2);
+    printf("numElem = %d\n",test2.size());
+    for (int i=0;i<test2.size();i++)
+    {
+        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+    }
+	b3OpenCLArray<b3SortData>* src = 0;
+	if (workingSize%dataAlignment)
+	{
+		workingSize += dataAlignment-(workingSize%dataAlignment);
+		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
+		m_workBuffer4->resize(workingSize);
+		b3SortData fillValue;
+		fillValue.m_key = 0xffffffff;
+		fillValue.m_value = 0xffffffff;
+#define USE_BTFILL
+#ifdef USE_BTFILL
+		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
+		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
+		for (int i=originalSize; i<workingSize;i++)
+		{
+			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
+		}
+		src = m_workBuffer4;
+	} else
+	{
+		src = &keyValuesInOut;
+		m_workBuffer4->resize(0);
+	}
+	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
+	int minCap = NUM_BUCKET*NUM_WGS;
+	int n = workingSize;
+	m_workBuffer1->resize(minCap);
+	m_workBuffer3->resize(workingSize);
+	b3Assert( BITS_PER_PASS == 4 );
+	b3Assert( WG_SIZE == 64 );
+	b3Assert( (sortBits&0x3) == 0 );
+	b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
+	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
+	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
+	int nWGs = NUM_WGS;
+	b3ConstData cdata;
+	{
+        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
+     	int nBlocks = (n+blockSize-1)/(blockSize);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+	int count=0;
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+        keyValuesInOut.copyToHost(test2);
+        printf("numElem = %d\n",test2.size());
+        for (int i=0;i<test2.size();i++)
+        {
+            if (test2[i].m_key != test2[i].m_value)
+            {
+                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+            }
+        }
+		cdata.m_startBit = ib;
+		if (src->size())
+		{
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
+			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			int num = NUM_WGS*WG_SIZE;
+			launcher.launch1D( num, WG_SIZE );
+		}
+		b3AlignedObjectArray<unsigned int> testHist;
+		srcHisto->copyToHost(testHist);
+		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
+		for (int i=0;i<testHist.size();i++)
+		{
+			if (testHist[i]!=0)
+				printf("testHist[%d]=%d\n",i,testHist[i]);
+		}
+//fast prefix scan is not working properly on Mac OSX yet
+#ifdef __APPLE__
+	bool fastScan=false;
+	bool fastScan=!m_deviceCPU;//only use fast scan on GPU
+		if (fastScan)
+		{//	prefix scan group histogram
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
+			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( 128, 128 );
+			destHisto = srcHisto;
+		}else
+		{
+			//unsigned int sum; //for debugging
+            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+		}
+		destHisto->copyToHost(testHist);
+		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
+		for (int i=0;i<testHist.size();i++)
+		{
+			if (testHist[i]!=0)
+				printf("testHist[%d]=%d\n",i,testHist[i]);
+		}
+        for (int i=0;i<testHist.size();i+=NUM_WGS)
+		{
+				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
+		}
+#define USE_GPU
+#ifdef USE_GPU
+		if (src->size())
+		{//	local sort and distribute
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
+			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+        {
+#define NUM_TABLES 16
+//#define SEQUENTIAL
+            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+            int tables[NUM_TABLES];
+            int startBit = ib;
+            destHisto->copyToHost(testHist);
+            b3AlignedObjectArray<b3SortData> srcHost;
+            b3AlignedObjectArray<b3SortData> dstHost;
+            dstHost.resize(src->size());
+            src->copyToHost(srcHost);
+            for (int i=0;i<NUM_TABLES;i++)
+            {
+                tables[i] = testHist[i*NUM_WGS];
+            }
+            //	distribute
+            for(int i=0; i<n; i++)
+            {
+                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
+                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+                counter2[tableIdx] ++;
+            }
+            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+            int tables[NUM_TABLES];
+             b3AlignedObjectArray<b3SortData> dstHostOK;
+            dstHostOK.resize(src->size());
+            destHisto->copyToHost(testHist);
+            b3AlignedObjectArray<b3SortData> srcHost;
+            src->copyToHost(srcHost);
+            int blockSize = 256;
+            int nBlocksPerWG = cdata.m_nBlocksPerWG;
+            int startBit = ib;
+            {
+                for (int i=0;i<NUM_TABLES;i++)
+                {
+                    tables[i] = testHist[i*NUM_WGS];
+                }
+                //	distribute
+                for(int i=0; i<n; i++)
+                {
+                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
+                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+                    counter2[tableIdx] ++;
+                }
+            }
+            b3AlignedObjectArray<b3SortData> dstHost;
+            dstHost.resize(src->size());
+            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
+            {
+              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+              for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
+              {
+                for (int lIdx = 0;lIdx < 64;lIdx++)
+                {
+                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+                    //	AMD: AtomInc performs better while NV prefers ++
+                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
+                    {
+                        if( addr+j < n )
+                        {
+                          //  printf ("addr+j=%d\n", addr+j);
+                            int i = addr+j;
+                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
+                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
+                            b3SortData ok = dstHostOK[destIndex];
+                            if (ok.m_key != srcHost[i].m_key)
+                            {
+                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
+                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
+                            }
+                            if (ok.m_value != srcHost[i].m_value)
+                            {
+                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
+                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
+                            }
+                            dstHost[destIndex] = srcHost[i];
+                            counter[tableIdx] ++;
+                        }
+                    }
+                }
+              }
+            }
+#endif //SEQUENTIAL
+            dst->copyFromHost(dstHost);
+        }
+		destHisto->copyToHost(testHist);
+		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
+		for (int i=0;i<testHist.size();i++)
+		{
+			if (testHist[i]!=0)
+				printf("testHist[%d]=%d\n",i,testHist[i]);
+		}
+		b3Swap(src, dst );
+		b3Swap(srcHisto,destHisto);
+        keyValuesInOut.copyToHost(test2);
+        printf("numElem = %d\n",test2.size());
+        for (int i=0;i<test2.size();i++)
+        {
+            if (test2[i].m_key != test2[i].m_value)
+            {
+                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+            }
+        }
+        count++;
+	}
+	if (count&1)
+	{
+		b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+	}
+	if (m_workBuffer4->size())
+	{
+		m_workBuffer4->resize(originalSize);
+		keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
+	}
+    keyValuesInOut.copyToHost(test2);
+    printf("numElem = %d\n",test2.size());
+    for (int i=0;i<test2.size();i++)
+    {
+        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+    }
+void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
+	int originalSize = keysInOut.size();
+	int workingSize = originalSize;
+	int dataAlignment = DATA_ALIGNMENT;
+	b3OpenCLArray<unsigned int>* src = 0;
+	if (workingSize%dataAlignment)
+	{
+		workingSize += dataAlignment-(workingSize%dataAlignment);
+		m_workBuffer4a->copyFromOpenCLArray(keysInOut);
+		m_workBuffer4a->resize(workingSize);
+		unsigned int fillValue = 0xffffffff;
+		m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
+		src = m_workBuffer4a;
+	} else
+	{
+		src = &keysInOut;
+		m_workBuffer4a->resize(0);
+	}
+	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
+	int minCap = NUM_BUCKET*NUM_WGS;
+	int n = workingSize;
+	m_workBuffer1->resize(minCap);
+	m_workBuffer3->resize(workingSize);
+	m_workBuffer3a->resize(workingSize);
+	b3Assert( BITS_PER_PASS == 4 );
+	b3Assert( WG_SIZE == 64 );
+	b3Assert( (sortBits&0x3) == 0 );
+	b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
+	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
+	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
+	int nWGs = NUM_WGS;
+	b3ConstData cdata;
+	{
+        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
+     	int nBlocks = (n+blockSize-1)/(blockSize);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+	int count=0;
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		cdata.m_startBit = ib;
+		if (src->size())
+		{
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
+			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			int num = NUM_WGS*WG_SIZE;
+			launcher.launch1D( num, WG_SIZE );
+		}
+//fast prefix scan is not working properly on Mac OSX yet
+#ifdef __APPLE__
+	bool fastScan=false;	
+	bool fastScan=!m_deviceCPU;
+		if (fastScan)
+		{//	prefix scan group histogram
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
+			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( 128, 128 );
+			destHisto = srcHisto;
+		}else
+		{
+			//unsigned int sum; //for debugging
+            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+		}
+		if (src->size())
+		{//	local sort and distribute
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
+			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		b3Swap(src, dst );
+		b3Swap(srcHisto,destHisto);
+        count++;
+	}
+	if (count&1)
+	{
+		b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+	}
+	if (m_workBuffer4a->size())
+	{
+		m_workBuffer4a->resize(originalSize);
+		keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
new file mode 100644
index 00000000..975bd80e
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
@@ -0,0 +1,95 @@
+#ifndef B3_RADIXSORT32_H
+#define B3_RADIXSORT32_H
+#include "b3OpenCLArray.h"
+struct b3SortData
+	union
+	{
+		unsigned int m_key;
+		unsigned int x;
+	};
+	union
+	{
+		unsigned int m_value;
+		unsigned int y;
+	};
+#include "b3BufferInfoCL.h"
+class  b3RadixSort32CL
+		b3OpenCLArray<unsigned int>* m_workBuffer1;
+		b3OpenCLArray<unsigned int>* m_workBuffer2;
+		b3OpenCLArray<b3SortData>*	m_workBuffer3;
+		b3OpenCLArray<b3SortData>*	m_workBuffer4;
+		b3OpenCLArray<unsigned int>* m_workBuffer3a;
+		b3OpenCLArray<unsigned int>* m_workBuffer4a;
+		cl_command_queue	m_commandQueue;
+		cl_kernel m_streamCountSortDataKernel;
+		cl_kernel m_streamCountKernel;
+		cl_kernel m_prefixScanKernel;
+		cl_kernel m_sortAndScatterSortDataKernel;
+		cl_kernel m_sortAndScatterKernel;
+		bool	m_deviceCPU;
+		class b3PrefixScanCL* m_scan;
+		class b3FillCL*	m_fill;
+	struct b3ConstData
+		{
+			int m_n;
+			int m_nWGs;
+			int m_startBit;
+			int m_nBlocksPerWG;
+		};
+	enum
+		{
+			WG_SIZE = 64,
+            BLOCK_SIZE = 256,
+			BITS_PER_PASS = 4,
+			//	if you change this, change nPerWI in kernel as well
+			NUM_WGS = 20*6,	//	cypress
+//			NUM_WGS = 24*6,	//	cayman
+//			NUM_WGS = 32*4,	//	nv
+		};
+		b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
+		virtual ~b3RadixSort32CL();
+		void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
+								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
+		///keys only
+		void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits  = 32 );
+		void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits  = 32 );
+		void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+		void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+#endif //B3_RADIXSORT32_H
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl
new file mode 100644
index 00000000..f3b4a1e8
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl
@@ -0,0 +1,106 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+typedef struct
+	u32 m_key; 
+	u32 m_value;
+typedef struct
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+} ConstBuffer;
+void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, 
+					unsigned int nSrc, unsigned int nDst)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nSrc )
+	{
+		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, 
+					unsigned int nSrc, unsigned int nDst)
+	int gIdx = GET_GLOBAL_IDX+1;
+	if( gIdx < nSrc+1 )
+	{
+		SortData first; first.m_key = 0; first.m_value = 0;
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+		SortData iData = src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+		if( iData.m_key != jData.m_key )
+		{
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, 
+					unsigned int nSrc, unsigned int nDst)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nDst )
+	{
+		C[gIdx] = A[gIdx] - B[gIdx];
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
new file mode 100644
index 00000000..9c9e8471
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
@@ -0,0 +1,87 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* boundSearchKernelsCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"typedef struct\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"typedef struct\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+"					unsigned int nSrc, unsigned int nDst)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+"					unsigned int nSrc, unsigned int nDst)\n"
+"	int gIdx = GET_GLOBAL_IDX+1;\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"		SortData iData = src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+"					unsigned int nSrc, unsigned int nDst)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	if( gIdx < nDst )\n"
+"	{\n"
+"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl
new file mode 100644
index 00000000..2eee5752
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl
@@ -0,0 +1,128 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+typedef struct
+	int m_n;
+	int m_padding[3];
+} ConstBuffer;
+void Copy1F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < cb.m_n )
+	{
+		float4 a0 = src[gIdx];
+		dst[ gIdx ] = a0;
+	}
+void Copy2F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+	int gIdx = GET_GLOBAL_IDX;
+	if( 2*gIdx <= cb.m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+void Copy4F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+	int gIdx = GET_GLOBAL_IDX;
+	if( 4*gIdx <= cb.m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+void CopyF1Kernel(__global float* dstF1, __global float* srcF1, 
+					ConstBuffer cb)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < cb.m_n )
+	{
+		float a0 = srcF1[gIdx];
+		dstF1[ gIdx ] = a0;
+	}
+void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, 
+					ConstBuffer cb)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < cb.m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+		dstF2[ gIdx ] = a0;
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
new file mode 100644
index 00000000..e5670e3c
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
@@ -0,0 +1,132 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* copyKernelsCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"typedef struct\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"} ConstBuffer;\n"
+"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( 2*gIdx <= cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( 4*gIdx <= cb.m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+"					ConstBuffer cb)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+"					ConstBuffer cb)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl
new file mode 100644
index 00000000..71c31075
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl
@@ -0,0 +1,107 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+typedef struct
+	union
+	{
+		int4 m_data;
+		uint4 m_unsignedData;
+		float	m_floatData;
+	};
+	int m_offset;
+	int m_n;
+	int m_padding[2];
+} ConstBuffer;
+void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < num_elements )
+	{
+		dstInt[ offset+gIdx ] = value;
+	}
+void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < num_elements )
+	{
+		dstFloat[ offset+gIdx ] = value;
+	}
+void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < num )
+	{
+		dstInt[ offset+gIdx ] = value;
+	}
+void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < num )
+	{
+		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
+	}
+void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < num )
+	{
+		dstInt4[ offset+gIdx ] = value;
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
new file mode 100644
index 00000000..4f8b96e4
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
@@ -0,0 +1,91 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* fillKernelsCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"typedef struct\n"
+"	union\n"
+"	{\n"
+"		int4 m_data;\n"
+"		uint4 m_unsignedData;\n"
+"		float	m_floatData;\n"
+"	};\n"
+"	int m_offset;\n"
+"	int m_n;\n"
+"	int m_padding[2];\n"
+"} ConstBuffer;\n"
+"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < num_elements )\n"
+"	{\n"
+"		dstInt[ offset+gIdx ] = value;\n"
+"	}\n"
+"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < num_elements )\n"
+"	{\n"
+"		dstFloat[ offset+gIdx ] = value;\n"
+"	}\n"
+"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < num )\n"
+"	{\n"
+"		dstInt[ offset+gIdx ] = value;\n"
+"	}\n"
+"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < num )\n"
+"	{\n"
+"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
+"	}\n"
+"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < num )\n"
+"	{\n"
+"		dstInt4[ offset+gIdx ] = value;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl
new file mode 100644
index 00000000..c9da7985
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl
@@ -0,0 +1,154 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+// takahiro end
+#define WG_SIZE 128 
+#define m_numElems x
+#define m_numBlocks y
+#define m_numScanBlocks z
+/*typedef struct
+	uint m_numElems;
+	uint m_numBlocks;
+	uint m_numScanBlocks;
+	uint m_padding[1];
+} ConstBuffer;
+float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)
+	float4 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            data[bi] += data[ai];
+        }
+	}
+    if( lIdx == 0 )
+	{
+		blocksum = data[ n-1 ];
+    data[ n-1 ] = 0;
+	}
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            float4 temp = data[ai];
+            data[ai] = data[bi];
+            data[bi] += temp;
+        }
+	}
+	return blocksum;
+void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)
+	__local float4 ldsData[WG_SIZE*2];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
+	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+	if( lIdx == 0 ) 
+		sumBuffer[GET_GROUP_IDX] = sum;
+	if( (2*gIdx) < cb.m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < cb.m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)
+	const u32 blockSize = WG_SIZE*2;
+	int myIdx = GET_GROUP_IDX+1;
+	int lIdx = GET_LOCAL_IDX;
+	float4 iBlockSum = blockSum[myIdx];
+	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
+	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+void TopLevelScanKernel(__global float4* dst, uint4 cb)
+	__local float4 ldsData[2048];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
+	}
+	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+	if( gIdx == 0 )
+	{
+		dst[cb.m_numBlocks] = sum;
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl
new file mode 100644
index 00000000..963cc1e4
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl
@@ -0,0 +1,154 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+// takahiro end
+#define WG_SIZE 128 
+#define m_numElems x
+#define m_numBlocks y
+#define m_numScanBlocks z
+/*typedef struct
+	uint m_numElems;
+	uint m_numBlocks;
+	uint m_numScanBlocks;
+	uint m_padding[1];
+} ConstBuffer;
+u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
+	u32 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            data[bi] += data[ai];
+        }
+	}
+    if( lIdx == 0 )
+	{
+		blocksum = data[ n-1 ];
+        data[ n-1 ] = 0;
+	}
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            u32 temp = data[ai];
+            data[ai] = data[bi];
+            data[bi] += temp;
+        }
+	}
+	return blocksum;
+void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
+		uint4 cb)
+	__local u32 ldsData[WG_SIZE*2];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
+	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
+	if( (2*gIdx) < cb.m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < cb.m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
+	const u32 blockSize = WG_SIZE*2;
+	int myIdx = GET_GROUP_IDX+1;
+	int lIdx = GET_LOCAL_IDX;
+	u32 iBlockSum = blockSum[myIdx];
+	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
+	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+void TopLevelScanKernel(__global u32* dst, uint4 cb)
+	__local u32 ldsData[2048];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
+	}
+	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+	if( gIdx == 0 )
+	{
+		dst[cb.m_numBlocks] = sum;
+	}
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
new file mode 100644
index 00000000..27baab83
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
@@ -0,0 +1,129 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* prefixScanKernelsCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"// takahiro end\n"
+"#define WG_SIZE 128 \n"
+"#define m_numElems x\n"
+"#define m_numBlocks y\n"
+"#define m_numScanBlocks z\n"
+"/*typedef struct\n"
+"	uint m_numElems;\n"
+"	uint m_numBlocks;\n"
+"	uint m_numScanBlocks;\n"
+"	uint m_padding[1];\n"
+"} ConstBuffer;\n"
+"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+"	u32 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            data[bi] += data[ai];\n"
+"        }\n"
+"	}\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = data[ n-1 ];\n"
+"        data[ n-1 ] = 0;\n"
+"	}\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            u32 temp = data[ai];\n"
+"            data[ai] = data[bi];\n"
+"            data[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	return blocksum;\n"
+"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+"		uint4 cb)\n"
+"	__local u32 ldsData[WG_SIZE*2];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+"	if( (2*gIdx) < cb.m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	u32 iBlockSum = blockSum[myIdx];\n"
+"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
+"	__local u32 ldsData[2048];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[cb.m_numBlocks] = sum;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
new file mode 100644
index 00000000..5b132547
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
@@ -0,0 +1,129 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* prefixScanKernelsFloat4CL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"// takahiro end\n"
+"#define WG_SIZE 128 \n"
+"#define m_numElems x\n"
+"#define m_numBlocks y\n"
+"#define m_numScanBlocks z\n"
+"/*typedef struct\n"
+"	uint m_numElems;\n"
+"	uint m_numBlocks;\n"
+"	uint m_numScanBlocks;\n"
+"	uint m_padding[1];\n"
+"} ConstBuffer;\n"
+"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
+"	float4 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            data[bi] += data[ai];\n"
+"        }\n"
+"	}\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = data[ n-1 ];\n"
+"    data[ n-1 ] = 0;\n"
+"	}\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            float4 temp = data[ai];\n"
+"            data[ai] = data[bi];\n"
+"            data[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	return blocksum;\n"
+"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
+"	__local float4 ldsData[WG_SIZE*2];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"	if( lIdx == 0 ) \n"
+"		sumBuffer[GET_GROUP_IDX] = sum;\n"
+"	if( (2*gIdx) < cb.m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	float4 iBlockSum = blockSum[myIdx];\n"
+"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
+"	__local float4 ldsData[2048];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[cb.m_numBlocks] = sum;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl
new file mode 100644
index 00000000..7402e2f3
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl
@@ -0,0 +1,1071 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Author Takahiro Harada
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+#define WG_SIZE 64
+#define BITS_PER_PASS 4
+typedef uchar u8;
+//	this isn't optimization for VLIW. But just reducing writes. 
+#define USE_2LEVEL_REDUCE 1
+//#define CHECK_BOUNDARY 1
+//#define NV_GPU 1
+//	Cypress
+#define nPerWI 16
+//	Cayman
+//#define nPerWI 20
+#define m_n x
+#define m_nWGs y
+#define m_startBit z
+#define m_nBlocksPerWG w
+typedef struct
+	int m_n;
+	int m_nWGs;
+	int m_startBit;
+	int m_nBlocksPerWG;
+} ConstBuffer;
+typedef struct
+	unsigned int m_key;
+	unsigned int m_value;
+} SortDataCL;
+uint prefixScanVectorEx( uint4* data )
+	u32 sum = 0;
+	u32 tmp = data[0].x;
+	data[0].x = sum;
+	sum += tmp;
+	tmp = data[0].y;
+	data[0].y = sum;
+	sum += tmp;
+	tmp = data[0].z;
+	data[0].z = sum;
+	sum += tmp;
+	tmp = data[0].w;
+	data[0].w = sum;
+	sum += tmp;
+	return sum;
+u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+wgSize] = pData;
+	}
+	{	//	Prefix sum
+		int idx = 2*lIdx + (wgSize+1);
+#if defined(USE_2LEVEL_REDUCE)
+		if( lIdx < 64 )
+		{
+			u32 u0, u1, u2;
+			u0 = sorterSharedMemory[idx-3];
+			u1 = sorterSharedMemory[idx-2];
+			u2 = sorterSharedMemory[idx-1];
+			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			
+			u0 = sorterSharedMemory[idx-12];
+			u1 = sorterSharedMemory[idx-8];
+			u2 = sorterSharedMemory[idx-4];
+			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			
+			u0 = sorterSharedMemory[idx-48];
+			u1 = sorterSharedMemory[idx-32];
+			u2 = sorterSharedMemory[idx-16];
+			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			
+			if( wgSize > 64 )
+			{
+				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+			}
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+		}
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
+			if( wgSize > 64 )
+			{
+				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+			}
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+		}
+	}
+	*totalSum = sorterSharedMemory[wgSize*2-1];
+	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];
+	return addValue;
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
+	u32 s4 = prefixScanVectorEx( &pData );
+	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );
+	return pData + make_uint4( rank, rank, rank, rank );
+uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
+	u32 s4 = prefixScanVectorEx( &pData );
+	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );
+	return pData + make_uint4( rank, rank, rank, rank );
+u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}
+u32 bit8Scan(u32 v)
+	return (v<<8) + (v<<16) + (v<<24);
+#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]
+void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )
+	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		MY_HISTOGRAM(i) = 0;
+	}
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	u32 localKey;
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+		//	AMD: AtomInc performs better while NV prefers ++
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+		{
+#if defined(CHECK_BOUNDARY)
+			if( addr+i < n )
+			{
+				localKey = (gSrc[addr+i]>>startBit) & 0xf;
+#if defined(NV_GPU)
+				MY_HISTOGRAM( localKey )++;
+				AtomInc( MY_HISTOGRAM( localKey ) );
+			}
+		}
+	}
+	if( lIdx < NUM_BUCKET )
+	{
+		u32 sum = 0;
+		for(int i=0; i<GET_GROUP_SIZE; i++)
+		{
+			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
+		}
+		histogramOut[lIdx*nWGs+wgIdx] = sum;
+	}
+void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )
+	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		MY_HISTOGRAM(i) = 0;
+	}
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	u32 localKey;
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+		//	AMD: AtomInc performs better while NV prefers ++
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+		{
+#if defined(CHECK_BOUNDARY)
+			if( addr+i < n )
+			{
+				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;
+#if defined(NV_GPU)
+				MY_HISTOGRAM( localKey )++;
+				AtomInc( MY_HISTOGRAM( localKey ) );
+			}
+		}
+	}
+	if( lIdx < NUM_BUCKET )
+	{
+		u32 sum = 0;
+		for(int i=0; i<GET_GROUP_SIZE; i++)
+		{
+			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
+		}
+		histogramOut[lIdx*nWGs+wgIdx] = sum;
+	}
+#define nPerLane (nPerWI/4)
+//	NUM_BUCKET*nWGs < 128*nPerWI
+void PrefixScanKernel( __global u32* wHistogram1, int4  cb )
+	__local u32 ldsTopScanData[128*2];
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	const int nWGs = cb.m_nWGs;
+	u32 data[nPerWI];
+	for(int i=0; i<nPerWI; i++)
+	{
+		data[i] = 0;
+		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )
+			data[i] = wHistogram1[nPerWI*lIdx+i];
+	}
+	uint4 myData = make_uint4(0,0,0,0);
+	for(int i=0; i<nPerLane; i++)
+	{
+		myData.x += data[nPerLane*0+i];
+		myData.y += data[nPerLane*1+i];
+		myData.z += data[nPerLane*2+i];
+		myData.w += data[nPerLane*3+i];
+	}
+	uint totalSum;
+	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );
+//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches
+	{	int j = 0;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	{	int j = 1;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	{	int j = 2;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	{	int j = 3;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	for(int i=0; i<nPerLane; i++)
+	{
+		data[nPerLane*0+i] += scanned.x;
+		data[nPerLane*1+i] += scanned.y;
+		data[nPerLane*2+i] += scanned.z;
+		data[nPerLane*3+i] += scanned.w;
+	}
+	for(int i=0; i<nPerWI; i++)
+	{
+		int index = nPerWI*lIdx+i;
+		if (index < NUM_BUCKET*nWGs)
+			wHistogram1[nPerWI*lIdx+i] = data[i];
+	}
+//	4 scan, 4 exchange
+void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
+	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)
+	{
+		u32 mask = (1<<bitIdx);
+		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );
+		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
+		u32 total;
+		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );
+		{
+			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
+			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
+			ldsSortData[dstAddr.x] = sortData[0];
+			ldsSortData[dstAddr.y] = sortData[1];
+			ldsSortData[dstAddr.z] = sortData[2];
+			ldsSortData[dstAddr.w] = sortData[3];
+			sortData[0] = ldsSortData[localAddr.x];
+			sortData[1] = ldsSortData[localAddr.y];
+			sortData[2] = ldsSortData[localAddr.z];
+			sortData[3] = ldsSortData[localAddr.w];
+		}
+	}
+//	2 scan, 2 exchange
+void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
+	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
+	{
+		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, 
+			(sortData[1]>>(startBit+ibit)) & 0x3, 
+			(sortData[2]>>(startBit+ibit)) & 0x3, 
+			(sortData[3]>>(startBit+ibit)) & 0x3);
+		u32 key4;
+		u32 sKeyPacked[4] = { 0, 0, 0, 0 };
+		{
+			sKeyPacked[0] |= 1<<(8*b.x);
+			sKeyPacked[1] |= 1<<(8*b.y);
+			sKeyPacked[2] |= 1<<(8*b.z);
+			sKeyPacked[3] |= 1<<(8*b.w);
+			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
+		}
+		u32 rankPacked;
+		u32 sumPacked;
+		{
+			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
+		}
+		u32 newOffset[4] = { 0,0,0,0 };
+		{
+			u32 sumScanned = bit8Scan( sumPacked );
+			u32 scannedKeys[4];
+			scannedKeys[0] = 1<<(8*b.x);
+			scannedKeys[1] = 1<<(8*b.y);
+			scannedKeys[2] = 1<<(8*b.z);
+			scannedKeys[3] = 1<<(8*b.w);
+			{	//	4 scans at once
+				u32 sum4 = 0;
+				for(int ie=0; ie<4; ie++)
+				{
+					u32 tmp = scannedKeys[ie];
+					scannedKeys[ie] = sum4;
+					sum4 += tmp;
+				}
+			}
+			{
+				u32 sumPlusRank = sumScanned + rankPacked;
+				{	u32 ie = b.x;
+					scannedKeys[0] += sumPlusRank;
+					newOffset[0] = unpack4Key( scannedKeys[0], ie );
+				}
+				{	u32 ie = b.y;
+					scannedKeys[1] += sumPlusRank;
+					newOffset[1] = unpack4Key( scannedKeys[1], ie );
+				}
+				{	u32 ie = b.z;
+					scannedKeys[2] += sumPlusRank;
+					newOffset[2] = unpack4Key( scannedKeys[2], ie );
+				}
+				{	u32 ie = b.w;
+					scannedKeys[3] += sumPlusRank;
+					newOffset[3] = unpack4Key( scannedKeys[3], ie );
+				}
+			}
+		}
+		{
+			ldsSortData[newOffset[0]] = sortData[0];
+			ldsSortData[newOffset[1]] = sortData[1];
+			ldsSortData[newOffset[2]] = sortData[2];
+			ldsSortData[newOffset[3]] = sortData[3];
+			u32 dstAddr = 4*lIdx;
+			sortData[0] = ldsSortData[dstAddr+0];
+			sortData[1] = ldsSortData[dstAddr+1];
+			sortData[2] = ldsSortData[dstAddr+2];
+			sortData[3] = ldsSortData[dstAddr+3];
+		}
+	}
+#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]
+void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
+	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
+	__local u32 localHistogramToCarry[NUM_BUCKET];
+	__local u32 localHistogram[NUM_BUCKET*2];
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int startBit = cb.m_startBit;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
+	}
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		u32 myHistogram = 0;
+		u32 sortData[ELEMENTS_PER_WORK_ITEM];
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;
+			sortData[i] = gSrc[ addr+i ];
+		sort4Bits(sortData, startBit, lIdx, ldsSortData);
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+			keys[i] = (sortData[i]>>startBit) & 0xf;
+		{	//	create histogram
+			u32 setIdx = lIdx/16;
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[lIdx] = 0;
+			}
+			ldsSortData[lIdx] = 0;
+			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+				if( addr+i < n )
+#if defined(NV_GPU)
+				SET_HISTOGRAM( setIdx, keys[i] )++;
+				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
+			uint hIdx = NUM_BUCKET+lIdx;
+			if( lIdx < NUM_BUCKET )
+			{
+				u32 sum = 0;
+				for(int i=0; i<WG_SIZE/16; i++)
+				{
+					sum += SET_HISTOGRAM( i, lIdx );
+				}
+				myHistogram = sum;
+				localHistogram[hIdx] = sum;
+			}
+#if defined(USE_2LEVEL_REDUCE)
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				u32 u0, u1, u2;
+				u0 = localHistogram[hIdx-3];
+				u1 = localHistogram[hIdx-2];
+				u2 = localHistogram[hIdx-1];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+				u0 = localHistogram[hIdx-12];
+				u1 = localHistogram[hIdx-8];
+				u2 = localHistogram[hIdx-4];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+			}
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+			}
+		}
+		{
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
+				int binIdx = keys[ie];
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+#if defined(CHECK_BOUNDARY)
+				if( addr+ie < n )
+				gDst[ groupOffset + myIdx ] = sortData[ie];
+			}
+		}
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+	}
+//	2 scan, 2 exchange
+void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)
+	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
+	{
+		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, 
+			(sortData[1]>>(startBit+ibit)) & 0x3, 
+			(sortData[2]>>(startBit+ibit)) & 0x3, 
+			(sortData[3]>>(startBit+ibit)) & 0x3);
+		u32 key4;
+		u32 sKeyPacked[4] = { 0, 0, 0, 0 };
+		{
+			sKeyPacked[0] |= 1<<(8*b.x);
+			sKeyPacked[1] |= 1<<(8*b.y);
+			sKeyPacked[2] |= 1<<(8*b.z);
+			sKeyPacked[3] |= 1<<(8*b.w);
+			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
+		}
+		u32 rankPacked;
+		u32 sumPacked;
+		{
+			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
+		}
+		u32 newOffset[4] = { 0,0,0,0 };
+		{
+			u32 sumScanned = bit8Scan( sumPacked );
+			u32 scannedKeys[4];
+			scannedKeys[0] = 1<<(8*b.x);
+			scannedKeys[1] = 1<<(8*b.y);
+			scannedKeys[2] = 1<<(8*b.z);
+			scannedKeys[3] = 1<<(8*b.w);
+			{	//	4 scans at once
+				u32 sum4 = 0;
+				for(int ie=0; ie<4; ie++)
+				{
+					u32 tmp = scannedKeys[ie];
+					scannedKeys[ie] = sum4;
+					sum4 += tmp;
+				}
+			}
+			{
+				u32 sumPlusRank = sumScanned + rankPacked;
+				{	u32 ie = b.x;
+					scannedKeys[0] += sumPlusRank;
+					newOffset[0] = unpack4Key( scannedKeys[0], ie );
+				}
+				{	u32 ie = b.y;
+					scannedKeys[1] += sumPlusRank;
+					newOffset[1] = unpack4Key( scannedKeys[1], ie );
+				}
+				{	u32 ie = b.z;
+					scannedKeys[2] += sumPlusRank;
+					newOffset[2] = unpack4Key( scannedKeys[2], ie );
+				}
+				{	u32 ie = b.w;
+					scannedKeys[3] += sumPlusRank;
+					newOffset[3] = unpack4Key( scannedKeys[3], ie );
+				}
+			}
+		}
+		{
+			ldsSortData[newOffset[0]] = sortData[0];
+			ldsSortData[newOffset[1]] = sortData[1];
+			ldsSortData[newOffset[2]] = sortData[2];
+			ldsSortData[newOffset[3]] = sortData[3];
+			ldsSortVal[newOffset[0]] = sortVal[0];
+			ldsSortVal[newOffset[1]] = sortVal[1];
+			ldsSortVal[newOffset[2]] = sortVal[2];
+			ldsSortVal[newOffset[3]] = sortVal[3];
+			u32 dstAddr = 4*lIdx;
+			sortData[0] = ldsSortData[dstAddr+0];
+			sortData[1] = ldsSortData[dstAddr+1];
+			sortData[2] = ldsSortData[dstAddr+2];
+			sortData[3] = ldsSortData[dstAddr+3];
+			sortVal[0] = ldsSortVal[dstAddr+0];
+			sortVal[1] = ldsSortVal[dstAddr+1];
+			sortVal[2] = ldsSortVal[dstAddr+2];
+			sortVal[3] = ldsSortVal[dstAddr+3];
+		}
+	}
+void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
+	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
+	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
+	__local u32 localHistogramToCarry[NUM_BUCKET];
+	__local u32 localHistogram[NUM_BUCKET*2];
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int startBit = cb.m_startBit;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
+	}
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		u32 myHistogram = 0;
+		int sortData[ELEMENTS_PER_WORK_ITEM];
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+		{
+			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;
+			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;
+		}
+		{
+			sortData[i] = gSrc[ addr+i ].m_key;
+			sortVal[i] = gSrc[ addr+i ].m_value;
+		}
+		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+			keys[i] = (sortData[i]>>startBit) & 0xf;
+		{	//	create histogram
+			u32 setIdx = lIdx/16;
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[lIdx] = 0;
+			}
+			ldsSortData[lIdx] = 0;
+			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+				if( addr+i < n )
+#if defined(NV_GPU)
+				SET_HISTOGRAM( setIdx, keys[i] )++;
+				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
+			uint hIdx = NUM_BUCKET+lIdx;
+			if( lIdx < NUM_BUCKET )
+			{
+				u32 sum = 0;
+				for(int i=0; i<WG_SIZE/16; i++)
+				{
+					sum += SET_HISTOGRAM( i, lIdx );
+				}
+				myHistogram = sum;
+				localHistogram[hIdx] = sum;
+			}
+#if defined(USE_2LEVEL_REDUCE)
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				u32 u0, u1, u2;
+				u0 = localHistogram[hIdx-3];
+				u1 = localHistogram[hIdx-2];
+				u2 = localHistogram[hIdx-1];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+				u0 = localHistogram[hIdx-12];
+				u1 = localHistogram[hIdx-8];
+				u2 = localHistogram[hIdx-4];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+			}
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+			}
+		}
+    	{
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
+				int binIdx = keys[ie];
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+#if defined(CHECK_BOUNDARY)
+				if( addr+ie < n )
+				{
+                    if ((groupOffset + myIdx)<n)
+                    {
+                        if (sortData[ie]==sortVal[ie])
+                        {
+                            SortDataCL tmp;
+                            tmp.m_key = sortData[ie];
+                            tmp.m_value = sortVal[ie];
+                            if (tmp.m_key == tmp.m_value)
+                                gDst[groupOffset + myIdx ] = tmp;
+                        }
+                    }
+				}
+                if ((groupOffset + myIdx)<n)
+                {
+                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];
+                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];
+                }
+			}
+		}
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+	}
+void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 realLocalIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+    int counter[NUM_BUCKET];
+    if (realLocalIdx>0)
+        return;
+    for (int c=0;c<NUM_BUCKET;c++)
+        counter[c]=0;
+    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
+  {
+     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
+ 	{
+        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
+		{
+            int i = addr2+j;
+			if( i < n )
+			{
+                int tableIdx;
+				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1
+                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
+                counter[tableIdx] ++;
+			}
+		}
+	}
+  }
+void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 realLocalIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+    int counter[NUM_BUCKET];
+    if (realLocalIdx>0)
+        return;
+    for (int c=0;c<NUM_BUCKET;c++)
+        counter[c]=0;
+    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
+  {
+     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
+ 	{
+        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
+		{
+            int i = addr2+j;
+			if( i < n )
+			{
+                int tableIdx;
+				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1
+                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
+                counter[tableIdx] ++;
+			}
+		}
+	}
+  }
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
new file mode 100644
index 00000000..8876c16a
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
@@ -0,0 +1,910 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* radixSort32KernelsCL= \
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Author Takahiro Harada\n"
+"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"#define WG_SIZE 64\n"
+"#define BITS_PER_PASS 4\n"
+"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
+"typedef uchar u8;\n"
+"//	this isn't optimization for VLIW. But just reducing writes. \n"
+"#define USE_2LEVEL_REDUCE 1\n"
+"//#define CHECK_BOUNDARY 1\n"
+"//#define NV_GPU 1\n"
+"//	Cypress\n"
+"#define nPerWI 16\n"
+"//	Cayman\n"
+"//#define nPerWI 20\n"
+"#define m_n x\n"
+"#define m_nWGs y\n"
+"#define m_startBit z\n"
+"#define m_nBlocksPerWG w\n"
+"typedef struct\n"
+"	int m_n;\n"
+"	int m_nWGs;\n"
+"	int m_startBit;\n"
+"	int m_nBlocksPerWG;\n"
+"} ConstBuffer;\n"
+"typedef struct\n"
+"	unsigned int m_key;\n"
+"	unsigned int m_value;\n"
+"} SortDataCL;\n"
+"uint prefixScanVectorEx( uint4* data )\n"
+"	u32 sum = 0;\n"
+"	u32 tmp = data[0].x;\n"
+"	data[0].x = sum;\n"
+"	sum += tmp;\n"
+"	tmp = data[0].y;\n"
+"	data[0].y = sum;\n"
+"	sum += tmp;\n"
+"	tmp = data[0].z;\n"
+"	data[0].z = sum;\n"
+"	sum += tmp;\n"
+"	tmp = data[0].w;\n"
+"	data[0].w = sum;\n"
+"	sum += tmp;\n"
+"	return sum;\n"
+"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+wgSize] = pData;\n"
+"	}\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (wgSize+1);\n"
+"#if defined(USE_2LEVEL_REDUCE)\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			u32 u0, u1, u2;\n"
+"			u0 = sorterSharedMemory[idx-3];\n"
+"			u1 = sorterSharedMemory[idx-2];\n"
+"			u2 = sorterSharedMemory[idx-1];\n"
+"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+"			u0 = sorterSharedMemory[idx-12];\n"
+"			u1 = sorterSharedMemory[idx-8];\n"
+"			u2 = sorterSharedMemory[idx-4];\n"
+"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+"			u0 = sorterSharedMemory[idx-48];\n"
+"			u1 = sorterSharedMemory[idx-32];\n"
+"			u2 = sorterSharedMemory[idx-16];\n"
+"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+"			if( wgSize > 64 )\n"
+"			{\n"
+"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"			}\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"		}\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n"
+"			if( wgSize > 64 )\n"
+"			{\n"
+"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"			}\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"		}\n"
+"	}\n"
+"	*totalSum = sorterSharedMemory[wgSize*2-1];\n"
+"	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
+"	return addValue;\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
+"	u32 s4 = prefixScanVectorEx( &pData );\n"
+"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
+"	return pData + make_uint4( rank, rank, rank, rank );\n"
+"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
+"	u32 s4 = prefixScanVectorEx( &pData );\n"
+"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
+"	return pData + make_uint4( rank, rank, rank, rank );\n"
+"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
+"u32 bit8Scan(u32 v)\n"
+"	return (v<<8) + (v<<16) + (v<<24);\n"
+"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
+"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
+"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		MY_HISTOGRAM(i) = 0;\n"
+"	}\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	u32 localKey;\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
+"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
+"		//	AMD: AtomInc performs better while NV prefers ++\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"		{\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"			if( addr+i < n )\n"
+"			{\n"
+"				localKey = (gSrc[addr+i]>>startBit) & 0xf;\n"
+"#if defined(NV_GPU)\n"
+"				MY_HISTOGRAM( localKey )++;\n"
+"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if( lIdx < NUM_BUCKET )\n"
+"	{\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
+"		{\n"
+"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
+"		}\n"
+"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
+"	}\n"
+"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )\n"
+"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		MY_HISTOGRAM(i) = 0;\n"
+"	}\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	u32 localKey;\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
+"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
+"		//	AMD: AtomInc performs better while NV prefers ++\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"		{\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"			if( addr+i < n )\n"
+"			{\n"
+"				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n"
+"#if defined(NV_GPU)\n"
+"				MY_HISTOGRAM( localKey )++;\n"
+"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if( lIdx < NUM_BUCKET )\n"
+"	{\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
+"		{\n"
+"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
+"		}\n"
+"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
+"	}\n"
+"#define nPerLane (nPerWI/4)\n"
+"//	NUM_BUCKET*nWGs < 128*nPerWI\n"
+"void PrefixScanKernel( __global u32* wHistogram1, int4  cb )\n"
+"	__local u32 ldsTopScanData[128*2];\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	u32 data[nPerWI];\n"
+"	for(int i=0; i<nPerWI; i++)\n"
+"	{\n"
+"		data[i] = 0;\n"
+"		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
+"			data[i] = wHistogram1[nPerWI*lIdx+i];\n"
+"	}\n"
+"	uint4 myData = make_uint4(0,0,0,0);\n"
+"	for(int i=0; i<nPerLane; i++)\n"
+"	{\n"
+"		myData.x += data[nPerLane*0+i];\n"
+"		myData.y += data[nPerLane*1+i];\n"
+"		myData.z += data[nPerLane*2+i];\n"
+"		myData.w += data[nPerLane*3+i];\n"
+"	}\n"
+"	uint totalSum;\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
+"//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches\n"
+"	{	int j = 0;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	{	int j = 1;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	{	int j = 2;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	{	int j = 3;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	for(int i=0; i<nPerLane; i++)\n"
+"	{\n"
+"		data[nPerLane*0+i] += scanned.x;\n"
+"		data[nPerLane*1+i] += scanned.y;\n"
+"		data[nPerLane*2+i] += scanned.z;\n"
+"		data[nPerLane*3+i] += scanned.w;\n"
+"	}\n"
+"	for(int i=0; i<nPerWI; i++)\n"
+"	{\n"
+"		int index = nPerWI*lIdx+i;\n"
+"		if (index < NUM_BUCKET*nWGs)\n"
+"			wHistogram1[nPerWI*lIdx+i] = data[i];\n"
+"	}\n"
+"//	4 scan, 4 exchange\n"
+"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
+"	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n"
+"	{\n"
+"		u32 mask = (1<<bitIdx);\n"
+"		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n"
+"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+"		u32 total;\n"
+"		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n"
+"		{\n"
+"			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+"			ldsSortData[dstAddr.x] = sortData[0];\n"
+"			ldsSortData[dstAddr.y] = sortData[1];\n"
+"			ldsSortData[dstAddr.z] = sortData[2];\n"
+"			ldsSortData[dstAddr.w] = sortData[3];\n"
+"			sortData[0] = ldsSortData[localAddr.x];\n"
+"			sortData[1] = ldsSortData[localAddr.y];\n"
+"			sortData[2] = ldsSortData[localAddr.z];\n"
+"			sortData[3] = ldsSortData[localAddr.w];\n"
+"		}\n"
+"	}\n"
+"//	2 scan, 2 exchange\n"
+"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
+"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
+"	{\n"
+"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
+"		u32 key4;\n"
+"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
+"		{\n"
+"			sKeyPacked[0] |= 1<<(8*b.x);\n"
+"			sKeyPacked[1] |= 1<<(8*b.y);\n"
+"			sKeyPacked[2] |= 1<<(8*b.z);\n"
+"			sKeyPacked[3] |= 1<<(8*b.w);\n"
+"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
+"		}\n"
+"		u32 rankPacked;\n"
+"		u32 sumPacked;\n"
+"		{\n"
+"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
+"		}\n"
+"		u32 newOffset[4] = { 0,0,0,0 };\n"
+"		{\n"
+"			u32 sumScanned = bit8Scan( sumPacked );\n"
+"			u32 scannedKeys[4];\n"
+"			scannedKeys[0] = 1<<(8*b.x);\n"
+"			scannedKeys[1] = 1<<(8*b.y);\n"
+"			scannedKeys[2] = 1<<(8*b.z);\n"
+"			scannedKeys[3] = 1<<(8*b.w);\n"
+"			{	//	4 scans at once\n"
+"				u32 sum4 = 0;\n"
+"				for(int ie=0; ie<4; ie++)\n"
+"				{\n"
+"					u32 tmp = scannedKeys[ie];\n"
+"					scannedKeys[ie] = sum4;\n"
+"					sum4 += tmp;\n"
+"				}\n"
+"			}\n"
+"			{\n"
+"				u32 sumPlusRank = sumScanned + rankPacked;\n"
+"				{	u32 ie = b.x;\n"
+"					scannedKeys[0] += sumPlusRank;\n"
+"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
+"				}\n"
+"				{	u32 ie = b.y;\n"
+"					scannedKeys[1] += sumPlusRank;\n"
+"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
+"				}\n"
+"				{	u32 ie = b.z;\n"
+"					scannedKeys[2] += sumPlusRank;\n"
+"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
+"				}\n"
+"				{	u32 ie = b.w;\n"
+"					scannedKeys[3] += sumPlusRank;\n"
+"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		{\n"
+"			ldsSortData[newOffset[0]] = sortData[0];\n"
+"			ldsSortData[newOffset[1]] = sortData[1];\n"
+"			ldsSortData[newOffset[2]] = sortData[2];\n"
+"			ldsSortData[newOffset[3]] = sortData[3];\n"
+"			u32 dstAddr = 4*lIdx;\n"
+"			sortData[0] = ldsSortData[dstAddr+0];\n"
+"			sortData[1] = ldsSortData[dstAddr+1];\n"
+"			sortData[2] = ldsSortData[dstAddr+2];\n"
+"			sortData[3] = ldsSortData[dstAddr+3];\n"
+"		}\n"
+"	}\n"
+"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
+"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
+"	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
+"	__local u32 localHistogram[NUM_BUCKET*2];\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
+"	}\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		u32 myHistogram = 0;\n"
+"		u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n"
+"			sortData[i] = gSrc[ addr+i ];\n"
+"		sort4Bits(sortData, startBit, lIdx, ldsSortData);\n"
+"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
+"		{	//	create histogram\n"
+"			u32 setIdx = lIdx/16;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx] = 0;\n"
+"			}\n"
+"			ldsSortData[lIdx] = 0;\n"
+"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+i < n )\n"
+"#if defined(NV_GPU)\n"
+"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
+"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
+"			\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				u32 sum = 0;\n"
+"				for(int i=0; i<WG_SIZE/16; i++)\n"
+"				{\n"
+"					sum += SET_HISTOGRAM( i, lIdx );\n"
+"				}\n"
+"				myHistogram = sum;\n"
+"				localHistogram[hIdx] = sum;\n"
+"			}\n"
+"#if defined(USE_2LEVEL_REDUCE)\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				u32 u0, u1, u2;\n"
+"				u0 = localHistogram[hIdx-3];\n"
+"				u1 = localHistogram[hIdx-2];\n"
+"				u2 = localHistogram[hIdx-1];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"				u0 = localHistogram[hIdx-12];\n"
+"				u1 = localHistogram[hIdx-8];\n"
+"				u2 = localHistogram[hIdx-4];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"			}\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"			}\n"
+"		}\n"
+"		{\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
+"				int binIdx = keys[ie];\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+ie < n )\n"
+"				gDst[ groupOffset + myIdx ] = sortData[ie];\n"
+"			}\n"
+"		}\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"	}\n"
+"//	2 scan, 2 exchange\n"
+"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
+"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
+"	{\n"
+"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
+"		u32 key4;\n"
+"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
+"		{\n"
+"			sKeyPacked[0] |= 1<<(8*b.x);\n"
+"			sKeyPacked[1] |= 1<<(8*b.y);\n"
+"			sKeyPacked[2] |= 1<<(8*b.z);\n"
+"			sKeyPacked[3] |= 1<<(8*b.w);\n"
+"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
+"		}\n"
+"		u32 rankPacked;\n"
+"		u32 sumPacked;\n"
+"		{\n"
+"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
+"		}\n"
+"		u32 newOffset[4] = { 0,0,0,0 };\n"
+"		{\n"
+"			u32 sumScanned = bit8Scan( sumPacked );\n"
+"			u32 scannedKeys[4];\n"
+"			scannedKeys[0] = 1<<(8*b.x);\n"
+"			scannedKeys[1] = 1<<(8*b.y);\n"
+"			scannedKeys[2] = 1<<(8*b.z);\n"
+"			scannedKeys[3] = 1<<(8*b.w);\n"
+"			{	//	4 scans at once\n"
+"				u32 sum4 = 0;\n"
+"				for(int ie=0; ie<4; ie++)\n"
+"				{\n"
+"					u32 tmp = scannedKeys[ie];\n"
+"					scannedKeys[ie] = sum4;\n"
+"					sum4 += tmp;\n"
+"				}\n"
+"			}\n"
+"			{\n"
+"				u32 sumPlusRank = sumScanned + rankPacked;\n"
+"				{	u32 ie = b.x;\n"
+"					scannedKeys[0] += sumPlusRank;\n"
+"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
+"				}\n"
+"				{	u32 ie = b.y;\n"
+"					scannedKeys[1] += sumPlusRank;\n"
+"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
+"				}\n"
+"				{	u32 ie = b.z;\n"
+"					scannedKeys[2] += sumPlusRank;\n"
+"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
+"				}\n"
+"				{	u32 ie = b.w;\n"
+"					scannedKeys[3] += sumPlusRank;\n"
+"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		{\n"
+"			ldsSortData[newOffset[0]] = sortData[0];\n"
+"			ldsSortData[newOffset[1]] = sortData[1];\n"
+"			ldsSortData[newOffset[2]] = sortData[2];\n"
+"			ldsSortData[newOffset[3]] = sortData[3];\n"
+"			ldsSortVal[newOffset[0]] = sortVal[0];\n"
+"			ldsSortVal[newOffset[1]] = sortVal[1];\n"
+"			ldsSortVal[newOffset[2]] = sortVal[2];\n"
+"			ldsSortVal[newOffset[3]] = sortVal[3];\n"
+"			u32 dstAddr = 4*lIdx;\n"
+"			sortData[0] = ldsSortData[dstAddr+0];\n"
+"			sortData[1] = ldsSortData[dstAddr+1];\n"
+"			sortData[2] = ldsSortData[dstAddr+2];\n"
+"			sortData[3] = ldsSortData[dstAddr+3];\n"
+"			sortVal[0] = ldsSortVal[dstAddr+0];\n"
+"			sortVal[1] = ldsSortVal[dstAddr+1];\n"
+"			sortVal[2] = ldsSortVal[dstAddr+2];\n"
+"			sortVal[3] = ldsSortVal[dstAddr+3];\n"
+"		}\n"
+"	}\n"
+"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
+"	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+"	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
+"	__local u32 localHistogram[NUM_BUCKET*2];\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
+"	}\n"
+"    \n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		u32 myHistogram = 0;\n"
+"		int sortData[ELEMENTS_PER_WORK_ITEM];\n"
+"		int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"		{\n"
+"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n"
+"			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n"
+"		}\n"
+"		{\n"
+"			sortData[i] = gSrc[ addr+i ].m_key;\n"
+"			sortVal[i] = gSrc[ addr+i ].m_value;\n"
+"		}\n"
+"		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
+"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
+"		{	//	create histogram\n"
+"			u32 setIdx = lIdx/16;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx] = 0;\n"
+"			}\n"
+"			ldsSortData[lIdx] = 0;\n"
+"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+i < n )\n"
+"#if defined(NV_GPU)\n"
+"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
+"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
+"			\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				u32 sum = 0;\n"
+"				for(int i=0; i<WG_SIZE/16; i++)\n"
+"				{\n"
+"					sum += SET_HISTOGRAM( i, lIdx );\n"
+"				}\n"
+"				myHistogram = sum;\n"
+"				localHistogram[hIdx] = sum;\n"
+"			}\n"
+"#if defined(USE_2LEVEL_REDUCE)\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				u32 u0, u1, u2;\n"
+"				u0 = localHistogram[hIdx-3];\n"
+"				u1 = localHistogram[hIdx-2];\n"
+"				u2 = localHistogram[hIdx-1];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"				u0 = localHistogram[hIdx-12];\n"
+"				u1 = localHistogram[hIdx-8];\n"
+"				u2 = localHistogram[hIdx-4];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"			}\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"			}\n"
+"		}\n"
+"    	{\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
+"				int binIdx = keys[ie];\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+ie < n )\n"
+"				{\n"
+"                    if ((groupOffset + myIdx)<n)\n"
+"                    {\n"
+"                        if (sortData[ie]==sortVal[ie])\n"
+"                        {\n"
+"                            \n"
+"                            SortDataCL tmp;\n"
+"                            tmp.m_key = sortData[ie];\n"
+"                            tmp.m_value = sortVal[ie];\n"
+"                            if (tmp.m_key == tmp.m_value)\n"
+"                                gDst[groupOffset + myIdx ] = tmp;\n"
+"                        }\n"
+"                        \n"
+"                    }\n"
+"				}\n"
+"                if ((groupOffset + myIdx)<n)\n"
+"                {\n"
+"                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n"
+"                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n"
+"                }\n"
+"			}\n"
+"		}\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"	}\n"
+"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
+"    \n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"    int counter[NUM_BUCKET];\n"
+"    \n"
+"    if (realLocalIdx>0)\n"
+"        return;\n"
+"    \n"
+"    for (int c=0;c<NUM_BUCKET;c++)\n"
+"        counter[c]=0;\n"
+"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
+"  {\n"
+"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
+" 	{\n"
+"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"        \n"
+"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
+"		{\n"
+"            int i = addr2+j;\n"
+"			if( i < n )\n"
+"			{\n"
+"                int tableIdx;\n"
+"				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
+"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
+"                counter[tableIdx] ++;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"  }\n"
+"    \n"
+"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
+"    \n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"    int counter[NUM_BUCKET];\n"
+"    \n"
+"    if (realLocalIdx>0)\n"
+"        return;\n"
+"    \n"
+"    for (int c=0;c<NUM_BUCKET;c++)\n"
+"        counter[c]=0;\n"
+"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
+"  {\n"
+"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
+" 	{\n"
+"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"        \n"
+"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
+"		{\n"
+"            int i = addr2+j;\n"
+"			if( i < n )\n"
+"			{\n"
+"                int tableIdx;\n"
+"				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
+"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
+"                counter[tableIdx] ++;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"  }\n"
+"    \n"
diff --git a/src/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp b/src/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
new file mode 100644
index 00000000..4ef38bd1
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
@@ -0,0 +1,391 @@
+#include "b3GpuRaycast.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h"
+#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"
+#define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl"
+struct b3GpuRaycastInternalData
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue  m_q;
+	cl_kernel m_raytraceKernel;
+	cl_kernel m_raytracePairsKernel;
+	cl_kernel m_findRayRigidPairIndexRanges;
+	b3GpuParallelLinearBvh* m_plbvh;
+	b3RadixSort32CL* m_radixSorter;
+	b3FillCL* m_fill;
+	//1 element per ray
+	b3OpenCLArray<b3RayInfo>* m_gpuRays;
+	b3OpenCLArray<b3RayHit>* m_gpuHitResults;
+	b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
+	b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
+	//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
+	b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
+	b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs;	//x == ray index, y == rigid index
+	int m_test;
+b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	m_data = new b3GpuRaycastInternalData;
+	m_data->m_context = ctx;
+	m_data->m_device = device;
+	m_data->m_q = q;
+	m_data->m_raytraceKernel = 0;
+	m_data->m_raytracePairsKernel = 0;
+	m_data->m_findRayRigidPairIndexRanges = 0;
+	m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
+	m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
+	m_data->m_fill = new b3FillCL(ctx, device, q);
+	m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
+	m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
+	m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
+	m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q);
+	m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q);
+	m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);
+	{
+		cl_int errNum=0;
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,rayCastKernelCL,&errNum,"",B3_RAYCAST_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		clReleaseProgram(prog);
+	}
+	clReleaseKernel(m_data->m_raytraceKernel);
+	clReleaseKernel(m_data->m_raytracePairsKernel);
+	clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
+	delete m_data->m_plbvh;
+	delete m_data->m_radixSorter;
+	delete m_data->m_fill;
+	delete m_data->m_gpuRays;
+	delete m_data->m_gpuHitResults;
+	delete m_data->m_firstRayRigidPairIndexPerRay;
+	delete m_data->m_numRayRigidPairsPerRay;
+	delete m_data->m_gpuNumRayRigidPairs;
+	delete m_data->m_gpuRayRigidPairs;
+	delete m_data;
+bool sphere_intersect(const b3Vector3& spherePos,  b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
+    b3Vector3 rs = rayFrom - spherePos;
+	b3Vector3 rayDir = rayTo-rayFrom;
+	float A = b3Dot(rayDir,rayDir);
+    float B = b3Dot(rs, rayDir);
+    float C = b3Dot(rs, rs) - (radius * radius);
+	float D = B * B - A*C;
+    if (D > 0.0)
+    {
+        float t = (-B - sqrt(D))/A;
+        if ( (t >= 0.0f) && (t < hitFraction) )
+        {
+			hitFraction = t;
+            return true;
+		}
+	}
+	return false;
+bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly,
+	const b3AlignedObjectArray<b3GpuFace>& faces,  float& hitFraction, b3Vector3& hitNormal)
+	float exitFraction = hitFraction;
+	float enterFraction = -0.1f;
+	b3Vector3 curHitNormal=b3MakeVector3(0,0,0);
+	for (int i=0;i<poly.m_numFaces;i++)
+	{
+		const b3GpuFace& face = faces[poly.m_faceOffset+i];
+		float fromPlaneDist = b3Dot(rayFromLocal,face.m_plane)+face.m_plane.w;
+		float toPlaneDist = b3Dot(rayToLocal,face.m_plane)+face.m_plane.w;
+		if (fromPlaneDist<0.f)
+		{
+			if (toPlaneDist >= 0.f)
+			{
+				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
+				if (exitFraction>fraction)
+				{
+					exitFraction = fraction;
+				}
+			} 			
+		} else
+		{
+			if (toPlaneDist<0.f)
+			{
+				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
+				if (enterFraction <= fraction)
+				{
+					enterFraction = fraction;
+					curHitNormal = face.m_plane;
+					curHitNormal.w = 0.f;
+				}
+			} else
+			{
+				return false;
+			}
+		}
+		if (exitFraction <= enterFraction)
+			return false;
+	}
+	if (enterFraction < 0.f)
+		return false;
+	hitFraction = enterFraction;
+	hitNormal = curHitNormal;
+	return true;
+void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
+		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables,const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
+//	return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);
+	B3_PROFILE("castRaysHost");
+	for (int r=0;r<rays.size();r++)
+	{
+		b3Vector3 rayFrom = rays[r].m_from;
+		b3Vector3 rayTo = rays[r].m_to;
+		float hitFraction = hitResults[r].m_hitFraction;
+		int hitBodyIndex= -1;
+		b3Vector3 hitNormal;
+		for (int b=0;b<numBodies;b++)
+		{
+			const b3Vector3& pos = bodies[b].m_pos;
+			const b3Quaternion& orn = bodies[b].m_quat;
+			switch (collidables[bodies[b].m_collidableIdx].m_shapeType)
+			{
+			case SHAPE_SPHERE:
+				{
+					b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius;
+					if (sphere_intersect(pos,  radius, rayFrom, rayTo,hitFraction))
+					{
+						hitBodyIndex = b;
+						b3Vector3 hitPoint;
+						hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction);
+						hitNormal = (hitPoint-bodies[b].m_pos).normalize();
+					}
+				}
+				{
+					b3Transform convexWorldTransform;
+					convexWorldTransform.setIdentity();
+					convexWorldTransform.setOrigin(bodies[b].m_pos);
+					convexWorldTransform.setRotation(bodies[b].m_quat);
+					b3Transform convexWorld2Local = convexWorldTransform.inverse();
+					b3Vector3 rayFromLocal = convexWorld2Local(rayFrom);
+					b3Vector3 rayToLocal = convexWorld2Local(rayTo);
+					int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex;
+					const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex];
+					if (rayConvex(rayFromLocal, rayToLocal,poly,narrowphaseData->m_convexFaces, hitFraction, hitNormal))
+					{
+						hitBodyIndex = b;
+					}
+					break;
+				}
+			default:
+				{
+					static bool once=true;
+					if (once)
+					{
+						once=false;
+						b3Warning("Raytest: unsupported shape type\n");
+					}
+				}
+			}
+		}
+		if (hitBodyIndex>=0)
+		{
+			hitResults[r].m_hitFraction = hitFraction;
+			hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction);
+			hitResults[r].m_hitNormal = hitNormal;
+			hitResults[r].m_hitBody = hitBodyIndex;
+		}
+	}
+///todo: add some acceleration structure (AABBs, tree etc)
+void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
+		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, 
+		const struct b3GpuNarrowPhaseInternalData* narrowphaseData,	class b3GpuBroadphaseInterface* broadphase)
+	//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);
+	B3_PROFILE("castRaysGPU");
+	{
+		B3_PROFILE("raycast copyFromHost");
+		m_data->m_gpuRays->copyFromHost(rays);
+		m_data->m_gpuHitResults->copyFromHost(hitResults);
+	}
+	int numRays = hitResults.size();
+	{
+		m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
+		m_data->m_numRayRigidPairsPerRay->resize(numRays);
+		m_data->m_gpuNumRayRigidPairs->resize(1);
+		m_data->m_gpuRayRigidPairs->resize(numRays * 16);
+	}
+	//run kernel
+	const bool USE_BRUTE_FORCE_RAYCAST = false;
+	{
+		B3_PROFILE("raycast launch1D");
+		b3LauncherCL launcher(m_data->m_q,m_data->m_raytraceKernel,"m_raytraceKernel");
+		int numRays = rays.size();
+		launcher.setConst(numRays);
+		launcher.setBuffer(m_data->m_gpuRays->getBufferCL());
+		launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL());
+		launcher.setConst(numBodies);
+		launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL());
+		launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL());
+		launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL());
+		launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL());
+		launcher.launch1D(numRays);
+		clFinish(m_data->m_q);
+	}
+	else
+	{
+		m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() );
+		m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
+		int numRayRigidPairs = -1;
+		m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
+		if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() )
+		{
+			numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
+			m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
+		}
+		m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs);	//Radix sort needs b3OpenCLArray::size() to be correct
+		//Sort ray-rigid pairs by ray index
+		{
+			B3_PROFILE("sort ray-rigid pairs");
+			m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) );
+		}
+		//detect start,count of each ray pair
+		{
+			B3_PROFILE("detect ray-rigid pair index ranges");
+			{
+				B3_PROFILE("reset ray-rigid pair index ranges");
+				m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays);	//atomic_min used to find first index
+				m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
+				clFinish(m_data->m_q);
+			}
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() )
+			};
+			b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numRayRigidPairs);
+			launcher.launch1D(numRayRigidPairs);
+			clFinish(m_data->m_q);
+		}
+		{
+			B3_PROFILE("ray-rigid intersection");
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() )
+			};
+			b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numRays);
+			launcher.launch1D(numRays);
+			clFinish(m_data->m_q);
+		}
+	}
+	//copy results
+	{
+		B3_PROFILE("raycast copyToHost");
+		m_data->m_gpuHitResults->copyToHost(hitResults);
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h b/src/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h
new file mode 100644
index 00000000..3a5cf44b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h
@@ -0,0 +1,32 @@
+#ifndef B3_GPU_RAYCAST_H
+#define B3_GPU_RAYCAST_H
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
+class b3GpuRaycast
+	struct b3GpuRaycastInternalData* m_data;
+	b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue  q);
+	virtual ~b3GpuRaycast();
+	void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn,	b3AlignedObjectArray<b3RayHit>& hitResults,
+		int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
+		const struct b3GpuNarrowPhaseInternalData* narrowphaseData);
+	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
+		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
+		const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
+#endif //B3_GPU_RAYCAST_H
diff --git a/src/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl b/src/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl
new file mode 100644
index 00000000..e72d9687
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl
@@ -0,0 +1,439 @@
+#define SHAPE_PLANE 4
+#define SHAPE_SPHERE 7
+typedef struct
+	float4 m_from;
+	float4 m_to;
+} b3RayInfo;
+typedef struct
+	float m_hitFraction;
+	int	m_hitResult0;
+	int	m_hitResult1;
+	int	m_hitResult2;
+	float4	m_hitPoint;
+	float4	m_hitNormal;
+} b3RayHit;
+typedef struct
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	unsigned int m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+typedef struct Collidable
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	float m_radius;
+	int m_shapeType;
+	int m_shapeIndex;
+} Collidable;
+typedef struct  
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+typedef struct
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} b3GpuFace;
+//	Quaternion
+typedef float4 Quaternion;
+	Quaternion qtMul(Quaternion a, Quaternion b);
+	Quaternion qtNormalize(Quaternion in);
+	Quaternion qtInvert(Quaternion q);
+	float dot3F4(float4 a, float4 b)
+	float4 a1 = (float4)(a.xyz,0.f);
+	float4 b1 = (float4)(b.xyz,0.f);
+	return dot(a1, b1);
+	Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross( a, b );
+	ans += a.w*b+b.w*a;
+	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+	Quaternion qtNormalize(Quaternion in)
+	return fast_normalize(in);
+	//	in /= length( in );
+	//	return in;
+	float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(q,vcpy);
+	out = qtMul(out,qInv);
+	return out;
+	Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+	float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+void	trInverse(float4 translationIn, Quaternion orientationIn,
+	float4* translationOut, Quaternion* orientationOut)
+	*orientationOut = qtInvert(orientationIn);
+	*translationOut = qtRotate(*orientationOut, -translationIn);
+bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,
+	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)
+	rayFromLocal.w = 0.f;
+	rayToLocal.w = 0.f;
+	bool result = true;
+	float exitFraction = hitFraction[0];
+	float enterFraction = -0.3f;
+	float4 curHitNormal = (float4)(0,0,0,0);
+	for (int i=0;i<numFaces && result;i++)
+	{
+		b3GpuFace face = faces[faceOffset+i];
+		float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;
+		float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;
+		if (fromPlaneDist<0.f)
+		{
+			if (toPlaneDist >= 0.f)
+			{
+				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
+				if (exitFraction>fraction)
+				{
+					exitFraction = fraction;
+				}
+			} 			
+		} else
+		{
+			if (toPlaneDist<0.f)
+			{
+				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
+				if (enterFraction <= fraction)
+				{
+					enterFraction = fraction;
+					curHitNormal = face.m_plane;
+					curHitNormal.w = 0.f;
+				}
+			} else
+			{
+				result = false;
+			}
+		}
+		if (exitFraction <= enterFraction)
+			result = false;
+	}
+	if (enterFraction < 0.f)
+	{
+		result = false;
+	}
+	if (result)
+	{	
+		hitFraction[0] = enterFraction;
+		hitNormal[0] = curHitNormal;
+	}
+	return result;
+bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)
+	float4 rs = rayFrom - spherePos;
+	rs.w = 0.f;
+	float4 rayDir = rayTo-rayFrom;
+	rayDir.w = 0.f;
+	float A = dot(rayDir,rayDir);
+	float B = dot(rs, rayDir);
+	float C = dot(rs, rs) - (radius * radius);
+	float D = B * B - A*C;
+	if (D > 0.0f)
+	{
+		float t = (-B - sqrt(D))/A;
+		if ( (t >= 0.0f) && (t < (*hitFraction)) )
+		{
+			*hitFraction = t;
+			return true;
+		}
+	}
+	return false;
+float4 setInterpolate3(float4 from, float4 to, float t)
+	float s = 1.0f - t;
+	float4 result;
+	result = s * from + t * to;
+	result.w = 0.f;	
+	return result;	
+__kernel void rayCastKernel(  
+	int numRays, 
+	const __global b3RayInfo* rays, 
+	__global b3RayHit* hitResults, 
+	const int numBodies, 
+	__global Body* bodies,
+	__global Collidable* collidables,
+	__global const b3GpuFace* faces,
+	__global const ConvexPolyhedronCL* convexShapes	)
+	int i = get_global_id(0);
+	if (i>=numRays)
+		return;
+	hitResults[i].m_hitFraction = 1.f;
+	float4 rayFrom = rays[i].m_from;
+	float4 rayTo = rays[i].m_to;
+	float hitFraction = 1.f;
+	float4 hitPoint;
+	float4 hitNormal;
+	int hitBodyIndex= -1;
+	int cachedCollidableIndex = -1;
+	Collidable cachedCollidable;
+	for (int b=0;b<numBodies;b++)
+	{
+		if (hitResults[i].m_hitResult2==b)
+			continue;
+		Body body = bodies[b];
+		float4 pos = body.m_pos;
+		float4 orn = body.m_quat;
+		if (cachedCollidableIndex != body.m_collidableIdx)
+		{
+			cachedCollidableIndex = body.m_collidableIdx;
+			cachedCollidable = collidables[cachedCollidableIndex];
+		}
+		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			float4 invPos = (float4)(0,0,0,0);
+			float4 invOrn = (float4)(0,0,0,0);
+			float4 rayFromLocal = (float4)(0,0,0,0);
+			float4 rayToLocal = (float4)(0,0,0,0);
+			invOrn = qtInvert(orn);
+			invPos = qtRotate(invOrn, -pos);
+			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
+			rayToLocal = qtRotate( invOrn, rayTo) + invPos;
+			rayFromLocal.w = 0.f;
+			rayToLocal.w = 0.f;
+			int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;
+			int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;
+			if (numFaces)
+			{
+				if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
+				{
+					hitBodyIndex = b;
+				}
+			}
+		}
+		if (cachedCollidable.m_shapeType == SHAPE_SPHERE)
+		{
+			float radius = cachedCollidable.m_radius;
+			if (sphere_intersect(pos,  radius, rayFrom, rayTo, &hitFraction))
+			{
+				hitBodyIndex = b;
+				hitNormal = (float4) (hitPoint-bodies[b].m_pos);
+			}
+		}
+	}
+	if (hitBodyIndex>=0)
+	{
+		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);
+		hitResults[i].m_hitFraction = hitFraction;
+		hitResults[i].m_hitPoint = hitPoint;
+		hitResults[i].m_hitNormal = normalize(hitNormal);
+		hitResults[i].m_hitResult0 = hitBodyIndex;
+	}
+__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, 
+											__global int* out_firstRayRigidPairIndexPerRay,
+											__global int* out_numRayRigidPairsPerRay,
+											int numRayRigidPairs)
+	int rayRigidPairIndex = get_global_id(0);
+	if (rayRigidPairIndex >= numRayRigidPairs) return;
+	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;
+	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);
+	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);
+__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, 
+								__global b3RayHit* hitResults, 
+								__global int* firstRayRigidPairIndexPerRay,
+								__global int* numRayRigidPairsPerRay,
+								__global Body* bodies,
+								__global Collidable* collidables,
+								__global const b3GpuFace* faces,
+								__global const ConvexPolyhedronCL* convexShapes,
+								__global int2* rayRigidPairs,
+								int numRays)
+	int i = get_global_id(0);
+	if (i >= numRays) return;
+	float4 rayFrom = rays[i].m_from;
+	float4 rayTo = rays[i].m_to;
+	hitResults[i].m_hitFraction = 1.f;
+	float hitFraction = 1.f;
+	float4 hitPoint;
+	float4 hitNormal;
+	int hitBodyIndex = -1;
+	//
+	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)
+	{
+		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];
+		int b = rayRigidPairs[rayRigidPairIndex].y;
+		if (hitResults[i].m_hitResult2 == b) continue;
+		Body body = bodies[b];
+		Collidable rigidCollidable = collidables[body.m_collidableIdx];
+		float4 pos = body.m_pos;
+		float4 orn = body.m_quat;
+		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			float4 invPos = (float4)(0,0,0,0);
+			float4 invOrn = (float4)(0,0,0,0);
+			float4 rayFromLocal = (float4)(0,0,0,0);
+			float4 rayToLocal = (float4)(0,0,0,0);
+			invOrn = qtInvert(orn);
+			invPos = qtRotate(invOrn, -pos);
+			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
+			rayToLocal = qtRotate( invOrn, rayTo) + invPos;
+			rayFromLocal.w = 0.f;
+			rayToLocal.w = 0.f;
+			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;
+			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;
+			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
+			{
+				hitBodyIndex = b;
+				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
+			}
+		}
+		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)
+		{
+			float radius = rigidCollidable.m_radius;
+			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
+			{
+				hitBodyIndex = b;
+				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
+				hitNormal = (float4) (hitPoint - bodies[b].m_pos);
+			}
+		}
+	}
+	if (hitBodyIndex >= 0)
+	{
+		hitResults[i].m_hitFraction = hitFraction;
+		hitResults[i].m_hitPoint = hitPoint;
+		hitResults[i].m_hitNormal = normalize(hitNormal);
+		hitResults[i].m_hitResult0 = hitBodyIndex;
+	}
diff --git a/src/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h b/src/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
new file mode 100644
index 00000000..6257909a
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
@@ -0,0 +1,381 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* rayCastKernelCL= \
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_PLANE 4\n"
+"#define SHAPE_SPHERE 7\n"
+"typedef struct\n"
+"	float4 m_from;\n"
+"	float4 m_to;\n"
+"} b3RayInfo;\n"
+"typedef struct\n"
+"	float m_hitFraction;\n"
+"	int	m_hitResult0;\n"
+"	int	m_hitResult1;\n"
+"	int	m_hitResult2;\n"
+"	float4	m_hitPoint;\n"
+"	float4	m_hitNormal;\n"
+"} b3RayHit;\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	unsigned int m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} Body;\n"
+"typedef struct Collidable\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	float m_radius;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"} Collidable;\n"
+"typedef struct  \n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} b3GpuFace;\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"	Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"	Quaternion qtNormalize(Quaternion in);\n"
+"	Quaternion qtInvert(Quaternion q);\n"
+"	float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = (float4)(a.xyz,0.f);\n"
+"	float4 b1 = (float4)(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"	Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"	Quaternion qtNormalize(Quaternion in)\n"
+"	return fast_normalize(in);\n"
+"	//	in /= length( in );\n"
+"	//	return in;\n"
+"	float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(q,vcpy);\n"
+"	out = qtMul(out,qInv);\n"
+"	return out;\n"
+"	Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"	float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+"	float4* translationOut, Quaternion* orientationOut)\n"
+"	*orientationOut = qtInvert(orientationIn);\n"
+"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
+"	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
+"	rayFromLocal.w = 0.f;\n"
+"	rayToLocal.w = 0.f;\n"
+"	bool result = true;\n"
+"	float exitFraction = hitFraction[0];\n"
+"	float enterFraction = -0.3f;\n"
+"	float4 curHitNormal = (float4)(0,0,0,0);\n"
+"	for (int i=0;i<numFaces && result;i++)\n"
+"	{\n"
+"		b3GpuFace face = faces[faceOffset+i];\n"
+"		float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
+"		float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
+"		if (fromPlaneDist<0.f)\n"
+"		{\n"
+"			if (toPlaneDist >= 0.f)\n"
+"			{\n"
+"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
+"				if (exitFraction>fraction)\n"
+"				{\n"
+"					exitFraction = fraction;\n"
+"				}\n"
+"			} 			\n"
+"		} else\n"
+"		{\n"
+"			if (toPlaneDist<0.f)\n"
+"			{\n"
+"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
+"				if (enterFraction <= fraction)\n"
+"				{\n"
+"					enterFraction = fraction;\n"
+"					curHitNormal = face.m_plane;\n"
+"					curHitNormal.w = 0.f;\n"
+"				}\n"
+"			} else\n"
+"			{\n"
+"				result = false;\n"
+"			}\n"
+"		}\n"
+"		if (exitFraction <= enterFraction)\n"
+"			result = false;\n"
+"	}\n"
+"	if (enterFraction < 0.f)\n"
+"	{\n"
+"		result = false;\n"
+"	}\n"
+"	if (result)\n"
+"	{	\n"
+"		hitFraction[0] = enterFraction;\n"
+"		hitNormal[0] = curHitNormal;\n"
+"	}\n"
+"	return result;\n"
+"bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
+"	float4 rs = rayFrom - spherePos;\n"
+"	rs.w = 0.f;\n"
+"	float4 rayDir = rayTo-rayFrom;\n"
+"	rayDir.w = 0.f;\n"
+"	float A = dot(rayDir,rayDir);\n"
+"	float B = dot(rs, rayDir);\n"
+"	float C = dot(rs, rs) - (radius * radius);\n"
+"	float D = B * B - A*C;\n"
+"	if (D > 0.0f)\n"
+"	{\n"
+"		float t = (-B - sqrt(D))/A;\n"
+"		if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
+"		{\n"
+"			*hitFraction = t;\n"
+"			return true;\n"
+"		}\n"
+"	}\n"
+"	return false;\n"
+"float4 setInterpolate3(float4 from, float4 to, float t)\n"
+"	float s = 1.0f - t;\n"
+"	float4 result;\n"
+"	result = s * from + t * to;\n"
+"	result.w = 0.f;	\n"
+"	return result;	\n"
+"__kernel void rayCastKernel(  \n"
+"	int numRays, \n"
+"	const __global b3RayInfo* rays, \n"
+"	__global b3RayHit* hitResults, \n"
+"	const int numBodies, \n"
+"	__global Body* bodies,\n"
+"	__global Collidable* collidables,\n"
+"	__global const b3GpuFace* faces,\n"
+"	__global const ConvexPolyhedronCL* convexShapes	)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numRays)\n"
+"		return;\n"
+"	hitResults[i].m_hitFraction = 1.f;\n"
+"	float4 rayFrom = rays[i].m_from;\n"
+"	float4 rayTo = rays[i].m_to;\n"
+"	float hitFraction = 1.f;\n"
+"	float4 hitPoint;\n"
+"	float4 hitNormal;\n"
+"	int hitBodyIndex= -1;\n"
+"	int cachedCollidableIndex = -1;\n"
+"	Collidable cachedCollidable;\n"
+"	for (int b=0;b<numBodies;b++)\n"
+"	{\n"
+"		if (hitResults[i].m_hitResult2==b)\n"
+"			continue;\n"
+"		Body body = bodies[b];\n"
+"		float4 pos = body.m_pos;\n"
+"		float4 orn = body.m_quat;\n"
+"		if (cachedCollidableIndex != body.m_collidableIdx)\n"
+"		{\n"
+"			cachedCollidableIndex = body.m_collidableIdx;\n"
+"			cachedCollidable = collidables[cachedCollidableIndex];\n"
+"		}\n"
+"		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"			float4 invPos = (float4)(0,0,0,0);\n"
+"			float4 invOrn = (float4)(0,0,0,0);\n"
+"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
+"			float4 rayToLocal = (float4)(0,0,0,0);\n"
+"			invOrn = qtInvert(orn);\n"
+"			invPos = qtRotate(invOrn, -pos);\n"
+"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
+"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
+"			rayFromLocal.w = 0.f;\n"
+"			rayToLocal.w = 0.f;\n"
+"			int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
+"			int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
+"			if (numFaces)\n"
+"			{\n"
+"				if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
+"				{\n"
+"					hitBodyIndex = b;\n"
+"					\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			float radius = cachedCollidable.m_radius;\n"
+"		\n"
+"			if (sphere_intersect(pos,  radius, rayFrom, rayTo, &hitFraction))\n"
+"			{\n"
+"				hitBodyIndex = b;\n"
+"				hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (hitBodyIndex>=0)\n"
+"	{\n"
+"		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
+"		hitResults[i].m_hitFraction = hitFraction;\n"
+"		hitResults[i].m_hitPoint = hitPoint;\n"
+"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
+"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
+"	}\n"
+"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
+"											__global int* out_firstRayRigidPairIndexPerRay,\n"
+"											__global int* out_numRayRigidPairsPerRay,\n"
+"											int numRayRigidPairs)\n"
+"	int rayRigidPairIndex = get_global_id(0);\n"
+"	if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
+"	\n"
+"	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
+"	\n"
+"	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
+"	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
+"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
+"								__global b3RayHit* hitResults, \n"
+"								__global int* firstRayRigidPairIndexPerRay,\n"
+"								__global int* numRayRigidPairsPerRay,\n"
+"									\n"
+"								__global Body* bodies,\n"
+"								__global Collidable* collidables,\n"
+"								__global const b3GpuFace* faces,\n"
+"								__global const ConvexPolyhedronCL* convexShapes,\n"
+"								\n"
+"								__global int2* rayRigidPairs,\n"
+"								int numRays)\n"
+"	int i = get_global_id(0);\n"
+"	if (i >= numRays) return;\n"
+"	\n"
+"	float4 rayFrom = rays[i].m_from;\n"
+"	float4 rayTo = rays[i].m_to;\n"
+"		\n"
+"	hitResults[i].m_hitFraction = 1.f;\n"
+"		\n"
+"	float hitFraction = 1.f;\n"
+"	float4 hitPoint;\n"
+"	float4 hitNormal;\n"
+"	int hitBodyIndex = -1;\n"
+"		\n"
+"	//\n"
+"	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
+"	{\n"
+"		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
+"		int b = rayRigidPairs[rayRigidPairIndex].y;\n"
+"		\n"
+"		if (hitResults[i].m_hitResult2 == b) continue;\n"
+"		\n"
+"		Body body = bodies[b];\n"
+"		Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
+"		\n"
+"		float4 pos = body.m_pos;\n"
+"		float4 orn = body.m_quat;\n"
+"		\n"
+"		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"			float4 invPos = (float4)(0,0,0,0);\n"
+"			float4 invOrn = (float4)(0,0,0,0);\n"
+"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
+"			float4 rayToLocal = (float4)(0,0,0,0);\n"
+"			invOrn = qtInvert(orn);\n"
+"			invPos = qtRotate(invOrn, -pos);\n"
+"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
+"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
+"			rayFromLocal.w = 0.f;\n"
+"			rayToLocal.w = 0.f;\n"
+"			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
+"			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
+"			\n"
+"			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
+"			{\n"
+"				hitBodyIndex = b;\n"
+"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			float radius = rigidCollidable.m_radius;\n"
+"		\n"
+"			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
+"			{\n"
+"				hitBodyIndex = b;\n"
+"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
+"				hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if (hitBodyIndex >= 0)\n"
+"	{\n"
+"		hitResults[i].m_hitFraction = hitFraction;\n"
+"		hitResults[i].m_hitPoint = hitPoint;\n"
+"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
+"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
+"	}\n"
+"	\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
new file mode 100644
index 00000000..c7478f54
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
@@ -0,0 +1,18 @@
+#ifndef B3_CONSTRAINT4_h
+#define B3_CONSTRAINT4_h
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4 : public b3ContactConstraint4
+	inline	void setFrictionCoeff(float value) { m_linear[3] = value; }
+	inline	float getFrictionCoeff() const { return m_linear[3]; }
+#endif //B3_CONSTRAINT4_h
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp
new file mode 100644
index 00000000..2189fd90
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp
@@ -0,0 +1,137 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "b3GpuGenericConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include <new>
+#include "Bullet3Common/b3Transform.h"
+void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData* bodies)
+	switch (m_constraintType)
+	{
+		{
+			*info = 3;
+			break;
+		};
+		default:
+		{
+			b3Assert(0);
+		}
+	};
+void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info,  const b3RigidBodyData* bodies)
+	b3Transform trA;
+	trA.setIdentity();
+	trA.setOrigin(bodies[constraint->m_rbA].m_pos);
+	trA.setRotation(bodies[constraint->m_rbA].m_quat);
+	b3Transform trB;
+	trB.setIdentity();
+	trB.setOrigin(bodies[constraint->m_rbB].m_pos);
+	trB.setRotation(bodies[constraint->m_rbB].m_quat);
+		// anchor points in global coordinates with respect to body PORs.
+    // set jacobian
+    info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip+1] = 1;
+	info->m_J1linearAxis[2*info->rowskip+2] = 1;
+	b3Vector3 a1 = trA.getBasis()*constraint->getPivotInA();
+	b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA());
+	{
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip);
+		b3Vector3 a1neg = -a1;
+		a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2);
+	}
+	if (info->m_J2linearAxis)
+	{
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[info->rowskip+1] = -1;
+		info->m_J2linearAxis[2*info->rowskip+2] = -1;
+	}
+	b3Vector3 a2 = trB.getBasis()*constraint->getPivotInB();
+	{
+	//	b3Vector3 a2n = -a2;
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip);
+		a2.getSkewSymmetricMatrix(angular0,angular1,angular2);
+	}
+    // set right hand side
+//	b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
+	b3Scalar currERP = info->erp;
+	b3Scalar k = info->fps * currERP;
+    int j;
+	for (j=0; j<3; j++)
+    {
+        info->m_constraintError[j*info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
+		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
+    }
+#if 0
+	if(m_flags & B3_P2P_FLAGS_CFM)
+	{
+		for (j=0; j<3; j++)
+		{
+			info->cfm[j*info->rowskip] = m_cfm;
+		}
+	}
+#if 0
+	b3Scalar impulseClamp = m_setting.m_impulseClamp;//
+	for (j=0; j<3; j++)
+    {
+		if (m_setting.m_impulseClamp > 0)
+		{
+			info->m_lowerLimit[j*info->rowskip] = -impulseClamp;
+			info->m_upperLimit[j*info->rowskip] = impulseClamp;
+		}
+	}
+	info->m_damping = m_setting.m_damping;
+void b3GpuGenericConstraint::getInfo2 (b3GpuConstraintInfo2* info,  const b3RigidBodyData* bodies)
+	switch (m_constraintType)
+	{
+		{
+			getInfo2Point2Point(this,info,bodies);
+			break;
+		};
+		default:
+			{
+				b3Assert(0);
+			}
+	};
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h
new file mode 100644
index 00000000..14b3ba7f
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h
@@ -0,0 +1,132 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3Common/b3Quaternion.h"
+struct b3RigidBodyData;
+enum b3GpuGenericConstraintType
+struct b3GpuConstraintInfo2 
+	// integrator parameters: frames per second (1/stepsize), default error
+	// reduction parameter (0..1).
+	b3Scalar fps,erp;
+	// for the first and second body, pointers to two (linear and angular)
+	// n*3 jacobian sub matrices, stored by rows. these matrices will have
+	// been initialized to 0 on entry. if the second body is zero then the
+	// J2xx pointers may be 0.
+	b3Scalar *m_J1linearAxis,*m_J1angularAxis,*m_J2linearAxis,*m_J2angularAxis;
+	// elements to jump from one row to the next in J's
+	int rowskip;
+	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
+	// "constraint force mixing" vector. c is set to zero on entry, cfm is
+	// set to a constant value (typically very small or zero) value on entry.
+	b3Scalar *m_constraintError,*cfm;
+	// lo and hi limits for variables (set to -/+ infinity on entry).
+	b3Scalar *m_lowerLimit,*m_upperLimit;
+	// findex vector for variables. see the LCP solver interface for a
+	// description of what this does. this is set to -1 on entry.
+	// note that the returned indexes are relative to the first index of
+	// the constraint.
+	int *findex;
+	// number of solver iterations
+	int m_numIterations;
+	//damping of the velocity
+	b3Scalar	m_damping;
+B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint
+	int				m_constraintType;
+	int				m_rbA;
+	int				m_rbB;
+	float			m_breakingImpulseThreshold;
+	b3Vector3 m_pivotInA;
+	b3Vector3 m_pivotInB;
+	b3Quaternion m_relTargetAB;
+	int	m_flags;
+	int m_uid;
+	int m_padding[2];
+	int	getRigidBodyA() const
+	{
+		return m_rbA;
+	}
+	int	getRigidBodyB() const
+	{
+		return m_rbB;
+	}
+	const b3Vector3& getPivotInA() const
+	{
+		return m_pivotInA;
+	}
+	const b3Vector3& getPivotInB() const
+	{
+		return m_pivotInB;
+	}
+	int isEnabled() const
+	{
+		return m_flags & B3_CONSTRAINT_FLAG_ENABLED;
+	}
+	float getBreakingImpulseThreshold() const
+	{
+		return m_breakingImpulseThreshold;
+	}
+	///internal method used by the constraint solver, don't use them directly
+	void getInfo1 (unsigned int* info,const b3RigidBodyData* bodies);
+	///internal method used by the constraint solver, don't use them directly
+	void getInfo2 (b3GpuConstraintInfo2* info,  const b3RigidBodyData* bodies);
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp
new file mode 100644
index 00000000..b5b025ef
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp
@@ -0,0 +1,1382 @@
+#include "b3GpuJacobiContactSolver.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
+class b3Vector3;
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/RigidBody/kernels/solverUtils.h"
+#include "Bullet3Common/b3Logging.h"
+#include "b3GpuConstraint4.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#define SOLVER_UTILS_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl"
+struct b3GpuJacobiSolverInternalData
+		//btRadixSort32CL*	m_sort32;
+		//btBoundSearchCL*	m_search;
+		b3PrefixScanCL*	m_scan;
+		b3OpenCLArray<unsigned int>* m_bodyCount;
+		b3OpenCLArray<b3Int2>*		m_contactConstraintOffsets;
+		b3OpenCLArray<unsigned int>* m_offsetSplitBodies;
+		b3OpenCLArray<b3Vector3>*	m_deltaLinearVelocities;
+		b3OpenCLArray<b3Vector3>*	m_deltaAngularVelocities;
+		b3AlignedObjectArray<b3Vector3>	m_deltaLinearVelocitiesCPU;
+		b3AlignedObjectArray<b3Vector3>	m_deltaAngularVelocitiesCPU;
+		b3OpenCLArray<b3GpuConstraint4>* m_contactConstraints;
+		b3FillCL*	m_filler;
+		cl_kernel	m_countBodiesKernel;
+		cl_kernel	m_contactToConstraintSplitKernel;
+		cl_kernel	m_clearVelocitiesKernel;
+		cl_kernel	m_averageVelocitiesKernel;
+		cl_kernel	m_updateBodyVelocitiesKernel;
+		cl_kernel	m_solveContactKernel;
+		cl_kernel	m_solveFrictionKernel;
+b3GpuJacobiContactSolver::b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
+	:m_context(ctx),
+	m_device(device),
+	m_queue(queue)
+	m_data = new b3GpuJacobiSolverInternalData;
+	m_data->m_scan = new b3PrefixScanCL(m_context,m_device,m_queue);
+	m_data->m_bodyCount = new b3OpenCLArray<unsigned int>(m_context,m_queue);
+	m_data->m_filler = new b3FillCL(m_context,m_device,m_queue);
+	m_data->m_contactConstraintOffsets = new b3OpenCLArray<b3Int2>(m_context,m_queue);
+	m_data->m_offsetSplitBodies = new b3OpenCLArray<unsigned int>(m_context,m_queue);
+	m_data->m_contactConstraints = new b3OpenCLArray<b3GpuConstraint4>(m_context,m_queue);
+	m_data->m_deltaLinearVelocities = new b3OpenCLArray<b3Vector3>(m_context,m_queue);
+	m_data->m_deltaAngularVelocities = new b3OpenCLArray<b3Vector3>(m_context,m_queue);
+	cl_int pErrNum;
+	const char* additionalMacros="";
+	const char* solverUtilsSource = solverUtilsCL;
+	{
+		cl_program solverUtilsProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverUtilsSource, &pErrNum,additionalMacros, SOLVER_UTILS_KERNEL_PATH);
+		b3Assert(solverUtilsProg);
+		m_data->m_countBodiesKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "CountBodiesKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_countBodiesKernel);
+		m_data->m_contactToConstraintSplitKernel  = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "ContactToConstraintSplitKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_contactToConstraintSplitKernel);
+		m_data->m_clearVelocitiesKernel  = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "ClearVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_clearVelocitiesKernel);
+		m_data->m_averageVelocitiesKernel  = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "AverageVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_averageVelocitiesKernel);
+		m_data->m_updateBodyVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "UpdateBodyVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_updateBodyVelocitiesKernel);
+		m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "SolveContactJacobiKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_solveContactKernel );
+		m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "SolveFrictionJacobiKernel", &pErrNum, solverUtilsProg,additionalMacros );
+		b3Assert(m_data->m_solveFrictionKernel);
+	}
+	clReleaseKernel(m_data->m_solveContactKernel);
+	clReleaseKernel(m_data->m_solveFrictionKernel);
+	clReleaseKernel(m_data->m_countBodiesKernel);
+	clReleaseKernel(m_data->m_contactToConstraintSplitKernel);
+	clReleaseKernel(m_data->m_averageVelocitiesKernel);
+	clReleaseKernel(m_data->m_updateBodyVelocitiesKernel);
+	clReleaseKernel(m_data->m_clearVelocitiesKernel );
+	delete m_data->m_deltaLinearVelocities;
+	delete m_data->m_deltaAngularVelocities;
+	delete m_data->m_contactConstraints;
+	delete m_data->m_offsetSplitBodies;
+	delete m_data->m_contactConstraintOffsets;
+	delete m_data->m_bodyCount;
+	delete m_data->m_filler;
+	delete m_data->m_scan;
+	delete m_data;
+b3Vector3 make_float4(float v)
+	return b3MakeVector3 (v,v,v);
+b3Vector4 make_float4(float x,float y, float z, float w)
+	return b3MakeVector4 (x,y,z,w);
+	static
+	inline
+	float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1, 
+					 const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1)
+	{
+		return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1);
+	}
+	static
+	inline
+	void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1,
+							 b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1)
+	{
+		linear = n;
+		angular0 = b3Cross(r0, n);
+		angular1 = -b3Cross(r1, n);
+	}
+static __inline void solveContact(b3GpuConstraint4& cs, 
+	const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA,
+	const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, 
+	float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB)
+	for(int ic=0; ic<4; ic++)
+	{
+		//	dont necessary because this makes change to 0
+		if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+		{
+			b3Vector3 angular0, angular1, linear;
+			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
+			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
+			setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 );
+			float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
+				linVelARO+dLinVelA, angVelARO+dAngVelA, linVelBRO+dLinVelB, angVelBRO+dAngVelB ) + cs.m_b[ic];
+			rambdaDt *= cs.m_jacCoeffInv[ic];
+			{
+				float prevSum = cs.m_appliedRambdaDt[ic];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max( updated, minRambdaDt[ic] );
+				updated = b3Min( updated, maxRambdaDt[ic] );
+				rambdaDt = updated - prevSum;
+				cs.m_appliedRambdaDt[ic] = updated;
+			}
+			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+            b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+			if (invMassA)
+			{
+				dLinVelA += linImp0;
+				dAngVelA += angImp0;
+			}
+			if (invMassB)
+			{
+				dLinVelB += linImp1;
+				dAngVelB += angImp1;
+			}
+		}
+	}
+void solveContact3(b3GpuConstraint4* cs,
+			b3Vector3* posAPtr, b3Vector3* linVelA, b3Vector3* angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+			b3Vector3* posBPtr, b3Vector3* linVelB, b3Vector3* angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
+			b3Vector3* dLinVelA, b3Vector3* dAngVelA, b3Vector3* dLinVelB, b3Vector3* dAngVelB)
+	float minRambdaDt = 0;
+	float maxRambdaDt = FLT_MAX;
+	for(int ic=0; ic<4; ic++)
+	{
+		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;
+		b3Vector3 angular0, angular1, linear;
+		b3Vector3 r0 = cs->m_worldPos[ic] - *posAPtr;
+		b3Vector3 r1 = cs->m_worldPos[ic] - *posBPtr;
+		setLinearAndAngular( cs->m_linear, r0, r1, linear, angular0, angular1 );
+		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, 
+			*linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];
+		rambdaDt *= cs->m_jacCoeffInv[ic];
+		{
+			float prevSum = cs->m_appliedRambdaDt[ic];
+			float updated = prevSum;
+			updated += rambdaDt;
+			updated = b3Max( updated, minRambdaDt );
+			updated = b3Min( updated, maxRambdaDt );
+			rambdaDt = updated - prevSum;
+			cs->m_appliedRambdaDt[ic] = updated;
+		}
+		b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+		b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+		b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+		b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+		if (invMassA)
+		{
+			*dLinVelA += linImp0;
+			*dAngVelA += angImp0;
+		}
+		if (invMassB)
+		{
+			*dLinVelB += linImp1;
+			*dAngVelB += angImp1;
+		}
+	}
+static inline void solveFriction(b3GpuConstraint4& cs, 
+	const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA,
+	const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, 
+	float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB)
+	b3Vector3 linVelA = linVelARO+dLinVelA;
+	b3Vector3 linVelB = linVelBRO+dLinVelB;
+	b3Vector3 angVelA = angVelARO+dAngVelA;
+	b3Vector3 angVelB = angVelBRO+dAngVelB;
+	if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+	const b3Vector3& center = (const b3Vector3&)cs.m_center;
+	b3Vector3 n = -(const b3Vector3&)cs.m_linear;
+	b3Vector3 tangent[2];
+#if 1		
+	b3PlaneSpace1 (n, tangent[0],tangent[1]);
+	b3Vector3 r = cs.m_worldPos[0]-center;
+	tangent[0] = cross3( n, r );
+	tangent[1] = cross3( tangent[0], n );
+	tangent[0] = normalize3( tangent[0] );
+	tangent[1] = normalize3( tangent[1] );
+	b3Vector3 angular0, angular1, linear;
+	b3Vector3 r0 = center - posA;
+	b3Vector3 r1 = center - posB;
+	for(int i=0; i<2; i++)
+	{
+		setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
+		float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+			linVelA, angVelA, linVelB, angVelB );
+		rambdaDt *= cs.m_fJacCoeffInv[i];
+			{
+				float prevSum = cs.m_fAppliedRambdaDt[i];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max( updated, minRambdaDt[i] );
+				updated = b3Min( updated, maxRambdaDt[i] );
+				rambdaDt = updated - prevSum;
+				cs.m_fAppliedRambdaDt[i] = updated;
+			}
+		b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+		b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+		b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+		b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+		b3Assert(_finite(linImp0.getX()));
+		b3Assert(_finite(linImp1.getX()));
+		if (invMassA)
+		{
+			dLinVelA += linImp0;
+			dAngVelA += angImp0;
+		}
+		if (invMassB)
+		{
+			dLinVelB += linImp1;
+			dAngVelB += angImp1;
+		}
+	}
+	{	//	angular damping for point constraint
+		b3Vector3 ab = ( posB - posA ).normalized();
+		b3Vector3 ac = ( center - posA ).normalized();
+		if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+		{
+			float angNA = b3Dot( n, angVelA );
+			float angNB = b3Dot( n, angVelB );
+			if (invMassA)
+				dAngVelA -= (angNA*0.1f)*n;
+			if (invMassB)
+				dAngVelB -= (angNB*0.1f)*n;
+		}
+	}
+float calcJacCoeff(const b3Vector3& linear0, const b3Vector3& linear1, const b3Vector3& angular0, const b3Vector3& angular1,
+					float invMass0, const b3Matrix3x3* invInertia0, float invMass1, const b3Matrix3x3* invInertia1, float countA, float countB)
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = b3Dot(mtMul3(angular0,*invInertia0), angular0);
+	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = b3Dot(mtMul3(angular1,*invInertia1), angular1);
+	return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);
+//	return -1.f/((jmj0+jmj1)+(jmj2+jmj3));
+void setConstraint4( const b3Vector3& posA, const b3Vector3& linVelA, const b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+	const b3Vector3& posB, const b3Vector3& linVelB, const b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+	 b3Contact4* src, float dt, float positionDrift, float positionConstraintCoeff, float countA, float countB,
+	b3GpuConstraint4* dstC )
+	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
+	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
+	float dtInv = 1.f/dt;
+	for(int ic=0; ic<4; ic++)
+	{
+		dstC->m_appliedRambdaDt[ic] = 0.f;
+	}
+	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
+	dstC->m_linear = src->m_worldNormalOnB;
+	dstC->m_linear[3] = 0.7f ;//src->getFrictionCoeff() );
+	for(int ic=0; ic<4; ic++)
+	{
+		b3Vector3 r0 = src->m_worldPosB[ic] - posA;
+		b3Vector3 r1 = src->m_worldPosB[ic] - posB;
+		if( ic >= src->m_worldNormalOnB[3] )//npoints
+		{
+			dstC->m_jacCoeffInv[ic] = 0.f;
+			continue;
+		}
+		float relVelN;
+		{
+			b3Vector3 linear, angular0, angular1;
+			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, linear, angular0, angular1);
+			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB ,countA,countB);
+			relVelN = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB);
+			float e = 0.f;//src->getRestituitionCoeff();
+			if( relVelN*relVelN < 0.004f ) 
+			{
+				e = 0.f;
+			}
+			dstC->m_b[ic] = e*relVelN;
+			//float penetration = src->m_worldPos[ic].w;
+			dstC->m_b[ic] += (src->m_worldPosB[ic][3] + positionDrift)*positionConstraintCoeff*dtInv;
+			dstC->m_appliedRambdaDt[ic] = 0.f;
+		}
+	}
+	if( src->m_worldNormalOnB[3] > 0 )//npoints
+	{	//	prepare friction
+		b3Vector3 center = make_float4(0.f);
+		for(int i=0; i<src->m_worldNormalOnB[3]; i++) 
+			center += src->m_worldPosB[i];
+		center /= (float)src->m_worldNormalOnB[3];
+		b3Vector3 tangent[2];
+		b3PlaneSpace1(src->m_worldNormalOnB,tangent[0],tangent[1]);
+		b3Vector3 r[2];
+		r[0] = center - posA;
+		r[1] = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			b3Vector3 linear, angular0, angular1;
+			setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1);
+			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB ,countA,countB);
+			dstC->m_fAppliedRambdaDt[i] = 0.f;
+		}
+		dstC->m_center = center;
+	}
+	for(int i=0; i<4; i++)
+	{
+		if( i<src->m_worldNormalOnB[3] )
+		{
+			dstC->m_worldPos[i] = src->m_worldPosB[i];
+		}
+		else
+		{
+			dstC->m_worldPos[i] = make_float4(0.f);
+		}
+	}
+void ContactToConstraintKernel(b3Contact4* gContact, b3RigidBodyData* gBodies, b3InertiaData* gShapes, b3GpuConstraint4* gConstraintOut, int nContacts,
+float dt,
+float positionDrift,
+float positionConstraintCoeff, int gIdx, b3AlignedObjectArray<unsigned int>& bodyCount
+	//int gIdx = 0;//GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
+		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+		b3Vector3 posA = gBodies[aIdx].m_pos;
+		b3Vector3 linVelA = gBodies[aIdx].m_linVel;
+		b3Vector3 angVelA = gBodies[aIdx].m_angVel;
+		float invMassA = gBodies[aIdx].m_invMass;
+		b3Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertiaWorld;//.m_invInertia;
+		b3Vector3 posB = gBodies[bIdx].m_pos;
+		b3Vector3 linVelB = gBodies[bIdx].m_linVel;
+		b3Vector3 angVelB = gBodies[bIdx].m_angVel;
+		float invMassB = gBodies[bIdx].m_invMass;
+		b3Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertiaWorld;//m_invInertia;
+		b3GpuConstraint4 cs;
+		float countA = invMassA ? (float)(bodyCount[aIdx]) : 1;
+		float countB = invMassB ? (float)(bodyCount[bIdx]) : 1;
+    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,
+			&cs );
+		cs.m_batchIdx = gContact[gIdx].m_batchIdx;
+		gConstraintOut[gIdx] = cs;
+	}
+void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo)
+	B3_PROFILE("b3GpuJacobiContactSolver::solveGroup");
+	b3AlignedObjectArray<unsigned int> bodyCount;
+	bodyCount.resize(numBodies);
+	for (int i=0;i<numBodies;i++)
+		bodyCount[i] = 0;
+	b3AlignedObjectArray<b3Int2> contactConstraintOffsets;
+	contactConstraintOffsets.resize(numManifolds);
+	for (int i=0;i<numManifolds;i++)
+	{
+		int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;
+		int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;
+		bool isFixedA = (pa <0) || (pa == solverInfo.m_fixedBodyIndex);
+		bool isFixedB = (pb <0) || (pb == solverInfo.m_fixedBodyIndex);
+		int bodyIndexA = manifoldPtr[i].getBodyA();
+		int bodyIndexB = manifoldPtr[i].getBodyB();
+		if (!isFixedA)
+		{
+			contactConstraintOffsets[i].x = bodyCount[bodyIndexA];
+			bodyCount[bodyIndexA]++;
+		}
+		if (!isFixedB)
+		{
+			contactConstraintOffsets[i].y = bodyCount[bodyIndexB];
+			bodyCount[bodyIndexB]++;
+		} 
+	}
+	b3AlignedObjectArray<unsigned int> offsetSplitBodies;
+	offsetSplitBodies.resize(numBodies);
+	unsigned int totalNumSplitBodies;
+	m_data->m_scan->executeHost(bodyCount,offsetSplitBodies,numBodies,&totalNumSplitBodies);
+	int numlastBody = bodyCount[numBodies-1];
+	totalNumSplitBodies += numlastBody;
+	printf("totalNumSplitBodies = %d\n",totalNumSplitBodies);
+	b3AlignedObjectArray<b3GpuConstraint4> contactConstraints;
+	contactConstraints.resize(numManifolds);
+	for (int i=0;i<numManifolds;i++)
+	{
+		ContactToConstraintKernel(&manifoldPtr[0],bodies,inertias,&contactConstraints[0],numManifolds,
+			solverInfo.m_deltaTime,
+			solverInfo.m_positionDrift,
+			solverInfo.m_positionConstraintCoeff,
+			i, bodyCount);
+	}
+	int maxIter = solverInfo.m_numIterations;
+	b3AlignedObjectArray<b3Vector3> deltaLinearVelocities;
+	b3AlignedObjectArray<b3Vector3> deltaAngularVelocities;
+	deltaLinearVelocities.resize(totalNumSplitBodies);
+	deltaAngularVelocities.resize(totalNumSplitBodies);
+	for (int i=0;i<totalNumSplitBodies;i++)
+	{
+		deltaLinearVelocities[i].setZero();
+		deltaAngularVelocities[i].setZero();
+	}
+	for (int iter = 0;iter<maxIter;iter++)
+	{
+		int i=0;
+		for( i=0; i<numManifolds; i++)
+		{
+			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
+			int aIdx = (int)contactConstraints[i].m_bodyA;
+			int bIdx = (int)contactConstraints[i].m_bodyB;
+			b3RigidBodyData& bodyA = bodies[aIdx];
+			b3RigidBodyData& bodyB = bodies[bIdx];
+			b3Vector3 zero = b3MakeVector3(0,0,0);
+			b3Vector3* dlvAPtr=&zero;
+			b3Vector3* davAPtr=&zero;
+			b3Vector3* dlvBPtr=&zero;
+			b3Vector3* davBPtr=&zero;
+			if (bodyA.m_invMass)
+			{
+				int bodyOffsetA = offsetSplitBodies[aIdx];
+				int constraintOffsetA = contactConstraintOffsets[i].x;
+				int splitIndexA = bodyOffsetA+constraintOffsetA;
+				dlvAPtr = &deltaLinearVelocities[splitIndexA];
+				davAPtr = &deltaAngularVelocities[splitIndexA];
+			}
+			if (bodyB.m_invMass)
+			{
+				int bodyOffsetB = offsetSplitBodies[bIdx];
+				int constraintOffsetB = contactConstraintOffsets[i].y;
+				int splitIndexB= bodyOffsetB+constraintOffsetB;
+				dlvBPtr =&deltaLinearVelocities[splitIndexB];
+				davBPtr = &deltaAngularVelocities[splitIndexB];
+			}
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+				solveContact( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld, 
+					(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld,
+					maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr		);
+			}
+		}
+		//easy
+		for (int i=0;i<numBodies;i++)
+		{
+			if (bodies[i].m_invMass)
+			{
+				int bodyOffset = offsetSplitBodies[i];
+				int count = bodyCount[i];
+				float factor = 1.f/float(count);
+				b3Vector3 averageLinVel;
+				averageLinVel.setZero();
+				b3Vector3 averageAngVel;
+				averageAngVel.setZero();
+				for (int j=0;j<count;j++)
+				{
+					averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
+					averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
+				}
+				for (int j=0;j<count;j++)
+				{
+					deltaLinearVelocities[bodyOffset+j] = averageLinVel;
+					deltaAngularVelocities[bodyOffset+j] = averageAngVel;
+				}
+			}
+		}
+	}
+	for (int iter = 0;iter<maxIter;iter++)
+	{
+		int i=0;
+		//solve friction
+		for(int i=0; i<numManifolds; i++)
+		{
+			float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+			float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+			float sum = 0;
+			for(int j=0; j<4; j++)
+			{
+				sum +=contactConstraints[i].m_appliedRambdaDt[j];
+			}
+			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
+			int aIdx = (int)contactConstraints[i].m_bodyA;
+			int bIdx = (int)contactConstraints[i].m_bodyB;
+			b3RigidBodyData& bodyA = bodies[aIdx];
+			b3RigidBodyData& bodyB = bodies[bIdx];
+			b3Vector3 zero = b3MakeVector3(0,0,0);
+			b3Vector3* dlvAPtr=&zero;
+			b3Vector3* davAPtr=&zero;
+			b3Vector3* dlvBPtr=&zero;
+			b3Vector3* davBPtr=&zero;
+			if (bodyA.m_invMass)
+			{
+				int bodyOffsetA = offsetSplitBodies[aIdx];
+				int constraintOffsetA = contactConstraintOffsets[i].x;
+				int splitIndexA = bodyOffsetA+constraintOffsetA;
+				dlvAPtr = &deltaLinearVelocities[splitIndexA];
+				davAPtr = &deltaAngularVelocities[splitIndexA];
+			}
+			if (bodyB.m_invMass)
+			{
+				int bodyOffsetB = offsetSplitBodies[bIdx];
+				int constraintOffsetB = contactConstraintOffsets[i].y;
+				int splitIndexB= bodyOffsetB+constraintOffsetB;
+				dlvBPtr =&deltaLinearVelocities[splitIndexB];
+				davBPtr = &deltaAngularVelocities[splitIndexB];
+			}
+			for(int j=0; j<4; j++)
+			{
+				maxRambdaDt[j] = frictionCoeff*sum;
+				minRambdaDt[j] = -maxRambdaDt[j];
+			}
+			solveFriction( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,inertias[aIdx].m_invInertiaWorld, 
+				(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld,
+				maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr);
+		}
+		//easy
+		for (int i=0;i<numBodies;i++)
+		{
+			if (bodies[i].m_invMass)
+			{
+				int bodyOffset = offsetSplitBodies[i];
+				int count = bodyCount[i];
+				float factor = 1.f/float(count);
+				b3Vector3 averageLinVel;
+				averageLinVel.setZero();
+				b3Vector3 averageAngVel;
+				averageAngVel.setZero();
+				for (int j=0;j<count;j++)
+				{
+					averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
+					averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
+				}
+				for (int j=0;j<count;j++)
+				{
+					deltaLinearVelocities[bodyOffset+j] = averageLinVel;
+					deltaAngularVelocities[bodyOffset+j] = averageAngVel;
+				}
+			}
+		}
+	}
+	//easy
+	for (int i=0;i<numBodies;i++)
+	{
+		if (bodies[i].m_invMass)
+		{
+			int bodyOffset = offsetSplitBodies[i];
+			int count = bodyCount[i];
+			if (count)
+			{
+				bodies[i].m_linVel += deltaLinearVelocities[bodyOffset];
+				bodies[i].m_angVel += deltaAngularVelocities[bodyOffset];
+			}
+		}
+	}
+void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index)
+//void  b3GpuJacobiContactSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* bodies,b3OpenCLArray<b3InertiaData>* inertias,b3OpenCLArray<b3Contact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo)
+	b3JacobiSolverInfo solverInfo;
+	solverInfo.m_fixedBodyIndex = static0Index;
+	B3_PROFILE("b3GpuJacobiContactSolver::solveGroup");
+	//int numBodies = bodies->size();
+	int numManifolds = numContacts;//manifoldPtr->size();
+	{
+		B3_PROFILE("resize");
+		m_data->m_bodyCount->resize(numBodies);
+	}
+	unsigned int val=0;
+	b3Int2 val2;
+	val2.x=0;
+	val2.y=0;
+	 {
+		B3_PROFILE("m_filler");
+		m_data->m_contactConstraintOffsets->resize(numManifolds);
+		m_data->m_filler->execute(*m_data->m_bodyCount,val,numBodies);
+		m_data->m_filler->execute(*m_data->m_contactConstraintOffsets,val2,numManifolds);
+	}
+	{
+		B3_PROFILE("m_countBodiesKernel");
+		b3LauncherCL launcher(this->m_queue,m_data->m_countBodiesKernel,"m_countBodiesKernel");
+		launcher.setBuffer(contactBuf);//manifoldPtr->getBufferCL());
+		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+		launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
+		launcher.setConst(numManifolds);
+		launcher.setConst(solverInfo.m_fixedBodyIndex);
+		launcher.launch1D(numManifolds);
+	}
+	unsigned int totalNumSplitBodies=0;
+	{
+		B3_PROFILE("m_scan->execute");
+		m_data->m_offsetSplitBodies->resize(numBodies);
+		m_data->m_scan->execute(*m_data->m_bodyCount,*m_data->m_offsetSplitBodies,numBodies,&totalNumSplitBodies);
+		totalNumSplitBodies+=m_data->m_bodyCount->at(numBodies-1);
+	}
+	{
+		B3_PROFILE("m_data->m_contactConstraints->resize");
+		//int numContacts = manifoldPtr->size();
+		m_data->m_contactConstraints->resize(numContacts);
+	}
+	{
+		B3_PROFILE("contactToConstraintSplitKernel");
+		b3LauncherCL launcher( m_queue, m_data->m_contactToConstraintSplitKernel,"m_contactToConstraintSplitKernel");
+		launcher.setBuffer(contactBuf);
+		launcher.setBuffer(bodyBuf);
+		launcher.setBuffer(inertiaBuf);
+		launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
+		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+        launcher.setConst(numContacts);
+		launcher.setConst(solverInfo.m_deltaTime);
+		launcher.setConst(solverInfo.m_positionDrift);
+		launcher.setConst(solverInfo.m_positionConstraintCoeff);
+		launcher.launch1D( numContacts, 64 );
+	}
+	{
+		B3_PROFILE("m_data->m_deltaLinearVelocities->resize");
+		m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies);
+		m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies);
+	}
+	{
+		B3_PROFILE("m_clearVelocitiesKernel");
+		b3LauncherCL launch(m_queue,m_data->m_clearVelocitiesKernel,"m_clearVelocitiesKernel");
+		launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+		launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+		launch.setConst(totalNumSplitBodies);
+		launch.launch1D(totalNumSplitBodies);
+		clFinish(m_queue);
+	}
+	int maxIter = solverInfo.m_numIterations;
+	for (int iter = 0;iter<maxIter;iter++)
+	{
+		{
+			B3_PROFILE("m_solveContactKernel");
+			b3LauncherCL launcher( m_queue, m_data->m_solveContactKernel,"m_solveContactKernel" );
+			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
+			launcher.setBuffer(bodyBuf);
+			launcher.setBuffer(inertiaBuf);
+			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(solverInfo.m_deltaTime);
+			launcher.setConst(solverInfo.m_positionDrift);
+			launcher.setConst(solverInfo.m_positionConstraintCoeff);
+			launcher.setConst(solverInfo.m_fixedBodyIndex);
+			launcher.setConst(numManifolds);
+			launcher.launch1D(numManifolds);
+			clFinish(m_queue);
+		}
+		{
+			B3_PROFILE("average velocities");
+			b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel,"m_averageVelocitiesKernel");
+			launcher.setBuffer(bodyBuf);
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_queue);
+		}
+		{
+			B3_PROFILE("m_solveFrictionKernel");
+			b3LauncherCL launcher( m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel");
+			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
+			launcher.setBuffer(bodyBuf);
+			launcher.setBuffer(inertiaBuf);
+			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(solverInfo.m_deltaTime);
+			launcher.setConst(solverInfo.m_positionDrift);
+			launcher.setConst(solverInfo.m_positionConstraintCoeff);
+			launcher.setConst(solverInfo.m_fixedBodyIndex);
+			launcher.setConst(numManifolds);
+			launcher.launch1D(numManifolds);
+			clFinish(m_queue);
+		}
+		{
+			B3_PROFILE("average velocities");
+			b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel,"m_averageVelocitiesKernel");
+			launcher.setBuffer(bodyBuf);
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_queue);
+		}
+	}
+	{
+			B3_PROFILE("update body velocities");
+			b3LauncherCL launcher( m_queue, m_data->m_updateBodyVelocitiesKernel,"m_updateBodyVelocitiesKernel");
+			launcher.setBuffer(bodyBuf);
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_queue);
+		}
+#if 0
+void  b3GpuJacobiContactSolver::solveGroupMixed(b3OpenCLArray<b3RigidBodyData>* bodiesGPU,b3OpenCLArray<b3InertiaData>* inertiasGPU,b3OpenCLArray<b3Contact4>* manifoldPtrGPU,const btJacobiSolverInfo& solverInfo)
+	b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
+	bodiesGPU->copyToHost(bodiesCPU);
+	b3AlignedObjectArray<b3InertiaData> inertiasCPU;
+	inertiasGPU->copyToHost(inertiasCPU);
+	b3AlignedObjectArray<b3Contact4> manifoldPtrCPU;
+	manifoldPtrGPU->copyToHost(manifoldPtrCPU);
+	int numBodiesCPU = bodiesGPU->size();
+	int numManifoldsCPU = manifoldPtrGPU->size();
+	B3_PROFILE("b3GpuJacobiContactSolver::solveGroupMixed");
+	b3AlignedObjectArray<unsigned int> bodyCount;
+	bodyCount.resize(numBodiesCPU);
+	for (int i=0;i<numBodiesCPU;i++)
+		bodyCount[i] = 0;
+	b3AlignedObjectArray<b3Int2> contactConstraintOffsets;
+	contactConstraintOffsets.resize(numManifoldsCPU);
+	for (int i=0;i<numManifoldsCPU;i++)
+	{
+		int pa = manifoldPtrCPU[i].m_bodyAPtrAndSignBit;
+		int pb = manifoldPtrCPU[i].m_bodyBPtrAndSignBit;
+		bool isFixedA = (pa <0) || (pa == solverInfo.m_fixedBodyIndex);
+		bool isFixedB = (pb <0) || (pb == solverInfo.m_fixedBodyIndex);
+		int bodyIndexA = manifoldPtrCPU[i].getBodyA();
+		int bodyIndexB = manifoldPtrCPU[i].getBodyB();
+		if (!isFixedA)
+		{
+			contactConstraintOffsets[i].x = bodyCount[bodyIndexA];
+			bodyCount[bodyIndexA]++;
+		}
+		if (!isFixedB)
+		{
+			contactConstraintOffsets[i].y = bodyCount[bodyIndexB];
+			bodyCount[bodyIndexB]++;
+		} 
+	}
+	b3AlignedObjectArray<unsigned int> offsetSplitBodies;
+	offsetSplitBodies.resize(numBodiesCPU);
+	unsigned int totalNumSplitBodiesCPU;
+	m_data->m_scan->executeHost(bodyCount,offsetSplitBodies,numBodiesCPU,&totalNumSplitBodiesCPU);
+	int numlastBody = bodyCount[numBodiesCPU-1];
+	totalNumSplitBodiesCPU += numlastBody;
+		int numBodies = bodiesGPU->size();
+	int numManifolds = manifoldPtrGPU->size();
+	m_data->m_bodyCount->resize(numBodies);
+	unsigned int val=0;
+	b3Int2 val2;
+	val2.x=0;
+	val2.y=0;
+	 {
+		B3_PROFILE("m_filler");
+		m_data->m_contactConstraintOffsets->resize(numManifolds);
+		m_data->m_filler->execute(*m_data->m_bodyCount,val,numBodies);
+		m_data->m_filler->execute(*m_data->m_contactConstraintOffsets,val2,numManifolds);
+	}
+	{
+		B3_PROFILE("m_countBodiesKernel");
+		b3LauncherCL launcher(this->m_queue,m_data->m_countBodiesKernel);
+		launcher.setBuffer(manifoldPtrGPU->getBufferCL());
+		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+		launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
+		launcher.setConst(numManifolds);
+		launcher.setConst(solverInfo.m_fixedBodyIndex);
+		launcher.launch1D(numManifolds);
+	}
+	unsigned int totalNumSplitBodies=0;
+	m_data->m_offsetSplitBodies->resize(numBodies);
+	m_data->m_scan->execute(*m_data->m_bodyCount,*m_data->m_offsetSplitBodies,numBodies,&totalNumSplitBodies);
+	totalNumSplitBodies+=m_data->m_bodyCount->at(numBodies-1);
+	if (totalNumSplitBodies != totalNumSplitBodiesCPU)
+	{
+		printf("error in totalNumSplitBodies!\n");
+	}
+	int numContacts = manifoldPtrGPU->size();
+	m_data->m_contactConstraints->resize(numContacts);
+	{
+		B3_PROFILE("contactToConstraintSplitKernel");
+		b3LauncherCL launcher( m_queue, m_data->m_contactToConstraintSplitKernel);
+		launcher.setBuffer(manifoldPtrGPU->getBufferCL());
+		launcher.setBuffer(bodiesGPU->getBufferCL());
+		launcher.setBuffer(inertiasGPU->getBufferCL());
+		launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
+		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+        launcher.setConst(numContacts);
+		launcher.setConst(solverInfo.m_deltaTime);
+		launcher.setConst(solverInfo.m_positionDrift);
+		launcher.setConst(solverInfo.m_positionConstraintCoeff);
+		launcher.launch1D( numContacts, 64 );
+		clFinish(m_queue);
+	}
+	b3AlignedObjectArray<b3GpuConstraint4> contactConstraints;
+	contactConstraints.resize(numManifoldsCPU);
+	for (int i=0;i<numManifoldsCPU;i++)
+	{
+		ContactToConstraintKernel(&manifoldPtrCPU[0],&bodiesCPU[0],&inertiasCPU[0],&contactConstraints[0],numManifoldsCPU,
+			solverInfo.m_deltaTime,
+			solverInfo.m_positionDrift,
+			solverInfo.m_positionConstraintCoeff,
+			i, bodyCount);
+	}
+	int maxIter = solverInfo.m_numIterations;
+	b3AlignedObjectArray<b3Vector3> deltaLinearVelocities;
+	b3AlignedObjectArray<b3Vector3> deltaAngularVelocities;
+	deltaLinearVelocities.resize(totalNumSplitBodiesCPU);
+	deltaAngularVelocities.resize(totalNumSplitBodiesCPU);
+	for (int i=0;i<totalNumSplitBodiesCPU;i++)
+	{
+		deltaLinearVelocities[i].setZero();
+		deltaAngularVelocities[i].setZero();
+	}
+	m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies);
+	m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies);
+	{
+		B3_PROFILE("m_clearVelocitiesKernel");
+		b3LauncherCL launch(m_queue,m_data->m_clearVelocitiesKernel);
+		launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+		launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+		launch.setConst(totalNumSplitBodies);
+		launch.launch1D(totalNumSplitBodies);
+	}
+		///!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	m_data->m_contactConstraints->copyToHost(contactConstraints);
+	m_data->m_offsetSplitBodies->copyToHost(offsetSplitBodies);
+	m_data->m_contactConstraintOffsets->copyToHost(contactConstraintOffsets);
+	m_data->m_deltaLinearVelocities->copyToHost(deltaLinearVelocities);
+	m_data->m_deltaAngularVelocities->copyToHost(deltaAngularVelocities);
+	for (int iter = 0;iter<maxIter;iter++)
+	{
+				{
+			B3_PROFILE("m_solveContactKernel");
+			b3LauncherCL launcher( m_queue, m_data->m_solveContactKernel );
+			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
+			launcher.setBuffer(bodiesGPU->getBufferCL());
+			launcher.setBuffer(inertiasGPU->getBufferCL());
+			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(solverInfo.m_deltaTime);
+			launcher.setConst(solverInfo.m_positionDrift);
+			launcher.setConst(solverInfo.m_positionConstraintCoeff);
+			launcher.setConst(solverInfo.m_fixedBodyIndex);
+			launcher.setConst(numManifolds);
+			launcher.launch1D(numManifolds);
+			clFinish(m_queue);
+		}
+		int i=0;
+		for( i=0; i<numManifoldsCPU; i++)
+		{
+			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
+			int aIdx = (int)contactConstraints[i].m_bodyA;
+			int bIdx = (int)contactConstraints[i].m_bodyB;
+			b3RigidBodyData& bodyA = bodiesCPU[aIdx];
+			b3RigidBodyData& bodyB = bodiesCPU[bIdx];
+			b3Vector3 zero(0,0,0);
+			b3Vector3* dlvAPtr=&zero;
+			b3Vector3* davAPtr=&zero;
+			b3Vector3* dlvBPtr=&zero;
+			b3Vector3* davBPtr=&zero;
+			if (bodyA.m_invMass)
+			{
+				int bodyOffsetA = offsetSplitBodies[aIdx];
+				int constraintOffsetA = contactConstraintOffsets[i].x;
+				int splitIndexA = bodyOffsetA+constraintOffsetA;
+				dlvAPtr = &deltaLinearVelocities[splitIndexA];
+				davAPtr = &deltaAngularVelocities[splitIndexA];
+			}
+			if (bodyB.m_invMass)
+			{
+				int bodyOffsetB = offsetSplitBodies[bIdx];
+				int constraintOffsetB = contactConstraintOffsets[i].y;
+				int splitIndexB= bodyOffsetB+constraintOffsetB;
+				dlvBPtr =&deltaLinearVelocities[splitIndexB];
+				davBPtr = &deltaAngularVelocities[splitIndexB];
+			}
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+				solveContact( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertiasCPU[aIdx].m_invInertiaWorld, 
+					(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertiasCPU[bIdx].m_invInertiaWorld,
+					maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr		);
+			}
+		}
+		{
+			B3_PROFILE("average velocities");
+			b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel);
+			launcher.setBuffer(bodiesGPU->getBufferCL());
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_queue);
+		}
+		//easy
+		for (int i=0;i<numBodiesCPU;i++)
+		{
+			if (bodiesCPU[i].m_invMass)
+			{
+				int bodyOffset = offsetSplitBodies[i];
+				int count = bodyCount[i];
+				float factor = 1.f/float(count);
+				b3Vector3 averageLinVel;
+				averageLinVel.setZero();
+				b3Vector3 averageAngVel;
+				averageAngVel.setZero();
+				for (int j=0;j<count;j++)
+				{
+					averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
+					averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
+				}
+				for (int j=0;j<count;j++)
+				{
+					deltaLinearVelocities[bodyOffset+j] = averageLinVel;
+					deltaAngularVelocities[bodyOffset+j] = averageAngVel;
+				}
+			}
+		}
+//	m_data->m_deltaAngularVelocities->copyFromHost(deltaAngularVelocities);
+	//m_data->m_deltaLinearVelocities->copyFromHost(deltaLinearVelocities);
+	m_data->m_deltaAngularVelocities->copyToHost(deltaAngularVelocities);
+	m_data->m_deltaLinearVelocities->copyToHost(deltaLinearVelocities);
+#if 0
+		{
+			B3_PROFILE("m_solveFrictionKernel");
+			b3LauncherCL launcher( m_queue, m_data->m_solveFrictionKernel);
+			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
+			launcher.setBuffer(bodiesGPU->getBufferCL());
+			launcher.setBuffer(inertiasGPU->getBufferCL());
+			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(solverInfo.m_deltaTime);
+			launcher.setConst(solverInfo.m_positionDrift);
+			launcher.setConst(solverInfo.m_positionConstraintCoeff);
+			launcher.setConst(solverInfo.m_fixedBodyIndex);
+			launcher.setConst(numManifolds);
+			launcher.launch1D(numManifolds);
+			clFinish(m_queue);
+		}
+		//solve friction
+		for(int i=0; i<numManifoldsCPU; i++)
+		{
+			float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+			float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+			float sum = 0;
+			for(int j=0; j<4; j++)
+			{
+				sum +=contactConstraints[i].m_appliedRambdaDt[j];
+			}
+			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
+			int aIdx = (int)contactConstraints[i].m_bodyA;
+			int bIdx = (int)contactConstraints[i].m_bodyB;
+			b3RigidBodyData& bodyA = bodiesCPU[aIdx];
+			b3RigidBodyData& bodyB = bodiesCPU[bIdx];
+			b3Vector3 zero(0,0,0);
+			b3Vector3* dlvAPtr=&zero;
+			b3Vector3* davAPtr=&zero;
+			b3Vector3* dlvBPtr=&zero;
+			b3Vector3* davBPtr=&zero;
+			if (bodyA.m_invMass)
+			{
+				int bodyOffsetA = offsetSplitBodies[aIdx];
+				int constraintOffsetA = contactConstraintOffsets[i].x;
+				int splitIndexA = bodyOffsetA+constraintOffsetA;
+				dlvAPtr = &deltaLinearVelocities[splitIndexA];
+				davAPtr = &deltaAngularVelocities[splitIndexA];
+			}
+			if (bodyB.m_invMass)
+			{
+				int bodyOffsetB = offsetSplitBodies[bIdx];
+				int constraintOffsetB = contactConstraintOffsets[i].y;
+				int splitIndexB= bodyOffsetB+constraintOffsetB;
+				dlvBPtr =&deltaLinearVelocities[splitIndexB];
+				davBPtr = &deltaAngularVelocities[splitIndexB];
+			}
+			for(int j=0; j<4; j++)
+			{
+				maxRambdaDt[j] = frictionCoeff*sum;
+				minRambdaDt[j] = -maxRambdaDt[j];
+			}
+			solveFriction( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,inertiasCPU[aIdx].m_invInertiaWorld, 
+				(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertiasCPU[bIdx].m_invInertiaWorld,
+				maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr);
+		}
+		{
+			B3_PROFILE("average velocities");
+			b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel);
+			launcher.setBuffer(bodiesGPU->getBufferCL());
+			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_queue);
+		}
+		//easy
+		for (int i=0;i<numBodiesCPU;i++)
+		{
+			if (bodiesCPU[i].m_invMass)
+			{
+				int bodyOffset = offsetSplitBodies[i];
+				int count = bodyCount[i];
+				float factor = 1.f/float(count);
+				b3Vector3 averageLinVel;
+				averageLinVel.setZero();
+				b3Vector3 averageAngVel;
+				averageAngVel.setZero();
+				for (int j=0;j<count;j++)
+				{
+					averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
+					averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
+				}
+				for (int j=0;j<count;j++)
+				{
+					deltaLinearVelocities[bodyOffset+j] = averageLinVel;
+					deltaAngularVelocities[bodyOffset+j] = averageAngVel;
+				}
+			}
+		}
+	}
+	{
+		B3_PROFILE("update body velocities");
+		b3LauncherCL launcher( m_queue, m_data->m_updateBodyVelocitiesKernel);
+		launcher.setBuffer(bodiesGPU->getBufferCL());
+		launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
+		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
+		launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
+		launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
+		launcher.setConst(numBodies);
+		launcher.launch1D(numBodies);
+		clFinish(m_queue);
+	}
+	//easy
+	for (int i=0;i<numBodiesCPU;i++)
+	{
+		if (bodiesCPU[i].m_invMass)
+		{
+			int bodyOffset = offsetSplitBodies[i];
+			int count = bodyCount[i];
+			if (count)
+			{
+				bodiesCPU[i].m_linVel += deltaLinearVelocities[bodyOffset];
+				bodiesCPU[i].m_angVel += deltaAngularVelocities[bodyOffset];
+			}
+		}
+	}
+//	bodiesGPU->copyFromHost(bodiesCPU);
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h
new file mode 100644
index 00000000..b418f29e
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h
@@ -0,0 +1,62 @@
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+//#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+//struct b3InertiaData;
+class b3TypedConstraint;
+struct b3JacobiSolverInfo
+	int m_fixedBodyIndex;
+	float m_deltaTime;
+	float m_positionDrift;
+	float m_positionConstraintCoeff;
+	int	m_numIterations;
+	b3JacobiSolverInfo()
+		:m_fixedBodyIndex(0),
+		m_deltaTime(1./60.f),
+		m_positionDrift( 0.005f ), 
+		m_positionConstraintCoeff( 0.99f ),
+		m_numIterations(7)
+	{
+	}
+class b3GpuJacobiContactSolver
+	struct b3GpuJacobiSolverInternalData* m_data;
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+	b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
+	virtual ~b3GpuJacobiContactSolver();
+	void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
+	void solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,struct b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo);
+	//void  solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo);
+	//b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	//void  solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
+	//void  solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp
new file mode 100644
index 00000000..b512405d
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp
@@ -0,0 +1,1107 @@
+#include "b3GpuNarrowPhase.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
+#include <string.h>
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h"
+#include "b3GpuNarrowPhaseInternalData.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h"
+b3GpuNarrowPhase::b3GpuNarrowPhase(cl_context ctx, cl_device_id device, cl_command_queue queue, const b3Config& config)
+:m_data(0) ,m_planeBodyIndex(-1),m_static0Index(-1),
+	m_data = new b3GpuNarrowPhaseInternalData();
+	m_data->m_currentContactBuffer = 0;
+	memset(m_data,0,sizeof(b3GpuNarrowPhaseInternalData));
+	m_data->m_config = config;
+	m_data->m_gpuSatCollision = new GpuSatCollision(ctx,device,queue);
+	m_data->m_triangleConvexPairs = new b3OpenCLArray<b3Int4>(m_context,m_queue, config.m_maxTriConvexPairCapacity);
+	//m_data->m_convexPairsOutGPU = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false);
+	//m_data->m_planePairs = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false);
+	m_data->m_pBufContactOutCPU = new b3AlignedObjectArray<b3Contact4>();
+	m_data->m_pBufContactOutCPU->resize(config.m_maxBroadphasePairs);
+	m_data->m_bodyBufferCPU = new b3AlignedObjectArray<b3RigidBodyData>();
+	m_data->m_bodyBufferCPU->resize(config.m_maxConvexBodies);
+	m_data->m_inertiaBufferCPU = new b3AlignedObjectArray<b3InertiaData>();
+	m_data->m_inertiaBufferCPU->resize(config.m_maxConvexBodies);
+	m_data->m_pBufContactBuffersGPU[0] = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true);
+	m_data->m_pBufContactBuffersGPU[1] = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true);
+	m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx,queue,config.m_maxConvexBodies,false);
+	m_data->m_collidablesGPU = new b3OpenCLArray<b3Collidable>(ctx,queue,config.m_maxConvexShapes);
+	m_data->m_collidablesCPU.reserve(config.m_maxConvexShapes);
+	m_data->m_localShapeAABBCPU = new b3AlignedObjectArray<b3SapAabb>;
+	m_data->m_localShapeAABBGPU = new b3OpenCLArray<b3SapAabb>(ctx,queue,config.m_maxConvexShapes);
+	//m_data->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate(ctx,queue, config.m_maxBroadphasePairs,false);
+	m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx,queue, config.m_maxConvexBodies,false);
+	m_data->m_convexFacesGPU = new b3OpenCLArray<b3GpuFace>(ctx,queue,config.m_maxConvexShapes*config.m_maxFacesPerShape,false);
+	m_data->m_convexFaces.reserve(config.m_maxConvexShapes*config.m_maxFacesPerShape);
+	m_data->m_gpuChildShapes = new b3OpenCLArray<b3GpuChildShape>(ctx,queue,config.m_maxCompoundChildShapes,false);
+	m_data->m_convexPolyhedraGPU = new b3OpenCLArray<b3ConvexPolyhedronData>(ctx,queue,config.m_maxConvexShapes,false);
+	m_data->m_convexPolyhedra.reserve(config.m_maxConvexShapes);
+	m_data->m_uniqueEdgesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexUniqueEdges,true);
+	m_data->m_uniqueEdges.reserve(config.m_maxConvexUniqueEdges);
+	m_data->m_convexVerticesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexVertices,true);
+	m_data->m_convexVertices.reserve(config.m_maxConvexVertices);
+	m_data->m_convexIndicesGPU = new b3OpenCLArray<int>(ctx,queue,config.m_maxConvexIndices,true);
+    m_data->m_convexIndices.reserve(config.m_maxConvexIndices);
+	m_data->m_worldVertsB1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
+    m_data->m_clippingFacesOutGPU = new  b3OpenCLArray<b3Int4>(ctx,queue,config.m_maxConvexBodies);
+    m_data->m_worldNormalsAGPU = new  b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies);
+	m_data->m_worldVertsA1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
+    m_data->m_worldVertsB2GPU = new  b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
+	m_data->m_convexData = new b3AlignedObjectArray<b3ConvexUtility* >();
+	m_data->m_convexData->resize(config.m_maxConvexShapes);
+	m_data->m_convexPolyhedra.resize(config.m_maxConvexShapes);
+	m_data->m_numAcceleratedShapes = 0;
+	m_data->m_numAcceleratedRigidBodies = 0;
+	m_data->m_subTreesGPU = new b3OpenCLArray<b3BvhSubtreeInfo>(this->m_context,this->m_queue);
+	m_data->m_treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue);
+	m_data->m_bvhInfoGPU = new b3OpenCLArray<b3BvhInfo>(this->m_context,this->m_queue);
+	//m_data->m_contactCGPU = new b3OpenCLArray<Constraint4>(ctx,queue,config.m_maxBroadphasePairs,false);
+	//m_data->m_frictionCGPU = new b3OpenCLArray<adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_data->m_deviceCL, config.m_maxBroadphasePairs);
+	delete m_data->m_gpuSatCollision;
+	delete m_data->m_triangleConvexPairs;
+	//delete m_data->m_convexPairsOutGPU;
+	//delete m_data->m_planePairs;
+	delete m_data->m_pBufContactOutCPU;
+	delete m_data->m_bodyBufferCPU;
+	delete m_data->m_inertiaBufferCPU;
+	delete m_data->m_pBufContactBuffersGPU[0];
+	delete m_data->m_pBufContactBuffersGPU[1];
+	delete m_data->m_inertiaBufferGPU;
+	delete m_data->m_collidablesGPU;
+	delete m_data->m_localShapeAABBCPU;
+	delete m_data->m_localShapeAABBGPU;
+	delete m_data->m_bodyBufferGPU;
+	delete m_data->m_convexFacesGPU;
+	delete m_data->m_gpuChildShapes;
+	delete m_data->m_convexPolyhedraGPU;
+	delete m_data->m_uniqueEdgesGPU;
+	delete m_data->m_convexVerticesGPU;
+	delete m_data->m_convexIndicesGPU;
+	delete m_data->m_worldVertsB1GPU;
+    delete m_data->m_clippingFacesOutGPU;
+    delete m_data->m_worldNormalsAGPU;
+	delete m_data->m_worldVertsA1GPU;
+    delete m_data->m_worldVertsB2GPU;
+	delete m_data->m_bvhInfoGPU;
+	for (int i=0;i<m_data->m_bvhData.size();i++)
+	{
+		delete m_data->m_bvhData[i];
+	}
+	for (int i=0;i<m_data->m_meshInterfaces.size();i++)
+	{
+		delete m_data->m_meshInterfaces[i];
+	}
+	m_data->m_meshInterfaces.clear();
+	m_data->m_bvhData.clear();
+	delete m_data->m_treeNodesGPU;
+	delete m_data->m_subTreesGPU;
+    delete m_data->m_convexData;
+	delete m_data;
+int	b3GpuNarrowPhase::allocateCollidable()
+	int curSize = m_data->m_collidablesCPU.size();
+	if (curSize<m_data->m_config.m_maxConvexShapes)
+	{
+		m_data->m_collidablesCPU.expand();
+		return curSize;
+	}
+	else
+	{
+		b3Error("allocateCollidable out-of-range %d\n",m_data->m_config.m_maxConvexShapes);
+	}
+	return -1;
+int		b3GpuNarrowPhase::registerSphereShape(float radius)
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex<0)
+		return collidableIndex;
+	b3Collidable& col = getCollidableCpu(collidableIndex);
+	col.m_shapeType = SHAPE_SPHERE;
+	col.m_shapeIndex = 0;
+	col.m_radius = radius;
+	if (col.m_shapeIndex>=0)
+	{
+		b3SapAabb aabb;
+		b3Vector3 myAabbMin=b3MakeVector3(-radius,-radius,-radius);
+		b3Vector3 myAabbMax=b3MakeVector3(radius,radius,radius);
+		aabb.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x;
+		aabb.m_min[1] = myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y;
+		aabb.m_min[2] = myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z;
+		aabb.m_minIndices[3] = 0;
+		aabb.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x;
+		aabb.m_max[1] = myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y;
+		aabb.m_max[2] = myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z;
+		aabb.m_signedMaxIndices[3] = 0;
+		m_data->m_localShapeAABBCPU->push_back(aabb);
+//		m_data->m_localShapeAABBGPU->push_back(aabb);
+		clFinish(m_queue);
+	}
+	return collidableIndex;
+int b3GpuNarrowPhase::registerFace(const b3Vector3& faceNormal, float faceConstant)
+	int faceOffset = m_data->m_convexFaces.size();
+	b3GpuFace& face = m_data->m_convexFaces.expand();
+	face.m_plane = b3MakeVector3(faceNormal.x,faceNormal.y,faceNormal.z,faceConstant);
+	return faceOffset;
+int		b3GpuNarrowPhase::registerPlaneShape(const b3Vector3& planeNormal, float planeConstant)
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex<0)
+		return collidableIndex;
+	b3Collidable& col = getCollidableCpu(collidableIndex);
+	col.m_shapeType = SHAPE_PLANE;
+	col.m_shapeIndex = registerFace(planeNormal,planeConstant);
+	col.m_radius = planeConstant;
+	if (col.m_shapeIndex>=0)
+	{
+		b3SapAabb aabb;
+		aabb.m_min[0] = -1e30f;
+		aabb.m_min[1] = -1e30f;
+		aabb.m_min[2] = -1e30f;
+		aabb.m_minIndices[3] = 0;
+		aabb.m_max[0] = 1e30f;
+		aabb.m_max[1] = 1e30f;
+		aabb.m_max[2] = 1e30f;
+		aabb.m_signedMaxIndices[3] = 0;
+		m_data->m_localShapeAABBCPU->push_back(aabb);
+//		m_data->m_localShapeAABBGPU->push_back(aabb);
+		clFinish(m_queue);
+	}
+	return collidableIndex;
+int b3GpuNarrowPhase::registerConvexHullShapeInternal(b3ConvexUtility* convexPtr,b3Collidable& col)
+	m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1);
+	m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
+	b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
+	convex.mC = convexPtr->mC;
+	convex.mE = convexPtr->mE;
+	convex.m_extents= convexPtr->m_extents;
+	convex.m_localCenter = convexPtr->m_localCenter;
+	convex.m_radius = convexPtr->m_radius;
+	convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size();
+	int edgeOffset = m_data->m_uniqueEdges.size();
+	convex.m_uniqueEdgesOffset = edgeOffset;
+	m_data->m_uniqueEdges.resize(edgeOffset+convex.m_numUniqueEdges);
+	//convex data here
+	int i;
+	for ( i=0;i<convexPtr->m_uniqueEdges.size();i++)
+	{
+		m_data->m_uniqueEdges[edgeOffset+i] = convexPtr->m_uniqueEdges[i];
+	}
+	int faceOffset = m_data->m_convexFaces.size();
+	convex.m_faceOffset = faceOffset;
+	convex.m_numFaces = convexPtr->m_faces.size();
+	m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
+	for (i=0;i<convexPtr->m_faces.size();i++)
+	{
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector3(convexPtr->m_faces[i].m_plane[0],
+																			convexPtr->m_faces[i].m_plane[1],
+																			convexPtr->m_faces[i].m_plane[2],
+																			convexPtr->m_faces[i].m_plane[3]);
+		int indexOffset = m_data->m_convexIndices.size();
+		int numIndices = convexPtr->m_faces[i].m_indices.size();
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
+		m_data->m_convexIndices.resize(indexOffset+numIndices);
+		for (int p=0;p<numIndices;p++)
+		{
+			m_data->m_convexIndices[indexOffset+p] = convexPtr->m_faces[i].m_indices[p];
+		}
+	}
+	convex.m_numVertices = convexPtr->m_vertices.size();
+	int vertexOffset = m_data->m_convexVertices.size();
+	convex.m_vertexOffset =vertexOffset;
+	m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
+	for (int i=0;i<convexPtr->m_vertices.size();i++)
+	{
+		m_data->m_convexVertices[vertexOffset+i] = convexPtr->m_vertices[i];
+	}
+	(*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr;
+	return m_data->m_numAcceleratedShapes++;
+int		b3GpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
+	b3AlignedObjectArray<b3Vector3> verts;
+	unsigned char* vts = (unsigned char*) vertices;
+	for (int i=0;i<numVertices;i++)
+	{
+		float* vertex = (float*) &vts[i*strideInBytes];
+		verts.push_back(b3MakeVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2]));
+	}
+	b3ConvexUtility* utilPtr = new b3ConvexUtility();
+	bool merge = true;
+	if (numVertices)
+	{
+		utilPtr->initializePolyhedralFeatures(&verts[0],verts.size(),merge);
+	}
+	int collidableIndex = registerConvexHullShape(utilPtr);
+	delete utilPtr;
+	return collidableIndex;
+int		b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr)
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex<0)
+		return collidableIndex;
+	b3Collidable& col = getCollidableCpu(collidableIndex);
+	col.m_shapeType = SHAPE_CONVEX_HULL;
+	col.m_shapeIndex = -1;
+	{
+		b3Vector3 localCenter=b3MakeVector3(0,0,0);
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+			localCenter+=utilPtr->m_vertices[i];
+		localCenter*= (1.f/utilPtr->m_vertices.size());
+		utilPtr->m_localCenter = localCenter;
+		col.m_shapeIndex = registerConvexHullShapeInternal(utilPtr,col);
+	}
+	if (col.m_shapeIndex>=0)
+	{
+		b3SapAabb aabb;
+		b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f);
+		b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f);
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+		{
+			myAabbMin.setMin(utilPtr->m_vertices[i]);
+			myAabbMax.setMax(utilPtr->m_vertices[i]);
+		}
+		aabb.m_min[0] = myAabbMin[0];
+		aabb.m_min[1] = myAabbMin[1];
+		aabb.m_min[2] = myAabbMin[2];
+		aabb.m_minIndices[3] = 0;
+		aabb.m_max[0] = myAabbMax[0];
+		aabb.m_max[1] = myAabbMax[1];
+		aabb.m_max[2] = myAabbMax[2];
+		aabb.m_signedMaxIndices[3] = 0;
+		m_data->m_localShapeAABBCPU->push_back(aabb);
+//		m_data->m_localShapeAABBGPU->push_back(aabb);
+	}
+	return collidableIndex;
+int		b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes)
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex<0)
+		return collidableIndex;
+	b3Collidable& col = getCollidableCpu(collidableIndex);
+	col.m_shapeIndex = m_data->m_cpuChildShapes.size();
+	col.m_compoundBvhIndex = m_data->m_bvhInfoCPU.size();
+	{
+		b3Assert(col.m_shapeIndex+childShapes->size()<m_data->m_config.m_maxCompoundChildShapes);
+		for (int i=0;i<childShapes->size();i++)
+		{
+			m_data->m_cpuChildShapes.push_back(childShapes->at(i));
+		}
+	}
+	col.m_numChildShapes = childShapes->size();
+	b3SapAabb aabbLocalSpace;
+	b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f);
+	b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f);
+	b3AlignedObjectArray<b3Aabb> childLocalAabbs;
+	childLocalAabbs.resize(childShapes->size());
+	//compute local AABB of the compound of all children
+	for (int i=0;i<childShapes->size();i++)
+	{
+		int childColIndex = childShapes->at(i).m_shapeIndex;
+		b3Collidable& childCol = getCollidableCpu(childColIndex);
+		b3SapAabb aabbLoc =m_data->m_localShapeAABBCPU->at(childColIndex);
+		b3Vector3 childLocalAabbMin=b3MakeVector3(aabbLoc.m_min[0],aabbLoc.m_min[1],aabbLoc.m_min[2]);
+		b3Vector3 childLocalAabbMax=b3MakeVector3(aabbLoc.m_max[0],aabbLoc.m_max[1],aabbLoc.m_max[2]);
+		b3Vector3 aMin,aMax;
+		b3Scalar margin(0.f);
+		b3Transform childTr;
+		childTr.setIdentity();
+		childTr.setOrigin(childShapes->at(i).m_childPosition);
+		childTr.setRotation(b3Quaternion(childShapes->at(i).m_childOrientation));
+		b3TransformAabb(childLocalAabbMin,childLocalAabbMax,margin,childTr,aMin,aMax);
+		myAabbMin.setMin(aMin);
+		myAabbMax.setMax(aMax);		
+		childLocalAabbs[i].m_min[0] = aMin[0];
+		childLocalAabbs[i].m_min[1] = aMin[1];
+		childLocalAabbs[i].m_min[2] = aMin[2];
+		childLocalAabbs[i].m_min[3] = 0;
+		childLocalAabbs[i].m_max[0] = aMax[0];
+		childLocalAabbs[i].m_max[1] = aMax[1];
+		childLocalAabbs[i].m_max[2] = aMax[2];
+		childLocalAabbs[i].m_max[3] = 0;
+	}
+	aabbLocalSpace.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x;
+	aabbLocalSpace.m_min[1]= myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y;
+	aabbLocalSpace.m_min[2]= myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z;
+	aabbLocalSpace.m_minIndices[3] = 0;
+	aabbLocalSpace.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x;
+	aabbLocalSpace.m_max[1]= myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y;
+	aabbLocalSpace.m_max[2]= myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z;
+	aabbLocalSpace.m_signedMaxIndices[3] = 0;
+	m_data->m_localShapeAABBCPU->push_back(aabbLocalSpace);
+	b3QuantizedBvh* bvh = new b3QuantizedBvh;
+	bvh->setQuantizationValues(myAabbMin,myAabbMax);
+	QuantizedNodeArray&	nodes = bvh->getLeafNodeArray();
+	int numNodes = childShapes->size();
+	for (int i=0;i<numNodes;i++)
+	{
+		b3QuantizedBvhNode node;
+		b3Vector3 aabbMin,aabbMax;
+		aabbMin = (b3Vector3&) childLocalAabbs[i].m_min;
+		aabbMax = (b3Vector3&) childLocalAabbs[i].m_max;
+		bvh->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
+		bvh->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
+		int partId = 0;
+		node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | i;
+		nodes.push_back(node);
+	}
+	bvh->buildInternal();
+	int numSubTrees = bvh->getSubtreeInfoArray().size();
+	//void	setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0));
+	//QuantizedNodeArray&	getLeafNodeArray() {			return	m_quantizedLeafNodes;	}
+	///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
+	//void	buildInternal();
+	b3BvhInfo bvhInfo;
+	bvhInfo.m_aabbMin = bvh->m_bvhAabbMin;
+	bvhInfo.m_aabbMax = bvh->m_bvhAabbMax;
+	bvhInfo.m_quantization = bvh->m_bvhQuantization;
+	bvhInfo.m_numNodes = numNodes;
+	bvhInfo.m_numSubTrees = numSubTrees;
+	bvhInfo.m_nodeOffset = m_data->m_treeNodesCPU.size();
+	bvhInfo.m_subTreeOffset = m_data->m_subTreesCPU.size();
+	int numNewNodes = 		bvh->getQuantizedNodeArray().size();
+	for (int i=0;i<numNewNodes-1;i++)
+	{
+		if (bvh->getQuantizedNodeArray()[i].isLeafNode())
+		{
+			int orgIndex = bvh->getQuantizedNodeArray()[i].getTriangleIndex();
+			b3Vector3 nodeMinVec = bvh->unQuantize(bvh->getQuantizedNodeArray()[i].m_quantizedAabbMin);
+			b3Vector3 nodeMaxVec = bvh->unQuantize(bvh->getQuantizedNodeArray()[i].m_quantizedAabbMax);
+			for (int c=0;c<3;c++)
+			{
+				if (childLocalAabbs[orgIndex].m_min[c] < nodeMinVec[c])
+				{
+					printf("min org (%f) and new (%f) ? at i:%d,c:%d\n",childLocalAabbs[i].m_min[c],nodeMinVec[c],i,c);
+				}
+				if (childLocalAabbs[orgIndex].m_max[c] > nodeMaxVec[c])
+				{
+					printf("max org (%f) and new (%f) ? at i:%d,c:%d\n",childLocalAabbs[i].m_max[c],nodeMaxVec[c],i,c);
+				}
+			}
+		}
+	}
+	m_data->m_bvhInfoCPU.push_back(bvhInfo);
+	int numNewSubtrees = bvh->getSubtreeInfoArray().size();
+	m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees);
+	for (int i=0;i<numNewSubtrees;i++)
+	{
+		m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]);
+	}
+	int numNewTreeNodes = bvh->getQuantizedNodeArray().size();
+	for (int i=0;i<numNewTreeNodes;i++)
+	{
+		m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]);
+	}
+//	m_data->m_localShapeAABBGPU->push_back(aabbWS);
+	clFinish(m_queue);
+	return collidableIndex;
+int		b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling1)
+	b3Vector3 scaling=b3MakeVector3(scaling1[0],scaling1[1],scaling1[2]);
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex<0)
+		return collidableIndex;
+	b3Collidable& col = getCollidableCpu(collidableIndex);
+	col.m_shapeType = SHAPE_CONCAVE_TRIMESH;
+	col.m_shapeIndex = registerConcaveMeshShape(vertices,indices,col,scaling);
+	col.m_bvhIndex = m_data->m_bvhInfoCPU.size();
+	b3SapAabb aabb;
+	b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f);
+	b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f);
+	for (int i=0;i<vertices->size();i++)
+	{
+		b3Vector3 vtx(vertices->at(i)*scaling);
+		myAabbMin.setMin(vtx);
+		myAabbMax.setMax(vtx);
+	}
+	aabb.m_min[0] = myAabbMin[0];
+	aabb.m_min[1] = myAabbMin[1];
+	aabb.m_min[2] = myAabbMin[2];
+	aabb.m_minIndices[3] = 0;
+	aabb.m_max[0] = myAabbMax[0];
+	aabb.m_max[1]= myAabbMax[1];
+	aabb.m_max[2]= myAabbMax[2];
+	aabb.m_signedMaxIndices[3]= 0;
+	m_data->m_localShapeAABBCPU->push_back(aabb);
+//	m_data->m_localShapeAABBGPU->push_back(aabb);
+	b3OptimizedBvh* bvh = new b3OptimizedBvh();
+	//void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
+	bool useQuantizedAabbCompression = true;
+	b3TriangleIndexVertexArray* meshInterface=new b3TriangleIndexVertexArray();
+	m_data->m_meshInterfaces.push_back(meshInterface);
+	b3IndexedMesh mesh;
+	mesh.m_numTriangles = indices->size()/3;
+	mesh.m_numVertices = vertices->size();
+	mesh.m_vertexBase = (const unsigned char *)&vertices->at(0).x;
+	mesh.m_vertexStride = sizeof(b3Vector3);
+	mesh.m_triangleIndexStride = 3 * sizeof(int);// or sizeof(int)
+	mesh.m_triangleIndexBase = (const unsigned char *)&indices->at(0);
+	meshInterface->addIndexedMesh(mesh);
+	bvh->build(meshInterface, useQuantizedAabbCompression, (b3Vector3&)aabb.m_min, (b3Vector3&)aabb.m_max);
+	m_data->m_bvhData.push_back(bvh);
+	int numNodes = bvh->getQuantizedNodeArray().size();
+	//b3OpenCLArray<b3QuantizedBvhNode>*	treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue,numNodes);
+	int numSubTrees = bvh->getSubtreeInfoArray().size();
+	b3BvhInfo bvhInfo;
+	bvhInfo.m_aabbMin = bvh->m_bvhAabbMin;
+	bvhInfo.m_aabbMax = bvh->m_bvhAabbMax;
+	bvhInfo.m_quantization = bvh->m_bvhQuantization;
+	bvhInfo.m_numNodes = numNodes;
+	bvhInfo.m_numSubTrees = numSubTrees;
+	bvhInfo.m_nodeOffset = m_data->m_treeNodesCPU.size();
+	bvhInfo.m_subTreeOffset = m_data->m_subTreesCPU.size();
+	m_data->m_bvhInfoCPU.push_back(bvhInfo);
+	int numNewSubtrees = bvh->getSubtreeInfoArray().size();
+	m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees);
+	for (int i=0;i<numNewSubtrees;i++)
+	{
+		m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]);
+	}
+	int numNewTreeNodes = bvh->getQuantizedNodeArray().size();
+	for (int i=0;i<numNewTreeNodes;i++)
+	{
+		m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]);
+	}
+	return collidableIndex;
+int b3GpuNarrowPhase::registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,b3Collidable& col, const float* scaling1)
+	b3Vector3 scaling=b3MakeVector3(scaling1[0],scaling1[1],scaling1[2]);
+	m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1);
+	m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
+	b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
+	convex.mC = b3MakeVector3(0,0,0);
+	convex.mE = b3MakeVector3(0,0,0);
+	convex.m_extents= b3MakeVector3(0,0,0);
+	convex.m_localCenter = b3MakeVector3(0,0,0);
+	convex.m_radius = 0.f;
+	convex.m_numUniqueEdges = 0;
+	int edgeOffset = m_data->m_uniqueEdges.size();
+	convex.m_uniqueEdgesOffset = edgeOffset;
+	int faceOffset = m_data->m_convexFaces.size();
+	convex.m_faceOffset = faceOffset;
+	convex.m_numFaces = indices->size()/3;
+	m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
+	m_data->m_convexIndices.reserve(convex.m_numFaces*3);
+	for (int i=0;i<convex.m_numFaces;i++)
+	{
+		if (i%256==0)
+		{
+			//printf("i=%d out of %d", i,convex.m_numFaces);
+		}
+		b3Vector3 vert0(vertices->at(indices->at(i*3))*scaling);
+		b3Vector3 vert1(vertices->at(indices->at(i*3+1))*scaling);
+		b3Vector3 vert2(vertices->at(indices->at(i*3+2))*scaling);
+		b3Vector3 normal = ((vert1-vert0).cross(vert2-vert0)).normalize();
+		b3Scalar c = -(normal.dot(vert0));
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector4(normal.x,normal.y,normal.z,c);
+		int indexOffset = m_data->m_convexIndices.size();
+		int numIndices = 3;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
+		m_data->m_convexIndices.resize(indexOffset+numIndices);
+		for (int p=0;p<numIndices;p++)
+		{
+			int vi = indices->at(i*3+p);
+			m_data->m_convexIndices[indexOffset+p] = vi;//convexPtr->m_faces[i].m_indices[p];
+		}
+	}
+	convex.m_numVertices = vertices->size();
+	int vertexOffset = m_data->m_convexVertices.size();
+	convex.m_vertexOffset =vertexOffset;
+	m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
+	for (int i=0;i<vertices->size();i++)
+	{
+		m_data->m_convexVertices[vertexOffset+i] = vertices->at(i)*scaling;
+	}
+	(*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = 0;
+	return m_data->m_numAcceleratedShapes++;
+cl_mem	b3GpuNarrowPhase::getBodiesGpu()
+	return (cl_mem)m_data->m_bodyBufferGPU->getBufferCL();
+const struct b3RigidBodyData* b3GpuNarrowPhase::getBodiesCpu() const
+	return &m_data->m_bodyBufferCPU->at(0);
+int	b3GpuNarrowPhase::getNumBodiesGpu() const
+	return m_data->m_bodyBufferGPU->size();
+cl_mem	b3GpuNarrowPhase::getBodyInertiasGpu()
+	return (cl_mem)m_data->m_inertiaBufferGPU->getBufferCL();
+int	b3GpuNarrowPhase::getNumBodyInertiasGpu() const
+	return m_data->m_inertiaBufferGPU->size();
+b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex)
+	return m_data->m_collidablesCPU[collidableIndex];
+const b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex) const
+	return m_data->m_collidablesCPU[collidableIndex];
+cl_mem b3GpuNarrowPhase::getCollidablesGpu()
+	return m_data->m_collidablesGPU->getBufferCL();
+const struct b3Collidable* b3GpuNarrowPhase::getCollidablesCpu() const
+	if (m_data->m_collidablesCPU.size())
+		return &m_data->m_collidablesCPU[0];
+	return 0;
+const struct b3SapAabb* b3GpuNarrowPhase::getLocalSpaceAabbsCpu() const
+	if (m_data->m_localShapeAABBCPU->size())
+	{
+		return &m_data->m_localShapeAABBCPU->at(0);
+	} 
+	return 0;
+cl_mem	b3GpuNarrowPhase::getAabbLocalSpaceBufferGpu()
+	return m_data->m_localShapeAABBGPU->getBufferCL();
+int	b3GpuNarrowPhase::getNumCollidablesGpu() const
+	return m_data->m_collidablesGPU->size();
+int	b3GpuNarrowPhase::getNumContactsGpu() const
+	return m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->size();
+cl_mem b3GpuNarrowPhase::getContactsGpu()
+	return m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->getBufferCL();
+const b3Contact4* b3GpuNarrowPhase::getContactsCPU() const
+	m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->copyToHost(*m_data->m_pBufContactOutCPU);
+	return &m_data->m_pBufContactOutCPU->at(0);
+void b3GpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects)
+	cl_mem aabbsLocalSpace = m_data->m_localShapeAABBGPU->getBufferCL();
+	int nContactOut = 0;
+	//swap buffer
+	m_data->m_currentContactBuffer=1-m_data->m_currentContactBuffer;
+	int curSize = m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->size();
+	int maxTriConvexPairCapacity = m_data->m_config.m_maxTriConvexPairCapacity;
+	int numTriConvexPairsOut=0;
+	b3OpenCLArray<b3Int4> broadphasePairsGPU(m_context,m_queue);
+	broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs);
+	b3OpenCLArray<b3Aabb> clAabbArrayWorldSpace(this->m_context,this->m_queue);
+	clAabbArrayWorldSpace.setFromOpenCLBuffer(aabbsWorldSpace,numObjects);
+	b3OpenCLArray<b3Aabb> clAabbArrayLocalSpace(this->m_context,this->m_queue);
+	clAabbArrayLocalSpace.setFromOpenCLBuffer(aabbsLocalSpace,numObjects);
+	m_data->m_gpuSatCollision->computeConvexConvexContactsGPUSAT(
+		&broadphasePairsGPU, numBroadphasePairs,
+		m_data->m_bodyBufferGPU,
+		m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer],
+		nContactOut,
+		m_data->m_pBufContactBuffersGPU[1-m_data->m_currentContactBuffer],
+		m_data->m_config.m_maxContactCapacity,
+		m_data->m_config.m_compoundPairCapacity,
+		*m_data->m_convexPolyhedraGPU,
+		*m_data->m_convexVerticesGPU,
+		*m_data->m_uniqueEdgesGPU,
+		*m_data->m_convexFacesGPU,
+		*m_data->m_convexIndicesGPU,
+		*m_data->m_collidablesGPU,
+		*m_data->m_gpuChildShapes,
+		clAabbArrayWorldSpace,
+		clAabbArrayLocalSpace,
+		*m_data->m_worldVertsB1GPU,
+		*m_data->m_clippingFacesOutGPU,
+		*m_data->m_worldNormalsAGPU,
+		*m_data->m_worldVertsA1GPU,
+		*m_data->m_worldVertsB2GPU,
+		m_data->m_bvhData,
+		m_data->m_treeNodesGPU,
+		m_data->m_subTreesGPU,
+		m_data->m_bvhInfoGPU,
+		numObjects,
+		maxTriConvexPairCapacity,
+		*m_data->m_triangleConvexPairs,
+		numTriConvexPairsOut
+		);
+	/*b3AlignedObjectArray<b3Int4> broadphasePairsCPU;
+	broadphasePairsGPU.copyToHost(broadphasePairsCPU);
+	printf("checking pairs\n");
+	*/
+const b3SapAabb& b3GpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const
+	return m_data->m_localShapeAABBCPU->at(collidableIndex);
+int b3GpuNarrowPhase::registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation , const float* aabbMinPtr, const float* aabbMaxPtr,bool writeToGpu)
+	b3Vector3 aabbMin=b3MakeVector3(aabbMinPtr[0],aabbMinPtr[1],aabbMinPtr[2]);
+	b3Vector3 aabbMax=b3MakeVector3(aabbMaxPtr[0],aabbMaxPtr[1],aabbMaxPtr[2]);
+	if (m_data->m_numAcceleratedRigidBodies >= (m_data->m_config.m_maxConvexBodies))
+	{
+		b3Error("registerRigidBody: exceeding the number of rigid bodies, %d > %d \n",m_data->m_numAcceleratedRigidBodies,m_data->m_config.m_maxConvexBodies);
+		return -1;
+	}
+	m_data->m_bodyBufferCPU->resize(m_data->m_numAcceleratedRigidBodies+1);
+	b3RigidBodyData& body = m_data->m_bodyBufferCPU->at(m_data->m_numAcceleratedRigidBodies);
+	float friction = 1.f;
+	float restitution = 0.f;
+	body.m_frictionCoeff = friction;
+	body.m_restituitionCoeff = restitution;
+	body.m_angVel = b3MakeVector3(0,0,0);
+	body.m_linVel=b3MakeVector3(0,0,0);//.setZero();
+	body.m_pos =b3MakeVector3(position[0],position[1],position[2]);
+	body.m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]);
+	body.m_collidableIdx = collidableIndex;
+	if (collidableIndex>=0)
+	{
+//		body.m_shapeType = m_data->m_collidablesCPU.at(collidableIndex).m_shapeType;
+	} else
+	{
+	//	body.m_shapeType = CollisionShape::SHAPE_PLANE;
+		m_planeBodyIndex = m_data->m_numAcceleratedRigidBodies;
+	}
+	//body.m_shapeType = shapeType;
+	body.m_invMass = mass? 1.f/mass : 0.f;
+	if (writeToGpu)
+	{
+		m_data->m_bodyBufferGPU->copyFromHostPointer(&body,1,m_data->m_numAcceleratedRigidBodies);
+	}
+	b3InertiaData& shapeInfo = m_data->m_inertiaBufferCPU->at(m_data->m_numAcceleratedRigidBodies);
+	if (mass==0.f)
+	{
+		if (m_data->m_numAcceleratedRigidBodies==0)
+			m_static0Index = 0;
+		shapeInfo.m_initInvInertia.setValue(0,0,0,0,0,0,0,0,0);
+		shapeInfo.m_invInertiaWorld.setValue(0,0,0,0,0,0,0,0,0);
+	} else
+	{
+		b3Assert(body.m_collidableIdx>=0);
+		//approximate using the aabb of the shape
+		//Aabb aabb = (*m_data->m_shapePointers)[shapeIndex]->m_aabb;
+		b3Vector3 halfExtents = (aabbMax-aabbMin);//*0.5f;//fake larger inertia makes demos more stable ;-)
+		b3Vector3 localInertia;
+		float lx=2.f*halfExtents[0];
+		float ly=2.f*halfExtents[1];
+		float lz=2.f*halfExtents[2];
+		localInertia.setValue( (mass/12.0f) * (ly*ly + lz*lz),
+                                   (mass/12.0f) * (lx*lx + lz*lz),
+                                   (mass/12.0f) * (lx*lx + ly*ly));
+		b3Vector3 invLocalInertia;
+		invLocalInertia[0] = 1.f/localInertia[0];
+		invLocalInertia[1] = 1.f/localInertia[1];
+		invLocalInertia[2] = 1.f/localInertia[2];
+		invLocalInertia[3] = 0.f;
+		shapeInfo.m_initInvInertia.setValue(
+			invLocalInertia[0],		0,						0,
+			0,						invLocalInertia[1],		0,
+			0,						0,						invLocalInertia[2]);
+		b3Matrix3x3 m (body.m_quat);
+		shapeInfo.m_invInertiaWorld = m.scaled(invLocalInertia) * m.transpose();
+	}
+	if (writeToGpu)
+		m_data->m_inertiaBufferGPU->copyFromHostPointer(&shapeInfo,1,m_data->m_numAcceleratedRigidBodies);
+	return m_data->m_numAcceleratedRigidBodies++;
+int b3GpuNarrowPhase::getNumRigidBodies() const
+	return m_data->m_numAcceleratedRigidBodies;
+void	b3GpuNarrowPhase::writeAllBodiesToGpu()
+	if (m_data->m_localShapeAABBCPU->size())
+	{
+		m_data->m_localShapeAABBGPU->copyFromHost(*m_data->m_localShapeAABBCPU);
+	}
+	m_data->m_gpuChildShapes->copyFromHost(m_data->m_cpuChildShapes);
+	m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces);
+	m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra);
+	m_data->m_uniqueEdgesGPU->copyFromHost(m_data->m_uniqueEdges);
+	m_data->m_convexVerticesGPU->copyFromHost(m_data->m_convexVertices);
+	m_data->m_convexIndicesGPU->copyFromHost(m_data->m_convexIndices);
+	m_data->m_bvhInfoGPU->copyFromHost(m_data->m_bvhInfoCPU);
+	m_data->m_treeNodesGPU->copyFromHost(m_data->m_treeNodesCPU);
+	m_data->m_subTreesGPU->copyFromHost(m_data->m_subTreesCPU);
+	m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies);
+	m_data->m_inertiaBufferGPU->resize(m_data->m_numAcceleratedRigidBodies);
+	if (m_data->m_numAcceleratedRigidBodies)
+	{
+		m_data->m_bodyBufferGPU->copyFromHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
+		m_data->m_inertiaBufferGPU->copyFromHostPointer(&m_data->m_inertiaBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
+	}
+    if (m_data->m_collidablesCPU.size())
+	{
+		m_data->m_collidablesGPU->copyFromHost(m_data->m_collidablesCPU);
+	}
+void	b3GpuNarrowPhase::reset()
+	m_data->m_numAcceleratedShapes = 0;
+	m_data->m_numAcceleratedRigidBodies = 0;
+	this->m_static0Index = -1;
+	m_data->m_uniqueEdges.resize(0);
+	m_data->m_convexVertices.resize(0);
+	m_data->m_convexPolyhedra.resize(0);
+	m_data->m_convexIndices.resize(0);
+	m_data->m_cpuChildShapes.resize(0);
+	m_data->m_convexFaces.resize(0);
+	m_data->m_collidablesCPU.resize(0);
+	m_data->m_localShapeAABBCPU->resize(0);
+	m_data->m_bvhData.resize(0);
+	m_data->m_treeNodesCPU.resize(0);
+	m_data->m_subTreesCPU.resize(0);
+	m_data->m_bvhInfoCPU.resize(0);
+void	b3GpuNarrowPhase::readbackAllBodiesToCpu()
+	m_data->m_bodyBufferGPU->copyToHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
+void b3GpuNarrowPhase::setObjectTransformCpu(float* position, float* orientation , int bodyIndex)
+	if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size())
+	{
+		m_data->m_bodyBufferCPU->at(bodyIndex).m_pos=b3MakeVector3(position[0],position[1],position[2]);
+		m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]);
+	}
+	else
+	{
+		b3Warning("setObjectVelocityCpu out of range.\n");
+	}
+void b3GpuNarrowPhase::setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex)
+	if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size())
+	{
+		m_data->m_bodyBufferCPU->at(bodyIndex).m_linVel=b3MakeVector3(linVel[0],linVel[1],linVel[2]);
+		m_data->m_bodyBufferCPU->at(bodyIndex).m_angVel=b3MakeVector3(angVel[0],angVel[1],angVel[2]);
+	} else
+	{
+		b3Warning("setObjectVelocityCpu out of range.\n");
+	}
+bool b3GpuNarrowPhase::getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const
+	if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size())
+	{
+		position[0] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.x;
+		position[1] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.y;
+		position[2] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.z;
+		position[3] = 1.f;//or 1
+		orientation[0] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.x;
+		orientation[1] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.y;
+		orientation[2] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.z;
+		orientation[3] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.w;
+		return true;
+	}
+	b3Warning("getObjectTransformFromCpu out of range.\n");
+	return false;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h
new file mode 100644
index 00000000..05ff3fd0
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h
@@ -0,0 +1,109 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+class b3GpuNarrowPhase
+	struct b3GpuNarrowPhaseInternalData*	m_data;
+	int m_acceleratedCompanionShapeIndex;
+	int m_planeBodyIndex;
+	int	m_static0Index;
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+	int registerConvexHullShapeInternal(class b3ConvexUtility* convexPtr, b3Collidable& col);
+	int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
+	b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config);
+	virtual ~b3GpuNarrowPhase(void);
+	int		registerSphereShape(float radius);
+	int		registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
+	int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
+	int registerFace(const b3Vector3& faceNormal, float faceConstant);
+	int	registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling);
+	//do they need to be merged?
+	int	registerConvexHullShape(b3ConvexUtility* utilPtr);
+	int	registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+	int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
+	void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
+	void	writeAllBodiesToGpu();
+	void  reset();
+	void	readbackAllBodiesToCpu();
+	bool	getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const;
+	void setObjectTransformCpu(float* position, float* orientation , int bodyIndex);
+	void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);
+	virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
+	cl_mem	getBodiesGpu();
+	const struct b3RigidBodyData* getBodiesCpu() const;
+	//struct b3RigidBodyData* getBodiesCpu();
+	int	getNumBodiesGpu() const;
+	cl_mem	getBodyInertiasGpu();
+	int	getNumBodyInertiasGpu() const;
+	cl_mem	getCollidablesGpu();
+	const struct b3Collidable* getCollidablesCpu() const;
+	int		getNumCollidablesGpu() const;
+	const struct b3SapAabb* getLocalSpaceAabbsCpu() const;
+	const struct b3Contact4* getContactsCPU() const;
+	cl_mem	getContactsGpu();
+	int	getNumContactsGpu() const;
+	cl_mem	getAabbLocalSpaceBufferGpu();
+	int getNumRigidBodies() const;
+	int allocateCollidable();
+	int getStatic0Index() const
+	{
+		return m_static0Index;
+	}
+	b3Collidable& getCollidableCpu(int collidableIndex);
+	const b3Collidable& getCollidableCpu(int collidableIndex) const;
+	const b3GpuNarrowPhaseInternalData*	getInternalData() const
+	{
+			return m_data;
+	}
+	b3GpuNarrowPhaseInternalData*	getInternalData()
+	{
+			return m_data;
+	}
+	const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h
new file mode 100644
index 00000000..8a7f1ea8
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h
@@ -0,0 +1,95 @@
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Common/shared/b3Int2.h"
+class b3ConvexUtility;
+struct b3GpuNarrowPhaseInternalData
+	b3AlignedObjectArray<b3ConvexUtility*>* m_convexData;
+	b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
+	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
+	b3AlignedObjectArray<b3Vector3> m_convexVertices;
+	b3AlignedObjectArray<int> m_convexIndices;
+	b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU;
+	b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU;
+	b3OpenCLArray<b3Vector3>* m_convexVerticesGPU;
+	b3OpenCLArray<int>* m_convexIndicesGPU;
+    b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
+    b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
+    b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
+    b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
+    b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
+	b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes;
+	b3OpenCLArray<b3GpuChildShape>*	m_gpuChildShapes;
+	b3AlignedObjectArray<b3GpuFace> m_convexFaces;
+	b3OpenCLArray<b3GpuFace>* m_convexFacesGPU;
+	struct GpuSatCollision*	m_gpuSatCollision;
+	b3OpenCLArray<b3Int4>*			m_triangleConvexPairs;
+	b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2];
+	int	m_currentContactBuffer;
+	b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU;
+	b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU;
+	b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
+	b3AlignedObjectArray<b3InertiaData>*	m_inertiaBufferCPU;
+	b3OpenCLArray<b3InertiaData>*	m_inertiaBufferGPU;
+	int m_numAcceleratedShapes;
+	int m_numAcceleratedRigidBodies;
+	b3AlignedObjectArray<b3Collidable>	m_collidablesCPU;
+	b3OpenCLArray<b3Collidable>*	m_collidablesGPU;
+	b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU;
+	b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU;
+	b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData;
+	b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces;
+	b3AlignedObjectArray<b3QuantizedBvhNode>	m_treeNodesCPU;
+	b3AlignedObjectArray<b3BvhSubtreeInfo>	m_subTreesCPU;
+	b3AlignedObjectArray<b3BvhInfo>	m_bvhInfoCPU;
+	b3OpenCLArray<b3BvhInfo>*			m_bvhInfoGPU;
+	b3OpenCLArray<b3QuantizedBvhNode>*	m_treeNodesGPU;
+	b3OpenCLArray<b3BvhSubtreeInfo>*	m_subTreesGPU;
+	b3Config	m_config;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp
new file mode 100644
index 00000000..4d14bc42
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp
@@ -0,0 +1,1158 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+bool useGpuInitSolverBodies = true;
+bool useGpuInfo1 = true;
+bool useGpuInfo2= true;
+bool useGpuSolveJointConstraintRows=true;
+bool useGpuWriteBackVelocities = true;
+bool gpuBreakConstraints = true;
+#include "b3GpuPgsConstraintSolver.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
+#include <new>
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include <string.h> //for memset
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#define B3_JOINT_SOLVER_PATH "src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl"
+struct b3GpuPgsJacobiSolverInternalData
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+	b3PrefixScanCL*	m_prefixScan;
+	cl_kernel m_solveJointConstraintRowsKernels;
+	cl_kernel m_initSolverBodiesKernel;
+	cl_kernel m_getInfo1Kernel;
+	cl_kernel m_initBatchConstraintsKernel;
+	cl_kernel m_getInfo2Kernel;
+	cl_kernel m_writeBackVelocitiesKernel;
+	cl_kernel m_breakViolatedConstraintsKernel;
+	b3OpenCLArray<unsigned int>*	m_gpuConstraintRowOffsets;
+	b3OpenCLArray<b3GpuSolverBody>*			m_gpuSolverBodies;
+	b3OpenCLArray<b3BatchConstraint>*		m_gpuBatchConstraints;
+	b3OpenCLArray<b3GpuSolverConstraint>*		m_gpuConstraintRows;
+	b3OpenCLArray<unsigned int>*			m_gpuConstraintInfo1;
+//	b3AlignedObjectArray<b3GpuSolverBody>		m_cpuSolverBodies;
+	b3AlignedObjectArray<b3BatchConstraint>		m_cpuBatchConstraints;
+	b3AlignedObjectArray<b3GpuSolverConstraint>	m_cpuConstraintRows;
+	b3AlignedObjectArray<unsigned int>			m_cpuConstraintInfo1;
+	b3AlignedObjectArray<unsigned int>			m_cpuConstraintRowOffsets;
+	b3AlignedObjectArray<b3RigidBodyData>			m_cpuBodies;
+	b3AlignedObjectArray<b3InertiaData>			m_cpuInertias;
+	b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;
+	b3AlignedObjectArray<int>		m_batchSizes;
+static b3Transform	getWorldTransform(b3RigidBodyData* rb)
+	b3Transform newTrans;
+	newTrans.setOrigin(rb->m_pos);
+	newTrans.setRotation(rb->m_quat);
+	return newTrans;
+static const b3Matrix3x3&	getInvInertiaTensorWorld(b3InertiaData* inertia)
+	return inertia->m_invInertiaWorld;
+static const b3Vector3&	getLinearVelocity(b3RigidBodyData* rb)
+	return rb->m_linVel;
+static const b3Vector3&	getAngularVelocity(b3RigidBodyData* rb)
+	return rb->m_angVel;
+b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos)
+	//we also calculate lin/ang velocity for kinematic objects
+	return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos);
+b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs)
+	m_usePgs = usePgs;
+	m_gpuData = new b3GpuPgsJacobiSolverInternalData();
+	m_gpuData->m_context = ctx;
+	m_gpuData->m_device = device;
+	m_gpuData->m_queue = queue;
+	m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx,device,queue);
+	m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue);
+	m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context,m_gpuData->m_queue);
+	m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context,m_gpuData->m_queue);
+	m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context,m_gpuData->m_queue);
+	m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue);
+	cl_int errNum=0;
+	{
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,&errNum,"",B3_JOINT_SOLVER_PATH);
+		//cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,0,&errNum,"",B3_JOINT_SOLVER_PATH,true);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device,solveConstraintRowsCL, "solveJointConstraintRows",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initSolverBodies",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo1Kernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initBatchConstraintsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_getInfo2Kernel= b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo2Kernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"writeBackVelocitiesKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"breakViolatedConstraintsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		clReleaseProgram(prog);
+	}
+b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver ()
+	clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels);
+	clReleaseKernel(m_gpuData->m_initSolverBodiesKernel);
+	clReleaseKernel(m_gpuData->m_getInfo1Kernel);
+	clReleaseKernel(m_gpuData->m_initBatchConstraintsKernel);
+	clReleaseKernel(m_gpuData->m_getInfo2Kernel);
+	clReleaseKernel(m_gpuData->m_writeBackVelocitiesKernel);
+	clReleaseKernel(m_gpuData->m_breakViolatedConstraintsKernel);
+	delete m_gpuData->m_prefixScan;
+	delete m_gpuData->m_gpuConstraintRowOffsets;
+	delete m_gpuData->m_gpuSolverBodies;
+	delete m_gpuData->m_gpuBatchConstraints;
+	delete m_gpuData->m_gpuConstraintRows;
+	delete m_gpuData->m_gpuConstraintInfo1;
+	delete m_gpuData;
+struct b3BatchConstraint
+	int m_bodyAPtrAndSignBit;
+	int m_bodyBPtrAndSignBit;
+	int m_originalConstraintIndex;
+	int m_batchId;
+static b3AlignedObjectArray<b3BatchConstraint> batchConstraints;
+void	b3GpuPgsConstraintSolver::recomputeBatches()
+	m_gpuData->m_batchSizes.clear();
+b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("GPU solveGroupCacheFriendlySetup");
+	batchConstraints.resize(numConstraints);
+	m_gpuData->m_gpuBatchConstraints->resize(numConstraints);
+	m_staticIdx = -1;
+	m_maxOverrideNumSolverIterations = 0;
+	/*	m_gpuData->m_gpuBodies->resize(numBodies);
+	m_gpuData->m_gpuBodies->copyFromHostPointer(bodies,numBodies);
+	b3OpenCLArray<b3InertiaData> gpuInertias(m_gpuData->m_context,m_gpuData->m_queue);
+	gpuInertias.resize(numBodies);
+	gpuInertias.copyFromHostPointer(inertias,numBodies);
+	*/
+	m_gpuData->m_gpuSolverBodies->resize(numBodies);
+	m_tmpSolverBodyPool.resize(numBodies);
+	{
+		if (useGpuInitSolverBodies)
+		{
+			B3_PROFILE("m_initSolverBodiesKernel");
+			b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initSolverBodiesKernel,"m_initSolverBodiesKernel");
+			launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
+			launcher.setBuffer(gpuBodies->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_gpuData->m_queue);
+			//			m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
+		} else
+		{
+			gpuBodies->copyToHost(m_gpuData->m_cpuBodies);
+			for (int i=0;i<numBodies;i++)
+			{
+				b3RigidBodyData& body = m_gpuData->m_cpuBodies[i];
+				b3GpuSolverBody& solverBody = m_tmpSolverBodyPool[i];
+				initSolverBody(i,&solverBody,&body);
+				solverBody.m_originalBodyIndex = i;
+			}
+			m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
+		}
+	}
+	int totalBodies = 0;
+	int totalNumRows = 0;
+	//b3RigidBody* rb0=0,*rb1=0;
+	//if (1)
+	{
+		{
+			//			int i;
+			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
+			//			b3OpenCLArray<b3GpuGenericConstraint> gpuConstraints(m_gpuData->m_context,m_gpuData->m_queue);
+			if (useGpuInfo1)
+			{
+				B3_PROFILE("info1 and init batchConstraint");
+				m_gpuData->m_gpuConstraintInfo1->resize(numConstraints);
+				if (1)
+				{
+					B3_PROFILE("getInfo1Kernel");
+					b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo1Kernel,"m_getInfo1Kernel");
+					launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
+					launcher.setBuffer(gpuConstraints->getBufferCL());
+					launcher.setConst(numConstraints);
+					launcher.launch1D(numConstraints);
+					clFinish(m_gpuData->m_queue);
+				}
+				if (m_gpuData->m_batchSizes.size()==0)
+				{
+					B3_PROFILE("initBatchConstraintsKernel");
+					m_gpuData->m_gpuConstraintRowOffsets->resize(numConstraints);
+					unsigned int total=0;
+					m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1,*m_gpuData->m_gpuConstraintRowOffsets,numConstraints,&total);
+					unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints-1);
+					totalNumRows = total+lastElem;
+					{
+						B3_PROFILE("init batch constraints");
+						b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initBatchConstraintsKernel,"m_initBatchConstraintsKernel");
+						launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
+						launcher.setBuffer(gpuConstraints->getBufferCL());
+						launcher.setBuffer(gpuBodies->getBufferCL());
+						launcher.setConst(numConstraints);
+						launcher.launch1D(numConstraints);
+						clFinish(m_gpuData->m_queue);
+					}
+					//assume the batching happens on CPU, so copy the data
+					m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
+				}
+			} 
+			else
+			{
+				totalNumRows  = 0;
+				gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints);
+				//calculate the total number of contraint rows
+				for (int i=0;i<numConstraints;i++)
+				{
+					unsigned int& info1= m_tmpConstraintSizesPool[i];
+					//					unsigned int info1;
+					if (m_gpuData->m_cpuConstraints[i].isEnabled())
+					{
+						m_gpuData->m_cpuConstraints[i].getInfo1(&info1,&m_gpuData->m_cpuBodies[0]);
+					} else
+					{
+						info1 = 0;
+					}
+					totalNumRows += info1;
+				}
+				m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
+				m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool);
+			}
+			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
+			m_gpuData->m_gpuConstraintRows->resize(totalNumRows);
+			//			b3GpuConstraintArray		verify;
+			if (useGpuInfo2)
+			{
+				{
+						B3_PROFILE("getInfo2Kernel");
+						b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo2Kernel,"m_getInfo2Kernel");
+						launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
+						launcher.setBuffer(gpuConstraints->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
+						launcher.setBuffer(gpuBodies->getBufferCL());
+						launcher.setBuffer(gpuInertias->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
+						launcher.setConst(infoGlobal.m_timeStep);
+						launcher.setConst(infoGlobal.m_erp);
+						launcher.setConst(infoGlobal.m_globalCfm);
+						launcher.setConst(infoGlobal.m_damping);
+						launcher.setConst(infoGlobal.m_numIterations);
+						launcher.setConst(numConstraints);
+						launcher.launch1D(numConstraints);
+						clFinish(m_gpuData->m_queue);
+						if (m_gpuData->m_batchSizes.size()==0)
+							m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
+						//m_gpuData->m_gpuConstraintRows->copyToHost(verify);
+						//m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool);
+					}
+			} 
+			else
+			{
+				gpuInertias->copyToHost(m_gpuData->m_cpuInertias);
+					///setup the b3SolverConstraints
+				for (int i=0;i<numConstraints;i++)
+				{
+					const int& info1 = m_tmpConstraintSizesPool[i];
+					if (info1)
+					{
+						int constraintIndex = batchConstraints[i].m_originalConstraintIndex;
+						int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[constraintIndex];
+						b3GpuSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[constraintRowOffset];
+						b3GpuGenericConstraint& constraint = m_gpuData->m_cpuConstraints[i];
+						b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[ constraint.getRigidBodyA()];
+						//b3RigidBody& rbA = constraint.getRigidBodyA();
+		//				b3RigidBody& rbB = constraint.getRigidBodyB();
+						b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[ constraint.getRigidBodyB()];
+						int solverBodyIdA = constraint.getRigidBodyA();//getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias);
+						int solverBodyIdB = constraint.getRigidBodyB();//getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias);
+						b3GpuSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
+						b3GpuSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
+						if (rbA.m_invMass)
+						{
+							batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;
+						} else
+						{
+							if (!solverBodyIdA)
+								m_staticIdx = 0;
+							batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;
+						}
+						if (rbB.m_invMass)
+						{
+							batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;
+						} else
+						{
+							if (!solverBodyIdB)
+								m_staticIdx = 0;
+							batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;
+						}
+						int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
+						if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
+							m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
+						int j;
+						for ( j=0;j<info1;j++)
+						{
+							memset(&currentConstraintRow[j],0,sizeof(b3GpuSolverConstraint));
+							currentConstraintRow[j].m_angularComponentA.setValue(0,0,0);
+							currentConstraintRow[j].m_angularComponentB.setValue(0,0,0);
+							currentConstraintRow[j].m_appliedImpulse = 0.f;
+							currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+							currentConstraintRow[j].m_cfm = 0.f;
+							currentConstraintRow[j].m_contactNormal.setValue(0,0,0);
+							currentConstraintRow[j].m_friction = 0.f;
+							currentConstraintRow[j].m_frictionIndex = 0;
+							currentConstraintRow[j].m_jacDiagABInv = 0.f;
+							currentConstraintRow[j].m_lowerLimit = 0.f;
+							currentConstraintRow[j].m_upperLimit = 0.f;
+							currentConstraintRow[j].m_originalContactPoint = 0;
+							currentConstraintRow[j].m_overrideNumSolverIterations = 0;
+							currentConstraintRow[j].m_relpos1CrossNormal.setValue(0,0,0);
+							currentConstraintRow[j].m_relpos2CrossNormal.setValue(0,0,0);
+							currentConstraintRow[j].m_rhs = 0.f;
+							currentConstraintRow[j].m_rhsPenetration = 0.f;
+							currentConstraintRow[j].m_solverBodyIdA = 0;
+							currentConstraintRow[j].m_solverBodyIdB = 0;
+							currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;
+							currentConstraintRow[j].m_upperLimit = B3_INFINITY;
+							currentConstraintRow[j].m_appliedImpulse = 0.f;
+							currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+							currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+							currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
+							currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
+						}
+						bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+						bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+						bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+						bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+						bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+						bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+						bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+						bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+						b3GpuConstraintInfo2 info2;
+						info2.fps = 1.f/infoGlobal.m_timeStep;
+						info2.erp = infoGlobal.m_erp;
+						info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
+						info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
+						info2.m_J2linearAxis = 0;
+						info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
+						info2.rowskip = sizeof(b3GpuSolverConstraint)/sizeof(b3Scalar);//check this
+						///the size of b3GpuSolverConstraint needs be a multiple of b3Scalar
+						b3Assert(info2.rowskip*sizeof(b3Scalar)== sizeof(b3GpuSolverConstraint));
+						info2.m_constraintError = &currentConstraintRow->m_rhs;
+						currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
+						info2.m_damping = infoGlobal.m_damping;
+						info2.cfm = &currentConstraintRow->m_cfm;
+						info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+						info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+						info2.m_numIterations = infoGlobal.m_numIterations;
+						m_gpuData->m_cpuConstraints[i].getInfo2(&info2,&m_gpuData->m_cpuBodies[0]);
+						///finalize the constraint setup
+						for ( j=0;j<info1;j++)
+						{
+							b3GpuSolverConstraint& solverConstraint = currentConstraintRow[j];
+							if (solverConstraint.m_upperLimit>=m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold())
+							{
+								solverConstraint.m_upperLimit = m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold();
+							}
+							if (solverConstraint.m_lowerLimit<=-m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold())
+							{
+								solverConstraint.m_lowerLimit = -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold();
+							}
+	//						solverConstraint.m_originalContactPoint = constraint;
+							b3Matrix3x3& invInertiaWorldA= m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld;
+							{
+								//b3Vector3 angularFactorA(1,1,1);
+								const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
+								solverConstraint.m_angularComponentA = invInertiaWorldA*ftorqueAxis1;//*angularFactorA;
+							}
+							b3Matrix3x3& invInertiaWorldB= m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld;
+							{
+								const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
+								solverConstraint.m_angularComponentB = invInertiaWorldB*ftorqueAxis2;//*constraint.getRigidBodyB().getAngularFactor();
+							}
+							{
+								//it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal
+								//because it gets multiplied iMJlB
+								b3Vector3 iMJlA = solverConstraint.m_contactNormal*rbA.m_invMass;
+								b3Vector3 iMJaA = invInertiaWorldA*solverConstraint.m_relpos1CrossNormal;
+								b3Vector3 iMJlB = solverConstraint.m_contactNormal*rbB.m_invMass;//sign of normal?
+								b3Vector3 iMJaB = invInertiaWorldB*solverConstraint.m_relpos2CrossNormal;
+								b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
+								sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+								sum += iMJlB.dot(solverConstraint.m_contactNormal);
+								sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+								b3Scalar fsum = b3Fabs(sum);
+								b3Assert(fsum > B3_EPSILON);
+								solverConstraint.m_jacDiagABInv = fsum>B3_EPSILON?b3Scalar(1.)/sum : 0.f;
+							}
+							///fix rhs
+							///todo: add force/torque accelerators
+							{
+								b3Scalar rel_vel;
+								b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel);
+								b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel);
+								rel_vel = vel1Dotn+vel2Dotn;
+								b3Scalar restitution = 0.f;
+								b3Scalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
+								b3Scalar	velocityError = restitution - rel_vel * info2.m_damping;
+								b3Scalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+								b3Scalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+								solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+								solverConstraint.m_appliedImpulse = 0.f;
+							}
+						}
+					}
+				}
+				m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool);
+				m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool);
+				if (m_gpuData->m_batchSizes.size()==0)
+					m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
+				else
+					m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
+				m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
+			}//end useGpuInfo2
+		}
+		{
+			int i;
+			for (i=0;i<numManifolds;i++)
+			{
+				b3Contact4& manifold = manifoldPtr[i];
+				convertContact(bodies,inertias,&manifold,infoGlobal);
+			}
+		}
+	}
+//	b3ContactSolverInfo info = infoGlobal;
+	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
+	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
+	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
+	return 0.f;
+///a straight copy from GPU/OpenCL kernel, for debugging
+__inline void internalApplyImpulse( b3GpuSolverBody* body,  const b3Vector3& linearComponent, const b3Vector3& angularComponent,float impulseMagnitude)
+	body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;
+	body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);
+void resolveSingleConstraintRowGeneric2( b3GpuSolverBody* body1,  b3GpuSolverBody* body2,  b3GpuSolverConstraint* c)
+	float deltaImpulse = c->m_rhs-b3Scalar(c->m_appliedImpulse)*c->m_cfm;
+	float deltaVel1Dotn	=	b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) 	+ b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);
+	float deltaVel2Dotn	=	-b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);
+	deltaImpulse	-=	deltaVel1Dotn*c->m_jacDiagABInv;
+	deltaImpulse	-=	deltaVel2Dotn*c->m_jacDiagABInv;
+	float sum = b3Scalar(c->m_appliedImpulse) + deltaImpulse;
+	if (sum < c->m_lowerLimit)
+	{
+		deltaImpulse = c->m_lowerLimit-b3Scalar(c->m_appliedImpulse);
+		c->m_appliedImpulse = c->m_lowerLimit;
+	}
+	else if (sum > c->m_upperLimit) 
+	{
+		deltaImpulse = c->m_upperLimit-b3Scalar(c->m_appliedImpulse);
+		c->m_appliedImpulse = c->m_upperLimit;
+	}
+	else
+	{
+		c->m_appliedImpulse = sum;
+	}
+	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);
+	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);
+void	b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb)
+	solverBody->m_deltaLinearVelocity.setValue(0.f,0.f,0.f);
+	solverBody->m_deltaAngularVelocity.setValue(0.f,0.f,0.f);
+	solverBody->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+	solverBody->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+	b3Assert(rb);
+//	solverBody->m_worldTransform = getWorldTransform(rb);
+	solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass,rb->m_invMass,rb->m_invMass));
+	solverBody->m_originalBodyIndex = bodyIndex;
+	solverBody->m_angularFactor = b3MakeVector3(1,1,1);
+	solverBody->m_linearFactor = b3MakeVector3(1,1,1);
+	solverBody->m_linearVelocity = getLinearVelocity(rb);
+	solverBody->m_angularVelocity = getAngularVelocity(rb);
+void	b3GpuPgsConstraintSolver::averageVelocities()
+b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	//only create the batches once.
+	//@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated
+	B3_PROFILE("GpuSolveGroupCacheFriendlyIterations");
+	bool createBatches = m_gpuData->m_batchSizes.size()==0;
+	{
+		if (createBatches)
+		{
+			m_gpuData->m_batchSizes.resize(0);
+			{
+				m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
+				B3_PROFILE("batch joints");
+				b3Assert(batchConstraints.size()==numConstraints);
+				int simdWidth =numConstraints+1;
+				int numBodies = m_tmpSolverBodyPool.size();
+				sortConstraintByBatch3( &batchConstraints[0], numConstraints, simdWidth , m_staticIdx,  numBodies);
+				m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
+			}
+		} else
+		{
+			/*b3AlignedObjectArray<b3BatchConstraint> cpuCheckBatches;
+			m_gpuData->m_gpuBatchConstraints->copyToHost(cpuCheckBatches);
+			b3Assert(cpuCheckBatches.size()==batchConstraints.size());
+			printf(".\n");
+			*/
+			//>copyFromHost(batchConstraints);
+		}
+		int maxIterations = infoGlobal.m_numIterations;
+		bool useBatching = true;
+		if (useBatching )
+		{
+			if (!useGpuSolveJointConstraintRows)
+			{
+				B3_PROFILE("copy to host");
+				m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
+				m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
+				m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool);
+				m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1);
+				m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets);
+				gpuConstraints1->copyToHost(m_gpuData->m_cpuConstraints);
+			}
+			for ( int iteration = 0 ; iteration< maxIterations ; iteration++)
+			{
+				int batchOffset = 0;
+				int constraintOffset=0;
+				int numBatches = m_gpuData->m_batchSizes.size();
+				for (int bb=0;bb<numBatches;bb++)
+				{
+					int numConstraintsInBatch = m_gpuData->m_batchSizes[bb];
+					if (useGpuSolveJointConstraintRows)
+					{
+						B3_PROFILE("solveJointConstraintRowsKernels");
+						/*
+						__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,
+					  __global b3BatchConstraint* batchConstraints,
+					  	__global b3SolverConstraint* rows,
+						__global unsigned int* numConstraintRowsInfo1, 
+						__global unsigned int* rowOffsets,
+						__global b3GpuGenericConstraint* constraints,
+						int batchOffset,
+						int numConstraintsInBatch*/
+						b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_solveJointConstraintRowsKernels,"m_solveJointConstraintRowsKernels");
+						launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
+						launcher.setBuffer(gpuConstraints1->getBufferCL());//to detect disabled constraints
+						launcher.setConst(batchOffset);
+						launcher.setConst(numConstraintsInBatch);
+						launcher.launch1D(numConstraintsInBatch);
+					} else//useGpu
+					{
+						for (int b=0;b<numConstraintsInBatch;b++)
+						{
+							const b3BatchConstraint& c = batchConstraints[batchOffset+b];
+							/*printf("-----------\n");
+							printf("bb=%d\n",bb);
+							printf("c.batchId = %d\n", c.m_batchId);
+							*/
+							b3Assert(c.m_batchId==bb);
+							b3GpuGenericConstraint* constraint = &m_gpuData->m_cpuConstraints[c.m_originalConstraintIndex];
+							if (constraint->m_flags&B3_CONSTRAINT_FLAG_ENABLED)
+							{
+								int numConstraintRows = m_gpuData->m_cpuConstraintInfo1[c.m_originalConstraintIndex];
+								int constraintOffset = m_gpuData->m_cpuConstraintRowOffsets[c.m_originalConstraintIndex];
+								for (int jj=0;jj<numConstraintRows;jj++)
+								{
+	//							
+									b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset+jj];
+									//resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+									resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint);
+								}
+							}
+						}
+					}//useGpu
+					batchOffset+=numConstraintsInBatch;
+					constraintOffset+=numConstraintsInBatch;
+				}
+			}//for (int iteration...
+			if (!useGpuSolveJointConstraintRows)
+			{
+				{
+					B3_PROFILE("copy from host");
+					m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
+					m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
+					m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool);
+				}
+				//B3_PROFILE("copy to host");
+				//m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
+			}
+			//int sz = sizeof(b3GpuSolverBody);
+			//printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz);
+		} else
+		{
+			for ( int iteration = 0 ; iteration< maxIterations ; iteration++)
+			{			
+				int numJoints =			m_tmpSolverNonContactConstraintPool.size();
+				for (int j=0;j<numJoints;j++)
+				{
+					b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[j];
+					resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint);
+				}
+				if (!m_usePgs)
+				{
+					averageVelocities();
+				}
+			}
+		}
+	}
+	clFinish(m_gpuData->m_queue);
+	return 0.f;
+static b3AlignedObjectArray<int> bodyUsed;
+static b3AlignedObjectArray<int> curUsed;
+inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3( b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies)
+	int sz = sizeof(b3BatchConstraint);
+	B3_PROFILE("sortConstraintByBatch3");
+	static int maxSwaps = 0;
+	int numSwaps = 0;
+	curUsed.resize(2*simdWidth);
+	static int maxNumConstraints = 0;
+	if (maxNumConstraints<numConstraints)
+	{
+		maxNumConstraints = numConstraints;
+		//printf("maxNumConstraints  = %d\n",maxNumConstraints );
+	}
+	int numUsedArray = numBodies/32+1;
+	bodyUsed.resize(numUsedArray);
+	for (int q=0;q<numUsedArray;q++)
+		bodyUsed[q]=0;
+	int curBodyUsed = 0;
+	int numIter = 0;
+#if defined(_DEBUG)
+	for(int i=0; i<numConstraints; i++)
+		cs[i].m_batchId = -1;
+	int numValidConstraints = 0;
+	int unprocessedConstraintIndex = 0;
+	int batchIdx = 0;
+	{
+		B3_PROFILE("cpu batch innerloop");
+		while( numValidConstraints < numConstraints)
+		{
+			numIter++;
+			int nCurrentBatch = 0;
+			//	clear flag
+			for(int i=0; i<curBodyUsed; i++) 
+				bodyUsed[curUsed[i]/32] = 0;
+            curBodyUsed = 0;
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+				int idx = i;
+				b3Assert( idx < numConstraints );
+				//	check if it can go
+				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = 0;
+				int bUnavailable = 0;
+				if (!aIsStatic)
+				{
+					aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31));
+				}
+				if (!aUnavailable)
+				if (!bIsStatic)
+				{
+					bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31));
+				}
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31));
+						curUsed[curBodyUsed++]=bodyA;
+					}
+					if (!bIsStatic)
+					{
+						bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31));
+						curUsed[curBodyUsed++]=bodyB;
+					}
+					cs[idx].m_batchId = batchIdx;
+					if (i!=numValidConstraints)
+					{
+						b3Swap(cs[i],cs[numValidConstraints]);
+						numSwaps++;
+					}
+					numValidConstraints++;
+					{
+						nCurrentBatch++;
+						if( nCurrentBatch == simdWidth )
+						{
+							nCurrentBatch = 0;
+							for(int i=0; i<curBodyUsed; i++) 
+								bodyUsed[curUsed[i]/32] = 0;
+							curBodyUsed = 0;
+						}
+					}
+				}
+			}
+			m_gpuData->m_batchSizes.push_back(nCurrentBatch);
+			batchIdx ++;
+		}
+	}
+#if defined(_DEBUG)
+    //		debugPrintf( "nBatches: %d\n", batchIdx );
+	for(int i=0; i<numConstraints; i++)
+    {
+        b3Assert( cs[i].m_batchId != -1 );
+    }
+	if (maxSwaps<numSwaps)
+	{
+		maxSwaps = numSwaps;
+		//printf("maxSwaps = %d\n", maxSwaps);
+	}
+	return batchIdx;
+/// b3PgsJacobiSolver Sequentially applies impulses
+b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, 
+				int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints, const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("solveJoints");
+	//you need to provide at least some bodies
+	solveGroupCacheFriendlySetup( gpuBodies, gpuInertias,numBodies,gpuConstraints, numConstraints,infoGlobal);
+	solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints,infoGlobal);
+	solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias,numBodies, gpuConstraints, numConstraints, infoGlobal);
+	return 0.f;
+void	b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, 
+				int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints)
+	b3ContactSolverInfo infoGlobal;
+	infoGlobal.m_splitImpulse = false;
+	infoGlobal.m_timeStep = 1.f/60.f;
+	infoGlobal.m_numIterations = 4;//4;
+	infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS;
+	solveGroup(gpuBodies,gpuInertias,numBodies,gpuConstraints,numConstraints,infoGlobal);
+//b3AlignedObjectArray<b3RigidBodyData> testBodies;
+b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
+	B3_PROFILE("solveGroupCacheFriendlyFinish");
+	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+//	int i,j;
+	{
+		if (gpuBreakConstraints)
+		{
+			B3_PROFILE("breakViolatedConstraintsKernel");
+			b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_breakViolatedConstraintsKernel,"m_breakViolatedConstraintsKernel");
+			launcher.setBuffer(gpuConstraints->getBufferCL());
+			launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
+			launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
+			launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
+			launcher.setConst(numConstraints);
+			launcher.launch1D(numConstraints);
+		} else
+		{
+			gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints);
+			m_gpuData->m_gpuBatchConstraints->copyToHost(m_gpuData->m_cpuBatchConstraints);
+			m_gpuData->m_gpuConstraintRows->copyToHost(m_gpuData->m_cpuConstraintRows);
+			gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints);
+			m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1);
+			m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets);
+			for (int cid=0;cid<numConstraints;cid++)
+			{
+				int originalConstraintIndex = batchConstraints[cid].m_originalConstraintIndex;
+				int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[originalConstraintIndex];
+				int numRows = m_gpuData->m_cpuConstraintInfo1[originalConstraintIndex];
+				if (numRows)
+				{
+				//	printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold);
+					for (int i=0;i<numRows;i++)
+					{
+						int rowIndex =constraintRowOffset+i;
+						int orgConstraintIndex = m_gpuData->m_cpuConstraintRows[rowIndex].m_originalConstraintIndex;
+						float breakingThreshold = m_gpuData->m_cpuConstraints[orgConstraintIndex].m_breakingImpulseThreshold;
+					//	printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse);
+						if (b3Fabs((m_gpuData->m_cpuConstraintRows[rowIndex].m_appliedImpulse) >= breakingThreshold))
+						{
+							m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;
+						}
+					}
+				}
+			}
+			gpuConstraints->copyFromHost(m_gpuData->m_cpuConstraints);
+		}
+	}
+	{
+		if (useGpuWriteBackVelocities)
+		{
+			B3_PROFILE("GPU write back velocities and transforms");
+			b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_writeBackVelocitiesKernel,"m_writeBackVelocitiesKernel");
+			launcher.setBuffer(gpuBodies->getBufferCL());
+			launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
+			launcher.setConst(numBodies);
+			launcher.launch1D(numBodies);
+			clFinish(m_gpuData->m_queue);
+//			m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
+//			m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies);
+			//m_gpuData->m_gpuBodies->copyToHost(testBodies);
+		} 
+		else
+		{
+			B3_PROFILE("CPU write back velocities and transforms");
+			m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
+			gpuBodies->copyToHost(m_gpuData->m_cpuBodies);
+			for ( int i=0;i<m_tmpSolverBodyPool.size();i++)
+			{
+				int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+				//printf("bodyIndex=%d\n",bodyIndex);
+				b3Assert(i==bodyIndex);
+				b3RigidBodyData* body = &m_gpuData->m_cpuBodies[bodyIndex];
+				if (body->m_invMass)
+				{
+					if (infoGlobal.m_splitImpulse)
+						m_tmpSolverBodyPool[i].writebackVelocityAndTransform(infoGlobal.m_timeStep, infoGlobal.m_splitImpulseTurnErp);
+					else
+						m_tmpSolverBodyPool[i].writebackVelocity();
+					if (m_usePgs)
+					{
+						body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity;
+						body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity;
+					} else
+					{
+						b3Assert(0);
+					}
+	/*			
+					if (infoGlobal.m_splitImpulse)
+					{
+						body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin();
+						b3Quaternion orn;
+						orn = m_tmpSolverBodyPool[i].m_worldTransform.getRotation();
+						body->m_quat = orn;
+					}
+					*/
+				}
+			}//for
+			gpuBodies->copyFromHost(m_gpuData->m_cpuBodies);
+		}
+	}
+	clFinish(m_gpuData->m_queue);
+	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverBodyPool.resizeNoInitialize(0);
+	return 0.f;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h
new file mode 100644
index 00000000..ec0e3f73
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h
@@ -0,0 +1,78 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+struct b3Contact4;
+struct b3ContactPoint;
+class b3Dispatcher;
+#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
+#include "Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h"
+#include "b3GpuSolverBody.h"
+#include "b3GpuSolverConstraint.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+struct b3RigidBodyData;
+struct b3InertiaData;
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "b3GpuGenericConstraint.h"
+class b3GpuPgsConstraintSolver
+	int m_staticIdx;
+	struct b3GpuPgsJacobiSolverInternalData* m_gpuData;
+	protected:
+	b3AlignedObjectArray<b3GpuSolverBody>      m_tmpSolverBodyPool;
+	b3GpuConstraintArray			m_tmpSolverContactConstraintPool;
+	b3GpuConstraintArray			m_tmpSolverNonContactConstraintPool;
+	b3GpuConstraintArray			m_tmpSolverContactFrictionConstraintPool;
+	b3GpuConstraintArray			m_tmpSolverContactRollingFrictionConstraintPool;
+	b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool;
+	bool						m_usePgs;
+	void						averageVelocities();
+	int							m_maxOverrideNumSolverIterations;
+	int							m_numSplitImpulseRecoveries;
+//	int	getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
+	void	initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);
+	b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs);
+	virtual~b3GpuPgsConstraintSolver ();
+	virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	void	solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, 
+				int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);
+	int sortConstraintByBatch3( struct b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies);
+	void	recomputeBatches();
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp
new file mode 100644
index 00000000..694e7c1b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp
@@ -0,0 +1,1701 @@
+bool gUseLargeBatches = false;
+bool gCpuBatchContacts = false;
+bool gCpuSolveConstraint = false;
+bool gCpuRadixSort=false;
+bool gCpuSetSortData = false;
+bool gCpuSortContactsDeterminism = false;
+bool gUseCpuCopyConstraints = false;
+bool gUseScanHost = false;
+bool gReorderContactsOnCpu = false;
+bool optionalSortContactsDeterminism = true;
+#include "b3GpuPgsContactSolver.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include <string.h>
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "b3Solver.h"
+#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
+#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
+#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
+#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
+#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
+#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
+#include "kernels/solverSetup.h"
+#include "kernels/solverSetup2.h"
+#include "kernels/solveContact.h"
+#include "kernels/solveFriction.h"
+#include "kernels/batchingKernels.h"
+#include "kernels/batchingKernelsNew.h"
+struct	b3GpuBatchingPgsSolverInternalData
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+	int m_pairCapacity;
+	int m_nIterations;
+	b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
+	b3OpenCLArray<unsigned int>* m_numConstraints;
+	b3OpenCLArray<unsigned int>* m_offsets;
+	b3Solver*		m_solverGPU;		
+	cl_kernel m_batchingKernel;
+	cl_kernel m_batchingKernelNew;
+	cl_kernel m_solveContactKernel;
+	cl_kernel m_solveSingleContactKernel;
+	cl_kernel m_solveSingleFrictionKernel;
+	cl_kernel m_solveFrictionKernel;
+	cl_kernel m_contactToConstraintKernel;
+	cl_kernel m_setSortDataKernel;
+	cl_kernel m_reorderContactKernel;
+	cl_kernel m_copyConstraintKernel;
+	cl_kernel	m_setDeterminismSortDataBodyAKernel;
+	cl_kernel	m_setDeterminismSortDataBodyBKernel;
+	cl_kernel	m_setDeterminismSortDataChildShapeAKernel;
+	cl_kernel	m_setDeterminismSortDataChildShapeBKernel;
+	class b3RadixSort32CL*	m_sort32;
+	class b3BoundSearchCL*	m_search;
+	class b3PrefixScanCL*	m_scan;
+	b3OpenCLArray<b3SortData>* m_sortDataBuffer;
+	b3OpenCLArray<b3Contact4>* m_contactBuffer;
+	b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
+	b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
+	b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
+	b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
+	b3OpenCLArray<b3SortData>*	m_contactKeyValues;
+	b3AlignedObjectArray<unsigned int> m_idxBuffer;
+	b3AlignedObjectArray<b3SortData> m_sortData;
+	b3AlignedObjectArray<b3Contact4> m_old;
+	b3AlignedObjectArray<int>	m_batchSizes;
+	b3OpenCLArray<int>*	m_batchSizesGpu;
+b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue  q,int pairCapacity)
+	m_debugOutput=0;
+	m_data = new b3GpuBatchingPgsSolverInternalData;
+	m_data->m_context = ctx;
+	m_data->m_device = device;
+	m_data->m_queue = q;
+	m_data->m_pairCapacity = pairCapacity;
+	m_data->m_nIterations = 4;
+	m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx,q);
+	m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx,q);
+	m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx,q);
+	m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx,q);
+	m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx,q);
+	m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx,q);
+	m_data->m_solverGPU = new b3Solver(ctx,device,q,512*1024);
+	m_data->m_sort32 = new b3RadixSort32CL(ctx,device,m_data->m_queue);
+	m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS);
+	m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS);
+	const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
+	m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,m_data->m_queue,sortSize);
+	m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx,m_data->m_queue);
+	m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,m_data->m_queue,B3_SOLVER_N_CELLS);
+	m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
+	m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx,q,pairCapacity);
+	m_data->m_offsets = new b3OpenCLArray<unsigned int>( ctx,m_data->m_queue,B3_SOLVER_N_CELLS);
+	m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
+	const char* additionalMacros = "";
+	const char* srcFileNameForCaching="";
+	cl_int pErrNum;
+	const char* batchKernelSource = batchingKernelsCL;
+	const char* batchKernelNewSource = batchingKernelsNewCL;
+	const char* solverSetupSource = solverSetupCL;
+	const char* solverSetup2Source = solverSetup2CL;
+	const char* solveContactSource = solveContactCL;
+	const char* solveFrictionSource = solveFrictionCL;
+	{
+		cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
+		b3Assert(solveContactProg);
+		cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
+		b3Assert(solveFrictionProg);
+		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
+		b3Assert(solverSetup2Prog);
+		cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
+		b3Assert(solverSetupProg);
+		m_data->m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
+		b3Assert(m_data->m_solveFrictionKernel);
+		m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
+		b3Assert(m_data->m_solveContactKernel);
+		m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg,additionalMacros );
+		b3Assert(m_data->m_solveSingleContactKernel);
+		m_data->m_solveSingleFrictionKernel =b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg,additionalMacros );
+		b3Assert(m_data->m_solveSingleFrictionKernel);
+		m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
+		b3Assert(m_data->m_contactToConstraintKernel);
+		m_data->m_setSortDataKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_setSortDataKernel);
+		m_data->m_setDeterminismSortDataBodyAKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
+		m_data->m_setDeterminismSortDataBodyBKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
+		m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
+		m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
+		m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_reorderContactKernel);
+		m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_data->m_copyConstraintKernel);
+	}
+	{
+		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
+		b3Assert(batchingProg);
+		m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
+		b3Assert(m_data->m_batchingKernel);
+	}
+	{
+		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
+		b3Assert(batchingNewProg);
+		m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
+		b3Assert(m_data->m_batchingKernelNew);
+	}
+	delete m_data->m_batchSizesGpu;
+	delete m_data->m_bodyBufferGPU;
+	delete m_data->m_inertiaBufferGPU;
+	delete m_data->m_pBufContactOutGPU;
+	delete m_data->m_pBufContactOutGPUCopy;
+	delete m_data->m_contactKeyValues;
+	delete m_data->m_contactCGPU;
+	delete m_data->m_numConstraints;
+	delete m_data->m_offsets;
+	delete m_data->m_sortDataBuffer;
+	delete m_data->m_contactBuffer;
+	delete m_data->m_sort32;
+	delete m_data->m_scan;
+	delete m_data->m_search;
+	delete m_data->m_solverGPU;
+	clReleaseKernel(m_data->m_batchingKernel);
+	clReleaseKernel(m_data->m_batchingKernelNew);
+	clReleaseKernel(m_data->m_solveSingleContactKernel);
+	clReleaseKernel(m_data->m_solveSingleFrictionKernel);
+	clReleaseKernel( m_data->m_solveContactKernel);
+	clReleaseKernel( m_data->m_solveFrictionKernel);
+	clReleaseKernel( m_data->m_contactToConstraintKernel);
+	clReleaseKernel( m_data->m_setSortDataKernel);
+	clReleaseKernel( m_data->m_reorderContactKernel);
+	clReleaseKernel( m_data->m_copyConstraintKernel);
+	clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
+	clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
+	clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
+	clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
+	delete m_data;
+struct b3ConstraintCfg
+	b3ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(0) {}
+	float m_positionDrift;
+	float m_positionConstraintCoeff;
+	float m_dt;
+	bool m_enableParallelSolve;
+	float m_batchCellSize;
+	int m_staticIdx;
+void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations, const b3AlignedObjectArray<int>* batchSizes)//const b3OpenCLArray<int>* gpuBatchSizes)
+	B3_PROFILE("solveContactConstraintBatchSizes");
+	int numBatches = batchSizes->size()/B3_MAX_NUM_BATCHES;
+	for(int iter=0; iter<numIterations; iter++)
+	{
+		for (int cellId=0;cellId<numBatches;cellId++)
+		{
+			int offset = 0;
+			for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
+			{
+				int numInBatch = batchSizes->at(cellId*B3_MAX_NUM_BATCHES+ii);
+				if (!numInBatch)
+					break;
+				{
+					b3LauncherCL launcher( m_data->m_queue, m_data->m_solveSingleContactKernel,"m_solveSingleContactKernel" );
+					launcher.setBuffer(bodyBuf->getBufferCL() );
+					launcher.setBuffer(shapeBuf->getBufferCL() );
+					launcher.setBuffer(	constraint->getBufferCL() );
+					launcher.setConst(cellId);
+					launcher.setConst(offset);
+					launcher.setConst(numInBatch);
+					launcher.launch1D(numInBatch);
+					offset+=numInBatch;
+				}
+			}
+		}
+	}
+	for(int iter=0; iter<numIterations; iter++)
+	{
+		for (int cellId=0;cellId<numBatches;cellId++)
+		{
+			int offset = 0;
+			for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
+			{
+				int numInBatch = batchSizes->at(cellId*B3_MAX_NUM_BATCHES+ii);
+				if (!numInBatch)
+					break;
+				{
+					b3LauncherCL launcher( m_data->m_queue, m_data->m_solveSingleFrictionKernel,"m_solveSingleFrictionKernel" );
+					launcher.setBuffer(bodyBuf->getBufferCL() );
+					launcher.setBuffer(shapeBuf->getBufferCL() );
+					launcher.setBuffer(	constraint->getBufferCL() );
+					launcher.setConst(cellId);
+					launcher.setConst(offset);
+					launcher.setConst(numInBatch);
+					launcher.launch1D(numInBatch);
+					offset+=numInBatch;
+				}
+			}
+		}
+	}
+void b3GpuPgsContactSolver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations, const b3AlignedObjectArray<int>* batchSizes)//,const b3OpenCLArray<int>* gpuBatchSizes)
+	//sort the contacts
+	b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
+	{
+		const int nn = B3_SOLVER_N_CELLS;
+		cdata.x = 0;
+		cdata.y = maxNumBatches;//250;
+		int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		{
+			B3_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<numIterations; iter++)
+			{
+				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				{
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+					cdata.z = ib;
+				b3LauncherCL launcher( m_data->m_queue, m_data->m_solveContactKernel,"m_solveContactKernel" );
+#if 1
+					b3BufferInfoCL bInfo[] = { 
+						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
+						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
+						b3BufferInfoCL( constraint->getBufferCL() ),
+						b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), 
+						b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) 
+#ifdef DEBUG_ME
+						,	b3BufferInfoCL(&gpuDebugInfo)
+						};
+                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL());
+					//launcher.setConst(  cdata.x );
+                    launcher.setConst(  cdata.y );
+                    launcher.setConst(  cdata.z );
+					b3Int4 nSplit;
+					nSplit.x = B3_SOLVER_N_SPLIT_X;
+					nSplit.y = B3_SOLVER_N_SPLIT_Y;
+					nSplit.z = B3_SOLVER_N_SPLIT_Z;
+                    launcher.setConst(  nSplit );
+                    launcher.launch1D( numWorkItems, 64 );
+                    const char* fileName = "m_batchSolveKernel.bin";
+                    FILE* f = fopen(fileName,"rb");
+                    if (f)
+                    {
+                        int sizeInBytes=0;
+                        if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
+                        {
+                            printf("error, cannot get file size\n");
+                            exit(0);
+                        }
+                        unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
+                        fread(buf,sizeInBytes,1,f);
+                        int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
+                        int num = *(int*)&buf[serializedBytes];
+                        launcher.launch1D( num);
+                        //this clFinish is for testing on errors
+                        clFinish(m_queue);
+                    }
+#ifdef DEBUG_ME
+					clFinish(m_queue);
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					clFinish(m_queue);
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+				}
+			}
+			clFinish(m_data->m_queue);
+		}
+		cdata.x = 1;
+		bool applyFriction=true;
+		if (applyFriction)
+    	{
+			B3_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<numIterations; iter++)
+			{
+				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					b3BufferInfoCL bInfo[] = { 
+						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
+						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
+						b3BufferInfoCL( constraint->getBufferCL() ),
+						b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), 
+						b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() )
+#ifdef DEBUG_ME
+						,b3BufferInfoCL(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					b3LauncherCL launcher( m_data->m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel" );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL());
+					//launcher.setConst(  cdata.x );
+                    launcher.setConst(  cdata.y );
+                    launcher.setConst(  cdata.z );
+                    b3Int4 nSplit;
+					nSplit.x = B3_SOLVER_N_SPLIT_X;
+					nSplit.y = B3_SOLVER_N_SPLIT_Y;
+					nSplit.z = B3_SOLVER_N_SPLIT_Z;
+                    launcher.setConst(  nSplit );
+					launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
+				}
+			}
+			clFinish(m_data->m_queue);
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+static bool sortfnc(const b3SortData& a,const b3SortData& b)
+	return (a.m_key<b.m_key);
+static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
+	return ((p.m_bodyAPtrAndSignBit<q.m_bodyAPtrAndSignBit) ||
+		((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit<q.m_bodyBPtrAndSignBit)) ||
+		((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit)  &&     p.m_childIndexA<q.m_childIndexA ) ||
+		((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit)  &&     p.m_childIndexA<q.m_childIndexA ) ||
+		((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit)  &&     p.m_childIndexA==q.m_childIndexA  && p.m_childIndexB<q.m_childIndexB)
+		);
+static const int gridTable4x4[] = 
+    0,1,17,16,
+	1,2,18,19,
+	17,18,32,3,
+	16,19,3,34
+static const int gridTable8x8[] = 
+	  0,  2,  3, 16, 17, 18, 19,  1,
+	 66, 64, 80, 67, 82, 81, 65, 83,
+	131,144,128,130,147,129,145,146,
+	208,195,194,192,193,211,210,209,
+	 21, 22, 23,  5,  4,  6,  7, 20,
+	 86, 85, 69, 87, 70, 68, 84, 71,
+	151,133,149,150,135,148,132,134,
+	197,27,214,213,212,199,198,196
+#define USE_4x4_GRID 1
+void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts,float scale,const b3Int4& nSplit,int staticIdx)
+	for (int gIdx=0;gIdx<nContacts;gIdx++)
+	{
+		if( gIdx < nContacts )
+		{
+			int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;
+			int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;
+			int aIdx = abs(aPtrAndSignBit );
+			int bIdx = abs(bPtrAndSignBit);
+			bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);
+			bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);
+			int idx = (aStatic)? bIdx: aIdx;
+			b3Vector3 p = gBodies[idx].m_pos;
+			int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);
+			int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);
+			int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);
+			int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);
+		#if USE_4x4_GRID
+			int aa = aIdx&3;
+			int bb = bIdx&3;
+			if (aStatic)
+				aa = bb;
+			if (bStatic)
+				bb = aa;
+			int gridIndex = aa + bb*4;
+			int newIndex = gridTable4x4[gridIndex];
+		#else//USE_4x4_GRID
+			int aa = aIdx&7;
+			int bb = bIdx&7;
+			if (aStatic)
+				aa = bb;
+			if (bStatic)
+				bb = aa;
+			int gridIndex = aa + bb*8;
+			int newIndex = gridTable8x8[gridIndex];
+		#endif//USE_4x4_GRID
+			gSortDataOut[gIdx].x = newIndex;
+			gSortDataOut[gIdx].y = gIdx;
+		}
+		else
+		{
+			gSortDataOut[gIdx].x = 0xffffffff;
+		}
+	}
+void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
+	B3_PROFILE("solveContacts");
+	m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies);
+	m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies);
+	m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts);
+	if (optionalSortContactsDeterminism)
+	{
+		if (!gCpuSortContactsDeterminism)
+		{
+			B3_PROFILE("GPU Sort contact constraints (determinism)");
+			m_data->m_pBufContactOutGPUCopy->resize(numContacts);
+			m_data->m_contactKeyValues->resize(numContacts);
+			m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(),numContacts,0,0);
+			{
+				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel,"m_setDeterminismSortDataChildShapeBKernel");
+				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
+				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
+				launcher.setConst(numContacts);
+				launcher.launch1D( numContacts, 64 );
+			}
+			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
+			{
+				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel,"m_setDeterminismSortDataChildShapeAKernel");
+				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
+				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
+				launcher.setConst(numContacts);
+				launcher.launch1D( numContacts, 64 );
+			}
+			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
+			{
+				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel,"m_setDeterminismSortDataBodyBKernel");
+				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
+				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
+				launcher.setConst(numContacts);
+				launcher.launch1D( numContacts, 64 );
+			}
+			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
+			{
+				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel,"m_setDeterminismSortDataBodyAKernel");
+				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
+				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
+				launcher.setConst(numContacts);
+				launcher.launch1D( numContacts, 64 );
+			}
+			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
+			{
+				B3_PROFILE("gpu reorderContactKernel (determinism)");
+				b3Int4 cdata;
+				cdata.x = numContacts;
+				//b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
+				//	, b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
+				b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel");
+				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
+				launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
+				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
+				launcher.setConst( cdata );
+				launcher.launch1D( numContacts, 64 );
+            }
+		} else
+		{
+			B3_PROFILE("CPU Sort contact constraints (determinism)");
+			b3AlignedObjectArray<b3Contact4> cpuConstraints;
+			m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
+			bool sort = true;
+			if (sort)
+			{
+				cpuConstraints.quickSort(b3ContactCmp);
+				for (int i=0;i<cpuConstraints.size();i++)
+				{
+					cpuConstraints[i].m_batchIdx = i;
+				}
+			}
+			m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
+			if (m_debugOutput==100)
+			{
+				for (int i=0;i<cpuConstraints.size();i++)
+				{
+					printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n",i,cpuConstraints[i].m_bodyAPtrAndSignBit,cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
+				}
+			}
+			m_debugOutput++;
+		}
+	}
+	int nContactOut = m_data->m_pBufContactOutGPU->size();
+	bool useSolver = true;
+    if (useSolver)
+    {
+        float dt=1./60.;
+        b3ConstraintCfg csCfg( dt );
+        csCfg.m_enableParallelSolve = true;
+        csCfg.m_batchCellSize = 6;
+        csCfg.m_staticIdx = static0Index;
+        b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
+        void* additionalData = 0;//m_data->m_frictionCGPU;
+        const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
+        b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
+        int nContacts = nContactOut;
+		int maxNumBatches = 0;
+		if (!gUseLargeBatches)
+        {
+            if( m_data->m_solverGPU->m_contactBuffer2)
+            {
+                m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
+            }
+            if( m_data->m_solverGPU->m_contactBuffer2 == 0 )
+            {
+				m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context,m_data->m_queue, nContacts );
+                m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
+            }
+            //clFinish(m_data->m_queue);
+			{
+				B3_PROFILE("batching");
+				//@todo: just reserve it, without copy of original contact (unless we use warmstarting)
+				const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
+				{
+					//b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
+					//b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
+					const int sortAlignment = 512; // todo. get this out of sort
+					if( csCfg.m_enableParallelSolve )
+					{
+						int sortSize = B3NEXTMULTIPLEOF( nContacts, sortAlignment );
+						b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
+						b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
+						if (!gCpuSetSortData)
+						{	//	2. set cell idx
+							B3_PROFILE("GPU set cell idx");
+							struct CB
+							{
+								int m_nContacts;
+								int m_staticIdx;
+								float m_scale;
+								b3Int4 m_nSplit;
+							};
+							b3Assert( sortSize%64 == 0 );
+							CB cdata;
+							cdata.m_nContacts = nContacts;
+							cdata.m_staticIdx = csCfg.m_staticIdx;
+							cdata.m_scale = 1.f/csCfg.m_batchCellSize;
+							cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
+							cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
+							cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
+							m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
+							b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL()), b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
+							b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel,"m_setSortDataKernel" );
+							launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+							launcher.setConst( cdata.m_nContacts );
+							launcher.setConst( cdata.m_scale );
+							launcher.setConst(cdata.m_nSplit);
+							launcher.setConst(cdata.m_staticIdx);
+							launcher.launch1D( sortSize, 64 );
+						} else
+						{
+							m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
+							b3AlignedObjectArray<b3SortData> sortDataCPU;
+							m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
+							b3AlignedObjectArray<b3Contact4> contactCPU;
+							m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
+							b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
+							bodyBuf->copyToHost(bodiesCPU);
+							float scale = 1.f/csCfg.m_batchCellSize;
+							b3Int4 nSplit;
+							nSplit.x = B3_SOLVER_N_SPLIT_X;
+							nSplit.y = B3_SOLVER_N_SPLIT_Y;
+							nSplit.z = B3_SOLVER_N_SPLIT_Z;
+							SetSortDataCPU(&contactCPU[0],  &bodiesCPU[0], &sortDataCPU[0], nContacts,scale,nSplit,csCfg.m_staticIdx);
+							m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
+						}
+						if (!gCpuRadixSort)
+						{	//	3. sort by cell idx
+							B3_PROFILE("gpuRadixSort");
+							//int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
+							//int sortBit = 32;
+							//if( n <= 0xffff ) sortBit = 16;
+							//if( n <= 0xff ) sortBit = 8;
+							//adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
+							//adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
+							b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
+							this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
+						} else
+						{
+							b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
+							b3AlignedObjectArray<b3SortData> hostValues;
+							keyValuesInOut.copyToHost(hostValues);
+							hostValues.quickSort(sortfnc);
+							keyValuesInOut.copyFromHost(hostValues);
+						}
+						if (gUseScanHost)
+						{
+							//	4. find entries
+							B3_PROFILE("cpuBoundSearch");
+							b3AlignedObjectArray<unsigned int> countsHost;
+							countsNative->copyToHost(countsHost);
+							b3AlignedObjectArray<b3SortData> sortDataHost;
+							m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
+							//m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
+							m_data->m_solverGPU->m_search->executeHost(sortDataHost,nContacts,countsHost,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
+							countsNative->copyFromHost(countsHost);
+							//adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
+							//	B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
+							//unsigned int sum;
+							//m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
+							b3AlignedObjectArray<unsigned int> offsetsHost;
+							offsetsHost.resize(offsetsNative->size());
+							m_data->m_solverGPU->m_scan->executeHost(countsHost,offsetsHost, B3_SOLVER_N_CELLS);//,&sum );
+							offsetsNative->copyFromHost(offsetsHost);
+							//printf("sum = %d\n",sum);
+						}  else
+						{
+							//	4. find entries
+							B3_PROFILE("gpuBoundSearch");
+							m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
+							m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
+						} 
+						if (nContacts)
+						{	//	5. sort constraints by cellIdx
+							if (gReorderContactsOnCpu)
+							{
+								B3_PROFILE("cpu m_reorderContactKernel");
+								b3AlignedObjectArray<b3SortData> sortDataHost;
+								m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
+								b3AlignedObjectArray<b3Contact4> inContacts;
+								b3AlignedObjectArray<b3Contact4> outContacts;
+								m_data->m_pBufContactOutGPU->copyToHost(inContacts);
+								outContacts.resize(inContacts.size());
+								for (int i=0;i<nContacts;i++)
+								{
+									int srcIdx = sortDataHost[i].y;
+									outContacts[i] = inContacts[srcIdx];
+								}
+								m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
+								/*								"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
+								"{\n"
+								"	int nContacts = cb.x;\n"
+								"	int gIdx = GET_GLOBAL_IDX;\n"
+								"	if( gIdx < nContacts )\n"
+								"	{\n"
+								"		int srcIdx = sortData[gIdx].y;\n"
+								"		out[gIdx] = in[srcIdx];\n"
+								"	}\n"
+								"}\n"
+								*/
+							} else
+							{
+								B3_PROFILE("gpu m_reorderContactKernel");
+								b3Int4 cdata;
+								cdata.x = nContacts;
+								b3BufferInfoCL bInfo[] = { 
+									b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), 
+									b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
+									, b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
+									b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel");
+									launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+									launcher.setConst( cdata );
+									launcher.launch1D( nContacts, 64 );
+							}
+						}
+					}
+				}
+				//clFinish(m_data->m_queue);
+				//				{
+				//				b3AlignedObjectArray<unsigned int> histogram;
+				//				m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
+				//				printf(",,,\n");
+				//				}
+				if (nContacts)
+				{
+					if (gUseCpuCopyConstraints)
+					{
+						for (int i=0;i<nContacts;i++)
+						{
+							m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
+							//							m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); 
+							//						m_data->m_pBufContactOutGPU->getBufferCL() 
+						}
+					} else
+					{
+						B3_PROFILE("gpu m_copyConstraintKernel");
+						b3Int4 cdata; cdata.x = nContacts;
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL(  m_data->m_solverGPU->m_contactBuffer2->getBufferCL() ), 
+							b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ) 
+						};
+						b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel,"m_copyConstraintKernel" );
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst(  cdata );
+						launcher.launch1D( nContacts, 64 );
+						//we use the clFinish for proper benchmark/profile
+						clFinish(m_data->m_queue);
+					}
+				}
+				bool compareGPU = false;
+				if (nContacts)
+				{
+					if (!gCpuBatchContacts)
+					{
+						B3_PROFILE("gpu batchContacts");
+						maxNumBatches = 250;//250;
+						m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
+						clFinish(m_data->m_queue);
+					} else
+					{
+						B3_PROFILE("cpu batchContacts");
+						static b3AlignedObjectArray<b3Contact4> cpuContacts;
+						b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
+						{
+							B3_PROFILE("copyToHost");
+							contactsIn->copyToHost(cpuContacts);
+						}
+						b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
+						b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
+						b3AlignedObjectArray<unsigned int> nNativeHost;
+						b3AlignedObjectArray<unsigned int> offsetsNativeHost;
+						{
+							B3_PROFILE("countsNative/offsetsNative copyToHost");
+							countsNative->copyToHost(nNativeHost);
+							offsetsNative->copyToHost(offsetsNativeHost);
+						}
+						int numNonzeroGrid=0;
+						if (gUseLargeBatches)
+						{
+							m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
+							int totalNumConstraints = cpuContacts.size();
+							int simdWidth =numBodies+1;//-1;//64;//-1;//32;
+							int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]);	//	on GPU
+							maxNumBatches = b3Max(numBatches,maxNumBatches);
+							static int globalMaxBatch = 0;
+							if (maxNumBatches>globalMaxBatch )
+							{
+								globalMaxBatch  = maxNumBatches;
+								b3Printf("maxNumBatches = %d\n",maxNumBatches);
+							}
+						} else
+						{
+							m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS*B3_MAX_NUM_BATCHES);
+							B3_PROFILE("cpu batch grid");
+							for(int i=0; i<B3_SOLVER_N_CELLS; i++)
+							{
+								int n = (nNativeHost)[i];
+								int offset = (offsetsNativeHost)[i];
+								if( n )
+								{
+									numNonzeroGrid++;
+									int simdWidth =numBodies+1;//-1;//64;//-1;//32;
+									int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[i*B3_MAX_NUM_BATCHES]);	//	on GPU
+									maxNumBatches = b3Max(numBatches,maxNumBatches);
+									static int globalMaxBatch = 0;
+									if (maxNumBatches>globalMaxBatch )
+									{
+										globalMaxBatch  = maxNumBatches;
+										b3Printf("maxNumBatches = %d\n",maxNumBatches);
+									}
+									//we use the clFinish for proper benchmark/profile
+								}
+							}
+							//clFinish(m_data->m_queue);
+						}
+						{
+							B3_PROFILE("m_contactBuffer->copyFromHost");
+							m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
+						}
+					} 
+				}
+			} 
+		} 
+			//printf("maxNumBatches = %d\n", maxNumBatches);
+		if (gUseLargeBatches)
+		{
+			if (nContacts)
+			{
+				B3_PROFILE("cpu batchContacts");
+				static b3AlignedObjectArray<b3Contact4> cpuContacts;
+//				b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
+				{
+					B3_PROFILE("copyToHost");
+					m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
+				}
+				b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
+				b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
+				int numNonzeroGrid=0;
+				{
+					m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
+					int totalNumConstraints = cpuContacts.size();
+					int simdWidth =numBodies+1;//-1;//64;//-1;//32;
+					int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]);	//	on GPU
+					maxNumBatches = b3Max(numBatches,maxNumBatches);
+					static int globalMaxBatch = 0;
+					if (maxNumBatches>globalMaxBatch )
+					{
+						globalMaxBatch  = maxNumBatches;
+						b3Printf("maxNumBatches = %d\n",maxNumBatches);
+					}
+				}
+				{
+					B3_PROFILE("m_contactBuffer->copyFromHost");
+					m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
+				}
+			} 
+		}
+		if (nContacts)
+		{
+			B3_PROFILE("gpu convertToConstraints");
+			m_data->m_solverGPU->convertToConstraints( bodyBuf, 
+				shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
+				contactConstraintOut, 
+				additionalData, nContacts, 
+				(b3SolverBase::ConstraintCfg&) csCfg );
+			clFinish(m_data->m_queue);
+		}
+		if (1)
+		{
+			int numIter = 4;
+			m_data->m_solverGPU->m_nIterations = numIter;//10
+			if (!gCpuSolveConstraint)
+			{
+				B3_PROFILE("GPU solveContactConstraint");
+				/*m_data->m_solverGPU->solveContactConstraint(
+				m_data->m_bodyBufferGPU, 
+				m_data->m_inertiaBufferGPU,
+				m_data->m_contactCGPU,0,
+				nContactOut ,
+				maxNumBatches);
+				*/
+				//m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
+				if (gUseLargeBatches)
+				{
+					solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU, 
+						m_data->m_inertiaBufferGPU,
+						m_data->m_contactCGPU,0,
+						nContactOut ,
+						maxNumBatches,numIter,&m_data->m_batchSizes);
+				} else
+				{
+					solveContactConstraint(
+						m_data->m_bodyBufferGPU, 
+						m_data->m_inertiaBufferGPU,
+						m_data->m_contactCGPU,0,
+						nContactOut ,
+						maxNumBatches,numIter,&m_data->m_batchSizes);//m_data->m_batchSizesGpu);
+				}
+			}
+			else
+			{
+				B3_PROFILE("Host solveContactConstraint");
+				m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches,&m_data->m_batchSizes);
+			}
+        }
+#if 0
+        if (0)
+        {
+            B3_PROFILE("read body velocities back to CPU");
+            //read body updated linear/angular velocities back to CPU
+            m_data->m_bodyBufferGPU->read(
+                                                  m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
+            adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
+        }
+    }
+void b3GpuPgsContactSolver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx )
+b3AlignedObjectArray<unsigned int> idxBuffer;
+b3AlignedObjectArray<b3SortData> sortData;
+b3AlignedObjectArray<b3Contact4> old;
+inline int b3GpuPgsContactSolver::sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies)
+	B3_PROFILE("sortConstraintByBatch");
+	int numIter = 0;
+	sortData.resize(n);
+	idxBuffer.resize(n);
+	old.resize(n);
+	unsigned int* idxSrc = &idxBuffer[0];
+	unsigned int* idxDst = &idxBuffer[0];
+	int nIdxSrc, nIdxDst;
+	const int N_FLG = 256;
+	const int FLG_MASK = N_FLG-1;
+	unsigned int flg[N_FLG/32];
+#if defined(_DEBUG)
+	for(int i=0; i<n; i++)
+		cs[i].getBatchIdx() = -1;
+	for(int i=0; i<n; i++) 
+		idxSrc[i] = i;
+	nIdxSrc = n;
+	int batchIdx = 0;
+	{
+		B3_PROFILE("cpu batch innerloop");
+		while( nIdxSrc )
+		{
+			numIter++;
+			nIdxDst = 0;
+			int nCurrentBatch = 0;
+			//	clear flag
+			for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+			for(int i=0; i<nIdxSrc; i++)
+			{
+				int idx = idxSrc[i];
+				b3Assert( idx < n );
+				//	check if it can go
+				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				int aIdx = bodyA & FLG_MASK;
+				int bIdx = bodyB & FLG_MASK;
+				unsigned int aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31));
+				unsigned int bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31));
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+                //use inv_mass!
+				aUnavailable = !aIsStatic? aUnavailable:0;//
+				bUnavailable = !bIsStatic? bUnavailable:0;
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+						flg[ aIdx/32 ] |= (1<<(aIdx&31));
+					if (!bIsStatic)
+						flg[ bIdx/32 ] |= (1<<(bIdx&31));
+					cs[idx].getBatchIdx() = batchIdx;
+					sortData[idx].m_key = batchIdx;
+					sortData[idx].m_value = idx;
+					{
+						nCurrentBatch++;
+						if( nCurrentBatch == simdWidth )
+						{
+							nCurrentBatch = 0;
+							for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+						}
+					}
+				}
+				else
+				{
+					idxDst[nIdxDst++] = idx;
+				}
+			}
+			b3Swap( idxSrc, idxDst );
+			b3Swap( nIdxSrc, nIdxDst );
+			batchIdx ++;
+		}
+	}
+	{
+		B3_PROFILE("quickSort");
+		sortData.quickSort(sortfnc);
+	}
+	{
+        B3_PROFILE("reorder");
+		//	reorder
+		memcpy( &old[0], cs, sizeof(b3Contact4)*n);
+		for(int i=0; i<n; i++)
+		{
+			int idx = sortData[i].m_value;
+			cs[i] = old[idx];
+		}
+	}
+#if defined(_DEBUG)
+    //		debugPrintf( "nBatches: %d\n", batchIdx );
+	for(int i=0; i<n; i++)
+    {
+        b3Assert( cs[i].getBatchIdx() != -1 );
+    }
+	return batchIdx;
+b3AlignedObjectArray<int> bodyUsed2;
+inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies)
+	B3_PROFILE("sortConstraintByBatch2");
+	bodyUsed2.resize(2*simdWidth);
+	for (int q=0;q<2*simdWidth;q++)
+		bodyUsed2[q]=0;
+	int curBodyUsed = 0;
+	int numIter = 0;
+	m_data->m_sortData.resize(numConstraints);
+	m_data->m_idxBuffer.resize(numConstraints);
+	m_data->m_old.resize(numConstraints);
+	unsigned int* idxSrc = &m_data->m_idxBuffer[0];
+#if defined(_DEBUG)
+	for(int i=0; i<numConstraints; i++)
+		cs[i].getBatchIdx() = -1;
+	for(int i=0; i<numConstraints; i++) 
+		idxSrc[i] = i;
+	int numValidConstraints = 0;
+	int unprocessedConstraintIndex = 0;
+	int batchIdx = 0;
+	{
+		B3_PROFILE("cpu batch innerloop");
+		while( numValidConstraints < numConstraints)
+		{
+			numIter++;
+			int nCurrentBatch = 0;
+			//	clear flag
+			for(int i=0; i<curBodyUsed; i++) 
+				bodyUsed2[i] = 0;
+            curBodyUsed = 0;
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+				int idx = idxSrc[i];
+				b3Assert( idx < numConstraints );
+				//	check if it can go
+				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = 0;
+				int bUnavailable = 0;
+				if (!aIsStatic)
+				{
+					for (int j=0;j<curBodyUsed;j++)
+					{
+						if (bodyA == bodyUsed2[j])
+						{
+							aUnavailable=1;
+							break;
+						}
+					}
+				}
+				if (!aUnavailable)
+				if (!bIsStatic)
+				{
+					for (int j=0;j<curBodyUsed;j++)
+					{
+						if (bodyB == bodyUsed2[j])
+						{
+							bUnavailable=1;
+							break;
+						}
+					}
+				}
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						bodyUsed2[curBodyUsed++] = bodyA;
+					}
+					if (!bIsStatic)
+					{
+						bodyUsed2[curBodyUsed++] = bodyB;
+					}
+					cs[idx].getBatchIdx() = batchIdx;
+					m_data->m_sortData[idx].m_key = batchIdx;
+					m_data->m_sortData[idx].m_value = idx;
+					if (i!=numValidConstraints)
+					{
+						b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
+					}
+					numValidConstraints++;
+					{
+						nCurrentBatch++;
+						if( nCurrentBatch == simdWidth )
+						{
+							nCurrentBatch = 0;
+							for(int i=0; i<curBodyUsed; i++) 
+								bodyUsed2[i] = 0;
+							curBodyUsed = 0;
+						}
+					}
+				}
+			}
+			batchIdx ++;
+		}
+	}
+	{
+		B3_PROFILE("quickSort");
+		//m_data->m_sortData.quickSort(sortfnc);
+	}
+	{
+        B3_PROFILE("reorder");
+		//	reorder
+		memcpy( &m_data->m_old[0], cs, sizeof(b3Contact4)*numConstraints);
+		for(int i=0; i<numConstraints; i++)
+		{
+			b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
+			int idx = m_data->m_sortData[idxSrc[i]].m_value;
+			cs[i] = m_data->m_old[idx];
+		}
+	}
+#if defined(_DEBUG)
+    //		debugPrintf( "nBatches: %d\n", batchIdx );
+	for(int i=0; i<numConstraints; i++)
+    {
+        b3Assert( cs[i].getBatchIdx() != -1 );
+    }
+	return batchIdx;
+b3AlignedObjectArray<int> bodyUsed;
+b3AlignedObjectArray<int> curUsed;
+inline int b3GpuPgsContactSolver::sortConstraintByBatch3( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies, int* batchSizes)
+	B3_PROFILE("sortConstraintByBatch3");
+	static int maxSwaps = 0;
+	int numSwaps = 0;
+	curUsed.resize(2*simdWidth);
+	static int maxNumConstraints = 0;
+	if (maxNumConstraints<numConstraints)
+	{
+		maxNumConstraints = numConstraints;
+		//printf("maxNumConstraints  = %d\n",maxNumConstraints );
+	}
+	int numUsedArray = numBodies/32+1;
+	bodyUsed.resize(numUsedArray);
+	for (int q=0;q<numUsedArray;q++)
+		bodyUsed[q]=0;
+	int curBodyUsed = 0;
+	int numIter = 0;
+	m_data->m_sortData.resize(0);
+	m_data->m_idxBuffer.resize(0);
+	m_data->m_old.resize(0);
+#if defined(_DEBUG)
+	for(int i=0; i<numConstraints; i++)
+		cs[i].getBatchIdx() = -1;
+	int numValidConstraints = 0;
+	int unprocessedConstraintIndex = 0;
+	int batchIdx = 0;
+	{
+		B3_PROFILE("cpu batch innerloop");
+		while( numValidConstraints < numConstraints)
+		{
+			numIter++;
+			int nCurrentBatch = 0;
+			batchSizes[batchIdx] = 0;
+			//	clear flag
+			for(int i=0; i<curBodyUsed; i++) 
+				bodyUsed[curUsed[i]/32] = 0;
+            curBodyUsed = 0;
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+				int idx = i;
+				b3Assert( idx < numConstraints );
+				//	check if it can go
+				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = 0;
+				int bUnavailable = 0;
+				if (!aIsStatic)
+				{
+					aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31));
+				}
+				if (!aUnavailable)
+				if (!bIsStatic)
+				{
+					bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31));
+				}
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31));
+						curUsed[curBodyUsed++]=bodyA;
+					}
+					if (!bIsStatic)
+					{
+						bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31));
+						curUsed[curBodyUsed++]=bodyB;
+					}
+					cs[idx].getBatchIdx() = batchIdx;
+					if (i!=numValidConstraints)
+					{
+						b3Swap(cs[i],cs[numValidConstraints]);
+						numSwaps++;
+					}
+					numValidConstraints++;
+					{
+						nCurrentBatch++;
+						if( nCurrentBatch == simdWidth )
+						{
+							batchSizes[batchIdx] += simdWidth;
+							nCurrentBatch = 0;
+							for(int i=0; i<curBodyUsed; i++) 
+								bodyUsed[curUsed[i]/32] = 0;
+							curBodyUsed = 0;
+						}
+					}
+				}
+			}
+			if (batchIdx>=B3_MAX_NUM_BATCHES)
+			{
+				b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
+				b3Assert(0);
+				break;
+			}
+			batchSizes[batchIdx] += nCurrentBatch;
+			batchIdx ++;
+		}
+	}
+#if defined(_DEBUG)
+    //		debugPrintf( "nBatches: %d\n", batchIdx );
+	for(int i=0; i<numConstraints; i++)
+    {
+        b3Assert( cs[i].getBatchIdx() != -1 );
+    }
+	batchSizes[batchIdx] =0;
+	if (maxSwaps<numSwaps)
+	{
+		maxSwaps = numSwaps;
+		//printf("maxSwaps = %d\n", maxSwaps);
+	}
+	return batchIdx;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h
new file mode 100644
index 00000000..98e2a5b8
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h
@@ -0,0 +1,43 @@
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "b3GpuConstraint4.h"
+class b3GpuPgsContactSolver
+	int m_debugOutput;
+	struct b3GpuBatchingPgsSolverInternalData*		m_data;
+	void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
+	inline int sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
+	inline int sortConstraintByBatch2( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
+	inline int sortConstraintByBatch3( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies, int* batchSizes);
+	void solveContactConstraintBatchSizes(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes);
+		void solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes);
+	b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue  q,int pairCapacity);
+	virtual ~b3GpuPgsContactSolver();
+	void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
new file mode 100644
index 00000000..783e4430
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@@ -0,0 +1,708 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "b3GpuRigidBodyPipeline.h"
+#include "b3GpuRigidBodyPipelineInternalData.h"
+#include "kernels/integrateKernel.h"
+#include "kernels/updateAabbsKernel.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "b3GpuNarrowPhase.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h"
+#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
+#define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl"
+#define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl"
+bool useBullet2CpuSolver = true;
+//choice of contact solver
+bool gUseJacobi = false;
+bool gUseDbvt = false;
+bool gDumpContactStats = false;
+bool gCalcWorldSpaceAabbOnCpu = false;
+bool gUseCalculateOverlappingPairsHost = false;
+bool gIntegrateOnCpu = false;
+bool gClearPairsOnGpu = true;
+#include "b3GpuJacobiContactSolver.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h"
+#include "b3GpuPgsContactSolver.h"
+#include "b3Solver.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3OpenCL/Raycast/b3GpuRaycast.h"
+#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
+b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue  q,class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap , struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
+	m_data = new b3GpuRigidBodyPipelineInternalData;
+	m_data->m_constraintUid=0;
+	m_data->m_config = config;
+	m_data->m_context = ctx;
+	m_data->m_device = device;
+	m_data->m_queue = q;
+	m_data->m_solver = new b3PgsJacobiSolver(true);//new b3PgsJacobiSolver(true);
+	m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx,device,q,true);//new b3PgsJacobiSolver(true);
+	m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx,q,config.m_maxConvexBodies);
+	m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx,q,config.m_maxBroadphasePairs);
+	m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx,q);
+	m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx,device,q,config.m_maxBroadphasePairs);	
+	m_data->m_solver2 = new b3GpuPgsContactSolver(ctx,device,q,config.m_maxBroadphasePairs);
+	m_data->m_raycaster = new b3GpuRaycast(ctx,device,q);
+	m_data->m_broadphaseDbvt = broadphaseDbvt;
+	m_data->m_broadphaseSap = broadphaseSap;
+	m_data->m_narrowphase = narrowphase;
+	m_data->m_gravity.setValue(0.f,-9.8f,0.f);
+	cl_int errNum=0;
+	{
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"",B3_RIGIDBODY_INTEGRATE_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		clReleaseProgram(prog);
+	}
+	{
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,updateAabbsKernelCL,&errNum,"",B3_RIGIDBODY_UPDATEAABB_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "clearOverlappingPairsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		clReleaseProgram(prog);
+	}
+	if (m_data->m_integrateTransformsKernel)
+		clReleaseKernel(m_data->m_integrateTransformsKernel);
+	if (m_data->m_updateAabbsKernel)
+		clReleaseKernel(m_data->m_updateAabbsKernel);
+	if (m_data->m_clearOverlappingPairsKernel)
+		clReleaseKernel(m_data->m_clearOverlappingPairsKernel);
+	delete m_data->m_raycaster;
+	delete m_data->m_solver;
+	delete m_data->m_allAabbsGPU;
+	delete m_data->m_gpuConstraints;
+	delete m_data->m_overlappingPairsGPU;
+	delete m_data->m_solver3;
+	delete m_data->m_solver2;
+	delete m_data;
+void	b3GpuRigidBodyPipeline::reset()
+	m_data->m_gpuConstraints->resize(0);
+	m_data->m_cpuConstraints.resize(0);
+	m_data->m_allAabbsGPU->resize(0);
+	m_data->m_allAabbsCPU.resize(0);
+void	b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
+	m_data->m_joints.push_back(constraint);
+void	b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
+	m_data->m_joints.remove(constraint);
+void  b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
+	m_data->m_gpuSolver->recomputeBatches();
+	//slow linear search
+	m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
+	//remove
+	for (int i=0;i<m_data->m_cpuConstraints.size();i++)
+	{
+		if (m_data->m_cpuConstraints[i].m_uid == uid)
+		{
+			//m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]);
+			m_data->m_cpuConstraints.swap(i,m_data->m_cpuConstraints.size()-1);
+			m_data->m_cpuConstraints.pop_back();
+			break;
+		}
+	}
+	if (m_data->m_cpuConstraints.size())
+	{
+		m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
+	} else
+	{
+		m_data->m_gpuConstraints->resize(0);
+	}
+int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold)
+	m_data->m_gpuSolver->recomputeBatches();
+	b3GpuGenericConstraint c;
+	c.m_uid = m_data->m_constraintUid;
+	m_data->m_constraintUid++;
+	c.m_rbA = bodyA;
+	c.m_rbB = bodyB;
+	c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]);
+	c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]);
+	c.m_breakingImpulseThreshold = breakingThreshold;
+	c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE;
+	m_data->m_cpuConstraints.push_back(c);
+	return c.m_uid;
+int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB,float breakingThreshold)
+	m_data->m_gpuSolver->recomputeBatches();
+	b3GpuGenericConstraint c;
+	c.m_uid = m_data->m_constraintUid;
+	m_data->m_constraintUid++;
+	c.m_rbA = bodyA;
+	c.m_rbB = bodyB;
+	c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]);
+	c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]);
+	c.m_relTargetAB.setValue(relTargetAB[0],relTargetAB[1],relTargetAB[2],relTargetAB[3]);
+	c.m_breakingImpulseThreshold = breakingThreshold;
+	c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE;
+	m_data->m_cpuConstraints.push_back(c);
+	return c.m_uid;
+void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
+	//update worldspace AABBs from local AABB/worldtransform
+	{
+		B3_PROFILE("setupGpuAabbs");
+		setupGpuAabbsFull();
+	}
+	int numPairs =0;
+	//compute overlapping pairs
+	{
+		if (gUseDbvt)
+		{
+			{
+				B3_PROFILE("setAabb");
+				m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU);
+				for (int i=0;i<m_data->m_allAabbsCPU.size();i++)
+				{
+					b3Vector3 aabbMin=b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0],m_data->m_allAabbsCPU[i].m_min[1],m_data->m_allAabbsCPU[i].m_min[2]);
+					b3Vector3 aabbMax=b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0],m_data->m_allAabbsCPU[i].m_max[1],m_data->m_allAabbsCPU[i].m_max[2]);
+					m_data->m_broadphaseDbvt->setAabb(i,aabbMin,aabbMax,0);
+				}
+			}
+			{
+				B3_PROFILE("calculateOverlappingPairs");
+				m_data->m_broadphaseDbvt->calculateOverlappingPairs();
+			}
+			numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
+		} else
+		{
+			if (gUseCalculateOverlappingPairsHost)
+			{
+				m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs);
+			} else
+			{
+				m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
+			}
+			numPairs = m_data->m_broadphaseSap->getNumOverlap();
+		}
+	}
+	//compute contact points
+//	printf("numPairs=%d\n",numPairs);
+	int numContacts  = 0;
+	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
+	if (numPairs)
+	{
+		cl_mem pairs =0;
+		cl_mem aabbsWS =0;
+		if (gUseDbvt)
+		{
+			B3_PROFILE("m_overlappingPairsGPU->copyFromHost");
+			m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
+			pairs = m_data->m_overlappingPairsGPU->getBufferCL();
+			aabbsWS = m_data->m_allAabbsGPU->getBufferCL();
+		} else
+		{
+			pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
+			aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
+		}
+		m_data->m_overlappingPairsGPU->resize(numPairs);
+		//mark the contacts for each pair as 'unused'
+		if (numPairs)
+		{
+			b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context,m_data->m_queue);
+			gpuPairs.setFromOpenCLBuffer(pairs,numPairs);
+			if (gClearPairsOnGpu)
+			{
+				//b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging
+				//gpuPairs.copyToHost(hostPairs);
+				b3LauncherCL launcher(m_data->m_queue,m_data->m_clearOverlappingPairsKernel,"clearOverlappingPairsKernel");
+				launcher.setBuffer(pairs);
+				launcher.setConst(numPairs);
+				launcher.launch1D(numPairs);
+				//gpuPairs.copyToHost(hostPairs);
+			} else
+			{
+				b3AlignedObjectArray<b3BroadphasePair> hostPairs;
+				gpuPairs.copyToHost(hostPairs);
+				for (int i=0;i<hostPairs.size();i++)
+				{
+					hostPairs[i].z = 0xffffffff;
+				}
+				gpuPairs.copyFromHost(hostPairs);
+			}
+		}
+		m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies);
+		numContacts = m_data->m_narrowphase->getNumContactsGpu();
+		if (gUseDbvt)
+		{
+			///store the cached information (contact locations in the 'z' component)
+			B3_PROFILE("m_overlappingPairsGPU->copyToHost");
+			m_data->m_overlappingPairsGPU->copyToHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
+		}
+		if (gDumpContactStats && numContacts)
+		{
+			m_data->m_narrowphase->getContactsGpu();
+			printf("numContacts = %d\n", numContacts);
+			int totalPoints  = 0;
+			const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU();
+			for (int i=0;i<numContacts;i++)
+			{
+				totalPoints += contacts->getNPoints();
+			}
+			printf("totalPoints=%d\n",totalPoints);
+		}
+	}
+	//convert contact points to contact constraints
+	//solve constraints
+	b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context,m_data->m_queue,0,true);
+	gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumRigidBodies());
+	b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context,m_data->m_queue,0,true);
+	gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumRigidBodies());
+	b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context,m_data->m_queue,0,true);
+	gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu());
+	int numJoints =  m_data->m_joints.size() ?  m_data->m_joints.size() : m_data->m_cpuConstraints.size();
+	if (useBullet2CpuSolver && numJoints)
+	{
+	//	b3AlignedObjectArray<b3Contact4> hostContacts;
+		//gpuContacts.copyToHost(hostContacts);
+		{
+			bool useGpu = m_data->m_joints.size()==0;
+//			b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
+			//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints);
+			if (useGpu)
+			{
+				m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(),&gpuBodies,&gpuInertias,numJoints, m_data->m_gpuConstraints);
+			} else
+			{
+				b3AlignedObjectArray<b3RigidBodyData> hostBodies;
+				gpuBodies.copyToHost(hostBodies);
+				b3AlignedObjectArray<b3InertiaData> hostInertias;
+				gpuInertias.copyToHost(hostInertias);
+				b3TypedConstraint** joints = numJoints? &m_data->m_joints[0] : 0;
+				m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints);
+				gpuBodies.copyFromHost(hostBodies);
+			}
+		}
+	}
+	if (numContacts)
+	{
+		if (gUseJacobi)
+		{
+			bool useGpu = true;
+			if (useGpu)
+			{
+				bool forceHost = false;
+				if (forceHost)
+				{
+					b3AlignedObjectArray<b3RigidBodyData> hostBodies;
+					b3AlignedObjectArray<b3InertiaData> hostInertias;
+					b3AlignedObjectArray<b3Contact4> hostContacts;
+					{
+						B3_PROFILE("copyToHost");
+						gpuBodies.copyToHost(hostBodies);
+						gpuInertias.copyToHost(hostInertias);
+						gpuContacts.copyToHost(hostContacts);
+					}
+					{
+						b3JacobiSolverInfo solverInfo;
+						m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(),&hostContacts[0],hostContacts.size(),solverInfo);
+					}
+					{
+						B3_PROFILE("copyFromHost");
+						gpuBodies.copyFromHost(hostBodies);
+					}
+				} else
+				{
+					int static0Index = m_data->m_narrowphase->getStatic0Index();
+					b3JacobiSolverInfo solverInfo;
+					//m_data->m_solver3->solveContacts(    >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo);
+					//m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
+					m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index);
+				}
+			} else
+			{
+				b3AlignedObjectArray<b3RigidBodyData> hostBodies;
+				gpuBodies.copyToHost(hostBodies);
+				b3AlignedObjectArray<b3InertiaData> hostInertias;
+				gpuInertias.copyToHost(hostInertias);
+				b3AlignedObjectArray<b3Contact4> hostContacts;
+				gpuContacts.copyToHost(hostContacts);
+				{
+					//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
+				}
+				gpuBodies.copyFromHost(hostBodies);
+			}
+		} else
+		{
+			int static0Index = m_data->m_narrowphase->getStatic0Index();
+			m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index);
+			//m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL());
+			/*m_data->m_solver3->solveContactConstraintHost(
+			(b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies,
+			(b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias,
+			(b3OpenCLArray<Constraint4>*) &gpuContacts,
+			0,numContacts,256);
+			*/
+		}
+	}
+	integrate(deltaTime);
+void	b3GpuRigidBodyPipeline::integrate(float timeStep)
+	//integrate
+	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
+	float angularDamp = 0.99f;
+	if (gIntegrateOnCpu)
+	{
+		if(numBodies)
+		{
+			b3GpuNarrowPhaseInternalData*	npData = m_data->m_narrowphase->getInternalData();
+			npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU);
+			b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0);
+			for (int nodeID=0;nodeID<numBodies;nodeID++)
+			{
+				integrateSingleTransform( bodies,nodeID, timeStep, angularDamp, m_data->m_gravity);
+			}
+			npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU);
+		}
+	} else
+	{
+		b3LauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel,"m_integrateTransformsKernel");
+		launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
+		launcher.setConst(numBodies);
+		launcher.setConst(timeStep);
+		launcher.setConst(angularDamp);
+		launcher.setConst(m_data->m_gravity);
+		launcher.launch1D(numBodies);
+	}
+void	b3GpuRigidBodyPipeline::setupGpuAabbsFull()
+	cl_int ciErrNum=0;
+	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
+	if (!numBodies)
+		return;
+	if (gCalcWorldSpaceAabbOnCpu)
+	{
+		if (numBodies)
+		{
+			if (gUseDbvt)
+			{
+				m_data->m_allAabbsCPU.resize(numBodies);
+				m_data->m_narrowphase->readbackAllBodiesToCpu();
+				for (int i=0;i<numBodies;i++)
+				{
+					b3ComputeWorldAabb(  i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_allAabbsCPU[0]);
+				}
+				m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
+			} else
+			{
+				m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies);
+				m_data->m_narrowphase->readbackAllBodiesToCpu();
+				for (int i=0;i<numBodies;i++)
+				{
+					b3ComputeWorldAabb(  i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
+				}
+				m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU());
+				//m_data->m_broadphaseSap->writeAabbsToGpu();
+			}
+		}
+	} else
+	{
+		//__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)
+		b3LauncherCL launcher(m_data->m_queue,m_data->m_updateAabbsKernel,"m_updateAabbsKernel");
+		launcher.setConst(numBodies);
+		cl_mem bodies = m_data->m_narrowphase->getBodiesGpu();
+		launcher.setBuffer(bodies);
+		cl_mem collidables = m_data->m_narrowphase->getCollidablesGpu();
+		launcher.setBuffer(collidables);
+		cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu();
+		launcher.setBuffer(localAabbs);
+		cl_mem worldAabbs =0;
+		if (gUseDbvt)
+		{
+			worldAabbs = m_data->m_allAabbsGPU->getBufferCL();
+		} else
+		{
+			worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
+		}
+		launcher.setBuffer(worldAabbs);
+		launcher.launch1D(numBodies);
+	}
+	/*
+	b3AlignedObjectArray<b3SapAabb> aabbs;
+	m_data->m_broadphaseSap->m_allAabbsGPU.copyToHost(aabbs);
+	printf("numAabbs = %d\n",  aabbs.size());
+	for (int i=0;i<aabbs.size();i++)
+	{
+		printf("aabb[%d].m_min=%f,%f,%f,%d\n",i,aabbs[i].m_minVec[0],aabbs[i].m_minVec[1],aabbs[i].m_minVec[2],aabbs[i].m_minIndices[3]);
+		printf("aabb[%d].m_max=%f,%f,%f,%d\n",i,aabbs[i].m_maxVec[0],aabbs[i].m_maxVec[1],aabbs[i].m_maxVec[2],aabbs[i].m_signedMaxIndices[3]);
+	};
+	*/
+cl_mem	b3GpuRigidBodyPipeline::getBodyBuffer()
+	return m_data->m_narrowphase->getBodiesGpu();
+int	b3GpuRigidBodyPipeline::getNumBodies() const
+	return m_data->m_narrowphase->getNumRigidBodies();
+void	b3GpuRigidBodyPipeline::setGravity(const float* grav)
+	m_data->m_gravity.setValue(grav[0],grav[1],grav[2]);
+void 		b3GpuRigidBodyPipeline::copyConstraintsToHost()
+	m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
+void 		b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
+	m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
+	m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
+int		b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
+	b3Vector3 aabbMin=b3MakeVector3(0,0,0),aabbMax=b3MakeVector3(0,0,0);
+	if (collidableIndex>=0)
+	{
+		b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
+		b3Vector3 localAabbMin=b3MakeVector3(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]);
+		b3Vector3 localAabbMax=b3MakeVector3(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]);
+		b3Scalar margin = 0.01f;
+		b3Transform t;
+		t.setIdentity();
+		t.setOrigin(b3MakeVector3(position[0],position[1],position[2]));
+		t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3]));
+		b3TransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax);
+	} else
+	{
+		b3Error("registerPhysicsInstance using invalid collidableIndex\n");
+		return -1;
+	}
+	bool writeToGpu = false;
+	int bodyIndex = m_data->m_narrowphase->getNumRigidBodies();
+	bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu);
+	if (bodyIndex>=0)
+	{
+		if (gUseDbvt)
+		{
+			m_data->m_broadphaseDbvt->createProxy(aabbMin,aabbMax,bodyIndex,0,1,1);
+			b3SapAabb aabb;
+			for (int i=0;i<3;i++)
+			{
+				aabb.m_min[i] = aabbMin[i];
+				aabb.m_max[i] = aabbMax[i];
+				aabb.m_minIndices[3] = bodyIndex;
+			}
+			m_data->m_allAabbsCPU.push_back(aabb);
+			if (writeInstanceToGpu)
+			{
+				m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
+			}
+		} else
+		{
+			if (mass)
+			{
+				m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher);
+			} else
+			{
+				m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher);	
+			}
+		}
+	}
+	/*
+	if (mass>0.f)
+		m_numDynamicPhysicsInstances++;
+	m_numPhysicsInstances++;
+	*/
+	return bodyIndex;
+void	b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults)
+	this->m_data->m_raycaster->castRays(rays,hitResults,
+		getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(),
+		m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
+		m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h
new file mode 100644
index 00000000..b4eac684
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h
@@ -0,0 +1,74 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
+class b3GpuRigidBodyPipeline
+	struct b3GpuRigidBodyPipelineInternalData*	m_data;
+	int allocateCollidable();
+	b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue  q , class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
+	virtual ~b3GpuRigidBodyPipeline();
+	void	stepSimulation(float deltaTime);
+	void	integrate(float timeStep);
+	void	setupGpuAabbsFull();
+	int		registerConvexPolyhedron(class b3ConvexUtility* convex);
+	//int		registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+	//int		registerSphereShape(float radius);
+	//int		registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
+	//int		registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
+	//int		registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
+	int		registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
+	//if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered
+	void	writeAllInstancesToGpu();
+	void	copyConstraintsToHost();
+	void	setGravity(const float* grav);
+	void reset();
+	int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold);
+	int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
+	void removeConstraintByUid(int uid);
+	void	addConstraint(class b3TypedConstraint* constraint);
+	void	removeConstraint(b3TypedConstraint* constraint);
+	void	castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults);
+	cl_mem	getBodyBuffer();
+	int	getNumBodies() const;
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
new file mode 100644
index 00000000..5ac92f97
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
@@ -0,0 +1,73 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
+#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h"
+struct b3GpuRigidBodyPipelineInternalData
+	cl_context			m_context;
+	cl_device_id		m_device;
+	cl_command_queue	m_queue;
+	cl_kernel	m_integrateTransformsKernel;
+	cl_kernel	m_updateAabbsKernel;
+	cl_kernel	m_clearOverlappingPairsKernel;
+	class b3PgsJacobiSolver* m_solver;
+	class b3GpuPgsConstraintSolver* m_gpuSolver;
+	class b3GpuPgsContactSolver* m_solver2;
+	class b3GpuJacobiContactSolver* m_solver3;
+	class b3GpuRaycast* m_raycaster;
+	class b3GpuBroadphaseInterface* m_broadphaseSap;
+	struct b3DynamicBvhBroadphase* m_broadphaseDbvt;
+	b3OpenCLArray<b3SapAabb>*	m_allAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU;
+	b3OpenCLArray<b3BroadphasePair>*		m_overlappingPairsGPU;
+	b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints;
+	b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;
+	b3AlignedObjectArray<b3TypedConstraint*> m_joints;
+	int	m_constraintUid;
+	class b3GpuNarrowPhase*	m_narrowphase;
+	b3Vector3	m_gravity;
+	b3Config	m_config;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h
new file mode 100644
index 00000000..f2a61801
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h
@@ -0,0 +1,228 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+#include "Bullet3Common/b3TransformUtil.h"
+///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
+#ifdef B3_USE_SSE
+#define USE_SIMD 1
+#endif //
+///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
+B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverBody
+//	b3Transform		m_worldTransformUnused;
+	b3Vector3		m_deltaLinearVelocity;
+	b3Vector3		m_deltaAngularVelocity;
+	b3Vector3		m_angularFactor;
+	b3Vector3		m_linearFactor;
+	b3Vector3		m_invMass;
+	b3Vector3		m_pushVelocity;
+	b3Vector3		m_turnVelocity;
+	b3Vector3		m_linearVelocity;
+	b3Vector3		m_angularVelocity;
+	union 
+	{
+		void*	m_originalBody;
+		int		m_originalBodyIndex;
+	};
+	int padding[3];
+	/*
+	void	setWorldTransform(const b3Transform& worldTransform)
+	{
+		m_worldTransform = worldTransform;
+	}
+	const b3Transform& getWorldTransform() const
+	{
+		return m_worldTransform;
+	}
+	*/
+	B3_FORCE_INLINE void	getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
+	{
+		if (m_originalBody)
+			velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+		else
+			velocity.setValue(0,0,0);
+	}
+	B3_FORCE_INLINE void	getAngularVelocity(b3Vector3& angVel) const
+	{
+		if (m_originalBody)
+			angVel =m_angularVelocity+m_deltaAngularVelocity;
+		else
+			angVel.setValue(0,0,0);
+	}
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
+	B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,b3Scalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
+	const b3Vector3& getDeltaLinearVelocity() const
+	{
+		return m_deltaLinearVelocity;
+	}
+	const b3Vector3& getDeltaAngularVelocity() const
+	{
+		return m_deltaAngularVelocity;
+	}
+	const b3Vector3& getPushVelocity() const 
+	{
+		return m_pushVelocity;
+	}
+	const b3Vector3& getTurnVelocity() const 
+	{
+		return m_turnVelocity;
+	}
+	////////////////////////////////////////////////
+	///some internal methods, don't use them
+	b3Vector3& internalGetDeltaLinearVelocity()
+	{
+		return m_deltaLinearVelocity;
+	}
+	b3Vector3& internalGetDeltaAngularVelocity()
+	{
+		return m_deltaAngularVelocity;
+	}
+	const b3Vector3& internalGetAngularFactor() const
+	{
+		return m_angularFactor;
+	}
+	const b3Vector3& internalGetInvMass() const
+	{
+		return m_invMass;
+	}
+	void internalSetInvMass(const b3Vector3& invMass)
+	{
+		m_invMass = invMass;
+	}
+	b3Vector3& internalGetPushVelocity()
+	{
+		return m_pushVelocity;
+	}
+	b3Vector3& internalGetTurnVelocity()
+	{
+		return m_turnVelocity;
+	}
+	B3_FORCE_INLINE void	internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
+	{
+		velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+	}
+	B3_FORCE_INLINE void	internalGetAngularVelocity(b3Vector3& angVel) const
+	{
+		angVel = m_angularVelocity+m_deltaAngularVelocity;
+	}
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
+	{
+		//if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
+	void	writebackVelocity()
+	{
+		//if (m_originalBody>=0)
+		{
+			m_linearVelocity +=m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
+			//m_originalBody->setCompanionId(-1);
+		}
+	}
+	void	writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
+	{
+        (void) timeStep;
+		if (m_originalBody)
+		{
+			m_linearVelocity += m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
+			//correct the position/orientation based on push/turn recovery
+			b3Transform newTransform;
+			if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0)
+			{
+			//	b3Quaternion orn = m_worldTransform.getRotation();
+//				b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
+//				m_worldTransform = newTransform;
+			}
+			//m_worldTransform.setRotation(orn);
+			//m_originalBody->setCompanionId(-1);
+		}
+	}
+#endif //B3_SOLVER_BODY_H
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h
new file mode 100644
index 00000000..60d235ba
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h
@@ -0,0 +1,82 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans http://github.com/erwincoumans/bullet3
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+//#include "b3JacobianEntry.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
+B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverConstraint
+	b3Vector3		m_relpos1CrossNormal;
+	b3Vector3		m_contactNormal;
+	b3Vector3		m_relpos2CrossNormal;
+	//b3Vector3		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal
+	b3Vector3		m_angularComponentA;
+	b3Vector3		m_angularComponentB;
+	mutable b3Scalar	m_appliedPushImpulse;
+	mutable b3Scalar	m_appliedImpulse;
+	int m_padding1;
+	int m_padding2;
+	b3Scalar	m_friction;
+	b3Scalar	m_jacDiagABInv;
+	b3Scalar		m_rhs;
+	b3Scalar		m_cfm;
+    b3Scalar		m_lowerLimit;
+	b3Scalar		m_upperLimit;
+	b3Scalar		m_rhsPenetration;
+    union
+	{
+		void*		m_originalContactPoint;
+		int		m_originalConstraintIndex;
+		b3Scalar	m_unusedPadding4;
+	};
+	int	m_overrideNumSolverIterations;
+    int			m_frictionIndex;
+	int m_solverBodyIdA;
+	int m_solverBodyIdB;
+	enum		b3SolverConstraintType
+	{
+	};
+typedef b3AlignedObjectArray<b3GpuSolverConstraint>	b3GpuConstraintArray;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp b/src/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp
new file mode 100644
index 00000000..c5bdf49c
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp
@@ -0,0 +1,1210 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#include "b3Solver.h"
+///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
+bool useNewBatchingKernel = true;
+bool gConvertConstraintOnCpu = false;
+#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
+#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
+#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
+#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
+#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
+#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
+#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
+#include "kernels/solverSetup.h"
+#include "kernels/solverSetup2.h"
+#include "kernels/solveContact.h"
+#include "kernels/solveFriction.h"
+#include "kernels/batchingKernels.h"
+#include "kernels/batchingKernelsNew.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3Common/b3Vector3.h"
+struct SolverDebugInfo
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	int m_valInt4;
+	int m_valInt5;
+	int m_valInt6;
+	int m_valInt7;
+	int m_valInt8;
+	int m_valInt9;
+	int m_valInt10;
+	int m_valInt11;
+	int	m_valInt12;
+	int	m_valInt13;
+	int	m_valInt14;
+	int	m_valInt15;
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+class SolverDeviceInl
+	struct ParallelSolveData
+	{
+		b3OpenCLArray<unsigned int>* m_numConstraints;
+		b3OpenCLArray<unsigned int>* m_offsets;
+	};
+b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
+			:m_nIterations(4),
+			m_context(ctx),
+			m_device(device),
+			m_queue(queue),
+			m_batchSizes(ctx,queue)
+	m_sort32 = new b3RadixSort32CL(ctx,device,queue);
+	m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
+	m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS);
+	const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
+	m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize);
+	m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue);
+	m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS );
+	m_numConstraints->resize(B3_SOLVER_N_CELLS);
+	m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS);
+	m_offsets->resize(B3_SOLVER_N_CELLS);
+	const char* additionalMacros = "";
+	const char* srcFileNameForCaching="";
+	cl_int pErrNum;
+	const char* batchKernelSource = batchingKernelsCL;
+	const char* batchKernelNewSource = batchingKernelsNewCL;
+	const char* solverSetupSource = solverSetupCL;
+	const char* solverSetup2Source = solverSetup2CL;
+	const char* solveContactSource = solveContactCL;
+	const char* solveFrictionSource = solveFrictionCL;
+	{
+		cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
+		b3Assert(solveContactProg);
+		cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
+		b3Assert(solveFrictionProg);
+		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
+		b3Assert(solverSetup2Prog);
+		cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
+		b3Assert(solverSetupProg);
+		m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
+		b3Assert(m_solveFrictionKernel);
+		m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
+		b3Assert(m_solveContactKernel);
+		m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
+		b3Assert(m_contactToConstraintKernel);
+		m_setSortDataKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_setSortDataKernel);
+		m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_reorderContactKernel);
+		m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_copyConstraintKernel);
+	}
+	{
+		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
+		//cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
+		b3Assert(batchingProg);
+		m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
+		b3Assert(m_batchingKernel);
+	}
+	{
+		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
+		b3Assert(batchingNewProg);
+		m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
+		//m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
+		b3Assert(m_batchingKernelNew);
+	}
+	delete m_offsets;
+	delete m_numConstraints;
+	delete m_sortDataBuffer;
+	delete m_contactBuffer2;
+	delete m_sort32;
+	delete m_scan;
+	delete m_search;
+	clReleaseKernel(m_batchingKernel);
+	clReleaseKernel(m_batchingKernelNew);
+	clReleaseKernel( m_solveContactKernel);
+	clReleaseKernel( m_solveFrictionKernel);
+	clReleaseKernel( m_contactToConstraintKernel);
+	clReleaseKernel( m_setSortDataKernel);
+	clReleaseKernel( m_reorderContactKernel);
+	clReleaseKernel( m_copyConstraintKernel);
+template<bool JACOBI>
+void solveContact(b3GpuConstraint4& cs, 
+	const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+	const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+	float maxRambdaDt[4], float minRambdaDt[4])
+	b3Vector3 dLinVelA; dLinVelA.setZero();
+	b3Vector3 dAngVelA; dAngVelA.setZero();
+	b3Vector3 dLinVelB; dLinVelB.setZero();
+	b3Vector3 dAngVelB; dAngVelB.setZero();
+	for(int ic=0; ic<4; ic++)
+	{
+		//	dont necessary because this makes change to 0
+		if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+		{
+			b3Vector3 angular0, angular1, linear;
+			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
+			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
+			setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 );
+			float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+			rambdaDt *= cs.m_jacCoeffInv[ic];
+			{
+				float prevSum = cs.m_appliedRambdaDt[ic];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max( updated, minRambdaDt[ic] );
+				updated = b3Min( updated, maxRambdaDt[ic] );
+				rambdaDt = updated - prevSum;
+				cs.m_appliedRambdaDt[ic] = updated;
+			}
+			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+            b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+			if( JACOBI )
+			{
+				dLinVelA += linImp0;
+				dAngVelA += angImp0;
+				dLinVelB += linImp1;
+				dAngVelB += angImp1;
+			}
+			else
+			{
+				linVelA += linImp0;
+				angVelA += angImp0;
+				linVelB += linImp1;
+				angVelB += angImp1;
+			}
+		}
+	}
+	if( JACOBI )
+	{
+		linVelA += dLinVelA;
+		angVelA += dAngVelA;
+		linVelB += dLinVelB;
+		angVelB += dAngVelB;
+	}
+	static
+	__inline
+	void solveFriction(b3GpuConstraint4& cs, 
+		const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+		const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+		const b3Vector3& center = (const b3Vector3&)cs.m_center;
+		b3Vector3 n = -(const b3Vector3&)cs.m_linear;
+		b3Vector3 tangent[2];
+#if 1		
+		b3PlaneSpace1 (n, tangent[0],tangent[1]);
+		b3Vector3 r = cs.m_worldPos[0]-center;
+		tangent[0] = cross3( n, r );
+		tangent[1] = cross3( tangent[0], n );
+		tangent[0] = normalize3( tangent[0] );
+		tangent[1] = normalize3( tangent[1] );
+		b3Vector3 angular0, angular1, linear;
+		b3Vector3 r0 = center - posA;
+		b3Vector3 r1 = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
+			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB );
+			rambdaDt *= cs.m_fJacCoeffInv[i];
+				{
+					float prevSum = cs.m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = b3Max( updated, minRambdaDt[i] );
+					updated = b3Min( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs.m_fAppliedRambdaDt[i] = updated;
+				}
+			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+			b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+			linVelA += linImp0;
+			angVelA += angImp0;
+			linVelB += linImp1;
+			angVelB += angImp1;
+		}
+		{	//	angular damping for point constraint
+			b3Vector3 ab = ( posB - posA ).normalized();
+			b3Vector3 ac = ( center - posA ).normalized();
+			if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+			{
+				float angNA = b3Dot( n, angVelA );
+				float angNB = b3Dot( n, angVelB );
+				angVelA -= (angNA*0.1f)*n;
+				angVelB -= (angNB*0.1f)*n;
+			}
+		}
+	}
+struct SolveTask// : public ThreadPool::Task
+	SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies,  b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
+		int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
+		: m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ),
+		m_solveFriction( true ),m_maxNumBatches(maxNumBatches),
+		m_curWgidx(curWgidx),
+		m_batchSizes(batchSizes),
+		m_cellIndex(cellIndex)
+	{}
+	unsigned short int getType(){ return 0; }
+	void run(int tIdx)
+	{
+		int offset = 0;
+		for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
+		{
+			int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii);
+			if (!numInBatch)
+				break;
+			for (int jj=0;jj<numInBatch;jj++)
+			{
+				int i = m_start + offset+jj;
+				int batchId = m_constraints[i].m_batchIdx;
+				b3Assert(batchId==ii);
+				float frictionCoeff = m_constraints[i].getFrictionCoeff();
+				int aIdx = (int)m_constraints[i].m_bodyA;
+				int bIdx = (int)m_constraints[i].m_bodyB;
+				int localBatch = m_constraints[i].m_batchIdx;
+				b3RigidBodyData& bodyA = m_bodies[aIdx];
+				b3RigidBodyData& bodyB = m_bodies[bIdx];
+				if( !m_solveFriction )
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
+							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+				else
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float sum = 0;
+					for(int j=0; j<4; j++)
+					{
+						sum +=m_constraints[i].m_appliedRambdaDt[j];
+					}
+					frictionCoeff = 0.7f;
+					for(int j=0; j<4; j++)
+					{
+						maxRambdaDt[j] = frictionCoeff*sum;
+						minRambdaDt[j] = -maxRambdaDt[j];
+					}
+					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
+						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+			}
+			offset+=numInBatch;
+		}
+/*		for (int bb=0;bb<m_maxNumBatches;bb++)
+		{
+			//for(int ic=m_nConstraints-1; ic>=0; ic--)
+			for(int ic=0; ic<m_nConstraints; ic++)
+			{
+				int i = m_start + ic;
+				if (m_constraints[i].m_batchIdx != bb)
+					continue;
+				float frictionCoeff = m_constraints[i].getFrictionCoeff();
+				int aIdx = (int)m_constraints[i].m_bodyA;
+				int bIdx = (int)m_constraints[i].m_bodyB;
+				int localBatch = m_constraints[i].m_batchIdx;
+				b3RigidBodyData& bodyA = m_bodies[aIdx];
+				b3RigidBodyData& bodyB = m_bodies[bIdx];
+				if( !m_solveFriction )
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
+							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+				else
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float sum = 0;
+					for(int j=0; j<4; j++)
+					{
+						sum +=m_constraints[i].m_appliedRambdaDt[j];
+					}
+					frictionCoeff = 0.7f;
+					for(int j=0; j<4; j++)
+					{
+						maxRambdaDt[j] = frictionCoeff*sum;
+						minRambdaDt[j] = -maxRambdaDt[j];
+					}
+					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
+						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+			}
+		}
+		*/
+	}
+	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
+	b3AlignedObjectArray<b3InertiaData>& m_shapes;
+	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
+	b3AlignedObjectArray<int>* m_batchSizes;
+	int m_cellIndex;
+	int m_curWgidx;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+	int m_maxNumBatches;
+void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes)
+#if 0
+	{	
+		int nSplitX = B3_SOLVER_N_SPLIT_X;
+		int nSplitY = B3_SOLVER_N_SPLIT_Y;
+		int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+		for (int z=0;z<4;z++)
+		{
+			for (int y=0;y<4;y++)
+			{
+				for (int x=0;x<4;x++)
+				{
+					int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
+				//	printf("newIndex=%d\n",newIndex);
+					int zIdx = newIndex/(nSplitX*nSplitY);
+					int remain = newIndex%(nSplitX*nSplitY);
+					int yIdx = remain/nSplitX;
+					int xIdx = remain%nSplitX;
+				//	printf("newIndex=%d\n",newIndex);
+				}
+			}
+		}
+		//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
+		for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+		{
+			for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+			{
+				int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
+				int remain= (wgIdx%((nSplitX*nSplitY)/4));
+				int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
+				int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
+				/*int zIdx = newIndex/(nSplitX*nSplitY);
+				int remain = newIndex%(nSplitX*nSplitY);
+				int yIdx = remain/nSplitX;
+				int xIdx = remain%nSplitX;
+				*/
+				int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+			//	printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
+			}
+		}
+	}
+	b3AlignedObjectArray<b3RigidBodyData> bodyNative;
+	bodyBuf->copyToHost(bodyNative);
+	b3AlignedObjectArray<b3InertiaData> shapeNative;
+	shapeBuf->copyToHost(shapeNative);
+	b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
+	constraint->copyToHost(constraintNative);
+	b3AlignedObjectArray<unsigned int> numConstraintsHost;
+	m_numConstraints->copyToHost(numConstraintsHost);
+	//printf("------------------------\n");
+	b3AlignedObjectArray<unsigned int> offsetsHost;
+	m_offsets->copyToHost(offsetsHost);
+	static int frame=0;
+	bool useBatches=true;
+	if (useBatches)
+	{
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+			{
+				int nSplitX = B3_SOLVER_N_SPLIT_X;
+				int nSplitY = B3_SOLVER_N_SPLIT_Y;
+				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+				//printf("cell Batch %d\n",cellBatch);
+				b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
+				for (int i=0;i<B3_SOLVER_N_CELLS;i++)
+				{
+					usedBodies[i].resize(0);
+				}
+				//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
+				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+				{
+					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
+					int remain= (wgIdx%((nSplitX*nSplitY)/4));
+					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
+					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
+					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+					if( numConstraintsHost[cellIdx] == 0 ) 
+						continue;
+					//printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
+					//printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
+					if (zIdx)
+					{
+					//printf("?\n");
+					}
+					if (iter==0)
+					{
+						//printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
+						//printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
+					}
+					const int start = offsetsHost[cellIdx];
+					int numConstraintsInCell = numConstraintsHost[cellIdx];
+					const int end = start + numConstraintsInCell;
+					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx);
+					task.m_solveFriction = false;
+					task.run(0);
+				}
+			}
+		}
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+			{
+				int nSplitX = B3_SOLVER_N_SPLIT_X;
+				int nSplitY = B3_SOLVER_N_SPLIT_Y;
+				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+				{
+					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
+					int remain= (wgIdx%((nSplitX*nSplitY)/4));
+					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
+					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
+					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+					if( numConstraintsHost[cellIdx] == 0 ) 
+						continue;
+					//printf("yIdx=%d\n",yIdx);
+					const int start = offsetsHost[cellIdx];
+					int numConstraintsInCell = numConstraintsHost[cellIdx];
+					const int end = start + numConstraintsInCell;
+					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx);
+					task.m_solveFriction = true;
+					task.run(0);
+				}
+			}
+		}
+	} else
+	{
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
+			task.m_solveFriction = false;
+			task.run(0);
+		}
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
+			task.m_solveFriction = true;
+			task.run(0);
+		}
+	}
+	bodyBuf->copyFromHost(bodyNative);
+	shapeBuf->copyFromHost(shapeNative);
+	constraint->copyFromHost(constraintNative);
+	frame++;
+void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+					const b3OpenCLArray<b3InertiaData>* shapeBuf,
+					b3OpenCLArray<b3GpuConstraint4>* constraint, 
+					b3OpenCLArray<unsigned int>* m_numConstraints,
+					b3OpenCLArray<unsigned int>* m_offsets,
+					int batchId
+					)
+//						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+//						b3BufferInfoCL( m_offsets->getBufferCL() ) 
+	int cellBatch = batchId;
+	const int nn = B3_SOLVER_N_CELLS;
+	int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+	b3AlignedObjectArray<unsigned int> gN;
+	m_numConstraints->copyToHost(gN);
+	b3AlignedObjectArray<unsigned int> gOffsets;
+	m_offsets->copyToHost(gOffsets);
+	int nSplitX = B3_SOLVER_N_SPLIT_X;
+	int nSplitY = B3_SOLVER_N_SPLIT_Y;
+	int bIdx = batchId;
+	b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
+	constraint->copyToHost(cpuConstraints);
+	printf("batch = %d\n", batchId);
+	int numWorkgroups = nn/B3_SOLVER_N_BATCHES;
+	b3AlignedObjectArray<int> usedBodies;
+	for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+	{
+		printf("wgIdx = %d           ", wgIdx);
+		int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2);					
+		int remain = wgIdx%((nSplitX*nSplitY));
+		int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1);
+		int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1);
+		int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+		printf("cellIdx=%d\n",cellIdx);
+		if( gN[cellIdx] == 0 ) 
+			continue;
+		const int start = gOffsets[cellIdx];
+		const int end = start + gN[cellIdx];
+		for (int c=start;c<end;c++)
+		{
+			b3GpuConstraint4& constraint = cpuConstraints[c];
+			//printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
+			if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size())
+			{
+				printf("error?\n");
+			}
+			if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size())
+			{
+				printf("error?\n");
+			}
+		}
+		for (int c=start;c<end;c++)
+		{
+			b3GpuConstraint4& constraint = cpuConstraints[c];
+			usedBodies.push_back(constraint.m_bodyA);
+			usedBodies.push_back(constraint.m_bodyB);
+		}
+	}
+static bool verify=false;
+void b3Solver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
+	b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
+	{
+		const int nn = B3_SOLVER_N_CELLS;
+		cdata.x = 0;
+		cdata.y = maxNumBatches;//250;
+		int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		{
+			B3_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<m_nIterations; iter++)
+			{
+				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				{
+					if (verify)
+					{
+						checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib);
+					}
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+					cdata.z = ib;
+				b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel");
+#if 1
+					b3BufferInfoCL bInfo[] = { 
+						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
+						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
+						b3BufferInfoCL( constraint->getBufferCL() ),
+						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+						b3BufferInfoCL( m_offsets->getBufferCL() ) 
+#ifdef DEBUG_ME
+						,	b3BufferInfoCL(&gpuDebugInfo)
+						};
+                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					//launcher.setConst(  cdata.x );
+                    launcher.setConst(  cdata.y );
+                    launcher.setConst(  cdata.z );
+                    b3Int4 nSplit;
+					nSplit.x = B3_SOLVER_N_SPLIT_X;
+					nSplit.y = B3_SOLVER_N_SPLIT_Y;
+					nSplit.z = B3_SOLVER_N_SPLIT_Z;
+                    launcher.setConst(  nSplit );
+                    launcher.launch1D( numWorkItems, 64 );
+                    const char* fileName = "m_batchSolveKernel.bin";
+                    FILE* f = fopen(fileName,"rb");
+                    if (f)
+                    {
+                        int sizeInBytes=0;
+                        if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
+                        {
+                            printf("error, cannot get file size\n");
+                            exit(0);
+                        }
+                        unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
+                        fread(buf,sizeInBytes,1,f);
+                        int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
+                        int num = *(int*)&buf[serializedBytes];
+                        launcher.launch1D( num);
+                        //this clFinish is for testing on errors
+                        clFinish(m_queue);
+                    }
+#ifdef DEBUG_ME
+					clFinish(m_queue);
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					clFinish(m_queue);
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+				}
+			}
+			clFinish(m_queue);
+		}
+		cdata.x = 1;
+		bool applyFriction=true;
+		if (applyFriction)
+    	{
+			B3_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<m_nIterations; iter++)
+			{
+				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					b3BufferInfoCL bInfo[] = { 
+						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
+						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
+						b3BufferInfoCL( constraint->getBufferCL() ),
+						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+						b3BufferInfoCL( m_offsets->getBufferCL() )
+#ifdef DEBUG_ME
+						,b3BufferInfoCL(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					//launcher.setConst(  cdata.x );
+                    launcher.setConst(  cdata.y );
+                    launcher.setConst(  cdata.z );
+                    b3Int4 nSplit;
+					nSplit.x = B3_SOLVER_N_SPLIT_X;
+					nSplit.y = B3_SOLVER_N_SPLIT_Y;
+					nSplit.z = B3_SOLVER_N_SPLIT_Z;
+                    launcher.setConst(  nSplit );
+					launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
+				}
+			}
+			clFinish(m_queue);
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
+	const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+	b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
+	int nContacts, const ConstraintCfg& cfg )
+	b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
+	contactCOut->resize(nContacts);
+	struct CB
+	{
+		int m_nContacts;
+		float m_dt;
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+	};
+	{
+		CB cdata;
+		cdata.m_nContacts = nContacts;
+		cdata.m_dt = cfg.m_dt;
+		cdata.m_positionDrift = cfg.m_positionDrift;
+		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
+		if (gConvertConstraintOnCpu)
+		{
+			b3AlignedObjectArray<b3RigidBodyData> gBodies;
+		bodyBuf->copyToHost(gBodies);
+		b3AlignedObjectArray<b3Contact4> gContact;
+		contactsIn->copyToHost(gContact);
+		b3AlignedObjectArray<b3InertiaData> gShapes;
+		shapeBuf->copyToHost(gShapes);
+		b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
+		gConstraintOut.resize(nContacts);
+			B3_PROFILE("cpu contactToConstraintKernel");
+			for (int gIdx=0;gIdx<nContacts;gIdx++)
+			{
+				int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
+				int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+				b3Float4 posA = gBodies[aIdx].m_pos;
+				b3Float4 linVelA = gBodies[aIdx].m_linVel;
+				b3Float4 angVelA = gBodies[aIdx].m_angVel;
+				float invMassA = gBodies[aIdx].m_invMass;
+				b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
+				b3Float4 posB = gBodies[bIdx].m_pos;
+				b3Float4 linVelB = gBodies[bIdx].m_linVel;
+				b3Float4 angVelB = gBodies[bIdx].m_angVel;
+				float invMassB = gBodies[bIdx].m_invMass;
+				b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
+				b3ContactConstraint4_t cs;
+    			setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+					&gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
+					&cs );
+				cs.m_batchIdx = gContact[gIdx].m_batchIdx;
+				gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
+			}
+			contactCOut->copyFromHost(gConstraintOut);
+		} else
+		{
+			B3_PROFILE("gpu m_contactToConstraintKernel");
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
+				b3BufferInfoCL( contactCOut->getBufferCL() )};
+			b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			//launcher.setConst(  cdata );
+			launcher.setConst(cdata.m_nContacts);
+			launcher.setConst(cdata.m_dt);
+			launcher.setConst(cdata.m_positionDrift);
+			launcher.setConst(cdata.m_positionConstraintCoeff);
+			launcher.launch1D( nContacts, 64 );	
+			clFinish(m_queue);
+		}
+	}
+void b3Solver::sortContacts(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
+			b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const b3Solver::ConstraintCfg& cfg )
+	const int sortAlignment = 512; // todo. get this out of sort
+	if( cfg.m_enableParallelSolve )
+	{
+		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+		b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+		b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+		{	//	2. set cell idx
+			struct CB
+			{
+				int m_nContacts;
+				int m_staticIdx;
+				float m_scale;
+				int m_nSplit;
+			};
+			b3Assert( sortSize%64 == 0 );
+			CB cdata;
+			cdata.m_nContacts = nContacts;
+			cdata.m_staticIdx = cfg.m_staticIdx;
+			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
+			cdata.m_nSplit = B3_SOLVER_N_SPLIT;
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
+			b3LauncherCL launcher( m_queue, m_setSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( sortSize, 64 );
+		}
+		{	//	3. sort by cell idx
+			int sortBit = 32;
+			//if( n <= 0xffff ) sortBit = 16;
+			//if( n <= 0xff ) sortBit = 8;
+			m_sort32->execute(*m_sortDataBuffer,sortSize);
+		}
+		{	//	4. find entries
+			m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
+			m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
+		}
+		{	//	5. sort constraints by cellIdx
+			//	todo. preallocate this
+//			b3Assert( contactsIn->getType() == TYPE_HOST );
+//			b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
+			{
+				b3Int4 cdata; cdata.x = nContacts;
+				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
+				b3LauncherCL launcher( m_queue, m_reorderContactKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst(  cdata );
+				launcher.launch1D( nContacts, 64 );
+			}
+//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
+		}
+	}
+void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx )
+	int numWorkItems = 64*B3_SOLVER_N_CELLS;
+	{
+		B3_PROFILE("batch generation");
+		b3Int4 cdata;
+		cdata.x = nContacts;
+		cdata.y = 0;
+		cdata.z = staticIdx;
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+		gpuDebugInfo.write(debugInfo,numWorkItems);
+		b3BufferInfoCL bInfo[] = { 
+			b3BufferInfoCL( contacts->getBufferCL() ), 
+			b3BufferInfoCL(  m_contactBuffer2->getBufferCL()),
+			b3BufferInfoCL( nNative->getBufferCL() ), 
+			b3BufferInfoCL( offsetsNative->getBufferCL() ),
+			,	b3BufferInfoCL(&gpuDebugInfo)
+		};
+		{
+			m_batchSizes.resize(nNative->size());
+			B3_PROFILE("batchingKernel");
+			//b3LauncherCL launcher( m_queue, m_batchingKernel);
+			cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
+			b3LauncherCL launcher( m_queue, k,"*batchingKernel");
+			if (!useNewBatchingKernel )
+			{
+				launcher.setBuffer( contacts->getBufferCL() );
+			}
+			launcher.setBuffer( m_contactBuffer2->getBufferCL() );
+			launcher.setBuffer( nNative->getBufferCL());
+			launcher.setBuffer( offsetsNative->getBufferCL());
+			launcher.setBuffer(m_batchSizes.getBufferCL());
+			//launcher.setConst(  cdata );
+            launcher.setConst(staticIdx);
+			launcher.launch1D( numWorkItems, 64 );
+			//clFinish(m_queue);
+			//b3AlignedObjectArray<int> batchSizesCPU;
+			//m_batchSizes.copyToHost(batchSizesCPU);
+			//printf(".\n");
+		}
+	aaaa
+		b3Contact4* hostContacts = new b3Contact4[nContacts];
+		m_contactBuffer->read(hostContacts,nContacts);
+		clFinish(m_queue);
+		gpuDebugInfo.read(debugInfo,numWorkItems);
+		clFinish(m_queue);
+		for (int i=0;i<numWorkItems;i++)
+		{
+			if (debugInfo[i].m_valInt1>0)
+			{
+				printf("catch\n");
+			}
+			if (debugInfo[i].m_valInt2>0)
+			{
+				printf("catch22\n");
+			}
+			if (debugInfo[i].m_valInt3>0)
+			{
+				printf("catch666\n");
+			}
+			if (debugInfo[i].m_valInt4>0)
+			{
+				printf("catch777\n");
+			}
+		}
+		delete[] debugInfo;
+#endif //BATCH_DEBUG
+	}
+//	copy buffer to buffer
+	//b3Assert(m_contactBuffer->size()==nContacts);
+	//contacts->copyFromOpenCLArray( *m_contactBuffer);
+	//clFinish(m_queue);//needed?
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/b3Solver.h b/src/bullet/Bullet3OpenCL/RigidBody/b3Solver.h
new file mode 100644
index 00000000..b37f2f1b
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/b3Solver.h
@@ -0,0 +1,126 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#ifndef __ADL_SOLVER_H
+#define __ADL_SOLVER_H
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "b3GpuConstraint4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+	B3_SOLVER_N_SPLIT_X = 8,//16,//4,
+	B3_SOLVER_N_SPLIT_Y = 4,//16,//4,
+	B3_SOLVER_N_SPLIT_Z = 8,//,
+	B3_SOLVER_N_BATCHES = 8,//4,//8,//4,
+class b3SolverBase
+	public:
+		struct ConstraintCfg
+		{
+			ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
+			float m_positionDrift;
+			float m_positionConstraintCoeff;
+			float m_dt;
+			bool m_enableParallelSolve;
+			float m_batchCellSize;
+			int m_staticIdx;
+		};
+class b3Solver : public b3SolverBase
+	public:
+		cl_context m_context;
+		cl_device_id m_device;
+		cl_command_queue m_queue;
+		b3OpenCLArray<unsigned int>* m_numConstraints;
+		b3OpenCLArray<unsigned int>* m_offsets;
+		b3OpenCLArray<int> m_batchSizes;
+		int m_nIterations;
+		cl_kernel m_batchingKernel;
+		cl_kernel m_batchingKernelNew;
+		cl_kernel m_solveContactKernel;
+		cl_kernel m_solveFrictionKernel;
+		cl_kernel m_contactToConstraintKernel;
+		cl_kernel m_setSortDataKernel;
+		cl_kernel m_reorderContactKernel;
+		cl_kernel m_copyConstraintKernel;
+		class b3RadixSort32CL*	m_sort32;
+		class b3BoundSearchCL*	m_search;
+		class b3PrefixScanCL*	m_scan;
+		b3OpenCLArray<b3SortData>* m_sortDataBuffer;
+		b3OpenCLArray<b3Contact4>* m_contactBuffer2;
+		enum
+		{
+		};
+		b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
+		virtual ~b3Solver();
+		void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches);
+		void solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, b3AlignedObjectArray<int>* batchSizes);
+		void convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
+			const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+		void	batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
+#endif //__ADL_SOLVER_H
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl
new file mode 100644
index 00000000..3b891b86
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl
@@ -0,0 +1,353 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile __global int*
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+#define max2 max
+#define min2 min
+#define WG_SIZE 64
+typedef struct 
+	int m_n;
+	int m_start;
+	int m_staticIdx;
+	int m_paddings[1];
+} ConstBuffer;
+typedef struct 
+	int m_a;
+	int m_b;
+	u32 m_idx;
+#define STACK_SIZE (WG_SIZE*10)
+//#define STACK_SIZE (WG_SIZE)
+#define RING_SIZE 1024
+#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
+#define RING_END ldsTmp
+u32 readBuf(__local u32* buff, int idx)
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	return buff[bufIdx] & (1<<bitIdx);
+void writeBuf(__local u32* buff, int idx)
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+//	buff[bufIdx] |= (1<<bitIdx);
+	atom_or( &buff[bufIdx], (1<<bitIdx) );
+u32 tryWrite(__local u32* buff, int idx)
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
+	return ((ans >> bitIdx)&1) == 0;
+//	batching on the GPU
+__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
+		__global const u32* gN, __global const u32* gStart, __global int* batchSizes, 
+		int m_staticIdx )
+	__local u32 ldsStackIdx[STACK_SIZE];
+	__local u32 ldsStackEnd;
+	__local Elem ldsRingElem[RING_SIZE];
+	__local u32 ldsRingEnd;
+	__local u32 ldsTmp;
+	__local u32 ldsCheckBuffer[CHECK_SIZE];
+	__local u32 ldsFixedBuffer[CHECK_SIZE];
+	__local u32 ldsGEnd;
+	__local u32 ldsDstEnd;
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	const int m_n = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+	if( lIdx == 0 )
+	{
+		ldsRingEnd = 0;
+		ldsGEnd = 0;
+		ldsStackEnd = 0;
+		ldsDstEnd = m_start;
+	}
+//	while(1)
+//was 250
+	int ie=0;
+	int maxBatch = 0;
+	for(ie=0; ie<50; ie++)
+	{
+		ldsFixedBuffer[lIdx] = 0;
+		for(int giter=0; giter<4; giter++)
+		{
+			int ringCap = GET_RING_CAPACITY;
+			//	1. fill ring
+			if( ldsGEnd < m_n )
+			{
+				while( ringCap > WG_SIZE )
+				{
+					if( ldsGEnd >= m_n ) break;
+					if( lIdx < ringCap - WG_SIZE )
+					{
+						int srcIdx;
+						AtomInc1( ldsGEnd, srcIdx );
+						if( srcIdx < m_n )
+						{
+							int dstIdx;
+							AtomInc1( ldsRingEnd, dstIdx );
+							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;
+							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;
+							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
+							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
+							ldsRingElem[dstIdx].m_idx = srcIdx;
+						}
+					}
+					ringCap = GET_RING_CAPACITY;
+				}
+			}
+			//	2. fill stack
+			__local Elem* dst = ldsRingElem;
+			if( lIdx == 0 ) RING_END = 0;
+			int srcIdx=lIdx;
+			int end = ldsRingEnd;
+			{
+				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
+				{
+					Elem e;
+					if(srcIdx<end) e = ldsRingElem[srcIdx];
+					bool done = (srcIdx<end)?false:true;
+					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
+					if( !done )
+					{
+						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));
+						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));
+						if( aUsed==0 && bUsed==0 )
+						{
+							int aAvailable=1;
+							int bAvailable=1;
+							int ea = abs(e.m_a);
+							int eb = abs(e.m_b);
+							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);
+							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);
+							if (!aStatic)
+								aAvailable = tryWrite( ldsCheckBuffer, ea );
+							if (!bStatic)
+								bAvailable = tryWrite( ldsCheckBuffer, eb );
+							//aAvailable = aStatic? 1: aAvailable;
+							//bAvailable = bStatic? 1: bAvailable;
+							bool success = (aAvailable && bAvailable);
+							if(success)
+							{
+								if (!aStatic)
+									writeBuf( ldsFixedBuffer, ea );
+								if (!bStatic)
+									writeBuf( ldsFixedBuffer, eb );
+							}
+							done = success;
+						}
+					}
+					//	put it aside
+					if(srcIdx<end)
+					{
+						if( done )
+						{
+							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
+							if( dstIdx < STACK_SIZE )
+								ldsStackIdx[dstIdx] = e.m_idx;
+							else{
+								done = false;
+								AtomAdd( ldsStackEnd, -1 );
+							}
+						}
+						if( !done )
+						{
+							int dstIdx; AtomInc1( RING_END, dstIdx );
+							dst[dstIdx] = e;
+						}
+					}
+					//	if filled, flush
+					if( ldsStackEnd == STACK_SIZE )
+					{
+						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
+						{
+							int idx = m_start + ldsStackIdx[i];
+							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+							gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+						}
+						if( lIdx == 0 ) ldsStackEnd = 0;
+						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) 
+						ldsFixedBuffer[lIdx] = 0;
+					}
+				}
+			}
+			if( lIdx == 0 ) ldsRingEnd = RING_END;
+		}
+		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
+		{
+			int idx = m_start + ldsStackIdx[i];
+			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+			gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+		}
+		//	in case it couldn't consume any pair. Flush them
+		//	todo. Serial batch worth while?
+		if( ldsStackEnd == 0 )
+		{
+			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
+			{
+				int idx = m_start + ldsRingElem[i].m_idx;
+				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+				int curBatch = 100+i;
+				if (maxBatch < curBatch)
+					maxBatch = curBatch;
+				gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
+			}
+			if( lIdx == 0 ) ldsRingEnd = 0;
+		}
+		if( lIdx == 0 ) ldsStackEnd = 0;
+		//	termination
+		if( ldsGEnd == m_n && ldsRingEnd == 0 )
+			break;
+	}
+	if( lIdx == 0 )
+	{
+		if (maxBatch < ie)
+			maxBatch=ie;
+		batchSizes[wgIdx]=maxBatch;
+	}
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
new file mode 100644
index 00000000..150eedc9
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
@@ -0,0 +1,388 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* batchingKernelsCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile __global int*\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"#define WG_SIZE 64\n"
+"typedef struct \n"
+"	int m_n;\n"
+"	int m_start;\n"
+"	int m_staticIdx;\n"
+"	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"typedef struct \n"
+"	int m_a;\n"
+"	int m_b;\n"
+"	u32 m_idx;\n"
+"#define STACK_SIZE (WG_SIZE*10)\n"
+"//#define STACK_SIZE (WG_SIZE)\n"
+"#define RING_SIZE 1024\n"
+"#define RING_SIZE_MASK (RING_SIZE-1)\n"
+"#define CHECK_SIZE (WG_SIZE)\n"
+"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
+"#define RING_END ldsTmp\n"
+"u32 readBuf(__local u32* buff, int idx)\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	return buff[bufIdx] & (1<<bitIdx);\n"
+"void writeBuf(__local u32* buff, int idx)\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"//	buff[bufIdx] |= (1<<bitIdx);\n"
+"	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"u32 tryWrite(__local u32* buff, int idx)\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"	return ((ans >> bitIdx)&1) == 0;\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
+"		__global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
+"		int m_staticIdx )\n"
+"	__local u32 ldsStackIdx[STACK_SIZE];\n"
+"	__local u32 ldsStackEnd;\n"
+"	__local Elem ldsRingElem[RING_SIZE];\n"
+"	__local u32 ldsRingEnd;\n"
+"	__local u32 ldsTmp;\n"
+"	__local u32 ldsCheckBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsGEnd;\n"
+"	__local u32 ldsDstEnd;\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	\n"
+"	const int m_n = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"		\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		ldsRingEnd = 0;\n"
+"		ldsGEnd = 0;\n"
+"		ldsStackEnd = 0;\n"
+"		ldsDstEnd = m_start;\n"
+"	}\n"
+"	\n"
+"	\n"
+"	\n"
+"//	while(1)\n"
+"//was 250\n"
+"	int ie=0;\n"
+"	int maxBatch = 0;\n"
+"	for(ie=0; ie<50; ie++)\n"
+"	{\n"
+"		ldsFixedBuffer[lIdx] = 0;\n"
+"		for(int giter=0; giter<4; giter++)\n"
+"		{\n"
+"			int ringCap = GET_RING_CAPACITY;\n"
+"		\n"
+"			//	1. fill ring\n"
+"			if( ldsGEnd < m_n )\n"
+"			{\n"
+"				while( ringCap > WG_SIZE )\n"
+"				{\n"
+"					if( ldsGEnd >= m_n ) break;\n"
+"					if( lIdx < ringCap - WG_SIZE )\n"
+"					{\n"
+"						int srcIdx;\n"
+"						AtomInc1( ldsGEnd, srcIdx );\n"
+"						if( srcIdx < m_n )\n"
+"						{\n"
+"							int dstIdx;\n"
+"							AtomInc1( ldsRingEnd, dstIdx );\n"
+"							\n"
+"							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
+"							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
+"							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
+"							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
+"							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
+"						}\n"
+"					}\n"
+"					ringCap = GET_RING_CAPACITY;\n"
+"				}\n"
+"			}\n"
+"	\n"
+"			//	2. fill stack\n"
+"			__local Elem* dst = ldsRingElem;\n"
+"			if( lIdx == 0 ) RING_END = 0;\n"
+"			int srcIdx=lIdx;\n"
+"			int end = ldsRingEnd;\n"
+"			{\n"
+"				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
+"				{\n"
+"					Elem e;\n"
+"					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
+"					bool done = (srcIdx<end)?false:true;\n"
+"					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
+"					\n"
+"					if( !done )\n"
+"					{\n"
+"						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
+"						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
+"						if( aUsed==0 && bUsed==0 )\n"
+"						{\n"
+"							int aAvailable=1;\n"
+"							int bAvailable=1;\n"
+"							int ea = abs(e.m_a);\n"
+"							int eb = abs(e.m_b);\n"
+"							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
+"							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
+"							\n"
+"							if (!aStatic)\n"
+"								aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
+"							if (!bStatic)\n"
+"								bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
+"							\n"
+"							//aAvailable = aStatic? 1: aAvailable;\n"
+"							//bAvailable = bStatic? 1: bAvailable;\n"
+"							bool success = (aAvailable && bAvailable);\n"
+"							if(success)\n"
+"							{\n"
+"							\n"
+"								if (!aStatic)\n"
+"									writeBuf( ldsFixedBuffer, ea );\n"
+"								if (!bStatic)\n"
+"									writeBuf( ldsFixedBuffer, eb );\n"
+"							}\n"
+"							done = success;\n"
+"						}\n"
+"					}\n"
+"					//	put it aside\n"
+"					if(srcIdx<end)\n"
+"					{\n"
+"						if( done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
+"							if( dstIdx < STACK_SIZE )\n"
+"								ldsStackIdx[dstIdx] = e.m_idx;\n"
+"							else{\n"
+"								done = false;\n"
+"								AtomAdd( ldsStackEnd, -1 );\n"
+"							}\n"
+"						}\n"
+"						if( !done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( RING_END, dstIdx );\n"
+"							dst[dstIdx] = e;\n"
+"						}\n"
+"					}\n"
+"					//	if filled, flush\n"
+"					if( ldsStackEnd == STACK_SIZE )\n"
+"					{\n"
+"						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
+"						{\n"
+"							int idx = m_start + ldsStackIdx[i];\n"
+"							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"						}\n"
+"						if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
+"						ldsFixedBuffer[lIdx] = 0;\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
+"		}\n"
+"		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
+"		{\n"
+"			int idx = m_start + ldsStackIdx[i];\n"
+"			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"		}\n"
+"		//	in case it couldn't consume any pair. Flush them\n"
+"		//	todo. Serial batch worth while?\n"
+"		if( ldsStackEnd == 0 )\n"
+"		{\n"
+"			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
+"			{\n"
+"				int idx = m_start + ldsRingElem[i].m_idx;\n"
+"				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"				int curBatch = 100+i;\n"
+"				if (maxBatch < curBatch)\n"
+"					maxBatch = curBatch;\n"
+"				\n"
+"				gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
+"				\n"
+"			}\n"
+"			if( lIdx == 0 ) ldsRingEnd = 0;\n"
+"		}\n"
+"		if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"		//	termination\n"
+"		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
+"			break;\n"
+"	}\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		if (maxBatch < ie)\n"
+"			maxBatch=ie;\n"
+"		batchSizes[wgIdx]=maxBatch;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl
new file mode 100644
index 00000000..ba1b66d2
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl
@@ -0,0 +1,231 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile __global int*
+#define SIMD_WIDTH 64
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+#define max2 max
+#define min2 min
+#define WG_SIZE 64
+typedef struct 
+	int m_n;
+	int m_start;
+	int m_staticIdx;
+	int m_paddings[1];
+} ConstBuffer;
+typedef struct 
+	int m_a;
+	int m_b;
+	u32 m_idx;
+//	batching on the GPU
+__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	const int m_n = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+	if( lIdx == 0 )
+	{
+		for (int i=0;i<m_n;i++)
+		{
+			int srcIdx = i+m_start;
+			int batchIndex = i;
+			gConstraints[ srcIdx ].m_batchIdx = batchIndex;	
+		}
+	}
+u32 readBuf(__local u32* buff, int idx)
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	return buff[bufIdx] & (1<<bitIdx);
+void writeBuf(__local u32* buff, int idx)
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	buff[bufIdx] |= (1<<bitIdx);
+	//atom_or( &buff[bufIdx], (1<<bitIdx) );
+u32 tryWrite(__local u32* buff, int idx)
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
+	return ((ans >> bitIdx)&1) == 0;
+//	batching on the GPU
+__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	const int numConstraints = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+	b3Contact4Data_t tmp;
+	__local u32 ldsFixedBuffer[CHECK_SIZE];
+	if( lIdx == 0 )
+	{
+		__global struct b3Contact4Data* cs = &gConstraints[m_start];	
+		int numValidConstraints = 0;
+		int batchIdx = 0;
+		while( numValidConstraints < numConstraints)
+		{
+			int nCurrentBatch = 0;
+			//	clear flag
+			for(int i=0; i<CHECK_SIZE; i++) 
+				ldsFixedBuffer[i] = 0;		
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+				int bodyAS = cs[i].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[i].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);
+				int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						writeBuf( ldsFixedBuffer, bodyA );
+					}
+					if (!bIsStatic)
+					{
+						writeBuf( ldsFixedBuffer, bodyB );
+					}
+					cs[i].m_batchIdx = batchIdx;
+					if (i!=numValidConstraints)
+					{
+						tmp = cs[i];
+						cs[i] = cs[numValidConstraints];
+						cs[numValidConstraints]  = tmp;
+					}
+					numValidConstraints++;
+					nCurrentBatch++;
+					if( nCurrentBatch == SIMD_WIDTH)
+					{
+						nCurrentBatch = 0;
+						for(int i=0; i<CHECK_SIZE; i++) 
+							ldsFixedBuffer[i] = 0;
+					}
+				}
+			}//for
+			batchIdx ++;
+		}//while
+		batchSizes[wgIdx] = batchIdx;
+	}//if( lIdx == 0 )
+	//return batchIdx;
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
new file mode 100644
index 00000000..1e5957ad
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
@@ -0,0 +1,291 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* batchingKernelsNewCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Erwin Coumans\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile __global int*\n"
+"#define SIMD_WIDTH 64\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"#define WG_SIZE 64\n"
+"typedef struct \n"
+"	int m_n;\n"
+"	int m_start;\n"
+"	int m_staticIdx;\n"
+"	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"typedef struct \n"
+"	int m_a;\n"
+"	int m_b;\n"
+"	u32 m_idx;\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	\n"
+"	const int m_n = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"		\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		for (int i=0;i<m_n;i++)\n"
+"		{\n"
+"			int srcIdx = i+m_start;\n"
+"			int batchIndex = i;\n"
+"			gConstraints[ srcIdx ].m_batchIdx = batchIndex;	\n"
+"		}\n"
+"	}\n"
+"#define CHECK_SIZE (WG_SIZE)\n"
+"u32 readBuf(__local u32* buff, int idx)\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	return buff[bufIdx] & (1<<bitIdx);\n"
+"void writeBuf(__local u32* buff, int idx)\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	buff[bufIdx] |= (1<<bitIdx);\n"
+"	//atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"u32 tryWrite(__local u32* buff, int idx)\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"	return ((ans >> bitIdx)&1) == 0;\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	const int numConstraints = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"	b3Contact4Data_t tmp;\n"
+"	\n"
+"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+"		\n"
+"	\n"
+"	\n"
+"	\n"
+"	\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"	\n"
+"		\n"
+"		__global struct b3Contact4Data* cs = &gConstraints[m_start];	\n"
+"	\n"
+"		\n"
+"		int numValidConstraints = 0;\n"
+"		int batchIdx = 0;\n"
+"		while( numValidConstraints < numConstraints)\n"
+"		{\n"
+"			int nCurrentBatch = 0;\n"
+"			//	clear flag\n"
+"	\n"
+"			for(int i=0; i<CHECK_SIZE; i++) \n"
+"				ldsFixedBuffer[i] = 0;		\n"
+"			for(int i=numValidConstraints; i<numConstraints; i++)\n"
+"			{\n"
+"				int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
+"				int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
+"				int bodyA = abs(bodyAS);\n"
+"				int bodyB = abs(bodyBS);\n"
+"				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
+"				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
+"				int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
+"				int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
+"				\n"
+"				if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
+"				{\n"
+"					if (!aIsStatic)\n"
+"					{\n"
+"						writeBuf( ldsFixedBuffer, bodyA );\n"
+"					}\n"
+"					if (!bIsStatic)\n"
+"					{\n"
+"						writeBuf( ldsFixedBuffer, bodyB );\n"
+"					}\n"
+"					cs[i].m_batchIdx = batchIdx;\n"
+"					if (i!=numValidConstraints)\n"
+"					{\n"
+"						tmp = cs[i];\n"
+"						cs[i] = cs[numValidConstraints];\n"
+"						cs[numValidConstraints]  = tmp;\n"
+"					}\n"
+"					numValidConstraints++;\n"
+"					\n"
+"					nCurrentBatch++;\n"
+"					if( nCurrentBatch == SIMD_WIDTH)\n"
+"					{\n"
+"						nCurrentBatch = 0;\n"
+"						for(int i=0; i<CHECK_SIZE; i++) \n"
+"							ldsFixedBuffer[i] = 0;\n"
+"						\n"
+"					}\n"
+"				}\n"
+"			}//for\n"
+"			batchIdx ++;\n"
+"		}//while\n"
+"		\n"
+"		batchSizes[wgIdx] = batchIdx;\n"
+"	}//if( lIdx == 0 )\n"
+"	\n"
+"	//return batchIdx;\n"
diff --git a/src/bullet/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl
similarity index 53%
rename from src/bullet/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h
rename to src/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl
index c8ebdfd6..e22bc9bc 100644
--- a/src/bullet/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl
@@ -1,5 +1,5 @@
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -10,45 +10,23 @@ subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
-#include "../PlatformDefinitions.h"
-#include "LinearMath/btScalar.h"
-#include "LinearMath/btVector3.h"
-#include "LinearMath/btMatrix3x3.h"
-#include "LinearMath/btAlignedAllocator.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
-ATTRIBUTE_ALIGNED16(struct) SpuSampleTaskDesc
+__kernel void 
+  integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)
-	uint32_t						m_sampleCommand;
-	uint32_t						m_taskId;
-	uint64_t 	m_mainMemoryPtr;
-	int			m_sampleValue;
+	int nodeID = get_global_id(0);
-void	processSampleTask(void* userPtr, void* lsMemory);
-void*	createSampleLocalStoreMemory();
+	if( nodeID < numNodes)
+	{
+		integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);
+	}
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
new file mode 100644
index 00000000..a5a43294
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@@ -0,0 +1,433 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* integrateKernelCL= \
+"Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Erwin Coumans\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
+"	\n"
+"	if (bodies[nodeID].m_invMass != 0.f)\n"
+"	{\n"
+"		float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
+"		//angular velocity\n"
+"		{\n"
+"			b3Float4 axis;\n"
+"			//add some hardcoded angular damping\n"
+"			bodies[nodeID].m_angVel.x *= angularDamping;\n"
+"			bodies[nodeID].m_angVel.y *= angularDamping;\n"
+"			bodies[nodeID].m_angVel.z *= angularDamping;\n"
+"			\n"
+"			b3Float4 angvel = bodies[nodeID].m_angVel;\n"
+"			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
+"			\n"
+"			//limit the angular motion\n"
+"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
+"			{\n"
+"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
+"			}\n"
+"			if(fAngle < 0.001f)\n"
+"			{\n"
+"				// use Taylor's expansions of sync function\n"
+"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// sync(fAngle) = sin(c*fAngle)/t\n"
+"				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
+"			}\n"
+"			\n"
+"			b3Quat dorn;\n"
+"			dorn.x = axis.x;\n"
+"			dorn.y = axis.y;\n"
+"			dorn.z = axis.z;\n"
+"			dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
+"			b3Quat orn0 = bodies[nodeID].m_quat;\n"
+"			b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
+"			predictedOrn = b3QuatNormalized(predictedOrn);\n"
+"			bodies[nodeID].m_quat=predictedOrn;\n"
+"		}\n"
+"		//linear velocity		\n"
+"		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;\n"
+"		\n"
+"		//apply gravity\n"
+"		bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n"
+"		\n"
+"	}\n"
+"	\n"
+"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
+"	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
+"	\n"
+"	if( (body->m_invMass != 0.f))\n"
+"	{\n"
+"		//angular velocity\n"
+"		{\n"
+"			b3Float4 axis;\n"
+"			//add some hardcoded angular damping\n"
+"			body->m_angVel.x *= angularDamping;\n"
+"			body->m_angVel.y *= angularDamping;\n"
+"			body->m_angVel.z *= angularDamping;\n"
+"			\n"
+"			b3Float4 angvel = body->m_angVel;\n"
+"			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
+"			//limit the angular motion\n"
+"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
+"			{\n"
+"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
+"			}\n"
+"			if(fAngle < 0.001f)\n"
+"			{\n"
+"				// use Taylor's expansions of sync function\n"
+"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// sync(fAngle) = sin(c*fAngle)/t\n"
+"				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
+"			}\n"
+"			b3Quat dorn;\n"
+"			dorn.x = axis.x;\n"
+"			dorn.y = axis.y;\n"
+"			dorn.z = axis.z;\n"
+"			dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
+"			b3Quat orn0 = body->m_quat;\n"
+"			b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
+"			predictedOrn = b3QuatNormalized(predictedOrn);\n"
+"			body->m_quat=predictedOrn;\n"
+"		}\n"
+"		//apply gravity\n"
+"		body->m_linVel += gravityAcceleration * timeStep;\n"
+"		//linear velocity		\n"
+"		body->m_pos +=  body->m_linVel * timeStep;\n"
+"		\n"
+"	}\n"
+"	\n"
+"__kernel void \n"
+"  integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
+"	int nodeID = get_global_id(0);\n"
+"	\n"
+"	if( nodeID < numNodes)\n"
+"	{\n"
+"		integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl
new file mode 100644
index 00000000..7f5dabe2
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl
@@ -0,0 +1,877 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails
+#define B3_INFINITY 1e30f
+#define mymake_float4 (float4)
+__inline float dot3F4(float4 a, float4 b)
+	float4 a1 = mymake_float4(a.xyz,0.f);
+	float4 b1 = mymake_float4(b.xyz,0.f);
+	return dot(a1, b1);
+typedef float4 Quaternion;
+typedef struct
+	float4 m_row[3];
+float4 mtMul1(Matrix3x3 a, float4 b);
+float4 mtMul3(float4 a, Matrix3x3 b);
+float4 mtMul1(Matrix3x3 a, float4 b)
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+float4 mtMul3(float4 a, Matrix3x3 b)
+	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+typedef struct
+	Matrix3x3 m_invInertiaWorld;
+	Matrix3x3 m_initInvInertia;
+} BodyInertia;
+typedef struct
+	Matrix3x3 m_basis;//orientation
+	float4	m_origin;//transform
+typedef struct
+//	b3Transform		m_worldTransformUnused;
+	float4		m_deltaLinearVelocity;
+	float4		m_deltaAngularVelocity;
+	float4		m_angularFactor;
+	float4		m_linearFactor;
+	float4		m_invMass;
+	float4		m_pushVelocity;
+	float4		m_turnVelocity;
+	float4		m_linearVelocity;
+	float4		m_angularVelocity;
+	union 
+	{
+		void*	m_originalBody;
+		int		m_originalBodyIndex;
+	};
+	int padding[3];
+} b3GpuSolverBody;
+typedef struct
+	float4 m_pos;
+	Quaternion m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	unsigned int m_shapeIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} b3RigidBodyCL;
+typedef struct
+	float4		m_relpos1CrossNormal;
+	float4		m_contactNormal;
+	float4		m_relpos2CrossNormal;
+	//float4		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal
+	float4		m_angularComponentA;
+	float4		m_angularComponentB;
+	float	m_appliedPushImpulse;
+	float	m_appliedImpulse;
+	int	m_padding1;
+	int	m_padding2;
+	float	m_friction;
+	float	m_jacDiagABInv;
+	float		m_rhs;
+	float		m_cfm;
+    float		m_lowerLimit;
+	float		m_upperLimit;
+	float		m_rhsPenetration;
+	int			m_originalConstraint;
+	int	m_overrideNumSolverIterations;
+    int			m_frictionIndex;
+	int m_solverBodyIdA;
+	int m_solverBodyIdB;
+} b3SolverConstraint;
+typedef struct 
+	int m_bodyAPtrAndSignBit;
+	int m_bodyBPtrAndSignBit;
+	int m_originalConstraintIndex;
+	int m_batchId;
+} b3BatchConstraint;
+typedef struct 
+	int				m_constraintType;
+	int				m_rbA;
+	int				m_rbB;
+	float			m_breakingImpulseThreshold;
+	float4 m_pivotInA;
+	float4 m_pivotInB;
+	Quaternion m_relTargetAB;
+	int	m_flags;
+	int m_padding[3];
+} b3GpuGenericConstraint;
+/*b3Transform	getWorldTransform(b3RigidBodyCL* rb)
+	b3Transform newTrans;
+	newTrans.setOrigin(rb->m_pos);
+	newTrans.setRotation(rb->m_quat);
+	return newTrans;
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+float4 fastNormalize4(float4 v)
+	v = mymake_float4(v.xyz,0.f);
+	return fast_normalize(v);
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+__inline void internalApplyImpulse(__global b3GpuSolverBody* body,  float4 linearComponent, float4 angularComponent,float impulseMagnitude)
+	body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;
+	body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);
+void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)
+	float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;
+	float deltaVel1Dotn	=	dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) 	+ dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);
+	float deltaVel2Dotn	=	-dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);
+	deltaImpulse	-=	deltaVel1Dotn*c->m_jacDiagABInv;
+	deltaImpulse	-=	deltaVel2Dotn*c->m_jacDiagABInv;
+	float sum = c->m_appliedImpulse + deltaImpulse;
+	if (sum < c->m_lowerLimit)
+	{
+		deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse;
+		c->m_appliedImpulse = c->m_lowerLimit;
+	}
+	else if (sum > c->m_upperLimit) 
+	{
+		deltaImpulse = c->m_upperLimit-c->m_appliedImpulse;
+		c->m_appliedImpulse = c->m_upperLimit;
+	}
+	else
+	{
+		c->m_appliedImpulse = sum;
+	}
+	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);
+	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);
+__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,
+					  __global b3BatchConstraint* batchConstraints,
+					  	__global b3SolverConstraint* rows,
+						__global unsigned int* numConstraintRowsInfo1, 
+						__global unsigned int* rowOffsets,
+						__global b3GpuGenericConstraint* constraints,
+						int batchOffset,
+						int numConstraintsInBatch
+                      )
+	int b = get_global_id(0);
+	if (b>=numConstraintsInBatch)
+		return;
+	__global b3BatchConstraint* c = &batchConstraints[b+batchOffset];
+	int originalConstraintIndex = c->m_originalConstraintIndex;
+	if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)
+	{
+		int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex];
+		int rowOffset = rowOffsets[originalConstraintIndex];
+		for (int jj=0;jj<numConstraintRows;jj++)
+		{
+			__global b3SolverConstraint* constraint = &rows[rowOffset+jj];
+			resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint);
+		}
+	}
+__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)
+	int i = get_global_id(0);
+	if (i>=numBodies)
+		return;
+	__global b3GpuSolverBody* solverBody = &solverBodies[i];
+	__global b3RigidBodyCL* bodyCL = &bodiesCL[i];
+	solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);
+	solverBody->m_deltaAngularVelocity  = (float4)(0.f,0.f,0.f,0.f);
+	solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);
+	solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);
+	solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f);
+	solverBody->m_originalBodyIndex = i;
+	solverBody->m_angularFactor = (float4)(1,1,1,0);
+	solverBody->m_linearFactor = (float4) (1,1,1,0);
+	solverBody->m_linearVelocity = bodyCL->m_linVel;
+	solverBody->m_angularVelocity = bodyCL->m_angVel;
+__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)
+	int cid = get_global_id(0);
+	if (cid>=numConstraints)
+		return;
+	int numRows = numConstraintRows[cid];
+	if (numRows)
+	{
+		for (int i=0;i<numRows;i++)
+		{
+			int rowIndex = rowOffsets[cid]+i;
+			float breakingThreshold = constraints[cid].m_breakingImpulseThreshold;
+			if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold)
+			{
+				constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;
+			}
+		}
+	}
+__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)
+	int i = get_global_id(0);
+	if (i>=numConstraints)
+		return;
+	__global b3GpuGenericConstraint* constraint = &constraints[i];
+	switch (constraint->m_constraintType)
+	{
+		{
+			infos[i] = 3;
+			break;
+		}
+		{
+			infos[i] = 6;
+			break;
+		}
+		default:
+		{
+		}
+	}
+__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, 
+										__global b3BatchConstraint* batchConstraints, 
+										__global b3GpuGenericConstraint* constraints,
+										__global b3RigidBodyCL* bodies,
+										int numConstraints)
+	int i = get_global_id(0);
+	if (i>=numConstraints)
+		return;
+	int rbA = constraints[i].m_rbA;
+	int rbB = constraints[i].m_rbB;
+	batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA;
+	batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB;
+	batchConstraints[i].m_batchId = -1;
+	batchConstraints[i].m_originalConstraintIndex = i;
+typedef struct
+	// integrator parameters: frames per second (1/stepsize), default error
+	// reduction parameter (0..1).
+	float fps,erp;
+	// for the first and second body, pointers to two (linear and angular)
+	// n*3 jacobian sub matrices, stored by rows. these matrices will have
+	// been initialized to 0 on entry. if the second body is zero then the
+	// J2xx pointers may be 0.
+	union 
+	{
+		__global float4* m_J1linearAxisFloat4;
+		__global float* m_J1linearAxis;
+	};
+	union
+	{
+		__global float4* m_J1angularAxisFloat4;
+		__global float* m_J1angularAxis;
+	};
+	union
+	{
+	__global float4* m_J2linearAxisFloat4;
+	__global float* m_J2linearAxis;
+	};
+	union
+	{
+		__global float4* m_J2angularAxisFloat4;
+		__global float* m_J2angularAxis;
+	};
+	// elements to jump from one row to the next in J's
+	int rowskip;
+	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
+	// "constraint force mixing" vector. c is set to zero on entry, cfm is
+	// set to a constant value (typically very small or zero) value on entry.
+	__global float* m_constraintError;
+	__global float* cfm;
+	// lo and hi limits for variables (set to -/+ infinity on entry).
+	__global float* m_lowerLimit;
+	__global float* m_upperLimit;
+	// findex vector for variables. see the LCP solver interface for a
+	// description of what this does. this is set to -1 on entry.
+	// note that the returned indexes are relative to the first index of
+	// the constraint.
+	__global int *findex;
+	// number of solver iterations
+	int m_numIterations;
+	//damping of the velocity
+	float	m_damping;
+} b3GpuConstraintInfo2;
+void	getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)
+	*v0 = (float4)(0.		,-vecIn.z		,vecIn.y,0.f);
+	*v1 = (float4)(vecIn.z	,0.			,-vecIn.x,0.f);
+	*v2 = (float4)(-vecIn.y	,vecIn.x	,0.f,0.f);
+void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)
+	float4 posA = bodies[constraint->m_rbA].m_pos;
+	Quaternion rotA = bodies[constraint->m_rbA].m_quat;
+	float4 posB = bodies[constraint->m_rbB].m_pos;
+	Quaternion rotB = bodies[constraint->m_rbB].m_quat;
+		// anchor points in global coordinates with respect to body PORs.
+    // set jacobian
+    info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip+1] = 1;
+	info->m_J1linearAxis[2*info->rowskip+2] = 1;
+	float4 a1 = qtRotate(rotA,constraint->m_pivotInA);
+	{
+		__global float4* angular0 = (__global float4*)(info->m_J1angularAxis);
+		__global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);
+		__global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip);
+		float4 a1neg = -a1;
+		getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2);
+	}
+	if (info->m_J2linearAxis)
+	{
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[info->rowskip+1] = -1;
+		info->m_J2linearAxis[2*info->rowskip+2] = -1;
+	}
+	float4 a2 = qtRotate(rotB,constraint->m_pivotInB);
+	{
+	//	float4 a2n = -a2;
+		__global float4* angular0 = (__global float4*)(info->m_J2angularAxis);
+		__global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip);
+		__global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip);
+		getSkewSymmetricMatrix(a2,angular0,angular1,angular2);
+	}
+    // set right hand side
+//	float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
+	float currERP = info->erp;
+	float k = info->fps * currERP;
+    int j;
+	float4 result = a2 + posB - a1 - posA;
+	float* resultPtr = &result;
+	for (j=0; j<3; j++)
+    {
+        info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);
+    }
+Quaternion nearest( Quaternion first, Quaternion qd)
+	Quaternion diff,sum;
+	diff = first- qd;
+	sum = first + qd;
+	if( dot(diff,diff) < dot(sum,sum) )
+		return qd;
+	return (-qd);
+float b3Acos(float x) 
+	if (x<-1)	
+		x=-1; 
+	if (x>1)	
+		x=1;
+	return acos(x); 
+float getAngle(Quaternion orn)
+	if (orn.w>=1.f)
+		orn.w=1.f;
+	float s = 2.f * b3Acos(orn.w);
+	return s;
+void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)
+	Quaternion orn1 = nearest(orn0,orn1a);
+	Quaternion dorn = qtMul(orn1,qtInvert(orn0));
+	*angle = getAngle(dorn);
+	*axis = (float4)(dorn.x,dorn.y,dorn.z,0.f);
+	//check for axis length
+	float len = dot3F4(*axis,*axis);
+		*axis = (float4)(1,0,0,0);
+	else
+		*axis /= sqrt(len);
+void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)
+	Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;
+	Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;
+	int s = info->rowskip;
+	int start_index = start_row * s;
+	// 3 rows to make body rotations equal
+	info->m_J1angularAxis[start_index] = 1;
+	info->m_J1angularAxis[start_index + s + 1] = 1;
+	info->m_J1angularAxis[start_index + s*2+2] = 1;
+	if ( info->m_J2angularAxis)
+	{
+		info->m_J2angularAxis[start_index] = -1;
+		info->m_J2angularAxis[start_index + s+1] = -1;
+		info->m_J2angularAxis[start_index + s*2+2] = -1;
+	}
+	float currERP = info->erp;
+	float k = info->fps * currERP;
+	float4 diff;
+	float angle;
+	float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB));
+	calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle);
+	diff*=-angle;
+	float* resultPtr = &diff;
+	for (int j=0; j<3; j++)
+    {
+        info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];
+    }
+__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)
+	int i = get_global_id(0);
+	if (i>=numBodies)
+		return;
+	if (bodies[i].m_invMass)
+	{
+//		if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)
+		{
+			bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity;
+		}
+//		if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP)
+		{
+			bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity;
+		} 
+	}
+__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, 
+							__global unsigned int* infos, 
+							__global unsigned int* constraintRowOffsets, 
+							__global b3GpuGenericConstraint* constraints, 
+							__global b3BatchConstraint* batchConstraints, 
+							__global b3RigidBodyCL* bodies,
+							__global BodyInertia* inertias,
+							__global b3GpuSolverBody* solverBodies,
+							float timeStep,
+							float globalErp,
+							float globalCfm,
+							float globalDamping,
+							int globalNumIterations,
+							int numConstraints)
+	int i = get_global_id(0);
+	if (i>=numConstraints)
+		return;
+	//for now, always initialize the batch info
+	int info1 = infos[i];
+	__global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];
+	__global b3GpuGenericConstraint* constraint = &constraints[i];
+	__global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];
+	__global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];
+	int solverBodyIdA = constraint->m_rbA;
+	int solverBodyIdB = constraint->m_rbB;
+	__global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];
+	__global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];
+	if (rbA->m_invMass)
+	{
+		batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;
+	} else
+	{
+//			if (!solverBodyIdA)
+//				m_staticIdx = 0;
+		batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;
+	}
+	if (rbB->m_invMass)
+	{
+		batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;
+	} else
+	{
+//			if (!solverBodyIdB)
+//				m_staticIdx = 0;
+		batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;
+	}
+	if (info1)
+	{
+		int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
+//		if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
+	//		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
+		int j;
+		for ( j=0;j<info1;j++)
+		{
+//			memset(&currentConstraintRow[j],0,sizeof(b3SolverConstraint));
+			currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0);
+			currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0);
+			currentConstraintRow[j].m_appliedImpulse = 0.f;
+			currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+			currentConstraintRow[j].m_cfm = 0.f;
+			currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0);
+			currentConstraintRow[j].m_friction = 0.f;
+			currentConstraintRow[j].m_frictionIndex = 0;
+			currentConstraintRow[j].m_jacDiagABInv = 0.f;
+			currentConstraintRow[j].m_lowerLimit = 0.f;
+			currentConstraintRow[j].m_upperLimit = 0.f;
+			currentConstraintRow[j].m_originalConstraint = i;
+			currentConstraintRow[j].m_overrideNumSolverIterations = 0;
+			currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);
+			currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0);
+			currentConstraintRow[j].m_rhs = 0.f;
+			currentConstraintRow[j].m_rhsPenetration = 0.f;
+			currentConstraintRow[j].m_solverBodyIdA = 0;
+			currentConstraintRow[j].m_solverBodyIdB = 0;
+			currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;
+			currentConstraintRow[j].m_upperLimit = B3_INFINITY;
+			currentConstraintRow[j].m_appliedImpulse = 0.f;
+			currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+			currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+			currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
+			currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;		
+		}
+		bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);
+		bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);
+		bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);
+		bodyAPtr->m_turnVelocity = (float4)(0,0,0,0);
+		bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);
+		bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);
+		bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);
+		bodyBPtr->m_turnVelocity  = (float4)(0,0,0,0);
+		int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this
+		b3GpuConstraintInfo2 info2;
+		info2.fps = 1.f/timeStep;
+		info2.erp = globalErp;
+		info2.m_J1linearAxisFloat4 = &currentConstraintRow->m_contactNormal;
+		info2.m_J1angularAxisFloat4 = &currentConstraintRow->m_relpos1CrossNormal;
+		info2.m_J2linearAxisFloat4 = 0;
+		info2.m_J2angularAxisFloat4 = &currentConstraintRow->m_relpos2CrossNormal;
+		info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this
+		///the size of b3SolverConstraint needs be a multiple of float
+//		b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));
+		info2.m_constraintError = &currentConstraintRow->m_rhs;
+		currentConstraintRow->m_cfm = globalCfm;
+		info2.m_damping = globalDamping;
+		info2.cfm = &currentConstraintRow->m_cfm;
+		info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+		info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+		info2.m_numIterations = globalNumIterations;
+		switch (constraint->m_constraintType)
+		{
+			{
+				getInfo2Point2Point(constraint,&info2,bodies);
+				break;
+			}
+			{
+				getInfo2Point2Point(constraint,&info2,bodies);
+				getInfo2FixedOrientation(constraint,&info2,bodies,3);
+				break;
+			}
+			default:
+			{
+			}
+		}
+		///finalize the constraint setup
+		for ( j=0;j<info1;j++)
+		{
+			__global b3SolverConstraint* solverConstraint = &currentConstraintRow[j];
+			if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)
+			{
+				solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;
+			}
+			if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)
+			{
+				solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;
+			}
+//						solverConstraint->m_originalContactPoint = constraint;
+			Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;
+			{
+				//float4 angularFactorA(1,1,1);
+				float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;
+				solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;
+			}
+			Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;
+			{
+				float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;
+				solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();
+			}
+			{
+				//it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal
+				//because it gets multiplied iMJlB
+				float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass;
+				float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);
+				float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?
+				float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);
+				float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);
+				sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);
+				sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);
+				sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal);
+				float fsum = fabs(sum);
+				if (fsum>FLT_EPSILON)
+				{
+					solverConstraint->m_jacDiagABInv = 1.f/sum;
+				} else
+				{
+					solverConstraint->m_jacDiagABInv = 0.f;
+				}
+			}
+			///fix rhs
+			///todo: add force/torque accelerators
+			{
+				float rel_vel;
+				float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);
+				float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);
+				rel_vel = vel1Dotn+vel2Dotn;
+				float restitution = 0.f;
+				float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2
+				float	velocityError = restitution - rel_vel * info2.m_damping;
+				float	penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv;
+				float	velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;
+				solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;
+				solverConstraint->m_appliedImpulse = 0.f;
+			}
+		}
+	}
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
new file mode 100644
index 00000000..d48ecf6e
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
@@ -0,0 +1,721 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* solveConstraintRowsCL= \
+"Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Erwin Coumans\n"
+"#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n"
+"#define B3_INFINITY 1e30f\n"
+"#define mymake_float4 (float4)\n"
+"__inline float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
+"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"typedef float4 Quaternion;\n"
+"typedef struct\n"
+"	float4 m_row[3];\n"
+"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a.m_row[0], b );\n"
+"	ans.y = dot3F4( a.m_row[1], b );\n"
+"	ans.z = dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a, colx );\n"
+"	ans.y = dot3F4( a, coly );\n"
+"	ans.z = dot3F4( a, colz );\n"
+"	return ans;\n"
+"typedef struct\n"
+"	Matrix3x3 m_invInertiaWorld;\n"
+"	Matrix3x3 m_initInvInertia;\n"
+"} BodyInertia;\n"
+"typedef struct\n"
+"	Matrix3x3 m_basis;//orientation\n"
+"	float4	m_origin;//transform\n"
+"typedef struct\n"
+"//	b3Transform		m_worldTransformUnused;\n"
+"	float4		m_deltaLinearVelocity;\n"
+"	float4		m_deltaAngularVelocity;\n"
+"	float4		m_angularFactor;\n"
+"	float4		m_linearFactor;\n"
+"	float4		m_invMass;\n"
+"	float4		m_pushVelocity;\n"
+"	float4		m_turnVelocity;\n"
+"	float4		m_linearVelocity;\n"
+"	float4		m_angularVelocity;\n"
+"	union \n"
+"	{\n"
+"		void*	m_originalBody;\n"
+"		int		m_originalBodyIndex;\n"
+"	};\n"
+"	int padding[3];\n"
+"} b3GpuSolverBody;\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	Quaternion m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	unsigned int m_shapeIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} b3RigidBodyCL;\n"
+"typedef struct\n"
+"	float4		m_relpos1CrossNormal;\n"
+"	float4		m_contactNormal;\n"
+"	float4		m_relpos2CrossNormal;\n"
+"	//float4		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n"
+"	float4		m_angularComponentA;\n"
+"	float4		m_angularComponentB;\n"
+"	\n"
+"	float	m_appliedPushImpulse;\n"
+"	float	m_appliedImpulse;\n"
+"	int	m_padding1;\n"
+"	int	m_padding2;\n"
+"	float	m_friction;\n"
+"	float	m_jacDiagABInv;\n"
+"	float		m_rhs;\n"
+"	float		m_cfm;\n"
+"	\n"
+"    float		m_lowerLimit;\n"
+"	float		m_upperLimit;\n"
+"	float		m_rhsPenetration;\n"
+"	int			m_originalConstraint;\n"
+"	int	m_overrideNumSolverIterations;\n"
+"    int			m_frictionIndex;\n"
+"	int m_solverBodyIdA;\n"
+"	int m_solverBodyIdB;\n"
+"} b3SolverConstraint;\n"
+"typedef struct \n"
+"	int m_bodyAPtrAndSignBit;\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int m_originalConstraintIndex;\n"
+"	int m_batchId;\n"
+"} b3BatchConstraint;\n"
+"typedef struct \n"
+"	int				m_constraintType;\n"
+"	int				m_rbA;\n"
+"	int				m_rbB;\n"
+"	float			m_breakingImpulseThreshold;\n"
+"	float4 m_pivotInA;\n"
+"	float4 m_pivotInB;\n"
+"	Quaternion m_relTargetAB;\n"
+"	int	m_flags;\n"
+"	int m_padding[3];\n"
+"} b3GpuGenericConstraint;\n"
+"/*b3Transform	getWorldTransform(b3RigidBodyCL* rb)\n"
+"	b3Transform newTrans;\n"
+"	newTrans.setOrigin(rb->m_pos);\n"
+"	newTrans.setRotation(rb->m_quat);\n"
+"	return newTrans;\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	v = mymake_float4(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"__inline void internalApplyImpulse(__global b3GpuSolverBody* body,  float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n"
+"	body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n"
+"	body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n"
+"void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n"
+"	float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n"
+"	float deltaVel1Dotn	=	dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) 	+ dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n"
+"	float deltaVel2Dotn	=	-dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n"
+"	deltaImpulse	-=	deltaVel1Dotn*c->m_jacDiagABInv;\n"
+"	deltaImpulse	-=	deltaVel2Dotn*c->m_jacDiagABInv;\n"
+"	float sum = c->m_appliedImpulse + deltaImpulse;\n"
+"	if (sum < c->m_lowerLimit)\n"
+"	{\n"
+"		deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse;\n"
+"		c->m_appliedImpulse = c->m_lowerLimit;\n"
+"	}\n"
+"	else if (sum > c->m_upperLimit) \n"
+"	{\n"
+"		deltaImpulse = c->m_upperLimit-c->m_appliedImpulse;\n"
+"		c->m_appliedImpulse = c->m_upperLimit;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		c->m_appliedImpulse = sum;\n"
+"	}\n"
+"	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n"
+"	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n"
+"__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n"
+"					  __global b3BatchConstraint* batchConstraints,\n"
+"					  	__global b3SolverConstraint* rows,\n"
+"						__global unsigned int* numConstraintRowsInfo1, \n"
+"						__global unsigned int* rowOffsets,\n"
+"						__global b3GpuGenericConstraint* constraints,\n"
+"						int batchOffset,\n"
+"						int numConstraintsInBatch\n"
+"                      )\n"
+"	int b = get_global_id(0);\n"
+"	if (b>=numConstraintsInBatch)\n"
+"		return;\n"
+"	__global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n"
+"	int originalConstraintIndex = c->m_originalConstraintIndex;\n"
+"	if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n"
+"	{\n"
+"		int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex];\n"
+"		int rowOffset = rowOffsets[originalConstraintIndex];\n"
+"		for (int jj=0;jj<numConstraintRows;jj++)\n"
+"		{\n"
+"			__global b3SolverConstraint* constraint = &rows[rowOffset+jj];\n"
+"			resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint);\n"
+"		}\n"
+"	}\n"
+"__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numBodies)\n"
+"		return;\n"
+"	__global b3GpuSolverBody* solverBody = &solverBodies[i];\n"
+"	__global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n"
+"	solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
+"	solverBody->m_deltaAngularVelocity  = (float4)(0.f,0.f,0.f,0.f);\n"
+"	solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
+"	solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
+"	solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f);\n"
+"	solverBody->m_originalBodyIndex = i;\n"
+"	solverBody->m_angularFactor = (float4)(1,1,1,0);\n"
+"	solverBody->m_linearFactor = (float4) (1,1,1,0);\n"
+"	solverBody->m_linearVelocity = bodyCL->m_linVel;\n"
+"	solverBody->m_angularVelocity = bodyCL->m_angVel;\n"
+"__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n"
+"	int cid = get_global_id(0);\n"
+"	if (cid>=numConstraints)\n"
+"		return;\n"
+"	int numRows = numConstraintRows[cid];\n"
+"	if (numRows)\n"
+"	{\n"
+"		for (int i=0;i<numRows;i++)\n"
+"		{\n"
+"			int rowIndex = rowOffsets[cid]+i;\n"
+"			float breakingThreshold = constraints[cid].m_breakingImpulseThreshold;\n"
+"			if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold)\n"
+"			{\n"
+"				constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConstraints)\n"
+"		return;\n"
+"	__global b3GpuGenericConstraint* constraint = &constraints[i];\n"
+"	switch (constraint->m_constraintType)\n"
+"	{\n"
+"		{\n"
+"			infos[i] = 3;\n"
+"			break;\n"
+"		}\n"
+"		{\n"
+"			infos[i] = 6;\n"
+"			break;\n"
+"		}\n"
+"		default:\n"
+"		{\n"
+"		}\n"
+"	}\n"
+"__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n"
+"										__global b3BatchConstraint* batchConstraints, \n"
+"										__global b3GpuGenericConstraint* constraints,\n"
+"										__global b3RigidBodyCL* bodies,\n"
+"										int numConstraints)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConstraints)\n"
+"		return;\n"
+"	int rbA = constraints[i].m_rbA;\n"
+"	int rbB = constraints[i].m_rbB;\n"
+"	batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA;\n"
+"	batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB;\n"
+"	batchConstraints[i].m_batchId = -1;\n"
+"	batchConstraints[i].m_originalConstraintIndex = i;\n"
+"typedef struct\n"
+"	// integrator parameters: frames per second (1/stepsize), default error\n"
+"	// reduction parameter (0..1).\n"
+"	float fps,erp;\n"
+"	// for the first and second body, pointers to two (linear and angular)\n"
+"	// n*3 jacobian sub matrices, stored by rows. these matrices will have\n"
+"	// been initialized to 0 on entry. if the second body is zero then the\n"
+"	// J2xx pointers may be 0.\n"
+"	union \n"
+"	{\n"
+"		__global float4* m_J1linearAxisFloat4;\n"
+"		__global float* m_J1linearAxis;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		__global float4* m_J1angularAxisFloat4;\n"
+"		__global float* m_J1angularAxis;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"	__global float4* m_J2linearAxisFloat4;\n"
+"	__global float* m_J2linearAxis;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		__global float4* m_J2angularAxisFloat4;\n"
+"		__global float* m_J2angularAxis;\n"
+"	};\n"
+"	// elements to jump from one row to the next in J's\n"
+"	int rowskip;\n"
+"	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n"
+"	// \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n"
+"	// set to a constant value (typically very small or zero) value on entry.\n"
+"	__global float* m_constraintError;\n"
+"	__global float* cfm;\n"
+"	// lo and hi limits for variables (set to -/+ infinity on entry).\n"
+"	__global float* m_lowerLimit;\n"
+"	__global float* m_upperLimit;\n"
+"	// findex vector for variables. see the LCP solver interface for a\n"
+"	// description of what this does. this is set to -1 on entry.\n"
+"	// note that the returned indexes are relative to the first index of\n"
+"	// the constraint.\n"
+"	__global int *findex;\n"
+"	// number of solver iterations\n"
+"	int m_numIterations;\n"
+"	//damping of the velocity\n"
+"	float	m_damping;\n"
+"} b3GpuConstraintInfo2;\n"
+"void	getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n"
+"	*v0 = (float4)(0.		,-vecIn.z		,vecIn.y,0.f);\n"
+"	*v1 = (float4)(vecIn.z	,0.			,-vecIn.x,0.f);\n"
+"	*v2 = (float4)(-vecIn.y	,vecIn.x	,0.f,0.f);\n"
+"void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n"
+"	float4 posA = bodies[constraint->m_rbA].m_pos;\n"
+"	Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n"
+"	float4 posB = bodies[constraint->m_rbB].m_pos;\n"
+"	Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n"
+"		// anchor points in global coordinates with respect to body PORs.\n"
+"   \n"
+"    // set jacobian\n"
+"    info->m_J1linearAxis[0] = 1;\n"
+"	info->m_J1linearAxis[info->rowskip+1] = 1;\n"
+"	info->m_J1linearAxis[2*info->rowskip+2] = 1;\n"
+"	float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n"
+"	{\n"
+"		__global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n"
+"		__global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n"
+"		__global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip);\n"
+"		float4 a1neg = -a1;\n"
+"		getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2);\n"
+"	}\n"
+"	if (info->m_J2linearAxis)\n"
+"	{\n"
+"		info->m_J2linearAxis[0] = -1;\n"
+"		info->m_J2linearAxis[info->rowskip+1] = -1;\n"
+"		info->m_J2linearAxis[2*info->rowskip+2] = -1;\n"
+"	}\n"
+"	\n"
+"	float4 a2 = qtRotate(rotB,constraint->m_pivotInB);\n"
+"   \n"
+"	{\n"
+"	//	float4 a2n = -a2;\n"
+"		__global float4* angular0 = (__global float4*)(info->m_J2angularAxis);\n"
+"		__global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip);\n"
+"		__global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip);\n"
+"		getSkewSymmetricMatrix(a2,angular0,angular1,angular2);\n"
+"	}\n"
+"    \n"
+"    // set right hand side\n"
+"//	float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n"
+"	float currERP = info->erp;\n"
+"	float k = info->fps * currERP;\n"
+"    int j;\n"
+"	float4 result = a2 + posB - a1 - posA;\n"
+"	float* resultPtr = &result;\n"
+"	for (j=0; j<3; j++)\n"
+"    {\n"
+"        info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n"
+"    }\n"
+"Quaternion nearest( Quaternion first, Quaternion qd)\n"
+"	Quaternion diff,sum;\n"
+"	diff = first- qd;\n"
+"	sum = first + qd;\n"
+"	\n"
+"	if( dot(diff,diff) < dot(sum,sum) )\n"
+"		return qd;\n"
+"	return (-qd);\n"
+"float b3Acos(float x) \n"
+"{ \n"
+"	if (x<-1)	\n"
+"		x=-1; \n"
+"	if (x>1)	\n"
+"		x=1;\n"
+"	return acos(x); \n"
+"float getAngle(Quaternion orn)\n"
+"	if (orn.w>=1.f)\n"
+"		orn.w=1.f;\n"
+"	float s = 2.f * b3Acos(orn.w);\n"
+"	return s;\n"
+"void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n"
+"	Quaternion orn1 = nearest(orn0,orn1a);\n"
+"	\n"
+"	Quaternion dorn = qtMul(orn1,qtInvert(orn0));\n"
+"	*angle = getAngle(dorn);\n"
+"	*axis = (float4)(dorn.x,dorn.y,dorn.z,0.f);\n"
+"	\n"
+"	//check for axis length\n"
+"	float len = dot3F4(*axis,*axis);\n"
+"	if (len < FLT_EPSILON*FLT_EPSILON)\n"
+"		*axis = (float4)(1,0,0,0);\n"
+"	else\n"
+"		*axis /= sqrt(len);\n"
+"void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n"
+"	Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n"
+"	Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n"
+"	int s = info->rowskip;\n"
+"	int start_index = start_row * s;\n"
+"	// 3 rows to make body rotations equal\n"
+"	info->m_J1angularAxis[start_index] = 1;\n"
+"	info->m_J1angularAxis[start_index + s + 1] = 1;\n"
+"	info->m_J1angularAxis[start_index + s*2+2] = 1;\n"
+"	if ( info->m_J2angularAxis)\n"
+"	{\n"
+"		info->m_J2angularAxis[start_index] = -1;\n"
+"		info->m_J2angularAxis[start_index + s+1] = -1;\n"
+"		info->m_J2angularAxis[start_index + s*2+2] = -1;\n"
+"	}\n"
+"	\n"
+"	float currERP = info->erp;\n"
+"	float k = info->fps * currERP;\n"
+"	float4 diff;\n"
+"	float angle;\n"
+"	float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB));\n"
+"	\n"
+"	calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle);\n"
+"	diff*=-angle;\n"
+"		\n"
+"	float* resultPtr = &diff;\n"
+"	\n"
+"	for (int j=0; j<3; j++)\n"
+"    {\n"
+"        info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n"
+"    }\n"
+"	\n"
+"__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numBodies)\n"
+"		return;\n"
+"	if (bodies[i].m_invMass)\n"
+"	{\n"
+"//		if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n"
+"		{\n"
+"			bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity;\n"
+"		}\n"
+"//		if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP)\n"
+"		{\n"
+"			bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity;\n"
+"		} \n"
+"	}\n"
+"__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n"
+"							__global unsigned int* infos, \n"
+"							__global unsigned int* constraintRowOffsets, \n"
+"							__global b3GpuGenericConstraint* constraints, \n"
+"							__global b3BatchConstraint* batchConstraints, \n"
+"							__global b3RigidBodyCL* bodies,\n"
+"							__global BodyInertia* inertias,\n"
+"							__global b3GpuSolverBody* solverBodies,\n"
+"							float timeStep,\n"
+"							float globalErp,\n"
+"							float globalCfm,\n"
+"							float globalDamping,\n"
+"							int globalNumIterations,\n"
+"							int numConstraints)\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConstraints)\n"
+"		return;\n"
+"		\n"
+"	//for now, always initialize the batch info\n"
+"	int info1 = infos[i];\n"
+"			\n"
+"	__global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n"
+"	__global b3GpuGenericConstraint* constraint = &constraints[i];\n"
+"	__global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n"
+"	__global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n"
+"	int solverBodyIdA = constraint->m_rbA;\n"
+"	int solverBodyIdB = constraint->m_rbB;\n"
+"	__global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n"
+"	__global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n"
+"	if (rbA->m_invMass)\n"
+"	{\n"
+"		batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n"
+"	} else\n"
+"	{\n"
+"//			if (!solverBodyIdA)\n"
+"//				m_staticIdx = 0;\n"
+"		batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n"
+"	}\n"
+"	if (rbB->m_invMass)\n"
+"	{\n"
+"		batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n"
+"	} else\n"
+"	{\n"
+"//			if (!solverBodyIdB)\n"
+"//				m_staticIdx = 0;\n"
+"		batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n"
+"	}\n"
+"	if (info1)\n"
+"	{\n"
+"		int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n"
+"//		if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n"
+"	//		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n"
+"		int j;\n"
+"		for ( j=0;j<info1;j++)\n"
+"		{\n"
+"//			memset(&currentConstraintRow[j],0,sizeof(b3SolverConstraint));\n"
+"			currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0);\n"
+"			currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0);\n"
+"			currentConstraintRow[j].m_appliedImpulse = 0.f;\n"
+"			currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n"
+"			currentConstraintRow[j].m_cfm = 0.f;\n"
+"			currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0);\n"
+"			currentConstraintRow[j].m_friction = 0.f;\n"
+"			currentConstraintRow[j].m_frictionIndex = 0;\n"
+"			currentConstraintRow[j].m_jacDiagABInv = 0.f;\n"
+"			currentConstraintRow[j].m_lowerLimit = 0.f;\n"
+"			currentConstraintRow[j].m_upperLimit = 0.f;\n"
+"			currentConstraintRow[j].m_originalConstraint = i;\n"
+"			currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n"
+"			currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n"
+"			currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0);\n"
+"			currentConstraintRow[j].m_rhs = 0.f;\n"
+"			currentConstraintRow[j].m_rhsPenetration = 0.f;\n"
+"			currentConstraintRow[j].m_solverBodyIdA = 0;\n"
+"			currentConstraintRow[j].m_solverBodyIdB = 0;\n"
+"							\n"
+"			currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;\n"
+"			currentConstraintRow[j].m_upperLimit = B3_INFINITY;\n"
+"			currentConstraintRow[j].m_appliedImpulse = 0.f;\n"
+"			currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n"
+"			currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;\n"
+"			currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n"
+"			currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;		\n"
+"		}\n"
+"		bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n"
+"		bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n"
+"		bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n"
+"		bodyAPtr->m_turnVelocity = (float4)(0,0,0,0);\n"
+"		bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n"
+"		bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n"
+"		bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n"
+"		bodyBPtr->m_turnVelocity  = (float4)(0,0,0,0);\n"
+"		int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n"
+"		\n"
+"		b3GpuConstraintInfo2 info2;\n"
+"		info2.fps = 1.f/timeStep;\n"
+"		info2.erp = globalErp;\n"
+"		info2.m_J1linearAxisFloat4 = &currentConstraintRow->m_contactNormal;\n"
+"		info2.m_J1angularAxisFloat4 = &currentConstraintRow->m_relpos1CrossNormal;\n"
+"		info2.m_J2linearAxisFloat4 = 0;\n"
+"		info2.m_J2angularAxisFloat4 = &currentConstraintRow->m_relpos2CrossNormal;\n"
+"		info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n"
+"		///the size of b3SolverConstraint needs be a multiple of float\n"
+"//		b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n"
+"		info2.m_constraintError = &currentConstraintRow->m_rhs;\n"
+"		currentConstraintRow->m_cfm = globalCfm;\n"
+"		info2.m_damping = globalDamping;\n"
+"		info2.cfm = &currentConstraintRow->m_cfm;\n"
+"		info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;\n"
+"		info2.m_upperLimit = &currentConstraintRow->m_upperLimit;\n"
+"		info2.m_numIterations = globalNumIterations;\n"
+"		switch (constraint->m_constraintType)\n"
+"		{\n"
+"			{\n"
+"				getInfo2Point2Point(constraint,&info2,bodies);\n"
+"				break;\n"
+"			}\n"
+"			{\n"
+"				getInfo2Point2Point(constraint,&info2,bodies);\n"
+"				getInfo2FixedOrientation(constraint,&info2,bodies,3);\n"
+"				break;\n"
+"			}\n"
+"			default:\n"
+"			{\n"
+"			}\n"
+"		}\n"
+"		///finalize the constraint setup\n"
+"		for ( j=0;j<info1;j++)\n"
+"		{\n"
+"			__global b3SolverConstraint* solverConstraint = &currentConstraintRow[j];\n"
+"			if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n"
+"			{\n"
+"				solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n"
+"			}\n"
+"			if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n"
+"			{\n"
+"				solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n"
+"			}\n"
+"//						solverConstraint->m_originalContactPoint = constraint;\n"
+"							\n"
+"			Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n"
+"			{\n"
+"				//float4 angularFactorA(1,1,1);\n"
+"				float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n"
+"				solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n"
+"			}\n"
+"						\n"
+"			Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n"
+"			{\n"
+"				float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n"
+"				solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n"
+"			}\n"
+"			{\n"
+"				//it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n"
+"				//because it gets multiplied iMJlB\n"
+"				float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass;\n"
+"				float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n"
+"				float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n"
+"				float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n"
+"				float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n"
+"				sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n"
+"				sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n"
+"				sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal);\n"
+"				float fsum = fabs(sum);\n"
+"				if (fsum>FLT_EPSILON)\n"
+"				{\n"
+"					solverConstraint->m_jacDiagABInv = 1.f/sum;\n"
+"				} else\n"
+"				{\n"
+"					solverConstraint->m_jacDiagABInv = 0.f;\n"
+"				}\n"
+"			}\n"
+"			///fix rhs\n"
+"			///todo: add force/torque accelerators\n"
+"			{\n"
+"				float rel_vel;\n"
+"				float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n"
+"				float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n"
+"				rel_vel = vel1Dotn+vel2Dotn;\n"
+"				float restitution = 0.f;\n"
+"				float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n"
+"				float	velocityError = restitution - rel_vel * info2.m_damping;\n"
+"				float	penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv;\n"
+"				float	velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n"
+"				solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n"
+"				solverConstraint->m_appliedImpulse = 0.f;\n"
+"			}\n"
+"		}\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.cl
new file mode 100644
index 00000000..5c4d62e4
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.cl
@@ -0,0 +1,501 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile global int*
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define mymake_float4 (float4)
+//#define make_float2 (float2)
+//#define make_uint4 (uint4)
+//#define make_int4 (int4)
+//#define make_uint2 (uint2)
+//#define make_int2 (int2)
+#define max2 max
+#define min2 min
+//	Vector
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+float dot3F4(float4 a, float4 b)
+	float4 a1 = mymake_float4(a.xyz,0.f);
+	float4 b1 = mymake_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 normalize3(const float4 a)
+	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+//	Matrix3x3
+typedef struct
+	float4 m_row[3];
+float4 mtMul1(Matrix3x3 a, float4 b);
+float4 mtMul3(float4 a, Matrix3x3 b);
+float4 mtMul1(Matrix3x3 a, float4 b)
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+float4 mtMul3(float4 a, Matrix3x3 b)
+	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+//	Quaternion
+typedef float4 Quaternion;
+#define WG_SIZE 64
+typedef struct
+	float4 m_pos;
+	Quaternion m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_shapeIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+typedef struct
+	Matrix3x3 m_invInertia;
+	Matrix3x3 m_initInvInertia;
+} Shape;
+typedef struct
+	float4 m_linear;
+	float4 m_worldPos[4];
+	float4 m_center;	
+	float m_jacCoeffInv[4];
+	float m_b[4];
+	float m_appliedRambdaDt[4];
+	float m_fJacCoeffInv[2];	
+	float m_fAppliedRambdaDt[2];	
+	u32 m_bodyA;
+	u32 m_bodyB;
+	int m_batchIdx;
+	u32 m_paddings[1];
+} Constraint4;
+typedef struct
+	int m_nConstraints;
+	int m_start;
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBuffer;
+typedef struct
+	int m_solveFriction;
+	int m_maxBatch;	//	long batch really kills the performance
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBufferBatchSolve;
+void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
+void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
+	*linear = mymake_float4(-n.xyz,0.f);
+	*angular0 = -cross3(r0, n);
+	*angular1 = cross3(r1, n);
+float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
+float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
+	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
+float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
+				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
+float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
+					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
+	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
+	return -1.f/(jmj0+jmj1+jmj2+jmj3);
+void solveContact(__global Constraint4* cs,
+				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
+				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);
+void solveContact(__global Constraint4* cs,
+			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
+			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)
+	float minRambdaDt = 0;
+	float maxRambdaDt = FLT_MAX;
+	for(int ic=0; ic<4; ic++)
+	{
+		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;
+		float4 angular0, angular1, linear;
+		float4 r0 = cs->m_worldPos[ic] - posA;
+		float4 r1 = cs->m_worldPos[ic] - posB;
+		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );
+		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, 
+			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];
+		rambdaDt *= cs->m_jacCoeffInv[ic];
+		{
+			float prevSum = cs->m_appliedRambdaDt[ic];
+			float updated = prevSum;
+			updated += rambdaDt;
+			updated = max2( updated, minRambdaDt );
+			updated = min2( updated, maxRambdaDt );
+			rambdaDt = updated - prevSum;
+			cs->m_appliedRambdaDt[ic] = updated;
+		}
+		float4 linImp0 = invMassA*linear*rambdaDt;
+		float4 linImp1 = invMassB*(-linear)*rambdaDt;
+		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+		*linVelA += linImp0;
+		*angVelA += angImp0;
+		*linVelB += linImp1;
+		*angVelB += angImp1;
+	}
+void btPlaneSpace1 (const float4* n, float4* p, float4* q);
+ void btPlaneSpace1 (const float4* n, float4* p, float4* q)
+  if (fabs(n[0].z) > 0.70710678f) {
+    // choose p in y-z plane
+    float a = n[0].y*n[0].y + n[0].z*n[0].z;
+    float k = 1.f/sqrt(a);
+    p[0].x = 0;
+	p[0].y = -n[0].z*k;
+	p[0].z = n[0].y*k;
+    // set q = n x p
+    q[0].x = a*k;
+	q[0].y = -n[0].x*p[0].z;
+	q[0].z = n[0].x*p[0].y;
+  }
+  else {
+    // choose p in x-y plane
+    float a = n[0].x*n[0].x + n[0].y*n[0].y;
+    float k = 1.f/sqrt(a);
+    p[0].x = -n[0].y*k;
+	p[0].y = n[0].x*k;
+	p[0].z = 0;
+    // set q = n x p
+    q[0].x = -n[0].z*p[0].y;
+	q[0].y = n[0].z*p[0].x;
+	q[0].z = a*k;
+  }
+void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
+void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
+	//float frictionCoeff = ldsCs[0].m_linear.w;
+	int aIdx = ldsCs[0].m_bodyA;
+	int bIdx = ldsCs[0].m_bodyB;
+	float4 posA = gBodies[aIdx].m_pos;
+	float4 linVelA = gBodies[aIdx].m_linVel;
+	float4 angVelA = gBodies[aIdx].m_angVel;
+	float invMassA = gBodies[aIdx].m_invMass;
+	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
+	float4 posB = gBodies[bIdx].m_pos;
+	float4 linVelB = gBodies[bIdx].m_linVel;
+	float4 angVelB = gBodies[bIdx].m_angVel;
+	float invMassB = gBodies[bIdx].m_invMass;
+	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
+	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
+			posB, &linVelB, &angVelB, invMassB, invInertiaB );
+  if (gBodies[aIdx].m_invMass)
+  {
+		gBodies[aIdx].m_linVel = linVelA;
+		gBodies[aIdx].m_angVel = angVelA;
+	} else
+	{
+		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
+		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
+	}
+	if (gBodies[bIdx].m_invMass)
+  {
+		gBodies[bIdx].m_linVel = linVelB;
+		gBodies[bIdx].m_angVel = angVelB;
+	} else
+	{
+		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
+		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
+	}
+typedef struct 
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+} SolverDebugInfo;
+void BatchSolveKernelContact(__global Body* gBodies,
+                      __global Shape* gShapes,
+                      __global Constraint4* gConstraints,
+                      __global int* gN,
+                      __global int* gOffsets,
+                      __global	int* batchSizes,
+                       int maxBatch1,
+                       int cellBatch,
+                       int4 nSplit
+                      )
+	//__local int ldsBatchIdx[WG_SIZE+1];
+	__local int ldsCurBatch;
+	__local int ldsNextBatch;
+	__local int ldsStart;
+	int lIdx = GET_LOCAL_IDX;
+	int wgIdx = GET_GROUP_IDX;
+//	int gIdx = GET_GLOBAL_IDX;
+//	debugInfo[gIdx].m_valInt0 = gIdx;
+	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
+	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
+	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
+	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);
+	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);
+	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);
+	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);
+	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);
+	//int cellIdx = xIdx+yIdx*nSplit;
+	if( gN[cellIdx] == 0 ) 
+		return;
+	int maxBatch = batchSizes[cellIdx];
+	const int start = gOffsets[cellIdx];
+	const int end = start + gN[cellIdx];
+	if( lIdx == 0 )
+	{
+		ldsCurBatch = 0;
+		ldsNextBatch = 0;
+		ldsStart = start;
+	}
+	int idx=ldsStart+lIdx;
+	while (ldsCurBatch < maxBatch)
+	{
+		for(; idx<end; )
+		{
+			if (gConstraints[idx].m_batchIdx == ldsCurBatch)
+			{
+					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );
+				 idx+=64;
+			} else
+			{
+				break;
+			}
+		}
+		if( lIdx == 0 )
+		{
+			ldsCurBatch++;
+		}
+	}
+__kernel void solveSingleContactKernel(__global Body* gBodies,
+                      __global Shape* gShapes,
+                      __global Constraint4* gConstraints,
+                       int cellIdx,
+                       int batchOffset,
+                       int numConstraintsInBatch
+                      )
+	int index = get_global_id(0);
+	if (index < numConstraintsInBatch)
+	{
+		int idx=batchOffset+index;
+		solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );
+	}    
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h
new file mode 100644
index 00000000..15a04999
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h
@@ -0,0 +1,393 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* solveContactCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile global int*\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define mymake_float4 (float4)\n"
+"//#define make_float2 (float2)\n"
+"//#define make_uint4 (uint4)\n"
+"//#define make_int4 (int4)\n"
+"//#define make_uint2 (uint2)\n"
+"//#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"//	Vector\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
+"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"//	float length = sqrtf(dot3F4(a, a));\n"
+"//	return 1.f/length * a;\n"
+"//	Matrix3x3\n"
+"typedef struct\n"
+"	float4 m_row[3];\n"
+"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a.m_row[0], b );\n"
+"	ans.y = dot3F4( a.m_row[1], b );\n"
+"	ans.z = dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a, colx );\n"
+"	ans.y = dot3F4( a, coly );\n"
+"	ans.z = dot3F4( a, colz );\n"
+"	return ans;\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"#define WG_SIZE 64\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	Quaternion m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_shapeIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} Body;\n"
+"typedef struct\n"
+"	Matrix3x3 m_invInertia;\n"
+"	Matrix3x3 m_initInvInertia;\n"
+"} Shape;\n"
+"typedef struct\n"
+"	float4 m_linear;\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_center;	\n"
+"	float m_jacCoeffInv[4];\n"
+"	float m_b[4];\n"
+"	float m_appliedRambdaDt[4];\n"
+"	float m_fJacCoeffInv[2];	\n"
+"	float m_fAppliedRambdaDt[2];	\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"	int m_batchIdx;\n"
+"	u32 m_paddings[1];\n"
+"} Constraint4;\n"
+"typedef struct\n"
+"	int m_nConstraints;\n"
+"	int m_start;\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"typedef struct\n"
+"	int m_solveFriction;\n"
+"	int m_maxBatch;	//	long batch really kills the performance\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBufferBatchSolve;\n"
+"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
+"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
+"	*linear = mymake_float4(-n.xyz,0.f);\n"
+"	*angular0 = -cross3(r0, n);\n"
+"	*angular1 = cross3(r1, n);\n"
+"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
+"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
+"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
+"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+"				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
+"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
+"	//	linear0,1 are normlized\n"
+"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
+"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
+"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
+"void solveContact(__global Constraint4* cs,\n"
+"				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
+"				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
+"void solveContact(__global Constraint4* cs,\n"
+"			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
+"			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
+"	float minRambdaDt = 0;\n"
+"	float maxRambdaDt = FLT_MAX;\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
+"		float4 angular0, angular1, linear;\n"
+"		float4 r0 = cs->m_worldPos[ic] - posA;\n"
+"		float4 r1 = cs->m_worldPos[ic] - posB;\n"
+"		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
+"		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
+"			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
+"		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
+"		{\n"
+"			float prevSum = cs->m_appliedRambdaDt[ic];\n"
+"			float updated = prevSum;\n"
+"			updated += rambdaDt;\n"
+"			updated = max2( updated, minRambdaDt );\n"
+"			updated = min2( updated, maxRambdaDt );\n"
+"			rambdaDt = updated - prevSum;\n"
+"			cs->m_appliedRambdaDt[ic] = updated;\n"
+"		}\n"
+"		float4 linImp0 = invMassA*linear*rambdaDt;\n"
+"		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
+"		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
+"		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
+"		*linVelA += linImp0;\n"
+"		*angVelA += angImp0;\n"
+"		*linVelB += linImp1;\n"
+"		*angVelB += angImp1;\n"
+"	}\n"
+"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
+"  if (fabs(n[0].z) > 0.70710678f) {\n"
+"    // choose p in y-z plane\n"
+"    float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = 0;\n"
+"	p[0].y = -n[0].z*k;\n"
+"	p[0].z = n[0].y*k;\n"
+"    // set q = n x p\n"
+"    q[0].x = a*k;\n"
+"	q[0].y = -n[0].x*p[0].z;\n"
+"	q[0].z = n[0].x*p[0].y;\n"
+"  }\n"
+"  else {\n"
+"    // choose p in x-y plane\n"
+"    float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = -n[0].y*k;\n"
+"	p[0].y = n[0].x*k;\n"
+"	p[0].z = 0;\n"
+"    // set q = n x p\n"
+"    q[0].x = -n[0].z*p[0].y;\n"
+"	q[0].y = n[0].z*p[0].x;\n"
+"	q[0].z = a*k;\n"
+"  }\n"
+"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
+"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
+"	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
+"	int aIdx = ldsCs[0].m_bodyA;\n"
+"	int bIdx = ldsCs[0].m_bodyB;\n"
+"	float4 posA = gBodies[aIdx].m_pos;\n"
+"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
+"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
+"	float invMassA = gBodies[aIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+"	float4 posB = gBodies[bIdx].m_pos;\n"
+"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
+"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
+"	float invMassB = gBodies[bIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+"	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
+"			posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
+"  if (gBodies[aIdx].m_invMass)\n"
+"  {\n"
+"		gBodies[aIdx].m_linVel = linVelA;\n"
+"		gBodies[aIdx].m_angVel = angVelA;\n"
+"	} else\n"
+"	{\n"
+"		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+"		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+"	\n"
+"	}\n"
+"	if (gBodies[bIdx].m_invMass)\n"
+"  {\n"
+"		gBodies[bIdx].m_linVel = linVelB;\n"
+"		gBodies[bIdx].m_angVel = angVelB;\n"
+"	} else\n"
+"	{\n"
+"		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+"		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+"	\n"
+"	}\n"
+"typedef struct \n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"	float m_val0;\n"
+"	float m_val1;\n"
+"	float m_val2;\n"
+"	float m_val3;\n"
+"} SolverDebugInfo;\n"
+"void BatchSolveKernelContact(__global Body* gBodies,\n"
+"                      __global Shape* gShapes,\n"
+"                      __global Constraint4* gConstraints,\n"
+"                      __global int* gN,\n"
+"                      __global int* gOffsets,\n"
+"                      __global	int* batchSizes,\n"
+"                       int maxBatch1,\n"
+"                       int cellBatch,\n"
+"                       int4 nSplit\n"
+"                      )\n"
+"	//__local int ldsBatchIdx[WG_SIZE+1];\n"
+"	__local int ldsCurBatch;\n"
+"	__local int ldsNextBatch;\n"
+"	__local int ldsStart;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"//	int gIdx = GET_GLOBAL_IDX;\n"
+"//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
+"	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
+"	\n"
+"	\n"
+"	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
+"	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
+"	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
+"	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
+"	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
+"	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
+"	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
+"	//int cellIdx = xIdx+yIdx*nSplit;\n"
+"	\n"
+"	if( gN[cellIdx] == 0 ) \n"
+"		return;\n"
+"	int maxBatch = batchSizes[cellIdx];\n"
+"	\n"
+"	\n"
+"	const int start = gOffsets[cellIdx];\n"
+"	const int end = start + gN[cellIdx];\n"
+"	\n"
+"	\n"
+"	\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		ldsCurBatch = 0;\n"
+"		ldsNextBatch = 0;\n"
+"		ldsStart = start;\n"
+"	}\n"
+"	int idx=ldsStart+lIdx;\n"
+"	while (ldsCurBatch < maxBatch)\n"
+"	{\n"
+"		for(; idx<end; )\n"
+"		{\n"
+"			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
+"			{\n"
+"					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+"				 idx+=64;\n"
+"			} else\n"
+"			{\n"
+"				break;\n"
+"			}\n"
+"		}\n"
+"	\n"
+"		if( lIdx == 0 )\n"
+"		{\n"
+"			ldsCurBatch++;\n"
+"		}\n"
+"	}\n"
+"	\n"
+"    \n"
+"__kernel void solveSingleContactKernel(__global Body* gBodies,\n"
+"                      __global Shape* gShapes,\n"
+"                      __global Constraint4* gConstraints,\n"
+"                       int cellIdx,\n"
+"                       int batchOffset,\n"
+"                       int numConstraintsInBatch\n"
+"                      )\n"
+"	int index = get_global_id(0);\n"
+"	if (index < numConstraintsInBatch)\n"
+"	{\n"
+"		int idx=batchOffset+index;\n"
+"		solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+"	}    \n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl
new file mode 100644
index 00000000..1d70fbba
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl
@@ -0,0 +1,527 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile global int*
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define mymake_float4 (float4)
+//#define make_float2 (float2)
+//#define make_uint4 (uint4)
+//#define make_int4 (int4)
+//#define make_uint2 (uint2)
+//#define make_int2 (int2)
+#define max2 max
+#define min2 min
+//	Vector
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+float dot3F4(float4 a, float4 b)
+	float4 a1 = mymake_float4(a.xyz,0.f);
+	float4 b1 = mymake_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float4 normalize3(const float4 a)
+	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+//	Matrix3x3
+typedef struct
+	float4 m_row[3];
+float4 mtMul1(Matrix3x3 a, float4 b);
+float4 mtMul3(float4 a, Matrix3x3 b);
+float4 mtMul1(Matrix3x3 a, float4 b)
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+float4 mtMul3(float4 a, Matrix3x3 b)
+	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+//	Quaternion
+typedef float4 Quaternion;
+#define WG_SIZE 64
+typedef struct
+	float4 m_pos;
+	Quaternion m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_shapeIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+typedef struct
+	Matrix3x3 m_invInertia;
+	Matrix3x3 m_initInvInertia;
+} Shape;
+typedef struct
+	float4 m_linear;
+	float4 m_worldPos[4];
+	float4 m_center;	
+	float m_jacCoeffInv[4];
+	float m_b[4];
+	float m_appliedRambdaDt[4];
+	float m_fJacCoeffInv[2];	
+	float m_fAppliedRambdaDt[2];	
+	u32 m_bodyA;
+	u32 m_bodyB;
+	int m_batchIdx;
+	u32 m_paddings[1];
+} Constraint4;
+typedef struct
+	int m_nConstraints;
+	int m_start;
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBuffer;
+typedef struct
+	int m_solveFriction;
+	int m_maxBatch;	//	long batch really kills the performance
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBufferBatchSolve;
+void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
+void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
+	*linear = mymake_float4(-n.xyz,0.f);
+	*angular0 = -cross3(r0, n);
+	*angular1 = cross3(r1, n);
+float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
+float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
+	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
+float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
+				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
+float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
+					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
+	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
+	return -1.f/(jmj0+jmj1+jmj2+jmj3);
+void btPlaneSpace1 (const float4* n, float4* p, float4* q);
+ void btPlaneSpace1 (const float4* n, float4* p, float4* q)
+  if (fabs(n[0].z) > 0.70710678f) {
+    // choose p in y-z plane
+    float a = n[0].y*n[0].y + n[0].z*n[0].z;
+    float k = 1.f/sqrt(a);
+    p[0].x = 0;
+	p[0].y = -n[0].z*k;
+	p[0].z = n[0].y*k;
+    // set q = n x p
+    q[0].x = a*k;
+	q[0].y = -n[0].x*p[0].z;
+	q[0].z = n[0].x*p[0].y;
+  }
+  else {
+    // choose p in x-y plane
+    float a = n[0].x*n[0].x + n[0].y*n[0].y;
+    float k = 1.f/sqrt(a);
+    p[0].x = -n[0].y*k;
+	p[0].y = n[0].x*k;
+	p[0].z = 0;
+    // set q = n x p
+    q[0].x = -n[0].z*p[0].y;
+	q[0].y = n[0].z*p[0].x;
+	q[0].z = a*k;
+  }
+void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
+void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
+	float frictionCoeff = ldsCs[0].m_linear.w;
+	int aIdx = ldsCs[0].m_bodyA;
+	int bIdx = ldsCs[0].m_bodyB;
+	float4 posA = gBodies[aIdx].m_pos;
+	float4 linVelA = gBodies[aIdx].m_linVel;
+	float4 angVelA = gBodies[aIdx].m_angVel;
+	float invMassA = gBodies[aIdx].m_invMass;
+	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
+	float4 posB = gBodies[bIdx].m_pos;
+	float4 linVelB = gBodies[bIdx].m_linVel;
+	float4 angVelB = gBodies[bIdx].m_angVel;
+	float invMassB = gBodies[bIdx].m_invMass;
+	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
+	{
+		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+		float sum = 0;
+		for(int j=0; j<4; j++)
+		{
+			sum +=ldsCs[0].m_appliedRambdaDt[j];
+		}
+		frictionCoeff = 0.7f;
+		for(int j=0; j<4; j++)
+		{
+			maxRambdaDt[j] = frictionCoeff*sum;
+			minRambdaDt[j] = -maxRambdaDt[j];
+		}
+//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
+//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );
+		{
+			__global Constraint4* cs = ldsCs;
+			if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;
+			const float4 center = cs->m_center;
+			float4 n = -cs->m_linear;
+			float4 tangent[2];
+			btPlaneSpace1(&n,&tangent[0],&tangent[1]);
+			float4 angular0, angular1, linear;
+			float4 r0 = center - posA;
+			float4 r1 = center - posB;
+			for(int i=0; i<2; i++)
+			{
+				setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
+				float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+											linVelA, angVelA, linVelB, angVelB );
+				rambdaDt *= cs->m_fJacCoeffInv[i];
+				{
+					float prevSum = cs->m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[i] );
+					updated = min2( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs->m_fAppliedRambdaDt[i] = updated;
+				}
+				float4 linImp0 = invMassA*linear*rambdaDt;
+				float4 linImp1 = invMassB*(-linear)*rambdaDt;
+				float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+				float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+				linVelA += linImp0;
+				angVelA += angImp0;
+				linVelB += linImp1;
+				angVelB += angImp1;
+			}
+			{	//	angular damping for point constraint
+				float4 ab = normalize3( posB - posA );
+				float4 ac = normalize3( center - posA );
+				if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))
+				{
+					float angNA = dot3F4( n, angVelA );
+					float angNB = dot3F4( n, angVelB );
+					angVelA -= (angNA*0.1f)*n;
+					angVelB -= (angNB*0.1f)*n;
+				}
+			}
+		}
+	}
+	if (gBodies[aIdx].m_invMass)
+	{
+		gBodies[aIdx].m_linVel = linVelA;
+		gBodies[aIdx].m_angVel = angVelA;
+	} else
+	{
+		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
+		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
+	}
+	if (gBodies[bIdx].m_invMass)
+	{
+		gBodies[bIdx].m_linVel = linVelB;
+		gBodies[bIdx].m_angVel = angVelB;
+	} else
+	{
+		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
+		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
+	}
+typedef struct 
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+} SolverDebugInfo;
+void BatchSolveKernelFriction(__global Body* gBodies,
+                      __global Shape* gShapes,
+                      __global Constraint4* gConstraints,
+                      __global int* gN,
+                      __global int* gOffsets,
+                      __global int* batchSizes,
+                       int maxBatch1,
+                       int cellBatch,
+                       int4 nSplit
+                      )
+	//__local int ldsBatchIdx[WG_SIZE+1];
+	__local int ldsCurBatch;
+	__local int ldsNextBatch;
+	__local int ldsStart;
+	int lIdx = GET_LOCAL_IDX;
+	int wgIdx = GET_GROUP_IDX;
+//	int gIdx = GET_GLOBAL_IDX;
+//	debugInfo[gIdx].m_valInt0 = gIdx;
+	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
+	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
+	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
+	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);
+	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);
+	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);
+	if( gN[cellIdx] == 0 ) 
+		return;
+	int maxBatch = batchSizes[cellIdx];
+	const int start = gOffsets[cellIdx];
+	const int end = start + gN[cellIdx];
+	if( lIdx == 0 )
+	{
+		ldsCurBatch = 0;
+		ldsNextBatch = 0;
+		ldsStart = start;
+	}
+	int idx=ldsStart+lIdx;
+	while (ldsCurBatch < maxBatch)
+	{
+		for(; idx<end; )
+		{
+			if (gConstraints[idx].m_batchIdx == ldsCurBatch)
+			{
+					solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );
+				 idx+=64;
+			} else
+			{
+				break;
+			}
+		}
+		if( lIdx == 0 )
+		{
+			ldsCurBatch++;
+		}
+	}
+__kernel void solveSingleFrictionKernel(__global Body* gBodies,
+                      __global Shape* gShapes,
+                      __global Constraint4* gConstraints,
+                       int cellIdx,
+                       int batchOffset,
+                       int numConstraintsInBatch
+                      )
+	int index = get_global_id(0);
+	if (index < numConstraintsInBatch)
+	{
+		int idx=batchOffset+index;
+		solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );
+	}    
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
new file mode 100644
index 00000000..eb58674f
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
@@ -0,0 +1,421 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* solveFrictionCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile global int*\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define mymake_float4 (float4)\n"
+"//#define make_float2 (float2)\n"
+"//#define make_uint4 (uint4)\n"
+"//#define make_int4 (int4)\n"
+"//#define make_uint2 (uint2)\n"
+"//#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"//	Vector\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
+"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"//	float length = sqrtf(dot3F4(a, a));\n"
+"//	return 1.f/length * a;\n"
+"//	Matrix3x3\n"
+"typedef struct\n"
+"	float4 m_row[3];\n"
+"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a.m_row[0], b );\n"
+"	ans.y = dot3F4( a.m_row[1], b );\n"
+"	ans.z = dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a, colx );\n"
+"	ans.y = dot3F4( a, coly );\n"
+"	ans.z = dot3F4( a, colz );\n"
+"	return ans;\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"#define WG_SIZE 64\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	Quaternion m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_shapeIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} Body;\n"
+"typedef struct\n"
+"	Matrix3x3 m_invInertia;\n"
+"	Matrix3x3 m_initInvInertia;\n"
+"} Shape;\n"
+"typedef struct\n"
+"	float4 m_linear;\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_center;	\n"
+"	float m_jacCoeffInv[4];\n"
+"	float m_b[4];\n"
+"	float m_appliedRambdaDt[4];\n"
+"	float m_fJacCoeffInv[2];	\n"
+"	float m_fAppliedRambdaDt[2];	\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"	int m_batchIdx;\n"
+"	u32 m_paddings[1];\n"
+"} Constraint4;\n"
+"typedef struct\n"
+"	int m_nConstraints;\n"
+"	int m_start;\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"typedef struct\n"
+"	int m_solveFriction;\n"
+"	int m_maxBatch;	//	long batch really kills the performance\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBufferBatchSolve;\n"
+"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
+"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
+"	*linear = mymake_float4(-n.xyz,0.f);\n"
+"	*angular0 = -cross3(r0, n);\n"
+"	*angular1 = cross3(r1, n);\n"
+"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
+"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
+"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
+"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+"				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
+"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
+"	//	linear0,1 are normlized\n"
+"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
+"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
+"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
+"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
+"  if (fabs(n[0].z) > 0.70710678f) {\n"
+"    // choose p in y-z plane\n"
+"    float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = 0;\n"
+"	p[0].y = -n[0].z*k;\n"
+"	p[0].z = n[0].y*k;\n"
+"    // set q = n x p\n"
+"    q[0].x = a*k;\n"
+"	q[0].y = -n[0].x*p[0].z;\n"
+"	q[0].z = n[0].x*p[0].y;\n"
+"  }\n"
+"  else {\n"
+"    // choose p in x-y plane\n"
+"    float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = -n[0].y*k;\n"
+"	p[0].y = n[0].x*k;\n"
+"	p[0].z = 0;\n"
+"    // set q = n x p\n"
+"    q[0].x = -n[0].z*p[0].y;\n"
+"	q[0].y = n[0].z*p[0].x;\n"
+"	q[0].z = a*k;\n"
+"  }\n"
+"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
+"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
+"	float frictionCoeff = ldsCs[0].m_linear.w;\n"
+"	int aIdx = ldsCs[0].m_bodyA;\n"
+"	int bIdx = ldsCs[0].m_bodyB;\n"
+"	float4 posA = gBodies[aIdx].m_pos;\n"
+"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
+"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
+"	float invMassA = gBodies[aIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+"	float4 posB = gBodies[bIdx].m_pos;\n"
+"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
+"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
+"	float invMassB = gBodies[bIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+"	\n"
+"	{\n"
+"		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
+"		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
+"		float sum = 0;\n"
+"		for(int j=0; j<4; j++)\n"
+"		{\n"
+"			sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
+"		}\n"
+"		frictionCoeff = 0.7f;\n"
+"		for(int j=0; j<4; j++)\n"
+"		{\n"
+"			maxRambdaDt[j] = frictionCoeff*sum;\n"
+"			minRambdaDt[j] = -maxRambdaDt[j];\n"
+"		}\n"
+"		\n"
+"//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
+"//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
+"		\n"
+"		\n"
+"		{\n"
+"			\n"
+"			__global Constraint4* cs = ldsCs;\n"
+"			\n"
+"			if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
+"			const float4 center = cs->m_center;\n"
+"			\n"
+"			float4 n = -cs->m_linear;\n"
+"			\n"
+"			float4 tangent[2];\n"
+"			btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
+"			float4 angular0, angular1, linear;\n"
+"			float4 r0 = center - posA;\n"
+"			float4 r1 = center - posB;\n"
+"			for(int i=0; i<2; i++)\n"
+"			{\n"
+"				setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
+"				float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
+"											linVelA, angVelA, linVelB, angVelB );\n"
+"				rambdaDt *= cs->m_fJacCoeffInv[i];\n"
+"				\n"
+"				{\n"
+"					float prevSum = cs->m_fAppliedRambdaDt[i];\n"
+"					float updated = prevSum;\n"
+"					updated += rambdaDt;\n"
+"					updated = max2( updated, minRambdaDt[i] );\n"
+"					updated = min2( updated, maxRambdaDt[i] );\n"
+"					rambdaDt = updated - prevSum;\n"
+"					cs->m_fAppliedRambdaDt[i] = updated;\n"
+"				}\n"
+"				\n"
+"				float4 linImp0 = invMassA*linear*rambdaDt;\n"
+"				float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
+"				float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
+"				float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
+"				\n"
+"				linVelA += linImp0;\n"
+"				angVelA += angImp0;\n"
+"				linVelB += linImp1;\n"
+"				angVelB += angImp1;\n"
+"			}\n"
+"			{	//	angular damping for point constraint\n"
+"				float4 ab = normalize3( posB - posA );\n"
+"				float4 ac = normalize3( center - posA );\n"
+"				if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))\n"
+"				{\n"
+"					float angNA = dot3F4( n, angVelA );\n"
+"					float angNB = dot3F4( n, angVelB );\n"
+"					\n"
+"					angVelA -= (angNA*0.1f)*n;\n"
+"					angVelB -= (angNB*0.1f)*n;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		\n"
+"	}\n"
+"	if (gBodies[aIdx].m_invMass)\n"
+"	{\n"
+"		gBodies[aIdx].m_linVel = linVelA;\n"
+"		gBodies[aIdx].m_angVel = angVelA;\n"
+"	} else\n"
+"	{\n"
+"		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+"		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+"	}\n"
+"	if (gBodies[bIdx].m_invMass)\n"
+"	{\n"
+"		gBodies[bIdx].m_linVel = linVelB;\n"
+"		gBodies[bIdx].m_angVel = angVelB;\n"
+"	} else\n"
+"	{\n"
+"		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+"		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+"	}\n"
+" \n"
+"typedef struct \n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"	float m_val0;\n"
+"	float m_val1;\n"
+"	float m_val2;\n"
+"	float m_val3;\n"
+"} SolverDebugInfo;\n"
+"void BatchSolveKernelFriction(__global Body* gBodies,\n"
+"                      __global Shape* gShapes,\n"
+"                      __global Constraint4* gConstraints,\n"
+"                      __global int* gN,\n"
+"                      __global int* gOffsets,\n"
+"                      __global int* batchSizes,\n"
+"                       int maxBatch1,\n"
+"                       int cellBatch,\n"
+"                       int4 nSplit\n"
+"                      )\n"
+"	//__local int ldsBatchIdx[WG_SIZE+1];\n"
+"	__local int ldsCurBatch;\n"
+"	__local int ldsNextBatch;\n"
+"	__local int ldsStart;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"//	int gIdx = GET_GLOBAL_IDX;\n"
+"//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
+"	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
+"	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
+"	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
+"	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
+"	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
+"	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
+"	\n"
+"	if( gN[cellIdx] == 0 ) \n"
+"		return;\n"
+"	int maxBatch = batchSizes[cellIdx];\n"
+"	const int start = gOffsets[cellIdx];\n"
+"	const int end = start + gN[cellIdx];\n"
+"	\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		ldsCurBatch = 0;\n"
+"		ldsNextBatch = 0;\n"
+"		ldsStart = start;\n"
+"	}\n"
+"	int idx=ldsStart+lIdx;\n"
+"	while (ldsCurBatch < maxBatch)\n"
+"	{\n"
+"		for(; idx<end; )\n"
+"		{\n"
+"			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
+"			{\n"
+"					solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+"				 idx+=64;\n"
+"			} else\n"
+"			{\n"
+"				break;\n"
+"			}\n"
+"		}\n"
+"		if( lIdx == 0 )\n"
+"		{\n"
+"			ldsCurBatch++;\n"
+"		}\n"
+"	}\n"
+"	\n"
+"    \n"
+"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n"
+"                      __global Shape* gShapes,\n"
+"                      __global Constraint4* gConstraints,\n"
+"                       int cellIdx,\n"
+"                       int batchOffset,\n"
+"                       int numConstraintsInBatch\n"
+"                      )\n"
+"	int index = get_global_id(0);\n"
+"	if (index < numConstraintsInBatch)\n"
+"	{\n"
+"		\n"
+"		int idx=batchOffset+index;\n"
+"	\n"
+"		solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+"	}    \n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
new file mode 100644
index 00000000..8e2de7b5
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
@@ -0,0 +1,277 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile global int*
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+#define max2 max
+#define min2 min
+//	Vector
+float fastDiv(float numerator, float denominator)
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+float4 fastDiv4(float4 numerator, float4 denominator)
+	return native_divide(numerator, denominator);	
+float fastSqrtf(float f2)
+	return native_sqrt(f2);
+//	return sqrt(f2);
+float fastRSqrt(float f2)
+	return native_rsqrt(f2);
+float fastLength4(float4 v)
+	return fast_length(v);
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+float sqrtf(float a)
+//	return sqrt(a);
+	return native_sqrt(a);
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float length3(const float4 a)
+	return sqrtf(dot3F4(a,a));
+float dot4(const float4 a, const float4 b)
+	return dot( a, b );
+//	for height
+float dot3w1(const float4 point, const float4 eqn)
+	return dot3F4(point,eqn) + eqn.w;
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+float4 normalize4(const float4 a)
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+float4 createEquation(const float4 a, const float4 b, const float4 c)
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+#define WG_SIZE 64
+typedef struct
+	int m_nConstraints;
+	int m_start;
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBuffer;
+typedef struct
+	int m_solveFriction;
+	int m_maxBatch;	//	long batch really kills the performance
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBufferBatchSolve;
+typedef struct 
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+} SolverDebugInfo;
+typedef struct
+	int m_nContacts;
+	float m_dt;
+	float m_positionDrift;
+	float m_positionConstraintCoeff;
+} ConstBufferCTC;
+void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, 
+int nContacts,
+float dt,
+float positionDrift,
+float positionConstraintCoeff
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
+		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+		float4 posA = gBodies[aIdx].m_pos;
+		float4 linVelA = gBodies[aIdx].m_linVel;
+		float4 angVelA = gBodies[aIdx].m_angVel;
+		float invMassA = gBodies[aIdx].m_invMass;
+		b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
+		float4 posB = gBodies[bIdx].m_pos;
+		float4 linVelB = gBodies[bIdx].m_linVel;
+		float4 angVelB = gBodies[bIdx].m_angVel;
+		float invMassB = gBodies[bIdx].m_invMass;
+		b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
+		b3ContactConstraint4_t cs;
+    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,
+			&cs );
+		cs.m_batchIdx = gContact[gIdx].m_batchIdx;
+		gConstraintOut[gIdx] = cs;
+	}
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
new file mode 100644
index 00000000..eb1834ee
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
@@ -0,0 +1,703 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* solverSetupCL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef struct b3ContactConstraint4 b3ContactConstraint4_t;\n"
+"struct b3ContactConstraint4\n"
+"	b3Float4 m_linear;//normal?\n"
+"	b3Float4 m_worldPos[4];\n"
+"	b3Float4 m_center;	//	friction\n"
+"	float m_jacCoeffInv[4];\n"
+"	float m_b[4];\n"
+"	float m_appliedRambdaDt[4];\n"
+"	float m_fJacCoeffInv[2];	//	friction\n"
+"	float m_fAppliedRambdaDt[2];	//	friction\n"
+"	unsigned int m_bodyA;\n"
+"	unsigned int m_bodyB;\n"
+"	int			m_batchIdx;\n"
+"	unsigned int m_paddings;\n"
+"//inline	void setFrictionCoeff(float value) { m_linear[3] = value; }\n"
+"inline	float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) \n"
+"	return constraint->m_linear.w; \n"
+"#endif //B3_CONTACT_CONSTRAINT5_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);\n"
+" void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)\n"
+"  if (b3Fabs(n.z) > 0.70710678f) {\n"
+"    // choose p in y-z plane\n"
+"    float a = n.y*n.y + n.z*n.z;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = 0;\n"
+"	p[0].y = -n.z*k;\n"
+"	p[0].z = n.y*k;\n"
+"    // set q = n x p\n"
+"    q[0].x = a*k;\n"
+"	q[0].y = -n.x*p[0].z;\n"
+"	q[0].z = n.x*p[0].y;\n"
+"  }\n"
+"  else {\n"
+"    // choose p in x-y plane\n"
+"    float a = n.x*n.x + n.y*n.y;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = -n.y*k;\n"
+"	p[0].y = n.x*k;\n"
+"	p[0].z = 0;\n"
+"    // set q = n x p\n"
+"    q[0].x = -n.z*p[0].y;\n"
+"	q[0].y = n.z*p[0].x;\n"
+"	q[0].z = a*k;\n"
+"  }\n"
+" \n"
+"void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)\n"
+"	*linear = b3MakeFloat4(n.x,n.y,n.z,0.f);\n"
+"	*angular0 = b3Cross3(r0, n);\n"
+"	*angular1 = -b3Cross3(r1, n);\n"
+"float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,\n"
+"	b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )\n"
+"	return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);\n"
+"float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,\n"
+"					float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)\n"
+"	//	linear0,1 are normlized\n"
+"	float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;\n"
+"	float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+"	float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;\n"
+"	float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
+"void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,\n"
+"	b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, \n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
+"	b3ContactConstraint4_t* dstC )\n"
+"	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
+"	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
+"	float dtInv = 1.f/dt;\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
+"	}\n"
+"	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
+"	dstC->m_linear = src->m_worldNormalOnB;\n"
+"	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		b3Float4 r0 = src->m_worldPosB[ic] - posA;\n"
+"		b3Float4 r1 = src->m_worldPosB[ic] - posB;\n"
+"		if( ic >= src->m_worldNormalOnB.w )//npoints\n"
+"		{\n"
+"			dstC->m_jacCoeffInv[ic] = 0.f;\n"
+"			continue;\n"
+"		}\n"
+"		float relVelN;\n"
+"		{\n"
+"			b3Float4 linear, angular0, angular1;\n"
+"			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n"
+"			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
+"				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
+"			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
+"				linVelA, angVelA, linVelB, angVelB);\n"
+"			float e = 0.f;//src->getRestituitionCoeff();\n"
+"			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
+"			dstC->m_b[ic] = e*relVelN;\n"
+"			//float penetration = src->m_worldPosB[ic].w;\n"
+"			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
+"			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
+"		}\n"
+"	}\n"
+"	if( src->m_worldNormalOnB.w > 0 )//npoints\n"
+"	{	//	prepare friction\n"
+"		b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);\n"
+"		for(int i=0; i<src->m_worldNormalOnB.w; i++) \n"
+"			center += src->m_worldPosB[i];\n"
+"		center /= (float)src->m_worldNormalOnB.w;\n"
+"		b3Float4 tangent[2];\n"
+"		b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n"
+"		\n"
+"		b3Float4 r[2];\n"
+"		r[0] = center - posA;\n"
+"		r[1] = center - posB;\n"
+"		for(int i=0; i<2; i++)\n"
+"		{\n"
+"			b3Float4 linear, angular0, angular1;\n"
+"			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
+"			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
+"				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
+"			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
+"		}\n"
+"		dstC->m_center = center;\n"
+"	}\n"
+"	for(int i=0; i<4; i++)\n"
+"	{\n"
+"		if( i<src->m_worldNormalOnB.w )\n"
+"		{\n"
+"			dstC->m_worldPos[i] = src->m_worldPosB[i];\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);\n"
+"		}\n"
+"	}\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile global int*\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"//	Vector\n"
+"float fastDiv(float numerator, float denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"float fastSqrtf(float f2)\n"
+"	return native_sqrt(f2);\n"
+"//	return sqrt(f2);\n"
+"float fastRSqrt(float f2)\n"
+"	return native_rsqrt(f2);\n"
+"float fastLength4(float4 v)\n"
+"	return fast_length(v);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"float sqrtf(float a)\n"
+"//	return sqrt(a);\n"
+"	return native_sqrt(a);\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float length3(const float4 a)\n"
+"	return sqrtf(dot3F4(a,a));\n"
+"float dot4(const float4 a, const float4 b)\n"
+"	return dot( a, b );\n"
+"//	for height\n"
+"float dot3w1(const float4 point, const float4 eqn)\n"
+"	return dot3F4(point,eqn) + eqn.w;\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"//	float length = sqrtf(dot3F4(a, a));\n"
+"//	return 1.f/length * a;\n"
+"float4 normalize4(const float4 a)\n"
+"	float length = sqrtf(dot4(a, a));\n"
+"	return 1.f/length * a;\n"
+"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
+"	float4 eqn;\n"
+"	float4 ab = b-a;\n"
+"	float4 ac = c-a;\n"
+"	eqn = normalize3( cross3(ab, ac) );\n"
+"	eqn.w = -dot3F4(eqn,a);\n"
+"	return eqn;\n"
+"#define WG_SIZE 64\n"
+"typedef struct\n"
+"	int m_nConstraints;\n"
+"	int m_start;\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"typedef struct\n"
+"	int m_solveFriction;\n"
+"	int m_maxBatch;	//	long batch really kills the performance\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBufferBatchSolve;\n"
+" \n"
+"typedef struct \n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"	float m_val0;\n"
+"	float m_val1;\n"
+"	float m_val2;\n"
+"	float m_val3;\n"
+"} SolverDebugInfo;\n"
+"typedef struct\n"
+"	int m_nContacts;\n"
+"	float m_dt;\n"
+"	float m_positionDrift;\n"
+"	float m_positionConstraintCoeff;\n"
+"} ConstBufferCTC;\n"
+"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, \n"
+"int nContacts,\n"
+"float dt,\n"
+"float positionDrift,\n"
+"float positionConstraintCoeff\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
+"		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
+"		float4 posA = gBodies[aIdx].m_pos;\n"
+"		float4 linVelA = gBodies[aIdx].m_linVel;\n"
+"		float4 angVelA = gBodies[aIdx].m_angVel;\n"
+"		float invMassA = gBodies[aIdx].m_invMass;\n"
+"		b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;\n"
+"		float4 posB = gBodies[bIdx].m_pos;\n"
+"		float4 linVelB = gBodies[bIdx].m_linVel;\n"
+"		float4 angVelB = gBodies[bIdx].m_angVel;\n"
+"		float invMassB = gBodies[bIdx].m_invMass;\n"
+"		b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;\n"
+"		b3ContactConstraint4_t cs;\n"
+"    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
+"			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n"
+"			&cs );\n"
+"		\n"
+"		cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
+"		gConstraintOut[gIdx] = cs;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
new file mode 100644
index 00000000..3dc48d43
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
@@ -0,0 +1,613 @@
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Takahiro Harada
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile global int*
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+#define max2 max
+#define min2 min
+//	Vector
+float fastDiv(float numerator, float denominator)
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+float4 fastDiv4(float4 numerator, float4 denominator)
+	return native_divide(numerator, denominator);	
+float fastSqrtf(float f2)
+	return native_sqrt(f2);
+//	return sqrt(f2);
+float fastRSqrt(float f2)
+	return native_rsqrt(f2);
+float fastLength4(float4 v)
+	return fast_length(v);
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+float sqrtf(float a)
+//	return sqrt(a);
+	return native_sqrt(a);
+float4 cross3(float4 a, float4 b)
+	return cross(a,b);
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float length3(const float4 a)
+	return sqrtf(dot3F4(a,a));
+float dot4(const float4 a, const float4 b)
+	return dot( a, b );
+//	for height
+float dot3w1(const float4 point, const float4 eqn)
+	return dot3F4(point,eqn) + eqn.w;
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+float4 normalize4(const float4 a)
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+float4 createEquation(const float4 a, const float4 b, const float4 c)
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+//	Matrix3x3
+typedef struct
+	float4 m_row[3];
+Matrix3x3 mtZero();
+Matrix3x3 mtIdentity();
+Matrix3x3 mtTranspose(Matrix3x3 m);
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
+float4 mtMul1(Matrix3x3 a, float4 b);
+float4 mtMul3(float4 a, Matrix3x3 b);
+Matrix3x3 mtZero()
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(0.f);
+	m.m_row[1] = (float4)(0.f);
+	m.m_row[2] = (float4)(0.f);
+	return m;
+Matrix3x3 mtIdentity()
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(1,0,0,0);
+	m.m_row[1] = (float4)(0,1,0,0);
+	m.m_row[2] = (float4)(0,0,1,0);
+	return m;
+Matrix3x3 mtTranspose(Matrix3x3 m)
+	Matrix3x3 out;
+	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+float4 mtMul1(Matrix3x3 a, float4 b)
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+float4 mtMul3(float4 a, Matrix3x3 b)
+	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+//	Quaternion
+typedef float4 Quaternion;
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+#define WG_SIZE 64
+typedef struct
+	float4 m_pos;
+	Quaternion m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_shapeIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+typedef struct
+	Matrix3x3 m_invInertia;
+	Matrix3x3 m_initInvInertia;
+} Shape;
+typedef struct
+	float4 m_linear;
+	float4 m_worldPos[4];
+	float4 m_center;	
+	float m_jacCoeffInv[4];
+	float m_b[4];
+	float m_appliedRambdaDt[4];
+	float m_fJacCoeffInv[2];	
+	float m_fAppliedRambdaDt[2];	
+	u32 m_bodyA;
+	u32 m_bodyB;
+	int m_batchIdx;
+	u32 m_paddings[1];
+} Constraint4;
+typedef struct
+	int m_nConstraints;
+	int m_start;
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBuffer;
+typedef struct
+	int m_solveFriction;
+	int m_maxBatch;	//	long batch really kills the performance
+	int m_batchIdx;
+	int m_nSplit;
+//	int m_paddings[1];
+} ConstBufferBatchSolve;
+typedef struct 
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+} SolverDebugInfo;
+//	others
+void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )
+	int nContacts = cb.x;
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int srcIdx = sortData[gIdx].y;
+		out[gIdx] = in[srcIdx];
+	}
+__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int2 sd;
+		sd.x = contactsIn[gIdx].m_childIndexB;
+		sd.y = gIdx;
+		sortDataOut[gIdx] = sd;
+	}
+__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int2 sdIn;
+		sdIn = sortDataInOut[gIdx];
+		int2 sdOut;
+		sdOut.x = contactsIn[sdIn.y].m_childIndexA;
+		sdOut.y = sdIn.y;
+		sortDataInOut[gIdx] = sdOut;
+	}
+__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int2 sdIn;
+		sdIn = sortDataInOut[gIdx];
+		int2 sdOut;
+		sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;
+		sdOut.y = sdIn.y;
+		sortDataInOut[gIdx] = sdOut;
+	}
+void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int2 sdIn;
+		sdIn = sortDataInOut[gIdx];
+		int2 sdOut;
+		sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;
+		sdOut.y = sdIn.y;
+		sortDataInOut[gIdx] = sdOut;
+	}
+typedef struct
+	int m_nContacts;
+	int m_staticIdx;
+	float m_scale;
+	int m_nSplit;
+} ConstBufferSSD;
+__constant const int gridTable4x4[] = 
+    0,1,17,16,
+	1,2,18,19,
+	17,18,32,3,
+	16,19,3,34
+__constant const int gridTable8x8[] = 
+	  0,  2,  3, 16, 17, 18, 19,  1,
+	 66, 64, 80, 67, 82, 81, 65, 83,
+	131,144,128,130,147,129,145,146,
+	208,195,194,192,193,211,210,209,
+	 21, 22, 23,  5,  4,  6,  7, 20,
+	 86, 85, 69, 87, 70, 68, 84, 71,
+	151,133,149,150,135,148,132,134,
+	197,27,214,213,212,199,198,196
+#define USE_4x4_GRID 1
+void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, 
+int nContacts,float scale,int4 nSplit,int staticIdx)
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;
+		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;
+		int aIdx = abs(aPtrAndSignBit );
+		int bIdx = abs(bPtrAndSignBit);
+		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);
+		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);
+		int idx = (aStatic)? bIdx: aIdx;
+		float4 p = gBodies[idx].m_pos;
+		int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);
+		int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);
+		int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);
+		int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);
+	#if USE_4x4_GRID
+		int aa = aIdx&3;
+		int bb = bIdx&3;
+		if (aStatic)
+			aa = bb;
+		if (bStatic)
+			bb = aa;
+		int gridIndex = aa + bb*4;
+		int newIndex = gridTable4x4[gridIndex];
+	#else//USE_4x4_GRID
+		int aa = aIdx&7;
+		int bb = bIdx&7;
+		if (aStatic)
+			aa = bb;
+		if (bStatic)
+			bb = aa;
+		int gridIndex = aa + bb*8;
+		int newIndex = gridTable8x8[gridIndex];
+	#endif//USE_4x4_GRID
+		gSortDataOut[gIdx].x = newIndex;
+		gSortDataOut[gIdx].y = gIdx;
+	}
+	else
+	{
+		gSortDataOut[gIdx].x = 0xffffffff;
+	}
+void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < cb.x )
+	{
+		gOut[gIdx] = gIn[gIdx];
+	}
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
new file mode 100644
index 00000000..1b5819f6
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
@@ -0,0 +1,601 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* solverSetup2CL= \
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Takahiro Harada\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile global int*\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"//	Vector\n"
+"float fastDiv(float numerator, float denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"float fastSqrtf(float f2)\n"
+"	return native_sqrt(f2);\n"
+"//	return sqrt(f2);\n"
+"float fastRSqrt(float f2)\n"
+"	return native_rsqrt(f2);\n"
+"float fastLength4(float4 v)\n"
+"	return fast_length(v);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"float sqrtf(float a)\n"
+"//	return sqrt(a);\n"
+"	return native_sqrt(a);\n"
+"float4 cross3(float4 a, float4 b)\n"
+"	return cross(a,b);\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float length3(const float4 a)\n"
+"	return sqrtf(dot3F4(a,a));\n"
+"float dot4(const float4 a, const float4 b)\n"
+"	return dot( a, b );\n"
+"//	for height\n"
+"float dot3w1(const float4 point, const float4 eqn)\n"
+"	return dot3F4(point,eqn) + eqn.w;\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"//	float length = sqrtf(dot3F4(a, a));\n"
+"//	return 1.f/length * a;\n"
+"float4 normalize4(const float4 a)\n"
+"	float length = sqrtf(dot4(a, a));\n"
+"	return 1.f/length * a;\n"
+"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
+"	float4 eqn;\n"
+"	float4 ab = b-a;\n"
+"	float4 ac = c-a;\n"
+"	eqn = normalize3( cross3(ab, ac) );\n"
+"	eqn.w = -dot3F4(eqn,a);\n"
+"	return eqn;\n"
+"//	Matrix3x3\n"
+"typedef struct\n"
+"	float4 m_row[3];\n"
+"Matrix3x3 mtZero();\n"
+"Matrix3x3 mtIdentity();\n"
+"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
+"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
+"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+"Matrix3x3 mtZero()\n"
+"	Matrix3x3 m;\n"
+"	m.m_row[0] = (float4)(0.f);\n"
+"	m.m_row[1] = (float4)(0.f);\n"
+"	m.m_row[2] = (float4)(0.f);\n"
+"	return m;\n"
+"Matrix3x3 mtIdentity()\n"
+"	Matrix3x3 m;\n"
+"	m.m_row[0] = (float4)(1,0,0,0);\n"
+"	m.m_row[1] = (float4)(0,1,0,0);\n"
+"	m.m_row[2] = (float4)(0,0,1,0);\n"
+"	return m;\n"
+"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
+"	Matrix3x3 out;\n"
+"	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
+"	Matrix3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	Matrix3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a.m_row[0], b );\n"
+"	ans.y = dot3F4( a.m_row[1], b );\n"
+"	ans.z = dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+"	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a, colx );\n"
+"	ans.y = dot3F4( a, coly );\n"
+"	ans.z = dot3F4( a, colz );\n"
+"	return ans;\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"#define WG_SIZE 64\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	Quaternion m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_shapeIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} Body;\n"
+"typedef struct\n"
+"	Matrix3x3 m_invInertia;\n"
+"	Matrix3x3 m_initInvInertia;\n"
+"} Shape;\n"
+"typedef struct\n"
+"	float4 m_linear;\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_center;	\n"
+"	float m_jacCoeffInv[4];\n"
+"	float m_b[4];\n"
+"	float m_appliedRambdaDt[4];\n"
+"	float m_fJacCoeffInv[2];	\n"
+"	float m_fAppliedRambdaDt[2];	\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"	int m_batchIdx;\n"
+"	u32 m_paddings[1];\n"
+"} Constraint4;\n"
+"typedef struct\n"
+"	int m_nConstraints;\n"
+"	int m_start;\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"typedef struct\n"
+"	int m_solveFriction;\n"
+"	int m_maxBatch;	//	long batch really kills the performance\n"
+"	int m_batchIdx;\n"
+"	int m_nSplit;\n"
+"//	int m_paddings[1];\n"
+"} ConstBufferBatchSolve;\n"
+" \n"
+"typedef struct \n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"	float m_val0;\n"
+"	float m_val1;\n"
+"	float m_val2;\n"
+"	float m_val3;\n"
+"} SolverDebugInfo;\n"
+"//	others\n"
+"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
+"	int nContacts = cb.x;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int srcIdx = sortData[gIdx].y;\n"
+"		out[gIdx] = in[srcIdx];\n"
+"	}\n"
+"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int2 sd;\n"
+"		sd.x = contactsIn[gIdx].m_childIndexB;\n"
+"		sd.y = gIdx;\n"
+"		sortDataOut[gIdx] = sd;\n"
+"	}\n"
+"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int2 sdIn;\n"
+"		sdIn = sortDataInOut[gIdx];\n"
+"		int2 sdOut;\n"
+"		sdOut.x = contactsIn[sdIn.y].m_childIndexA;\n"
+"		sdOut.y = sdIn.y;\n"
+"		sortDataInOut[gIdx] = sdOut;\n"
+"	}\n"
+"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int2 sdIn;\n"
+"		sdIn = sortDataInOut[gIdx];\n"
+"		int2 sdOut;\n"
+"		sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;\n"
+"		sdOut.y = sdIn.y;\n"
+"		sortDataInOut[gIdx] = sdOut;\n"
+"	}\n"
+"void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int2 sdIn;\n"
+"		sdIn = sortDataInOut[gIdx];\n"
+"		int2 sdOut;\n"
+"		sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;\n"
+"		sdOut.y = sdIn.y;\n"
+"		sortDataInOut[gIdx] = sdOut;\n"
+"	}\n"
+"typedef struct\n"
+"	int m_nContacts;\n"
+"	int m_staticIdx;\n"
+"	float m_scale;\n"
+"	int m_nSplit;\n"
+"} ConstBufferSSD;\n"
+"__constant const int gridTable4x4[] = \n"
+"    0,1,17,16,\n"
+"	1,2,18,19,\n"
+"	17,18,32,3,\n"
+"	16,19,3,34\n"
+"__constant const int gridTable8x8[] = \n"
+"	  0,  2,  3, 16, 17, 18, 19,  1,\n"
+"	 66, 64, 80, 67, 82, 81, 65, 83,\n"
+"	131,144,128,130,147,129,145,146,\n"
+"	208,195,194,192,193,211,210,209,\n"
+"	 21, 22, 23,  5,  4,  6,  7, 20,\n"
+"	 86, 85, 69, 87, 70, 68, 84, 71,\n"
+"	151,133,149,150,135,148,132,134,\n"
+"	197,27,214,213,212,199,198,196\n"
+"	\n"
+"#define USE_4x4_GRID 1\n"
+"void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
+"int nContacts,float scale,int4 nSplit,int staticIdx)\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;\n"
+"		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;\n"
+"		int aIdx = abs(aPtrAndSignBit );\n"
+"		int bIdx = abs(bPtrAndSignBit);\n"
+"		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n"
+"		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n"
+"		int idx = (aStatic)? bIdx: aIdx;\n"
+"		float4 p = gBodies[idx].m_pos;\n"
+"		int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);\n"
+"		int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);\n"
+"		int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);\n"
+"		int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);\n"
+"		\n"
+"	#if USE_4x4_GRID\n"
+"		int aa = aIdx&3;\n"
+"		int bb = bIdx&3;\n"
+"		if (aStatic)\n"
+"			aa = bb;\n"
+"		if (bStatic)\n"
+"			bb = aa;\n"
+"		int gridIndex = aa + bb*4;\n"
+"		int newIndex = gridTable4x4[gridIndex];\n"
+"	#else//USE_4x4_GRID\n"
+"		int aa = aIdx&7;\n"
+"		int bb = bIdx&7;\n"
+"		if (aStatic)\n"
+"			aa = bb;\n"
+"		if (bStatic)\n"
+"			bb = aa;\n"
+"		int gridIndex = aa + bb*8;\n"
+"		int newIndex = gridTable8x8[gridIndex];\n"
+"	#endif//USE_4x4_GRID\n"
+"		gSortDataOut[gIdx].x = newIndex;\n"
+"		gSortDataOut[gIdx].y = gIdx;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		gSortDataOut[gIdx].x = 0xffffffff;\n"
+"	}\n"
+"void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	if( gIdx < cb.x )\n"
+"	{\n"
+"		gOut[gIdx] = gIn[gIdx];\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl
new file mode 100644
index 00000000..a21a08c3
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl
@@ -0,0 +1,968 @@
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//Originally written by Erwin Coumans
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#define counter32_t volatile global int*
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+#define max2 max
+#define min2 min
+//	Vector
+float fastDiv(float numerator, float denominator)
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+float4 fastDiv4(float4 numerator, float4 denominator)
+	return native_divide(numerator, denominator);	
+float fastSqrtf(float f2)
+	return native_sqrt(f2);
+//	return sqrt(f2);
+float fastRSqrt(float f2)
+	return native_rsqrt(f2);
+float fastLength4(float4 v)
+	return fast_length(v);
+float4 fastNormalize4(float4 v)
+	return fast_normalize(v);
+float sqrtf(float a)
+//	return sqrt(a);
+	return native_sqrt(a);
+float4 cross3(float4 a1, float4 b1)
+	float4 	a=make_float4(a1.xyz,0.f);
+	float4 	b=make_float4(b1.xyz,0.f);
+	//float4 	a=a1;
+	//float4 	b=b1;
+	return cross(a,b);
+float dot3F4(float4 a, float4 b)
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+float length3(const float4 a)
+	return sqrtf(dot3F4(a,a));
+float dot4(const float4 a, const float4 b)
+	return dot( a, b );
+//	for height
+float dot3w1(const float4 point, const float4 eqn)
+	return dot3F4(point,eqn) + eqn.w;
+float4 normalize3(const float4 a)
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+float4 normalize4(const float4 a)
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+float4 createEquation(const float4 a, const float4 b, const float4 c)
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+//	Matrix3x3
+typedef struct
+	float4 m_row[3];
+Matrix3x3 mtZero();
+Matrix3x3 mtIdentity();
+Matrix3x3 mtTranspose(Matrix3x3 m);
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
+float4 mtMul1(Matrix3x3 a, float4 b);
+float4 mtMul3(float4 a, Matrix3x3 b);
+Matrix3x3 mtZero()
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(0.f);
+	m.m_row[1] = (float4)(0.f);
+	m.m_row[2] = (float4)(0.f);
+	return m;
+Matrix3x3 mtIdentity()
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(1,0,0,0);
+	m.m_row[1] = (float4)(0,1,0,0);
+	m.m_row[2] = (float4)(0,0,1,0);
+	return m;
+Matrix3x3 mtTranspose(Matrix3x3 m)
+	Matrix3x3 out;
+	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+float4 mtMul1(Matrix3x3 a, float4 b)
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+float4 mtMul3(float4 a, Matrix3x3 b)
+	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+//	Quaternion
+typedef float4 Quaternion;
+Quaternion qtMul(Quaternion a, Quaternion b);
+Quaternion qtNormalize(Quaternion in);
+float4 qtRotate(Quaternion q, float4 vec);
+Quaternion qtInvert(Quaternion q);
+Quaternion qtMul(Quaternion a, Quaternion b)
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+Quaternion qtNormalize(Quaternion in)
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+float4 qtRotate(Quaternion q, float4 vec)
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+Quaternion qtInvert(Quaternion q)
+	return (Quaternion)(-q.xyz, q.w);
+float4 qtInvRotate(const Quaternion q, float4 vec)
+	return qtRotate( qtInvert( q ), vec );
+#define WG_SIZE 64
+typedef struct
+	float4 m_pos;
+	Quaternion m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	u32 m_shapeIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+typedef struct
+	Matrix3x3 m_invInertia;
+	Matrix3x3 m_initInvInertia;
+} Shape;
+typedef struct
+	float4 m_linear;
+	float4 m_worldPos[4];
+	float4 m_center;	
+	float m_jacCoeffInv[4];
+	float m_b[4];
+	float m_appliedRambdaDt[4];
+	float m_fJacCoeffInv[2];	
+	float m_fAppliedRambdaDt[2];	
+	u32 m_bodyA;
+	u32 m_bodyB;
+	int m_batchIdx;
+	u32 m_paddings;
+} Constraint4;
+__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)
+	int i = GET_GLOBAL_IDX;
+	if( i < numContactManifolds)
+	{
+		int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;
+		bool isFixedA = (pa <0) || (pa == fixedBodyIndex);
+		int bodyIndexA = abs(pa);
+		if (!isFixedA)
+		{
+			 AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);
+		}
+		int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;
+		bool isFixedB = (pb <0) || (pb == fixedBodyIndex);
+		int bodyIndexB = abs(pb);
+		if (!isFixedB)
+		{
+			AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);
+		} 
+	}
+__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)
+	int i = GET_GLOBAL_IDX;
+	if( i < numSplitBodies)
+	{
+		linearVelocities[i] = make_float4(0);
+		angularVelocities[i] = make_float4(0);
+	}
+__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,
+__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)
+	int i = GET_GLOBAL_IDX;
+	if (i<numBodies)
+	{
+		if (gBodies[i].m_invMass)
+		{
+			int bodyOffset = offsetSplitBodies[i];
+			int count = bodyCount[i];
+			float factor = 1.f/((float)count);
+			float4 averageLinVel = make_float4(0.f);
+			float4 averageAngVel = make_float4(0.f);
+			for (int j=0;j<count;j++)
+			{
+				averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
+				averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
+			}
+			for (int j=0;j<count;j++)
+			{
+				deltaLinearVelocities[bodyOffset+j] = averageLinVel;
+				deltaAngularVelocities[bodyOffset+j] = averageAngVel;
+			}
+		}//bodies[i].m_invMass
+	}//i<numBodies
+void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
+	*linear = make_float4(n.xyz,0.f);
+	*angular0 = cross3(r0, n);
+	*angular1 = -cross3(r1, n);
+float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
+	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
+float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
+					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
+	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
+	return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);
+void btPlaneSpace1 (float4 n, float4* p, float4* q);
+ void btPlaneSpace1 (float4 n, float4* p, float4* q)
+  if (fabs(n.z) > 0.70710678f) {
+    // choose p in y-z plane
+    float a = n.y*n.y + n.z*n.z;
+    float k = 1.f/sqrt(a);
+    p[0].x = 0;
+	p[0].y = -n.z*k;
+	p[0].z = n.y*k;
+    // set q = n x p
+    q[0].x = a*k;
+	q[0].y = -n.x*p[0].z;
+	q[0].z = n.x*p[0].y;
+  }
+  else {
+    // choose p in x-y plane
+    float a = n.x*n.x + n.y*n.y;
+    float k = 1.f/sqrt(a);
+    p[0].x = -n.y*k;
+	p[0].y = n.x*k;
+	p[0].z = 0;
+    // set q = n x p
+    q[0].x = -n.z*p[0].y;
+	q[0].y = n.z*p[0].x;
+	q[0].z = a*k;
+  }
+void solveContact(__global Constraint4* cs,
+			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
+			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,
+			float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)
+	float minRambdaDt = 0;
+	float maxRambdaDt = FLT_MAX;
+	for(int ic=0; ic<4; ic++)
+	{
+		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;
+		float4 angular0, angular1, linear;
+		float4 r0 = cs->m_worldPos[ic] - posA;
+		float4 r1 = cs->m_worldPos[ic] - posB;
+		setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 );
+		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, 
+			*linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];
+		rambdaDt *= cs->m_jacCoeffInv[ic];
+		{
+			float prevSum = cs->m_appliedRambdaDt[ic];
+			float updated = prevSum;
+			updated += rambdaDt;
+			updated = max2( updated, minRambdaDt );
+			updated = min2( updated, maxRambdaDt );
+			rambdaDt = updated - prevSum;
+			cs->m_appliedRambdaDt[ic] = updated;
+		}
+		float4 linImp0 = invMassA*linear*rambdaDt;
+		float4 linImp1 = invMassB*(-linear)*rambdaDt;
+		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+		if (invMassA)
+		{
+			*dLinVelA += linImp0;
+			*dAngVelA += angImp0;
+		}
+		if (invMassB)
+		{
+			*dLinVelB += linImp1;
+			*dAngVelB += angImp1;
+		}
+	}
+//	solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);
+void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, 
+__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,
+__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)
+	//float frictionCoeff = ldsCs[0].m_linear.w;
+	int aIdx = ldsCs[0].m_bodyA;
+	int bIdx = ldsCs[0].m_bodyB;
+	float4 posA = gBodies[aIdx].m_pos;
+	float4 linVelA = gBodies[aIdx].m_linVel;
+	float4 angVelA = gBodies[aIdx].m_angVel;
+	float invMassA = gBodies[aIdx].m_invMass;
+	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
+	float4 posB = gBodies[bIdx].m_pos;
+	float4 linVelB = gBodies[bIdx].m_linVel;
+	float4 angVelB = gBodies[bIdx].m_angVel;
+	float invMassB = gBodies[bIdx].m_invMass;
+	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
+	float4 dLinVelA = make_float4(0,0,0,0);
+	float4 dAngVelA = make_float4(0,0,0,0);
+	float4 dLinVelB = make_float4(0,0,0,0);
+	float4 dAngVelB = make_float4(0,0,0,0);
+	int bodyOffsetA = offsetSplitBodies[aIdx];
+	int constraintOffsetA = contactConstraintOffsets[0].x;
+	int splitIndexA = bodyOffsetA+constraintOffsetA;
+	if (invMassA)
+	{
+		dLinVelA = deltaLinearVelocities[splitIndexA];
+		dAngVelA = deltaAngularVelocities[splitIndexA];
+	}
+	int bodyOffsetB = offsetSplitBodies[bIdx];
+	int constraintOffsetB = contactConstraintOffsets[0].y;
+	int splitIndexB= bodyOffsetB+constraintOffsetB;
+	if (invMassB)
+	{
+		dLinVelB = deltaLinearVelocities[splitIndexB];
+		dAngVelB = deltaAngularVelocities[splitIndexB];
+	}
+	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
+			posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);
+	if (invMassA)
+	{
+		deltaLinearVelocities[splitIndexA] = dLinVelA;
+		deltaAngularVelocities[splitIndexA] = dAngVelA;
+	} 
+	if (invMassB)
+	{
+		deltaLinearVelocities[splitIndexB] = dLinVelB;
+		deltaAngularVelocities[splitIndexB] = dAngVelB;
+	}
+__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,
+__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,
+float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds
+	int i = GET_GLOBAL_IDX;
+	if (i<numManifolds)
+	{
+		solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);
+	}
+void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,
+							__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,
+							__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)
+	float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;
+	int aIdx = ldsCs[0].m_bodyA;
+	int bIdx = ldsCs[0].m_bodyB;
+	float4 posA = gBodies[aIdx].m_pos;
+	float4 linVelA = gBodies[aIdx].m_linVel;
+	float4 angVelA = gBodies[aIdx].m_angVel;
+	float invMassA = gBodies[aIdx].m_invMass;
+	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
+	float4 posB = gBodies[bIdx].m_pos;
+	float4 linVelB = gBodies[bIdx].m_linVel;
+	float4 angVelB = gBodies[bIdx].m_angVel;
+	float invMassB = gBodies[bIdx].m_invMass;
+	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
+	float4 dLinVelA = make_float4(0,0,0,0);
+	float4 dAngVelA = make_float4(0,0,0,0);
+	float4 dLinVelB = make_float4(0,0,0,0);
+	float4 dAngVelB = make_float4(0,0,0,0);
+	int bodyOffsetA = offsetSplitBodies[aIdx];
+	int constraintOffsetA = contactConstraintOffsets[0].x;
+	int splitIndexA = bodyOffsetA+constraintOffsetA;
+	if (invMassA)
+	{
+		dLinVelA = deltaLinearVelocities[splitIndexA];
+		dAngVelA = deltaAngularVelocities[splitIndexA];
+	}
+	int bodyOffsetB = offsetSplitBodies[bIdx];
+	int constraintOffsetB = contactConstraintOffsets[0].y;
+	int splitIndexB= bodyOffsetB+constraintOffsetB;
+	if (invMassB)
+	{
+		dLinVelB = deltaLinearVelocities[splitIndexB];
+		dAngVelB = deltaAngularVelocities[splitIndexB];
+	}
+	{
+		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+		float sum = 0;
+		for(int j=0; j<4; j++)
+		{
+			sum +=ldsCs[0].m_appliedRambdaDt[j];
+		}
+		frictionCoeff = 0.7f;
+		for(int j=0; j<4; j++)
+		{
+			maxRambdaDt[j] = frictionCoeff*sum;
+			minRambdaDt[j] = -maxRambdaDt[j];
+		}
+//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
+//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );
+		{
+			__global Constraint4* cs = ldsCs;
+			if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;
+			const float4 center = cs->m_center;
+			float4 n = -cs->m_linear;
+			float4 tangent[2];
+			btPlaneSpace1(n,&tangent[0],&tangent[1]);
+			float4 angular0, angular1, linear;
+			float4 r0 = center - posA;
+			float4 r1 = center - posB;
+			for(int i=0; i<2; i++)
+			{
+				setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
+				float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+											linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );
+				rambdaDt *= cs->m_fJacCoeffInv[i];
+				{
+					float prevSum = cs->m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[i] );
+					updated = min2( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs->m_fAppliedRambdaDt[i] = updated;
+				}
+				float4 linImp0 = invMassA*linear*rambdaDt;
+				float4 linImp1 = invMassB*(-linear)*rambdaDt;
+				float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+				float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+				dLinVelA += linImp0;
+				dAngVelA += angImp0;
+				dLinVelB += linImp1;
+				dAngVelB += angImp1;
+			}
+			{	//	angular damping for point constraint
+				float4 ab = normalize3( posB - posA );
+				float4 ac = normalize3( center - posA );
+				if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))
+				{
+					float angNA = dot3F4( n, angVelA );
+					float angNB = dot3F4( n, angVelB );
+					dAngVelA -= (angNA*0.1f)*n;
+					dAngVelB -= (angNB*0.1f)*n;
+				}
+			}
+		}
+	}
+	if (invMassA)
+	{
+		deltaLinearVelocities[splitIndexA] = dLinVelA;
+		deltaAngularVelocities[splitIndexA] = dAngVelA;
+	} 
+	if (invMassB)
+	{
+		deltaLinearVelocities[splitIndexB] = dLinVelB;
+		deltaAngularVelocities[splitIndexB] = dAngVelB;
+	}
+__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,
+										__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,
+										__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,
+										float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds
+	int i = GET_GLOBAL_IDX;
+	if (i<numManifolds)
+	{
+		solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);
+	}
+__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,
+									__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)
+	int i = GET_GLOBAL_IDX;
+	if (i<numBodies)
+	{
+		if (gBodies[i].m_invMass)
+		{
+			int bodyOffset = offsetSplitBodies[i];
+			int count = bodyCount[i];
+			if (count)
+			{
+				gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];
+				gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];
+			}
+		}
+	}
+void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
+	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, 
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,
+	Constraint4* dstC )
+	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
+	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
+	float dtInv = 1.f/dt;
+	for(int ic=0; ic<4; ic++)
+	{
+		dstC->m_appliedRambdaDt[ic] = 0.f;
+	}
+	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
+	dstC->m_linear = src->m_worldNormalOnB;
+	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );
+	for(int ic=0; ic<4; ic++)
+	{
+		float4 r0 = src->m_worldPosB[ic] - posA;
+		float4 r1 = src->m_worldPosB[ic] - posB;
+		if( ic >= src->m_worldNormalOnB.w )//npoints
+		{
+			dstC->m_jacCoeffInv[ic] = 0.f;
+			continue;
+		}
+		float relVelN;
+		{
+			float4 linear, angular0, angular1;
+			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);
+			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);
+			relVelN = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB);
+			float e = 0.f;//src->getRestituitionCoeff();
+			if( relVelN*relVelN < 0.004f ) e = 0.f;
+			dstC->m_b[ic] = e*relVelN;
+			//float penetration = src->m_worldPosB[ic].w;
+			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;
+			dstC->m_appliedRambdaDt[ic] = 0.f;
+		}
+	}
+	if( src->m_worldNormalOnB.w > 0 )//npoints
+	{	//	prepare friction
+		float4 center = make_float4(0.f);
+		for(int i=0; i<src->m_worldNormalOnB.w; i++) 
+			center += src->m_worldPosB[i];
+		center /= (float)src->m_worldNormalOnB.w;
+		float4 tangent[2];
+		btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]);
+		float4 r[2];
+		r[0] = center - posA;
+		r[1] = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			float4 linear, angular0, angular1;
+			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
+			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);
+			dstC->m_fAppliedRambdaDt[i] = 0.f;
+		}
+		dstC->m_center = center;
+	}
+	for(int i=0; i<4; i++)
+	{
+		if( i<src->m_worldNormalOnB.w )
+		{
+			dstC->m_worldPos[i] = src->m_worldPosB[i];
+		}
+		else
+		{
+			dstC->m_worldPos[i] = make_float4(0.f);
+		}
+	}
+void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, 
+__global const unsigned int* bodyCount,
+int nContacts,
+float dt,
+float positionDrift,
+float positionConstraintCoeff
+	int gIdx = GET_GLOBAL_IDX;
+	if( gIdx < nContacts )
+	{
+		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
+		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+		float4 posA = gBodies[aIdx].m_pos;
+		float4 linVelA = gBodies[aIdx].m_linVel;
+		float4 angVelA = gBodies[aIdx].m_angVel;
+		float invMassA = gBodies[aIdx].m_invMass;
+		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
+		float4 posB = gBodies[bIdx].m_pos;
+		float4 linVelB = gBodies[bIdx].m_linVel;
+		float4 angVelB = gBodies[bIdx].m_angVel;
+		float invMassB = gBodies[bIdx].m_invMass;
+		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
+		Constraint4 cs;
+		float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1;
+		float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1;
+    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,
+			&cs  );
+		cs.m_batchIdx = gContact[gIdx].m_batchIdx;
+		gConstraintOut[gIdx] = cs;
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
new file mode 100644
index 00000000..c0173ad9
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
@@ -0,0 +1,909 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* solverUtilsCL= \
+"Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"//Originally written by Erwin Coumans\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#define counter32_t volatile global int*\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"//	Vector\n"
+"float fastDiv(float numerator, float denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"	return native_divide(numerator, denominator);	\n"
+"float fastSqrtf(float f2)\n"
+"	return native_sqrt(f2);\n"
+"//	return sqrt(f2);\n"
+"float fastRSqrt(float f2)\n"
+"	return native_rsqrt(f2);\n"
+"float fastLength4(float4 v)\n"
+"	return fast_length(v);\n"
+"float4 fastNormalize4(float4 v)\n"
+"	return fast_normalize(v);\n"
+"float sqrtf(float a)\n"
+"//	return sqrt(a);\n"
+"	return native_sqrt(a);\n"
+"float4 cross3(float4 a1, float4 b1)\n"
+"	float4 	a=make_float4(a1.xyz,0.f);\n"
+"	float4 	b=make_float4(b1.xyz,0.f);\n"
+"	//float4 	a=a1;\n"
+"	//float4 	b=b1;\n"
+"	return cross(a,b);\n"
+"float dot3F4(float4 a, float4 b)\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"float length3(const float4 a)\n"
+"	return sqrtf(dot3F4(a,a));\n"
+"float dot4(const float4 a, const float4 b)\n"
+"	return dot( a, b );\n"
+"//	for height\n"
+"float dot3w1(const float4 point, const float4 eqn)\n"
+"	return dot3F4(point,eqn) + eqn.w;\n"
+"float4 normalize3(const float4 a)\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"//	float length = sqrtf(dot3F4(a, a));\n"
+"//	return 1.f/length * a;\n"
+"float4 normalize4(const float4 a)\n"
+"	float length = sqrtf(dot4(a, a));\n"
+"	return 1.f/length * a;\n"
+"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
+"	float4 eqn;\n"
+"	float4 ab = b-a;\n"
+"	float4 ac = c-a;\n"
+"	eqn = normalize3( cross3(ab, ac) );\n"
+"	eqn.w = -dot3F4(eqn,a);\n"
+"	return eqn;\n"
+"//	Matrix3x3\n"
+"typedef struct\n"
+"	float4 m_row[3];\n"
+"Matrix3x3 mtZero();\n"
+"Matrix3x3 mtIdentity();\n"
+"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
+"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
+"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+"Matrix3x3 mtZero()\n"
+"	Matrix3x3 m;\n"
+"	m.m_row[0] = (float4)(0.f);\n"
+"	m.m_row[1] = (float4)(0.f);\n"
+"	m.m_row[2] = (float4)(0.f);\n"
+"	return m;\n"
+"Matrix3x3 mtIdentity()\n"
+"	Matrix3x3 m;\n"
+"	m.m_row[0] = (float4)(1,0,0,0);\n"
+"	m.m_row[1] = (float4)(0,1,0,0);\n"
+"	m.m_row[2] = (float4)(0,0,1,0);\n"
+"	return m;\n"
+"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
+"	Matrix3x3 out;\n"
+"	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
+"	Matrix3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	Matrix3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a.m_row[0], b );\n"
+"	ans.y = dot3F4( a.m_row[1], b );\n"
+"	ans.z = dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+"	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	float4 ans;\n"
+"	ans.x = dot3F4( a, colx );\n"
+"	ans.y = dot3F4( a, coly );\n"
+"	ans.z = dot3F4( a, colz );\n"
+"	return ans;\n"
+"//	Quaternion\n"
+"typedef float4 Quaternion;\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"#define WG_SIZE 64\n"
+"typedef struct\n"
+"	float4 m_pos;\n"
+"	Quaternion m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_shapeIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} Body;\n"
+"typedef struct\n"
+"	Matrix3x3 m_invInertia;\n"
+"	Matrix3x3 m_initInvInertia;\n"
+"} Shape;\n"
+"typedef struct\n"
+"	float4 m_linear;\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_center;	\n"
+"	float m_jacCoeffInv[4];\n"
+"	float m_b[4];\n"
+"	float m_appliedRambdaDt[4];\n"
+"	float m_fJacCoeffInv[2];	\n"
+"	float m_fAppliedRambdaDt[2];	\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"	int m_batchIdx;\n"
+"	u32 m_paddings;\n"
+"} Constraint4;\n"
+"__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n"
+"	int i = GET_GLOBAL_IDX;\n"
+"	\n"
+"	if( i < numContactManifolds)\n"
+"	{\n"
+"		int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;\n"
+"		bool isFixedA = (pa <0) || (pa == fixedBodyIndex);\n"
+"		int bodyIndexA = abs(pa);\n"
+"		if (!isFixedA)\n"
+"		{\n"
+"			 AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);\n"
+"		}\n"
+"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
+"		int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;\n"
+"		bool isFixedB = (pb <0) || (pb == fixedBodyIndex);\n"
+"		int bodyIndexB = abs(pb);\n"
+"		if (!isFixedB)\n"
+"		{\n"
+"			AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);\n"
+"		} \n"
+"	}\n"
+"__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n"
+"	int i = GET_GLOBAL_IDX;\n"
+"	\n"
+"	if( i < numSplitBodies)\n"
+"	{\n"
+"		linearVelocities[i] = make_float4(0);\n"
+"		angularVelocities[i] = make_float4(0);\n"
+"	}\n"
+"__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
+"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
+"	int i = GET_GLOBAL_IDX;\n"
+"	if (i<numBodies)\n"
+"	{\n"
+"		if (gBodies[i].m_invMass)\n"
+"		{\n"
+"			int bodyOffset = offsetSplitBodies[i];\n"
+"			int count = bodyCount[i];\n"
+"			float factor = 1.f/((float)count);\n"
+"			float4 averageLinVel = make_float4(0.f);\n"
+"			float4 averageAngVel = make_float4(0.f);\n"
+"			\n"
+"			for (int j=0;j<count;j++)\n"
+"			{\n"
+"				averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;\n"
+"				averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;\n"
+"			}\n"
+"			\n"
+"			for (int j=0;j<count;j++)\n"
+"			{\n"
+"				deltaLinearVelocities[bodyOffset+j] = averageLinVel;\n"
+"				deltaAngularVelocities[bodyOffset+j] = averageAngVel;\n"
+"			}\n"
+"			\n"
+"		}//bodies[i].m_invMass\n"
+"	}//i<numBodies\n"
+"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
+"	*linear = make_float4(n.xyz,0.f);\n"
+"	*angular0 = cross3(r0, n);\n"
+"	*angular1 = -cross3(r1, n);\n"
+"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
+"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
+"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n"
+"	//	linear0,1 are normlized\n"
+"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
+"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
+"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+"	return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n"
+"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
+"  if (fabs(n.z) > 0.70710678f) {\n"
+"    // choose p in y-z plane\n"
+"    float a = n.y*n.y + n.z*n.z;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = 0;\n"
+"	p[0].y = -n.z*k;\n"
+"	p[0].z = n.y*k;\n"
+"    // set q = n x p\n"
+"    q[0].x = a*k;\n"
+"	q[0].y = -n.x*p[0].z;\n"
+"	q[0].z = n.x*p[0].y;\n"
+"  }\n"
+"  else {\n"
+"    // choose p in x-y plane\n"
+"    float a = n.x*n.x + n.y*n.y;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = -n.y*k;\n"
+"	p[0].y = n.x*k;\n"
+"	p[0].z = 0;\n"
+"    // set q = n x p\n"
+"    q[0].x = -n.z*p[0].y;\n"
+"	q[0].y = n.z*p[0].x;\n"
+"	q[0].z = a*k;\n"
+"  }\n"
+"void solveContact(__global Constraint4* cs,\n"
+"			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
+"			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n"
+"			float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)\n"
+"	float minRambdaDt = 0;\n"
+"	float maxRambdaDt = FLT_MAX;\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
+"		float4 angular0, angular1, linear;\n"
+"		float4 r0 = cs->m_worldPos[ic] - posA;\n"
+"		float4 r1 = cs->m_worldPos[ic] - posB;\n"
+"		setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
+"	\n"
+"		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
+"			*linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n"
+"		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
+"		\n"
+"		{\n"
+"			float prevSum = cs->m_appliedRambdaDt[ic];\n"
+"			float updated = prevSum;\n"
+"			updated += rambdaDt;\n"
+"			updated = max2( updated, minRambdaDt );\n"
+"			updated = min2( updated, maxRambdaDt );\n"
+"			rambdaDt = updated - prevSum;\n"
+"			cs->m_appliedRambdaDt[ic] = updated;\n"
+"		}\n"
+"			\n"
+"		float4 linImp0 = invMassA*linear*rambdaDt;\n"
+"		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
+"		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
+"		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
+"		\n"
+"		if (invMassA)\n"
+"		{\n"
+"			*dLinVelA += linImp0;\n"
+"			*dAngVelA += angImp0;\n"
+"		}\n"
+"		if (invMassB)\n"
+"		{\n"
+"			*dLinVelB += linImp1;\n"
+"			*dAngVelB += angImp1;\n"
+"		}\n"
+"	}\n"
+"//	solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
+"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n"
+"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
+"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
+"	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
+"	int aIdx = ldsCs[0].m_bodyA;\n"
+"	int bIdx = ldsCs[0].m_bodyB;\n"
+"	float4 posA = gBodies[aIdx].m_pos;\n"
+"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
+"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
+"	float invMassA = gBodies[aIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+"	float4 posB = gBodies[bIdx].m_pos;\n"
+"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
+"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
+"	float invMassB = gBodies[bIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+"			\n"
+"	float4 dLinVelA = make_float4(0,0,0,0);\n"
+"	float4 dAngVelA = make_float4(0,0,0,0);\n"
+"	float4 dLinVelB = make_float4(0,0,0,0);\n"
+"	float4 dAngVelB = make_float4(0,0,0,0);\n"
+"			\n"
+"	int bodyOffsetA = offsetSplitBodies[aIdx];\n"
+"	int constraintOffsetA = contactConstraintOffsets[0].x;\n"
+"	int splitIndexA = bodyOffsetA+constraintOffsetA;\n"
+"	\n"
+"	if (invMassA)\n"
+"	{\n"
+"		dLinVelA = deltaLinearVelocities[splitIndexA];\n"
+"		dAngVelA = deltaAngularVelocities[splitIndexA];\n"
+"	}\n"
+"	int bodyOffsetB = offsetSplitBodies[bIdx];\n"
+"	int constraintOffsetB = contactConstraintOffsets[0].y;\n"
+"	int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
+"	if (invMassB)\n"
+"	{\n"
+"		dLinVelB = deltaLinearVelocities[splitIndexB];\n"
+"		dAngVelB = deltaAngularVelocities[splitIndexB];\n"
+"	}\n"
+"	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
+"			posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n"
+"	if (invMassA)\n"
+"	{\n"
+"		deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
+"		deltaAngularVelocities[splitIndexA] = dAngVelA;\n"
+"	} \n"
+"	if (invMassB)\n"
+"	{\n"
+"		deltaLinearVelocities[splitIndexB] = dLinVelB;\n"
+"		deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
+"	}\n"
+"__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
+"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
+"float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n"
+"	int i = GET_GLOBAL_IDX;\n"
+"	if (i<numManifolds)\n"
+"	{\n"
+"		solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
+"	}\n"
+"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n"
+"							__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
+"							__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
+"	float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n"
+"	int aIdx = ldsCs[0].m_bodyA;\n"
+"	int bIdx = ldsCs[0].m_bodyB;\n"
+"	float4 posA = gBodies[aIdx].m_pos;\n"
+"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
+"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
+"	float invMassA = gBodies[aIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+"	float4 posB = gBodies[bIdx].m_pos;\n"
+"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
+"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
+"	float invMassB = gBodies[bIdx].m_invMass;\n"
+"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+"	\n"
+"	float4 dLinVelA = make_float4(0,0,0,0);\n"
+"	float4 dAngVelA = make_float4(0,0,0,0);\n"
+"	float4 dLinVelB = make_float4(0,0,0,0);\n"
+"	float4 dAngVelB = make_float4(0,0,0,0);\n"
+"			\n"
+"	int bodyOffsetA = offsetSplitBodies[aIdx];\n"
+"	int constraintOffsetA = contactConstraintOffsets[0].x;\n"
+"	int splitIndexA = bodyOffsetA+constraintOffsetA;\n"
+"	\n"
+"	if (invMassA)\n"
+"	{\n"
+"		dLinVelA = deltaLinearVelocities[splitIndexA];\n"
+"		dAngVelA = deltaAngularVelocities[splitIndexA];\n"
+"	}\n"
+"	int bodyOffsetB = offsetSplitBodies[bIdx];\n"
+"	int constraintOffsetB = contactConstraintOffsets[0].y;\n"
+"	int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
+"	if (invMassB)\n"
+"	{\n"
+"		dLinVelB = deltaLinearVelocities[splitIndexB];\n"
+"		dAngVelB = deltaAngularVelocities[splitIndexB];\n"
+"	}\n"
+"	{\n"
+"		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
+"		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
+"		float sum = 0;\n"
+"		for(int j=0; j<4; j++)\n"
+"		{\n"
+"			sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
+"		}\n"
+"		frictionCoeff = 0.7f;\n"
+"		for(int j=0; j<4; j++)\n"
+"		{\n"
+"			maxRambdaDt[j] = frictionCoeff*sum;\n"
+"			minRambdaDt[j] = -maxRambdaDt[j];\n"
+"		}\n"
+"		\n"
+"//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
+"//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
+"		\n"
+"		\n"
+"		{\n"
+"			\n"
+"			__global Constraint4* cs = ldsCs;\n"
+"			\n"
+"			if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
+"			const float4 center = cs->m_center;\n"
+"			\n"
+"			float4 n = -cs->m_linear;\n"
+"			\n"
+"			float4 tangent[2];\n"
+"			btPlaneSpace1(n,&tangent[0],&tangent[1]);\n"
+"			float4 angular0, angular1, linear;\n"
+"			float4 r0 = center - posA;\n"
+"			float4 r1 = center - posB;\n"
+"			for(int i=0; i<2; i++)\n"
+"			{\n"
+"				setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
+"				float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
+"											linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );\n"
+"				rambdaDt *= cs->m_fJacCoeffInv[i];\n"
+"				\n"
+"				{\n"
+"					float prevSum = cs->m_fAppliedRambdaDt[i];\n"
+"					float updated = prevSum;\n"
+"					updated += rambdaDt;\n"
+"					updated = max2( updated, minRambdaDt[i] );\n"
+"					updated = min2( updated, maxRambdaDt[i] );\n"
+"					rambdaDt = updated - prevSum;\n"
+"					cs->m_fAppliedRambdaDt[i] = updated;\n"
+"				}\n"
+"				\n"
+"				float4 linImp0 = invMassA*linear*rambdaDt;\n"
+"				float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
+"				float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
+"				float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
+"				\n"
+"				dLinVelA += linImp0;\n"
+"				dAngVelA += angImp0;\n"
+"				dLinVelB += linImp1;\n"
+"				dAngVelB += angImp1;\n"
+"			}\n"
+"			{	//	angular damping for point constraint\n"
+"				float4 ab = normalize3( posB - posA );\n"
+"				float4 ac = normalize3( center - posA );\n"
+"				if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))\n"
+"				{\n"
+"					float angNA = dot3F4( n, angVelA );\n"
+"					float angNB = dot3F4( n, angVelB );\n"
+"					\n"
+"					dAngVelA -= (angNA*0.1f)*n;\n"
+"					dAngVelB -= (angNB*0.1f)*n;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		\n"
+"	}\n"
+"	if (invMassA)\n"
+"	{\n"
+"		deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
+"		deltaAngularVelocities[splitIndexA] = dAngVelA;\n"
+"	} \n"
+"	if (invMassB)\n"
+"	{\n"
+"		deltaLinearVelocities[splitIndexB] = dLinVelB;\n"
+"		deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
+"	}\n"
+" \n"
+"__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
+"										__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
+"										__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
+"										float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n"
+"	int i = GET_GLOBAL_IDX;\n"
+"	if (i<numManifolds)\n"
+"	{\n"
+"		solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
+"	}\n"
+"__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
+"									__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
+"	int i = GET_GLOBAL_IDX;\n"
+"	if (i<numBodies)\n"
+"	{\n"
+"		if (gBodies[i].m_invMass)\n"
+"		{\n"
+"			int bodyOffset = offsetSplitBodies[i];\n"
+"			int count = bodyCount[i];\n"
+"			if (count)\n"
+"			{\n"
+"				gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];\n"
+"				gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
+"	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n"
+"	Constraint4* dstC )\n"
+"	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
+"	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
+"	float dtInv = 1.f/dt;\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
+"	}\n"
+"	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
+"	dstC->m_linear = src->m_worldNormalOnB;\n"
+"	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		float4 r0 = src->m_worldPosB[ic] - posA;\n"
+"		float4 r1 = src->m_worldPosB[ic] - posB;\n"
+"		if( ic >= src->m_worldNormalOnB.w )//npoints\n"
+"		{\n"
+"			dstC->m_jacCoeffInv[ic] = 0.f;\n"
+"			continue;\n"
+"		}\n"
+"		float relVelN;\n"
+"		{\n"
+"			float4 linear, angular0, angular1;\n"
+"			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n"
+"			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
+"				invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n"
+"			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
+"				linVelA, angVelA, linVelB, angVelB);\n"
+"			float e = 0.f;//src->getRestituitionCoeff();\n"
+"			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
+"			dstC->m_b[ic] = e*relVelN;\n"
+"			//float penetration = src->m_worldPosB[ic].w;\n"
+"			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
+"			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
+"		}\n"
+"	}\n"
+"	if( src->m_worldNormalOnB.w > 0 )//npoints\n"
+"	{	//	prepare friction\n"
+"		float4 center = make_float4(0.f);\n"
+"		for(int i=0; i<src->m_worldNormalOnB.w; i++) \n"
+"			center += src->m_worldPosB[i];\n"
+"		center /= (float)src->m_worldNormalOnB.w;\n"
+"		float4 tangent[2];\n"
+"		btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n"
+"		\n"
+"		float4 r[2];\n"
+"		r[0] = center - posA;\n"
+"		r[1] = center - posB;\n"
+"		for(int i=0; i<2; i++)\n"
+"		{\n"
+"			float4 linear, angular0, angular1;\n"
+"			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
+"			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
+"				invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n"
+"			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
+"		}\n"
+"		dstC->m_center = center;\n"
+"	}\n"
+"	for(int i=0; i<4; i++)\n"
+"	{\n"
+"		if( i<src->m_worldNormalOnB.w )\n"
+"		{\n"
+"			dstC->m_worldPos[i] = src->m_worldPosB[i];\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			dstC->m_worldPos[i] = make_float4(0.f);\n"
+"		}\n"
+"	}\n"
+"void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n"
+"__global const unsigned int* bodyCount,\n"
+"int nContacts,\n"
+"float dt,\n"
+"float positionDrift,\n"
+"float positionConstraintCoeff\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	if( gIdx < nContacts )\n"
+"	{\n"
+"		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
+"		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
+"		float4 posA = gBodies[aIdx].m_pos;\n"
+"		float4 linVelA = gBodies[aIdx].m_linVel;\n"
+"		float4 angVelA = gBodies[aIdx].m_angVel;\n"
+"		float invMassA = gBodies[aIdx].m_invMass;\n"
+"		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+"		float4 posB = gBodies[bIdx].m_pos;\n"
+"		float4 linVelB = gBodies[bIdx].m_linVel;\n"
+"		float4 angVelB = gBodies[bIdx].m_angVel;\n"
+"		float invMassB = gBodies[bIdx].m_invMass;\n"
+"		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+"		Constraint4 cs;\n"
+"		float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1;\n"
+"		float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1;\n"
+"    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
+"			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n"
+"			&cs  );\n"
+"		\n"
+"		cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
+"		gConstraintOut[gIdx] = cs;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl b/src/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl
new file mode 100644
index 00000000..ba8ba735
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl
@@ -0,0 +1,22 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h"
+__kernel void initializeGpuAabbsFull(  const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{
+		b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);
+	}
+__kernel void clearOverlappingPairsKernel(  __global int4* pairs, int numPairs)
+	int pairId = get_global_id(0);
+	if( pairId< numPairs )
+	{
+		pairs[pairId].z = 0xffffffff;
+	}
\ No newline at end of file
diff --git a/src/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h b/src/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
new file mode 100644
index 00000000..d70e7401
--- /dev/null
+++ b/src/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@@ -0,0 +1,483 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* updateAabbsKernelCL= \
+"#ifndef B3_UPDATE_AABBS_H\n"
+"#define B3_UPDATE_AABBS_H\n"
+"#ifndef B3_AABB_H\n"
+"#define B3_AABB_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"struct MyTest\n"
+"	int bla;\n"
+"#ifdef __cplusplus\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"typedef struct\n"
+"	b3Float4 m_row[3];\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"b3Mat3x3 mtZero();\n"
+"b3Mat3x3 mtIdentity();\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"b3Mat3x3 mtZero()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"b3Mat3x3 mtIdentity()\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3Aabb b3Aabb_t;\n"
+"struct b3Aabb\n"
+"	union\n"
+"	{\n"
+"		float m_min[4];\n"
+"		b3Float4 m_minVec;\n"
+"		int m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float	m_max[4];\n"
+"		b3Float4 m_maxVec;\n"
+"		int m_signedMaxIndices[4];\n"
+"	};\n"
+"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
+"						b3Float4ConstArg pos,\n"
+"						b3QuatConstArg orn,\n"
+"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
+"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
+"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
+"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
+"		b3Mat3x3 m;\n"
+"		m = b3QuatGetRotationMatrix(orn);\n"
+"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
+"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
+"		\n"
+"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
+"										 0.f);\n"
+"		*aabbMinOut = center-extent;\n"
+"		*aabbMaxOut = center+extent;\n"
+"/// conservative test for overlap between two aabbs\n"
+"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
+"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
+"	bool overlap = true;\n"
+"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
+"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
+"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
+"	return overlap;\n"
+"#endif //B3_AABB_H\n"
+"#ifndef B3_COLLIDABLE_H\n"
+"#define B3_COLLIDABLE_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"enum b3ShapeTypes\n"
+"	SHAPE_PLANE=4,\n"
+"typedef struct b3Collidable b3Collidable_t;\n"
+"struct b3Collidable\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	int m_shapeType;\n"
+"	union\n"
+"	{\n"
+"		int m_shapeIndex;\n"
+"		float m_height;\n"
+"	};\n"
+"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
+"struct b3GpuChildShape\n"
+"	b3Float4	m_childPosition;\n"
+"	b3Quat		m_childOrientation;\n"
+"	union\n"
+"	{\n"
+"		int			m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
+"		int			m_capsuleAxis;\n"
+"	};\n"
+"	union \n"
+"	{\n"
+"		float		m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n"
+"		int			m_numChildShapes;//used for compound shape\n"
+"	};\n"
+"	union \n"
+"	{\n"
+"		float		m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n"
+"		int	m_collidableShapeIndex;\n"
+"	};\n"
+"	int			m_shapeType;\n"
+"struct b3CompoundOverlappingPair\n"
+"	int m_bodyIndexA;\n"
+"	int m_bodyIndexB;\n"
+"//	int	m_pairType;\n"
+"	int m_childShapeIndexA;\n"
+"	int m_childShapeIndexB;\n"
+"#endif //B3_COLLIDABLE_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#ifdef __cplusplus\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"void b3ComputeWorldAabb(  int bodyId, __global const b3RigidBodyData_t* bodies, __global const  b3Collidable_t* collidables, __global const  b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n"
+"	__global const b3RigidBodyData_t* body = &bodies[bodyId];\n"
+"	b3Float4 position = body->m_pos;\n"
+"	b3Quat	orientation = body->m_quat;\n"
+"	\n"
+"	int collidableIndex = body->m_collidableIdx;\n"
+"	int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n"
+"		\n"
+"	if (shapeIndex>=0)\n"
+"	{\n"
+"				\n"
+"		b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n"
+"		b3Aabb_t worldAabb;\n"
+"		\n"
+"		b3Float4 aabbAMinOut,aabbAMaxOut;	\n"
+"		float margin = 0.f;\n"
+"		b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n"
+"		\n"
+"		worldAabb.m_minVec =aabbAMinOut;\n"
+"		worldAabb.m_minIndices[3] = bodyId;\n"
+"		worldAabb.m_maxVec = aabbAMaxOut;\n"
+"		worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n"
+"		worldAabbs[bodyId] = worldAabb;\n"
+"	}\n"
+"#endif //B3_UPDATE_AABBS_H\n"
+"__kernel void initializeGpuAabbsFull(  const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n"
+"	int nodeID = get_global_id(0);\n"
+"	if( nodeID < numNodes )\n"
+"	{\n"
+"		b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n"
+"	}\n"
+"__kernel void clearOverlappingPairsKernel(  __global int4* pairs, int numPairs)\n"
+"	int pairId = get_global_id(0);\n"
+"	if( pairId< numPairs )\n"
+"	{\n"
+"		pairs[pairId].z = 0xffffffff;\n"
+"	}\n"
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/autogenerated/bullet2.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/autogenerated/bullet2.h
new file mode 100644
index 00000000..a6b57b1a
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/autogenerated/bullet2.h
@@ -0,0 +1,1053 @@
+/* Copyright (C) 2011 Erwin Coumans & Charlie C
+* This software is provided 'as-is', without any express or implied
+* warranty.  In no event will the authors be held liable for any damages
+* arising from the use of this software.
+* Permission is granted to anyone to use this software for any purpose,
+* including commercial applications, and to alter it and redistribute it
+* freely, subject to the following restrictions:
+* 1. The origin of this software must not be misrepresented; you must not
+*    claim that you wrote the original software. If you use this software
+*    in a product, an acknowledgment in the product documentation would be
+*    appreciated but is not required.
+* 2. Altered source versions must be plainly marked as such, and must not be
+*    misrepresented as being the original software.
+* 3. This notice may not be removed or altered from any source distribution.
+// Auto generated from Bullet/Extras/HeaderGenerator/bulletGenerate.py
+#ifndef __BULLET2_H__
+#define __BULLET2_H__
+namespace Bullet3SerializeBullet2 {
+// put an empty struct in the case
+typedef struct bInvalidHandle {
+	int unused;
+    class PointerArray;
+    class b3PhysicsSystem;
+    class ListBase;
+    class b3Vector3FloatData;
+    class b3Vector3DoubleData;
+    class b3Matrix3x3FloatData;
+    class b3Matrix3x3DoubleData;
+    class b3TransformFloatData;
+    class b3TransformDoubleData;
+    class b3BvhSubtreeInfoData;
+    class b3OptimizedBvhNodeFloatData;
+    class b3OptimizedBvhNodeDoubleData;
+    class b3QuantizedBvhNodeData;
+    class b3QuantizedBvhFloatData;
+    class b3QuantizedBvhDoubleData;
+    class b3CollisionShapeData;
+    class b3StaticPlaneShapeData;
+    class b3ConvexInternalShapeData;
+    class b3PositionAndRadius;
+    class b3MultiSphereShapeData;
+    class b3IntIndexData;
+    class b3ShortIntIndexData;
+    class b3ShortIntIndexTripletData;
+    class b3CharIndexTripletData;
+    class b3MeshPartData;
+    class b3StridingMeshInterfaceData;
+    class b3TriangleMeshShapeData;
+    class b3ScaledTriangleMeshShapeData;
+    class b3CompoundShapeChildData;
+    class b3CompoundShapeData;
+    class b3CylinderShapeData;
+    class b3CapsuleShapeData;
+    class b3TriangleInfoData;
+    class b3TriangleInfoMapData;
+    class b3GImpactMeshShapeData;
+    class b3ConvexHullShapeData;
+    class b3CollisionObjectDoubleData;
+    class b3CollisionObjectFloatData;
+    class b3DynamicsWorldDoubleData;
+    class b3DynamicsWorldFloatData;
+    class b3RigidBodyFloatData;
+    class b3RigidBodyDoubleData;
+    class b3ConstraintInfo1;
+    class b3TypedConstraintData;
+    class b3Point2PointConstraintFloatData;
+    class b3Point2PointConstraintDoubleData;
+    class b3HingeConstraintDoubleData;
+    class b3HingeConstraintFloatData;
+    class b3ConeTwistConstraintData;
+    class b3Generic6DofConstraintData;
+    class b3Generic6DofSpringConstraintData;
+    class b3SliderConstraintData;
+    class b3ContactSolverInfoDoubleData;
+    class b3ContactSolverInfoFloatData;
+    class SoftBodyMaterialData;
+    class SoftBodyNodeData;
+    class SoftBodyLinkData;
+    class SoftBodyFaceData;
+    class SoftBodyTetraData;
+    class SoftRigidAnchorData;
+    class SoftBodyConfigData;
+    class SoftBodyPoseData;
+    class SoftBodyClusterData;
+    class b3SoftBodyJointData;
+    class b3SoftBodyFloatData;
+// -------------------------------------------------- //
+    class PointerArray
+    {
+    public:
+        int m_size;
+        int m_capacity;
+        void *m_data;
+    };
+// -------------------------------------------------- //
+    class b3PhysicsSystem
+    {
+    public:
+        PointerArray m_collisionShapes;
+        PointerArray m_collisionObjects;
+        PointerArray m_constraints;
+    };
+// -------------------------------------------------- //
+    class ListBase
+    {
+    public:
+        void *first;
+        void *last;
+    };
+// -------------------------------------------------- //
+    class b3Vector3FloatData
+    {
+    public:
+        float m_floats[4];
+    };
+// -------------------------------------------------- //
+    class b3Vector3DoubleData
+    {
+    public:
+        double m_floats[4];
+    };
+// -------------------------------------------------- //
+    class b3Matrix3x3FloatData
+    {
+    public:
+        b3Vector3FloatData m_el[3];
+    };
+// -------------------------------------------------- //
+    class b3Matrix3x3DoubleData
+    {
+    public:
+        b3Vector3DoubleData m_el[3];
+    };
+// -------------------------------------------------- //
+    class b3TransformFloatData
+    {
+    public:
+        b3Matrix3x3FloatData m_basis;
+        b3Vector3FloatData m_origin;
+    };
+// -------------------------------------------------- //
+    class b3TransformDoubleData
+    {
+    public:
+        b3Matrix3x3DoubleData m_basis;
+        b3Vector3DoubleData m_origin;
+    };
+// -------------------------------------------------- //
+    class b3BvhSubtreeInfoData
+    {
+    public:
+        int m_rootNodeIndex;
+        int m_subtreeSize;
+        short m_quantizedAabbMin[3];
+        short m_quantizedAabbMax[3];
+    };
+// -------------------------------------------------- //
+    class b3OptimizedBvhNodeFloatData
+    {
+    public:
+        b3Vector3FloatData m_aabbMinOrg;
+        b3Vector3FloatData m_aabbMaxOrg;
+        int m_escapeIndex;
+        int m_subPart;
+        int m_triangleIndex;
+        char m_pad[4];
+    };
+// -------------------------------------------------- //
+    class b3OptimizedBvhNodeDoubleData
+    {
+    public:
+        b3Vector3DoubleData m_aabbMinOrg;
+        b3Vector3DoubleData m_aabbMaxOrg;
+        int m_escapeIndex;
+        int m_subPart;
+        int m_triangleIndex;
+        char m_pad[4];
+    };
+// -------------------------------------------------- //
+    class b3QuantizedBvhNodeData
+    {
+    public:
+        short m_quantizedAabbMin[3];
+        short m_quantizedAabbMax[3];
+        int m_escapeIndexOrTriangleIndex;
+    };
+// -------------------------------------------------- //
+    class b3QuantizedBvhFloatData
+    {
+    public:
+        b3Vector3FloatData m_bvhAabbMin;
+        b3Vector3FloatData m_bvhAabbMax;
+        b3Vector3FloatData m_bvhQuantization;
+        int m_curNodeIndex;
+        int m_useQuantization;
+        int m_numContiguousLeafNodes;
+        int m_numQuantizedContiguousNodes;
+        b3OptimizedBvhNodeFloatData *m_contiguousNodesPtr;
+        b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr;
+        b3BvhSubtreeInfoData *m_subTreeInfoPtr;
+        int m_traversalMode;
+        int m_numSubtreeHeaders;
+    };
+// -------------------------------------------------- //
+    class b3QuantizedBvhDoubleData
+    {
+    public:
+        b3Vector3DoubleData m_bvhAabbMin;
+        b3Vector3DoubleData m_bvhAabbMax;
+        b3Vector3DoubleData m_bvhQuantization;
+        int m_curNodeIndex;
+        int m_useQuantization;
+        int m_numContiguousLeafNodes;
+        int m_numQuantizedContiguousNodes;
+        b3OptimizedBvhNodeDoubleData *m_contiguousNodesPtr;
+        b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr;
+        int m_traversalMode;
+        int m_numSubtreeHeaders;
+        b3BvhSubtreeInfoData *m_subTreeInfoPtr;
+    };
+// -------------------------------------------------- //
+    class b3CollisionShapeData
+    {
+    public:
+        char *m_name;
+        int m_shapeType;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3StaticPlaneShapeData
+    {
+    public:
+        b3CollisionShapeData m_collisionShapeData;
+        b3Vector3FloatData m_localScaling;
+        b3Vector3FloatData m_planeNormal;
+        float m_planeConstant;
+        char m_pad[4];
+    };
+// -------------------------------------------------- //
+    class b3ConvexInternalShapeData
+    {
+    public:
+        b3CollisionShapeData m_collisionShapeData;
+        b3Vector3FloatData m_localScaling;
+        b3Vector3FloatData m_implicitShapeDimensions;
+        float m_collisionMargin;
+        int m_padding;
+    };
+// -------------------------------------------------- //
+    class b3PositionAndRadius
+    {
+    public:
+        b3Vector3FloatData m_pos;
+        float m_radius;
+    };
+// -------------------------------------------------- //
+    class b3MultiSphereShapeData
+    {
+    public:
+        b3ConvexInternalShapeData m_convexInternalShapeData;
+        b3PositionAndRadius *m_localPositionArrayPtr;
+        int m_localPositionArraySize;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3IntIndexData
+    {
+    public:
+        int m_value;
+    };
+// -------------------------------------------------- //
+    class b3ShortIntIndexData
+    {
+    public:
+        short m_value;
+        char m_pad[2];
+    };
+// -------------------------------------------------- //
+    class b3ShortIntIndexTripletData
+    {
+    public:
+        short m_values[3];
+        char m_pad[2];
+    };
+// -------------------------------------------------- //
+    class b3CharIndexTripletData
+    {
+    public:
+        char m_values[3];
+        char m_pad;
+    };
+// -------------------------------------------------- //
+    class b3MeshPartData
+    {
+    public:
+        b3Vector3FloatData *m_vertices3f;
+        b3Vector3DoubleData *m_vertices3d;
+        b3IntIndexData *m_indices32;
+        b3ShortIntIndexTripletData *m_3indices16;
+        b3CharIndexTripletData *m_3indices8;
+        b3ShortIntIndexData *m_indices16;
+        int m_numTriangles;
+        int m_numVertices;
+    };
+// -------------------------------------------------- //
+    class b3StridingMeshInterfaceData
+    {
+    public:
+        b3MeshPartData *m_meshPartsPtr;
+        b3Vector3FloatData m_scaling;
+        int m_numMeshParts;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3TriangleMeshShapeData
+    {
+    public:
+        b3CollisionShapeData m_collisionShapeData;
+        b3StridingMeshInterfaceData m_meshInterface;
+        b3QuantizedBvhFloatData *m_quantizedFloatBvh;
+        b3QuantizedBvhDoubleData *m_quantizedDoubleBvh;
+        b3TriangleInfoMapData *m_triangleInfoMap;
+        float m_collisionMargin;
+        char m_pad3[4];
+    };
+// -------------------------------------------------- //
+    class b3ScaledTriangleMeshShapeData
+    {
+    public:
+        b3TriangleMeshShapeData m_trimeshShapeData;
+        b3Vector3FloatData m_localScaling;
+    };
+// -------------------------------------------------- //
+    class b3CompoundShapeChildData
+    {
+    public:
+        b3TransformFloatData m_transform;
+        b3CollisionShapeData *m_childShape;
+        int m_childShapeType;
+        float m_childMargin;
+    };
+// -------------------------------------------------- //
+    class b3CompoundShapeData
+    {
+    public:
+        b3CollisionShapeData m_collisionShapeData;
+        b3CompoundShapeChildData *m_childShapePtr;
+        int m_numChildShapes;
+        float m_collisionMargin;
+    };
+// -------------------------------------------------- //
+    class b3CylinderShapeData
+    {
+    public:
+        b3ConvexInternalShapeData m_convexInternalShapeData;
+        int m_upAxis;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3CapsuleShapeData
+    {
+    public:
+        b3ConvexInternalShapeData m_convexInternalShapeData;
+        int m_upAxis;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3TriangleInfoData
+    {
+    public:
+        int m_flags;
+        float m_edgeV0V1Angle;
+        float m_edgeV1V2Angle;
+        float m_edgeV2V0Angle;
+    };
+// -------------------------------------------------- //
+    class b3TriangleInfoMapData
+    {
+    public:
+        int *m_hashTablePtr;
+        int *m_nextPtr;
+        b3TriangleInfoData *m_valueArrayPtr;
+        int *m_keyArrayPtr;
+        float m_convexEpsilon;
+        float m_planarEpsilon;
+        float m_equalVertexThreshold;
+        float m_edgeDistanceThreshold;
+        float m_zeroAreaThreshold;
+        int m_nextSize;
+        int m_hashTableSize;
+        int m_numValues;
+        int m_numKeys;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3GImpactMeshShapeData
+    {
+    public:
+        b3CollisionShapeData m_collisionShapeData;
+        b3StridingMeshInterfaceData m_meshInterface;
+        b3Vector3FloatData m_localScaling;
+        float m_collisionMargin;
+        int m_gimpactSubType;
+    };
+// -------------------------------------------------- //
+    class b3ConvexHullShapeData
+    {
+    public:
+        b3ConvexInternalShapeData m_convexInternalShapeData;
+        b3Vector3FloatData *m_unscaledPointsFloatPtr;
+        b3Vector3DoubleData *m_unscaledPointsDoublePtr;
+        int m_numUnscaledPoints;
+        char m_padding3[4];
+    };
+// -------------------------------------------------- //
+    class b3CollisionObjectDoubleData
+    {
+    public:
+        void *m_broadphaseHandle;
+        void *m_collisionShape;
+        b3CollisionShapeData *m_rootCollisionShape;
+        char *m_name;
+        b3TransformDoubleData m_worldTransform;
+        b3TransformDoubleData m_interpolationWorldTransform;
+        b3Vector3DoubleData m_interpolationLinearVelocity;
+        b3Vector3DoubleData m_interpolationAngularVelocity;
+        b3Vector3DoubleData m_anisotropicFriction;
+        double m_contactProcessingThreshold;
+        double m_deactivationTime;
+        double m_friction;
+        double m_rollingFriction;
+        double m_restitution;
+        double m_hitFraction;
+        double m_ccdSweptSphereRadius;
+        double m_ccdMotionThreshold;
+        int m_hasAnisotropicFriction;
+        int m_collisionFlags;
+        int m_islandTag1;
+        int m_companionId;
+        int m_activationState1;
+        int m_internalType;
+        int m_checkCollideWith;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3CollisionObjectFloatData
+    {
+    public:
+        void *m_broadphaseHandle;
+        void *m_collisionShape;
+        b3CollisionShapeData *m_rootCollisionShape;
+        char *m_name;
+        b3TransformFloatData m_worldTransform;
+        b3TransformFloatData m_interpolationWorldTransform;
+        b3Vector3FloatData m_interpolationLinearVelocity;
+        b3Vector3FloatData m_interpolationAngularVelocity;
+        b3Vector3FloatData m_anisotropicFriction;
+        float m_contactProcessingThreshold;
+        float m_deactivationTime;
+        float m_friction;
+        float m_rollingFriction;
+        float m_restitution;
+        float m_hitFraction;
+        float m_ccdSweptSphereRadius;
+        float m_ccdMotionThreshold;
+        int m_hasAnisotropicFriction;
+        int m_collisionFlags;
+        int m_islandTag1;
+        int m_companionId;
+        int m_activationState1;
+        int m_internalType;
+        int m_checkCollideWith;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3RigidBodyFloatData
+    {
+    public:
+        b3CollisionObjectFloatData m_collisionObjectData;
+        b3Matrix3x3FloatData m_invInertiaTensorWorld;
+        b3Vector3FloatData m_linearVelocity;
+        b3Vector3FloatData m_angularVelocity;
+        b3Vector3FloatData m_angularFactor;
+        b3Vector3FloatData m_linearFactor;
+        b3Vector3FloatData m_gravity;
+        b3Vector3FloatData m_gravity_acceleration;
+        b3Vector3FloatData m_invInertiaLocal;
+        b3Vector3FloatData m_totalForce;
+        b3Vector3FloatData m_totalTorque;
+        float m_inverseMass;
+        float m_linearDamping;
+        float m_angularDamping;
+        float m_additionalDampingFactor;
+        float m_additionalLinearDampingThresholdSqr;
+        float m_additionalAngularDampingThresholdSqr;
+        float m_additionalAngularDampingFactor;
+        float m_linearSleepingThreshold;
+        float m_angularSleepingThreshold;
+        int m_additionalDamping;
+    };
+// -------------------------------------------------- //
+    class b3RigidBodyDoubleData
+    {
+    public:
+        b3CollisionObjectDoubleData m_collisionObjectData;
+        b3Matrix3x3DoubleData m_invInertiaTensorWorld;
+        b3Vector3DoubleData m_linearVelocity;
+        b3Vector3DoubleData m_angularVelocity;
+        b3Vector3DoubleData m_angularFactor;
+        b3Vector3DoubleData m_linearFactor;
+        b3Vector3DoubleData m_gravity;
+        b3Vector3DoubleData m_gravity_acceleration;
+        b3Vector3DoubleData m_invInertiaLocal;
+        b3Vector3DoubleData m_totalForce;
+        b3Vector3DoubleData m_totalTorque;
+        double m_inverseMass;
+        double m_linearDamping;
+        double m_angularDamping;
+        double m_additionalDampingFactor;
+        double m_additionalLinearDampingThresholdSqr;
+        double m_additionalAngularDampingThresholdSqr;
+        double m_additionalAngularDampingFactor;
+        double m_linearSleepingThreshold;
+        double m_angularSleepingThreshold;
+        int m_additionalDamping;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3ConstraintInfo1
+    {
+    public:
+        int m_numConstraintRows;
+        int nub;
+    };
+// -------------------------------------------------- //
+    class b3TypedConstraintData
+    {
+    public:
+        bInvalidHandle *m_rbA;
+        bInvalidHandle *m_rbB;
+        char *m_name;
+        int m_objectType;
+        int m_userConstraintType;
+        int m_userConstraintId;
+        int m_needsFeedback;
+        float m_appliedImpulse;
+        float m_dbgDrawSize;
+        int m_disableCollisionsBetweenLinkedBodies;
+        int m_overrideNumSolverIterations;
+        float m_breakingImpulseThreshold;
+        int m_isEnabled;
+    };
+// -------------------------------------------------- //
+    class b3Point2PointConstraintFloatData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3Vector3FloatData m_pivotInA;
+        b3Vector3FloatData m_pivotInB;
+    };
+// -------------------------------------------------- //
+    class b3Point2PointConstraintDoubleData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3Vector3DoubleData m_pivotInA;
+        b3Vector3DoubleData m_pivotInB;
+    };
+// -------------------------------------------------- //
+    class b3HingeConstraintDoubleData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3TransformDoubleData m_rbAFrame;
+        b3TransformDoubleData m_rbBFrame;
+        int m_useReferenceFrameA;
+        int m_angularOnly;
+        int m_enableAngularMotor;
+        float m_motorTargetVelocity;
+        float m_maxMotorImpulse;
+        float m_lowerLimit;
+        float m_upperLimit;
+        float m_limitSoftness;
+        float m_biasFactor;
+        float m_relaxationFactor;
+    };
+// -------------------------------------------------- //
+    class b3HingeConstraintFloatData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3TransformFloatData m_rbAFrame;
+        b3TransformFloatData m_rbBFrame;
+        int m_useReferenceFrameA;
+        int m_angularOnly;
+        int m_enableAngularMotor;
+        float m_motorTargetVelocity;
+        float m_maxMotorImpulse;
+        float m_lowerLimit;
+        float m_upperLimit;
+        float m_limitSoftness;
+        float m_biasFactor;
+        float m_relaxationFactor;
+    };
+// -------------------------------------------------- //
+    class b3ConeTwistConstraintData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3TransformFloatData m_rbAFrame;
+        b3TransformFloatData m_rbBFrame;
+        float m_swingSpan1;
+        float m_swingSpan2;
+        float m_twistSpan;
+        float m_limitSoftness;
+        float m_biasFactor;
+        float m_relaxationFactor;
+        float m_damping;
+        char m_pad[4];
+    };
+// -------------------------------------------------- //
+    class b3Generic6DofConstraintData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3TransformFloatData m_rbAFrame;
+        b3TransformFloatData m_rbBFrame;
+        b3Vector3FloatData m_linearUpperLimit;
+        b3Vector3FloatData m_linearLowerLimit;
+        b3Vector3FloatData m_angularUpperLimit;
+        b3Vector3FloatData m_angularLowerLimit;
+        int m_useLinearReferenceFrameA;
+        int m_useOffsetForConstraintFrame;
+    };
+// -------------------------------------------------- //
+    class b3Generic6DofSpringConstraintData
+    {
+    public:
+        b3Generic6DofConstraintData m_6dofData;
+        int m_springEnabled[6];
+        float m_equilibriumPoint[6];
+        float m_springStiffness[6];
+        float m_springDamping[6];
+    };
+// -------------------------------------------------- //
+    class b3SliderConstraintData
+    {
+    public:
+        b3TypedConstraintData m_typeConstraintData;
+        b3TransformFloatData m_rbAFrame;
+        b3TransformFloatData m_rbBFrame;
+        float m_linearUpperLimit;
+        float m_linearLowerLimit;
+        float m_angularUpperLimit;
+        float m_angularLowerLimit;
+        int m_useLinearReferenceFrameA;
+        int m_useOffsetForConstraintFrame;
+    };
+// -------------------------------------------------- //
+    class b3ContactSolverInfoDoubleData
+    {
+    public:
+        double m_tau;
+        double m_damping;
+        double m_friction;
+        double m_timeStep;
+        double m_restitution;
+        double m_maxErrorReduction;
+        double m_sor;
+        double m_erp;
+        double m_erp2;
+        double m_globalCfm;
+        double m_splitImpulsePenetrationThreshold;
+        double m_splitImpulseTurnErp;
+        double m_linearSlop;
+        double m_warmstartingFactor;
+        double m_maxGyroscopicForce;
+        double m_singleAxisRollingFrictionThreshold;
+        int m_numIterations;
+        int m_solverMode;
+        int m_restingContactRestitutionThreshold;
+        int m_minimumSolverBatchSize;
+        int m_splitImpulse;
+        char m_padding[4];
+    };
+// -------------------------------------------------- //
+    class b3ContactSolverInfoFloatData
+    {
+    public:
+        float m_tau;
+        float m_damping;
+        float m_friction;
+        float m_timeStep;
+        float m_restitution;
+        float m_maxErrorReduction;
+        float m_sor;
+        float m_erp;
+        float m_erp2;
+        float m_globalCfm;
+        float m_splitImpulsePenetrationThreshold;
+        float m_splitImpulseTurnErp;
+        float m_linearSlop;
+        float m_warmstartingFactor;
+        float m_maxGyroscopicForce;
+        float m_singleAxisRollingFrictionThreshold;
+        int m_numIterations;
+        int m_solverMode;
+        int m_restingContactRestitutionThreshold;
+        int m_minimumSolverBatchSize;
+        int m_splitImpulse;
+        char m_padding[4];
+    };
+	// -------------------------------------------------- //
+    class b3DynamicsWorldDoubleData
+    {
+    public:
+        b3ContactSolverInfoDoubleData m_solverInfo;
+        b3Vector3DoubleData m_gravity;
+    };
+// -------------------------------------------------- //
+    class b3DynamicsWorldFloatData
+    {
+    public:
+        b3ContactSolverInfoFloatData m_solverInfo;
+        b3Vector3FloatData m_gravity;
+    };
+// -------------------------------------------------- //
+    class SoftBodyMaterialData
+    {
+    public:
+        float m_linearStiffness;
+        float m_angularStiffness;
+        float m_volumeStiffness;
+        int m_flags;
+    };
+// -------------------------------------------------- //
+    class SoftBodyNodeData
+    {
+    public:
+        SoftBodyMaterialData *m_material;
+        b3Vector3FloatData m_position;
+        b3Vector3FloatData m_previousPosition;
+        b3Vector3FloatData m_velocity;
+        b3Vector3FloatData m_accumulatedForce;
+        b3Vector3FloatData m_normal;
+        float m_inverseMass;
+        float m_area;
+        int m_attach;
+        int m_pad;
+    };
+// -------------------------------------------------- //
+    class SoftBodyLinkData
+    {
+    public:
+        SoftBodyMaterialData *m_material;
+        int m_nodeIndices[2];
+        float m_restLength;
+        int m_bbending;
+    };
+// -------------------------------------------------- //
+    class SoftBodyFaceData
+    {
+    public:
+        b3Vector3FloatData m_normal;
+        SoftBodyMaterialData *m_material;
+        int m_nodeIndices[3];
+        float m_restArea;
+    };
+// -------------------------------------------------- //
+    class SoftBodyTetraData
+    {
+    public:
+        b3Vector3FloatData m_c0[4];
+        SoftBodyMaterialData *m_material;
+        int m_nodeIndices[4];
+        float m_restVolume;
+        float m_c1;
+        float m_c2;
+        int m_pad;
+    };
+// -------------------------------------------------- //
+    class SoftRigidAnchorData
+    {
+    public:
+        b3Matrix3x3FloatData m_c0;
+        b3Vector3FloatData m_c1;
+        b3Vector3FloatData m_localFrame;
+        bInvalidHandle *m_rigidBody;
+        int m_nodeIndex;
+        float m_c2;
+    };
+// -------------------------------------------------- //
+    class SoftBodyConfigData
+    {
+    public:
+        int m_aeroModel;
+        float m_baumgarte;
+        float m_damping;
+        float m_drag;
+        float m_lift;
+        float m_pressure;
+        float m_volume;
+        float m_dynamicFriction;
+        float m_poseMatch;
+        float m_rigidContactHardness;
+        float m_kineticContactHardness;
+        float m_softContactHardness;
+        float m_anchorHardness;
+        float m_softRigidClusterHardness;
+        float m_softKineticClusterHardness;
+        float m_softSoftClusterHardness;
+        float m_softRigidClusterImpulseSplit;
+        float m_softKineticClusterImpulseSplit;
+        float m_softSoftClusterImpulseSplit;
+        float m_maxVolume;
+        float m_timeScale;
+        int m_velocityIterations;
+        int m_positionIterations;
+        int m_driftIterations;
+        int m_clusterIterations;
+        int m_collisionFlags;
+    };
+// -------------------------------------------------- //
+    class SoftBodyPoseData
+    {
+    public:
+        b3Matrix3x3FloatData m_rot;
+        b3Matrix3x3FloatData m_scale;
+        b3Matrix3x3FloatData m_aqq;
+        b3Vector3FloatData m_com;
+        b3Vector3FloatData *m_positions;
+        float *m_weights;
+        int m_numPositions;
+        int m_numWeigts;
+        int m_bvolume;
+        int m_bframe;
+        float m_restVolume;
+        int m_pad;
+    };
+// -------------------------------------------------- //
+    class SoftBodyClusterData
+    {
+    public:
+        b3TransformFloatData m_framexform;
+        b3Matrix3x3FloatData m_locii;
+        b3Matrix3x3FloatData m_invwi;
+        b3Vector3FloatData m_com;
+        b3Vector3FloatData m_vimpulses[2];
+        b3Vector3FloatData m_dimpulses[2];
+        b3Vector3FloatData m_lv;
+        b3Vector3FloatData m_av;
+        b3Vector3FloatData *m_framerefs;
+        int *m_nodeIndices;
+        float *m_masses;
+        int m_numFrameRefs;
+        int m_numNodes;
+        int m_numMasses;
+        float m_idmass;
+        float m_imass;
+        int m_nvimpulses;
+        int m_ndimpulses;
+        float m_ndamping;
+        float m_ldamping;
+        float m_adamping;
+        float m_matching;
+        float m_maxSelfCollisionImpulse;
+        float m_selfCollisionImpulseFactor;
+        int m_containsAnchor;
+        int m_collide;
+        int m_clusterIndex;
+    };
+// -------------------------------------------------- //
+    class b3SoftBodyJointData
+    {
+    public:
+        void *m_bodyA;
+        void *m_bodyB;
+        b3Vector3FloatData m_refs[2];
+        float m_cfm;
+        float m_erp;
+        float m_split;
+        int m_delete;
+        b3Vector3FloatData m_relPosition[2];
+        int m_bodyAtype;
+        int m_bodyBtype;
+        int m_jointType;
+        int m_pad;
+    };
+// -------------------------------------------------- //
+    class b3SoftBodyFloatData
+    {
+    public:
+        b3CollisionObjectFloatData m_collisionObjectData;
+        SoftBodyPoseData *m_pose;
+        SoftBodyMaterialData **m_materials;
+        SoftBodyNodeData *m_nodes;
+        SoftBodyLinkData *m_links;
+        SoftBodyFaceData *m_faces;
+        SoftBodyTetraData *m_tetrahedra;
+        SoftRigidAnchorData *m_anchors;
+        SoftBodyClusterData *m_clusters;
+        b3SoftBodyJointData *m_joints;
+        int m_numMaterials;
+        int m_numNodes;
+        int m_numLinks;
+        int m_numFaces;
+        int m_numTetrahedra;
+        int m_numAnchors;
+        int m_numClusters;
+        int m_numJoints;
+        SoftBodyConfigData m_config;
+    };
\ No newline at end of file
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3BulletFile.cpp b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3BulletFile.cpp
new file mode 100644
index 00000000..c3ceb838
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3BulletFile.cpp
@@ -0,0 +1,423 @@
+Copyright (c) 2006-2010 Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3BulletFile.h"
+#include "b3Defines.h"
+#include "b3DNA.h"
+#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
+#include <memory.h>
+#include <string.h>
+// 32 && 64 bit versions
+#ifdef _WIN64
+extern char b3s_bulletDNAstr64[];
+extern int b3s_bulletDNAlen64;
+extern char b3s_bulletDNAstr[];
+extern int b3s_bulletDNAlen;
+#endif //_WIN64
+extern char b3s_bulletDNAstr64[];
+extern int b3s_bulletDNAlen64;
+extern char b3s_bulletDNAstr[];
+extern int b3s_bulletDNAlen;
+using namespace bParse;
+:bFile("", "BULLET ")
+	mMemoryDNA = new bDNA(); //this memory gets released in the bFile::~bFile destructor,@todo not consistent with the rule 'who allocates it, has to deallocate it"
+	m_DnaCopy = 0;
+#ifdef _WIN64
+		m_DnaCopy = (char*)b3AlignedAlloc(b3s_bulletDNAlen64,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+		mMemoryDNA->init(m_DnaCopy,b3s_bulletDNAlen64);
+		m_DnaCopy = (char*)b3AlignedAlloc(b3s_bulletDNAlen,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr,b3s_bulletDNAlen);
+		mMemoryDNA->init(m_DnaCopy,b3s_bulletDNAlen);
+	if (VOID_IS_8)
+	{
+		m_DnaCopy = (char*) b3AlignedAlloc(b3s_bulletDNAlen64,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+		mMemoryDNA->init(m_DnaCopy,b3s_bulletDNAlen64);
+	}
+	else
+	{
+		m_DnaCopy =(char*) b3AlignedAlloc(b3s_bulletDNAlen,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr,b3s_bulletDNAlen);
+		mMemoryDNA->init(m_DnaCopy,b3s_bulletDNAlen);
+	}
+b3BulletFile::b3BulletFile(const char* fileName)
+:bFile(fileName, "BULLET ")
+	m_DnaCopy = 0;
+b3BulletFile::b3BulletFile(char *memoryBuffer, int len)
+:bFile(memoryBuffer,len, "BULLET ")
+	m_DnaCopy = 0;
+	if (m_DnaCopy)
+		b3AlignedFree(m_DnaCopy);
+	while (m_dataBlocks.size())
+	{
+		char* dataBlock = m_dataBlocks[m_dataBlocks.size()-1];
+		delete[] dataBlock;
+		m_dataBlocks.pop_back();
+	}
+// ----------------------------------------------------- //
+void b3BulletFile::parseData()
+//	printf ("Building datablocks");
+//	printf ("Chunk size = %d",CHUNK_HEADER_LEN);
+//	printf ("File chunk size = %d",ChunkUtils::getOffset(mFlags));
+	const bool brokenDNA = (mFlags&FD_BROKEN_DNA)!=0;
+	//const bool swap = (mFlags&FD_ENDIAN_SWAP)!=0;
+	mDataStart = 12;
+	char *dataPtr = mFileBuffer+mDataStart;
+	bChunkInd dataChunk;
+	dataChunk.code = 0;
+	//dataPtr += ChunkUtils::getNextBlock(&dataChunk, dataPtr, mFlags);
+	int seek = getNextBlock(&dataChunk, dataPtr, mFlags);
+	if (mFlags &FD_ENDIAN_SWAP) 
+		swapLen(dataPtr);
+	//dataPtr += ChunkUtils::getOffset(mFlags);
+	char *dataPtrHead = 0;
+	while (dataChunk.code != B3_DNA1)
+	{
+		if (!brokenDNA || (dataChunk.code != B3_QUANTIZED_BVH_CODE) )
+		{
+			// one behind
+			if (dataChunk.code == B3_SDNA) break;
+			//if (dataChunk.code == DNA1) break;
+			// same as (BHEAD+DATA dependency)
+			dataPtrHead = dataPtr+ChunkUtils::getOffset(mFlags);
+			if (dataChunk.dna_nr>=0)
+			{
+				char *id = readStruct(dataPtrHead, dataChunk);
+				// lookup maps
+				if (id)
+				{
+					m_chunkPtrPtrMap.insert(dataChunk.oldPtr, dataChunk);
+					mLibPointers.insert(dataChunk.oldPtr, (bStructHandle*)id);
+					m_chunks.push_back(dataChunk);
+					// block it
+					//bListBasePtr *listID = mMain->getListBasePtr(dataChunk.code);
+					//if (listID)
+					//	listID->push_back((bStructHandle*)id);
+				}
+				if (dataChunk.code == B3_SOFTBODY_CODE)
+				{
+					m_softBodies.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_RIGIDBODY_CODE)
+				{
+					m_rigidBodies.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_DYNAMICSWORLD_CODE)
+				{
+					m_dynamicsWorldInfo.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_CONSTRAINT_CODE)
+				{
+					m_constraints.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_QUANTIZED_BVH_CODE)
+				{
+					m_bvhs.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_TRIANLGE_INFO_MAP)
+				{
+					m_triangleInfoMaps.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_COLLISIONOBJECT_CODE)
+				{
+					m_collisionObjects.push_back((bStructHandle*) id);
+				}
+				if (dataChunk.code == B3_SHAPE_CODE)
+				{
+					m_collisionShapes.push_back((bStructHandle*) id);
+				}
+		//		if (dataChunk.code == GLOB)
+		//		{
+		//			m_glob = (bStructHandle*) id;
+		//		}
+			} else
+			{
+				//printf("unknown chunk\n");
+				mLibPointers.insert(dataChunk.oldPtr, (bStructHandle*)dataPtrHead);
+			}
+		} else
+		{
+			printf("skipping B3_QUANTIZED_BVH_CODE due to broken DNA\n");
+		}
+		dataPtr += seek;
+		seek =  getNextBlock(&dataChunk, dataPtr, mFlags);
+		if (mFlags &FD_ENDIAN_SWAP) 
+			swapLen(dataPtr);
+		if (seek < 0)
+			break;
+	}
+void	b3BulletFile::addDataBlock(char* dataBlock)
+	m_dataBlocks.push_back(dataBlock);
+void	b3BulletFile::writeDNA(FILE* fp)
+	bChunkInd dataChunk;
+	dataChunk.code = B3_DNA1;
+	dataChunk.dna_nr = 0;
+	dataChunk.nr = 1;
+	if (VOID_IS_8)
+	{
+#ifdef _WIN64
+		dataChunk.len = b3s_bulletDNAlen64;
+		dataChunk.oldPtr = b3s_bulletDNAstr64;
+		fwrite(&dataChunk,sizeof(bChunkInd),1,fp);
+		fwrite(b3s_bulletDNAstr64, b3s_bulletDNAlen64,1,fp);
+		b3Assert(0);
+	}
+	else
+	{
+#ifndef _WIN64
+		dataChunk.len = b3s_bulletDNAlen;
+		dataChunk.oldPtr = b3s_bulletDNAstr;
+		fwrite(&dataChunk,sizeof(bChunkInd),1,fp);
+		fwrite(b3s_bulletDNAstr, b3s_bulletDNAlen,1,fp);
+		b3Assert(0);
+	}
+	if (VOID_IS_8)
+	{
+		dataChunk.len = b3s_bulletDNAlen64;
+		dataChunk.oldPtr = b3s_bulletDNAstr64;
+		fwrite(&dataChunk,sizeof(bChunkInd),1,fp);
+		fwrite(b3s_bulletDNAstr64, b3s_bulletDNAlen64,1,fp);
+	}
+	else
+	{
+		dataChunk.len = b3s_bulletDNAlen;
+		dataChunk.oldPtr = b3s_bulletDNAstr;
+		fwrite(&dataChunk,sizeof(bChunkInd),1,fp);
+		fwrite(b3s_bulletDNAstr, b3s_bulletDNAlen,1,fp);
+	}
+void	b3BulletFile::parse(int verboseMode)
+	if (VOID_IS_8)
+	{
+#ifdef _WIN64
+		if (m_DnaCopy)
+			delete m_DnaCopy;
+		m_DnaCopy = (char*)b3AlignedAlloc(b3s_bulletDNAlen64,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+		parseInternal(verboseMode,(char*)b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+		b3Assert(0);
+	}
+	else
+	{
+#ifndef _WIN64
+		if (m_DnaCopy)
+			delete m_DnaCopy;
+		m_DnaCopy = (char*)b3AlignedAlloc(b3s_bulletDNAlen,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr,b3s_bulletDNAlen);
+		parseInternal(verboseMode,m_DnaCopy,b3s_bulletDNAlen);
+		b3Assert(0);
+	}
+	if (VOID_IS_8)
+	{
+		if (m_DnaCopy)
+			delete m_DnaCopy;
+		m_DnaCopy = (char*)b3AlignedAlloc(b3s_bulletDNAlen64,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+		parseInternal(verboseMode,m_DnaCopy,b3s_bulletDNAlen64);
+	}
+	else
+	{
+		if (m_DnaCopy)
+			delete m_DnaCopy;
+		m_DnaCopy = (char*)b3AlignedAlloc(b3s_bulletDNAlen,16);
+		memcpy(m_DnaCopy,b3s_bulletDNAstr,b3s_bulletDNAlen);
+		parseInternal(verboseMode,m_DnaCopy,b3s_bulletDNAlen);
+	}
+	//the parsing will convert to cpu endian
+	mFlags &=~FD_ENDIAN_SWAP;
+	int littleEndian= 1;
+	littleEndian= ((char*)&littleEndian)[0];
+	mFileBuffer[8] = littleEndian?'v':'V';
+// experimental
+int		b3BulletFile::write(const char* fileName, bool fixupPointers)
+	FILE *fp = fopen(fileName, "wb");
+	if (fp)
+	{
+		char header[B3_SIZEOFBLENDERHEADER] ;
+		memcpy(header, m_headerString, 7);
+		int endian= 1;
+		endian= ((char*)&endian)[0];
+		if (endian)
+		{
+			header[7] = '_';
+		} else
+		{
+			header[7] = '-';
+		}
+		if (VOID_IS_8)
+		{
+			header[8]='V';
+		} else
+		{
+			header[8]='v';
+		}
+		header[9] = '2';
+		header[10] = '7';
+		header[11] = '5';
+		fwrite(header,B3_SIZEOFBLENDERHEADER,1,fp);
+		writeChunks(fp, fixupPointers);
+		writeDNA(fp);
+		fclose(fp);
+	} else
+	{
+		printf("Error: cannot open file %s for writing\n",fileName);
+		return 0;
+	}
+	return 1;
+void	b3BulletFile::addStruct(const	char* structType,void* data, int len, void* oldPtr, int code)
+	bParse::bChunkInd dataChunk;
+	dataChunk.code = code;
+	dataChunk.nr = 1;
+	dataChunk.len = len;
+	dataChunk.dna_nr = mMemoryDNA->getReverseType(structType);
+	dataChunk.oldPtr = oldPtr;
+	///Perform structure size validation
+	short* structInfo= mMemoryDNA->getStruct(dataChunk.dna_nr);
+	int elemBytes;
+	elemBytes= mMemoryDNA->getLength(structInfo[0]);
+//	int elemBytes = mMemoryDNA->getElementSize(structInfo[0],structInfo[1]);
+	assert(len==elemBytes);
+	mLibPointers.insert(dataChunk.oldPtr, (bStructHandle*)data);
+	m_chunks.push_back(dataChunk);
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3BulletFile.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3BulletFile.h
new file mode 100644
index 00000000..fb1b9b0d
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3BulletFile.h
@@ -0,0 +1,83 @@
+Copyright (c) 2006-2010 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_BULLET_FILE_H
+#define B3_BULLET_FILE_H
+#include "b3File.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "b3Defines.h"
+#include "Bullet3Serialize/Bullet2FileLoader/b3Serializer.h"
+namespace bParse {
+	// ----------------------------------------------------- //
+	class b3BulletFile : public bFile
+	{
+	protected:
+		char*	m_DnaCopy;
+	public:
+		b3AlignedObjectArray<bStructHandle*>	m_softBodies;
+		b3AlignedObjectArray<bStructHandle*>	m_rigidBodies;
+		b3AlignedObjectArray<bStructHandle*>	m_collisionObjects;
+		b3AlignedObjectArray<bStructHandle*>	m_collisionShapes;
+		b3AlignedObjectArray<bStructHandle*>	m_constraints;
+		b3AlignedObjectArray<bStructHandle*>	m_bvhs;
+		b3AlignedObjectArray<bStructHandle*>	m_triangleInfoMaps;
+		b3AlignedObjectArray<bStructHandle*>	m_dynamicsWorldInfo;
+		b3AlignedObjectArray<char*>				m_dataBlocks;
+		b3BulletFile();
+		b3BulletFile(const char* fileName);
+		b3BulletFile(char *memoryBuffer, int len);
+		virtual ~b3BulletFile();
+		virtual	void	addDataBlock(char* dataBlock);
+		// experimental
+		virtual int		write(const char* fileName, bool fixupPointers=false);
+		virtual	void	parse(int verboseMode);
+		virtual	void parseData();
+		virtual	void	writeDNA(FILE* fp);
+		void	addStruct(const char* structType,void* data, int len, void* oldPtr, int code);
+	};
+#endif //B3_BULLET_FILE_H
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Chunk.cpp b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Chunk.cpp
new file mode 100644
index 00000000..c0e1bb70
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Chunk.cpp
@@ -0,0 +1,75 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3Chunk.h"
+#include "b3Defines.h"
+#include "b3File.h"
+#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
+#include <memory.h>
+#include <string.h>
+using namespace bParse;
+// ----------------------------------------------------- //
+short ChunkUtils::swapShort(short sht)
+	return sht;
+// ----------------------------------------------------- //
+int ChunkUtils::swapInt(int inte)
+	B3_SWITCH_INT(inte);
+	return inte;
+// ----------------------------------------------------- //
+b3Long64 ChunkUtils::swapLong64(b3Long64 lng)
+	return lng;
+// ----------------------------------------------------- //
+int ChunkUtils::getOffset(int flags)
+	// if the file is saved in a
+	// different format, get the
+	// file's chunk size
+	int res = CHUNK_HEADER_LEN;
+	if (VOID_IS_8)
+	{
+		if (flags &FD_BITS_VARIES)
+			res = sizeof(bChunkPtr4);
+	}
+	else
+	{
+		if (flags &FD_BITS_VARIES)
+			res = sizeof(bChunkPtr8);
+	}
+	return res;
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Chunk.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Chunk.h
new file mode 100644
index 00000000..03ecb6b4
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Chunk.h
@@ -0,0 +1,92 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef __BCHUNK_H__
+#define __BCHUNK_H__
+#if defined (_WIN32) && ! defined (__MINGW32__)
+	#define b3Long64 __int64
+#elif defined (__MINGW32__)	
+	#include <stdint.h>
+	#define b3Long64 int64_t
+	#define b3Long64 long long
+namespace bParse {
+	// ----------------------------------------------------- //
+	class bChunkPtr4
+	{
+	public:
+		bChunkPtr4(){}
+		int code;
+		int len;
+		union
+		{
+			int m_uniqueInt;
+		};
+		int dna_nr;
+		int nr;
+	};
+	// ----------------------------------------------------- //
+	class bChunkPtr8
+	{
+	public:
+		bChunkPtr8(){}
+		int code,  len;
+		union
+		{
+			b3Long64 oldPrev;
+			int	m_uniqueInts[2];
+		};
+		int dna_nr, nr;
+	};
+	// ----------------------------------------------------- //
+	class bChunkInd
+	{
+	public:
+		bChunkInd(){}
+		int code, len;
+		void *oldPtr;
+		int dna_nr, nr;
+	};
+	// ----------------------------------------------------- //
+	class ChunkUtils
+	{
+	public:
+		// file chunk offset
+		static int getOffset(int flags);
+		// endian utils
+		static short swapShort(short sht);
+		static int swapInt(int inte);
+		static b3Long64 swapLong64(b3Long64 lng);
+	};
+	const int CHUNK_HEADER_LEN = ((sizeof(bChunkInd)));
+	const bool VOID_IS_8 = ((sizeof(void*)==8));
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Common.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Common.h
new file mode 100644
index 00000000..2792d840
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Common.h
@@ -0,0 +1,39 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef __BCOMMON_H__
+#define __BCOMMON_H__
+#include <assert.h>
+//#include "bLog.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3HashMap.h"
+namespace bParse {
+	class bMain;
+	class bFileData;
+	class bFile;
+	class bDNA;
+	// delete void* undefined
+	typedef struct bStructHandle {int unused;}bStructHandle;
+	typedef b3AlignedObjectArray<bStructHandle*>	bListBasePtr;
+	typedef b3HashMap<b3HashPtr, bStructHandle*> bPtrMap;
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3DNA.cpp b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3DNA.cpp
new file mode 100644
index 00000000..b78f7633
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3DNA.cpp
@@ -0,0 +1,628 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include <assert.h>
+#include "b3DNA.h"
+#include "b3Chunk.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+//this define will force traversal of structures, to check backward (and forward) compatibility
+using namespace bParse;
+// ----------------------------------------------------- //
+	:	mPtrLen(0)
+	// --
+// ----------------------------------------------------- //
+	// --
+// ----------------------------------------------------- //
+bool bDNA::lessThan(bDNA *file)
+	return ( m_Names.size() < file->m_Names.size());
+// ----------------------------------------------------- //
+char *bDNA::getName(int ind)
+	assert(ind <= (int)m_Names.size());
+	return m_Names[ind].m_name;
+// ----------------------------------------------------- //
+char *bDNA::getType(int ind)
+	assert(ind<=  (int)mTypes.size());
+	return mTypes[ind];
+// ----------------------------------------------------- //
+short *bDNA::getStruct(int ind)
+	assert(ind <=  (int)mStructs.size());
+	return mStructs[ind];
+// ----------------------------------------------------- //
+short bDNA::getLength(int ind)
+	assert(ind <=  (int)mTlens.size());
+	return mTlens[ind];
+// ----------------------------------------------------- //
+int bDNA::getReverseType(short type)
+	int* intPtr = mStructReverse.find(type);
+	if (intPtr)
+		return *intPtr;
+	return -1;
+// ----------------------------------------------------- //
+int bDNA::getReverseType(const char *type)
+	b3HashString key(type);
+	int* valuePtr = mTypeLookup.find(key);
+	if (valuePtr)
+		return *valuePtr;
+	return -1;
+// ----------------------------------------------------- //
+int bDNA::getNumStructs()
+	return (int)mStructs.size();
+// ----------------------------------------------------- //
+bool bDNA::flagNotEqual(int dna_nr)
+	assert(dna_nr <=	(int)mCMPFlags.size());
+	return mCMPFlags[dna_nr] == FDF_STRUCT_NEQU;
+// ----------------------------------------------------- //
+bool bDNA::flagEqual(int dna_nr)
+	assert(dna_nr <=	(int)mCMPFlags.size());
+	int flag = mCMPFlags[dna_nr];
+	return  flag == FDF_STRUCT_EQU;
+// ----------------------------------------------------- //
+bool bDNA::flagNone(int dna_nr)
+	assert(dna_nr <=	(int)mCMPFlags.size());
+	return mCMPFlags[dna_nr] == FDF_NONE;
+// ----------------------------------------------------- //
+int bDNA::getPointerSize()
+	return mPtrLen;
+// ----------------------------------------------------- //
+void bDNA::initRecurseCmpFlags(int iter)
+	// iter is FDF_STRUCT_NEQU
+	short *oldStrc = mStructs[iter];
+	short type = oldStrc[0];
+	for (int i=0; i<(int)mStructs.size(); i++)
+	{
+		if (i != iter && mCMPFlags[i] == FDF_STRUCT_EQU )
+		{
+			short *curStruct = mStructs[i];
+			int eleLen = curStruct[1];
+			curStruct+=2;
+			for (int j=0; j<eleLen; j++, curStruct+=2)
+			{
+				if (curStruct[0] == type)
+				{
+					//char *name = m_Names[curStruct[1]].m_name;
+					//if (name[0] != '*')
+					if (m_Names[curStruct[1]].m_isPointer)
+					{
+						mCMPFlags[i] = FDF_STRUCT_NEQU;
+						initRecurseCmpFlags(i);
+					}
+				}
+			}
+		}
+	}
+// ----------------------------------------------------- //
+void bDNA::initCmpFlags(bDNA *memDNA)
+    // compare the file to memory
+	// this ptr should be the file data
+	assert(!m_Names.size() == 0 && "SDNA empty!");
+	mCMPFlags.resize(mStructs.size(), FDF_NONE);
+	int i;
+	for ( i=0; i<(int)mStructs.size(); i++)
+	{
+		short *oldStruct = mStructs[i];
+		int oldLookup = getReverseType(oldStruct[0]);
+		if (oldLookup == -1)
+		{
+			mCMPFlags[i] = FDF_NONE;
+			continue;
+		}
+		//char* typeName = mTypes[oldStruct[0]];
+		char* typeName = mTypes[oldLookup];
+		int newLookup = memDNA->getReverseType(typeName);
+		if (newLookup == -1)
+		{
+			mCMPFlags[i] = FDF_NONE;
+			continue;
+		}
+		short *curStruct = memDNA->mStructs[newLookup];
+		// memory for file
+		if (oldLookup < memDNA->mStructs.size())
+		{
+			short *curStruct = memDNA->mStructs[oldLookup];
+			// rebuild...
+			mCMPFlags[i] = FDF_STRUCT_NEQU;
+			if (curStruct[1] == oldStruct[1])
+			{
+				// type len same ...
+				if (mTlens[oldStruct[0]] == memDNA->mTlens[curStruct[0]])
+				{
+					bool isSame = true;
+					int elementLength = oldStruct[1];
+					curStruct+=2;
+					oldStruct+=2;
+					for (int j=0; j<elementLength; j++, curStruct+=2, oldStruct+=2)
+					{
+						// type the same
+						//const char* typeFileDNA = mTypes[oldStruct[0]];
+						//const char* typeMemDNA = mTypes[curStruct[0]];
+						if (strcmp(mTypes[oldStruct[0]], memDNA->mTypes[curStruct[0]])!=0)
+						{
+							isSame=false;
+							break;
+						}
+						// name the same
+						if (strcmp(m_Names[oldStruct[1]].m_name, memDNA->m_Names[curStruct[1]].m_name)!=0)
+						{
+							isSame=false;
+							break;
+						}
+					}
+					// flag valid ==
+					if (isSame)
+						mCMPFlags[i] = FDF_STRUCT_EQU;
+				}
+			}
+		}
+	}
+	// recurse in
+	for ( i=0; i<(int)mStructs.size(); i++)
+	{
+		if (mCMPFlags[i] == FDF_STRUCT_NEQU)
+			initRecurseCmpFlags(i);
+	}
+static int name_is_array(char* name, int* dim1, int* dim2) {
+	int len = strlen(name);
+	/*fprintf(stderr,"[%s]",name);*/
+	/*if (len >= 1) {
+	if (name[len-1] != ']')
+	return 1;
+	}
+	return 0;*/
+	char *bp;
+	int num;
+	if (dim1) {
+		*dim1 = 1;
+	}
+	if (dim2) {
+		*dim2 = 1;
+	}
+	bp = strchr(name, '[');
+	if (!bp) {
+		return 0;
+	}
+	num = 0;
+	while (++bp < name+len-1) {
+		const char c = *bp;
+		if (c == ']') {
+			break;
+		}
+		if (c <= '9' && c >= '0') {
+			num *= 10;
+			num += (c - '0');
+		} else {
+			printf("array parse error.\n");
+			return 0;
+		}
+	}
+	if (dim2) {
+		*dim2 = num;
+	}
+	/* find second dim, if any. */
+	bp = strchr(bp, '[');
+	if (!bp) {
+		return 1; /* at least we got the first dim. */
+	}
+	num = 0;
+	while (++bp < name+len-1) {
+		const char c = *bp;
+		if (c == ']') {
+			break;
+		}
+		if (c <= '9' && c >= '0') {
+			num *= 10;
+			num += (c - '0');
+		} else {
+			printf("array2 parse error.\n");
+			return 1;
+		}
+	}
+	if (dim1) {
+		if (dim2) {
+			*dim1 = *dim2;
+			*dim2 = num;
+		} else {
+			*dim1 = num;
+		}
+	}
+	return 1;
+// ----------------------------------------------------- //
+void bDNA::init(char *data, int len, bool swap)
+	int *intPtr=0;short *shtPtr=0;
+	char *cp = 0;int dataLen =0;long nr=0;
+	intPtr = (int*)data;
+	/*
+		SDNA (4 bytes) (magic number)
+		NAME (4 bytes)
+		<nr> (4 bytes) amount of names (int)
+		<string>
+		<string>
+	*/
+	if (strncmp(data, "SDNA", 4)==0)
+	{
+		// skip ++ NAME
+		intPtr++; intPtr++;
+	}
+	// Parse names
+	if (swap) 
+	{
+		*intPtr = ChunkUtils::swapInt(*intPtr);
+	}
+	dataLen = *intPtr;
+	intPtr++;
+	cp = (char*)intPtr;
+	int i;
+	for ( i=0; i<dataLen; i++)
+	{
+		bNameInfo info;
+		info.m_name = cp;
+		info.m_isPointer = (info.m_name[0] == '*') || (info.m_name[1] == '*');
+		name_is_array(info.m_name,&info.m_dim0,&info.m_dim1);
+		m_Names.push_back(info);
+		while (*cp)cp++;
+		cp++;
+	}
+	cp = b3AlignPointer(cp,4);
+	/*
+		TYPE (4 bytes)
+		<nr> amount of types (int)
+		<string>
+		<string>
+	*/
+	intPtr = (int*)cp;
+	assert(strncmp(cp, "TYPE", 4)==0); intPtr++;
+	if (swap) 
+	{
+		*intPtr = ChunkUtils::swapInt(*intPtr);
+	}
+	dataLen = *intPtr;
+	intPtr++;
+	cp = (char*)intPtr;
+	for ( i=0; i<dataLen; i++)
+	{
+		mTypes.push_back(cp);
+		while (*cp)cp++;
+		cp++;
+	}
+	cp = b3AlignPointer(cp,4);
+	/*
+		TLEN (4 bytes)
+		<len> (short) the lengths of types
+		<len>
+	*/
+	// Parse type lens
+	intPtr = (int*)cp;
+	assert(strncmp(cp, "TLEN", 4)==0); intPtr++;
+	dataLen = (int)mTypes.size();
+	shtPtr = (short*)intPtr;
+	for ( i=0; i<dataLen; i++, shtPtr++)
+	{
+		if (swap)
+			shtPtr[0] = ChunkUtils::swapShort(shtPtr[0]);
+		mTlens.push_back(shtPtr[0]);
+	}
+	if (dataLen & 1) shtPtr++;
+	/*
+		STRC (4 bytes)
+		<nr> amount of structs (int)
+		<typenr>
+		<nr_of_elems>
+		<typenr>
+		<namenr>
+		<typenr>
+		<namenr>
+	*/
+	intPtr = (int*)shtPtr;
+	cp = (char*)intPtr;
+	assert(strncmp(cp, "STRC", 4)==0); intPtr++;
+	if (swap) 
+	{
+		*intPtr = ChunkUtils::swapInt(*intPtr);
+	}
+	dataLen = *intPtr;
+	intPtr++;
+	shtPtr = (short*)intPtr;
+	for ( i=0; i<dataLen; i++)
+	{
+		mStructs.push_back (shtPtr);
+		if (swap)
+		{
+			shtPtr[0]= ChunkUtils::swapShort(shtPtr[0]);
+			shtPtr[1]= ChunkUtils::swapShort(shtPtr[1]);
+			int len = shtPtr[1];
+			shtPtr+= 2;
+			for (int a=0; a<len; a++, shtPtr+=2)
+			{
+				shtPtr[0]= ChunkUtils::swapShort(shtPtr[0]);
+				shtPtr[1]= ChunkUtils::swapShort(shtPtr[1]);
+			}
+		}
+		else
+			shtPtr+= (2*shtPtr[1])+2;
+	}
+	// build reverse lookups
+	for ( i=0; i<(int)mStructs.size(); i++)
+	{
+		short *strc = mStructs.at(i);
+		if (!mPtrLen && strcmp(mTypes[strc[0]],"ListBase")==0)
+		{
+			mPtrLen = mTlens[strc[0]]/2;
+		}
+		mStructReverse.insert(strc[0], i);
+		mTypeLookup.insert(b3HashString(mTypes[strc[0]]),i);
+	}
+// ----------------------------------------------------- //
+int bDNA::getArraySize(char* string)
+	int ret = 1;
+	int len = strlen(string);
+	char* next = 0;
+	for (int i=0; i<len; i++)
+	{
+		char c = string[i];
+		if (c == '[')
+			next = &string[i+1];
+		else if (c==']')
+			if (next)
+				ret *= atoi(next);
+	}
+//	print (string << ' ' << ret);
+	return ret;
+void bDNA::dumpTypeDefinitions()
+	int i;
+	int numTypes = mTypes.size();
+	for (i=0;i<numTypes;i++)
+	{
+	}
+	for ( i=0; i<(int)mStructs.size(); i++)
+	{
+		int totalBytes=0;
+		short *oldStruct = mStructs[i];
+		int oldLookup = getReverseType(oldStruct[0]);
+		if (oldLookup == -1)
+		{
+			mCMPFlags[i] = FDF_NONE;
+			continue;
+		}
+		short* newStruct = mStructs[oldLookup];
+		char* typeName = mTypes[newStruct[0]];
+		printf("%3d: %s ",i,typeName);
+		//char *name = mNames[oldStruct[1]];
+		int len = oldStruct[1];
+		printf(" (%d fields) ",len);
+		oldStruct+=2;
+		printf("{");
+		int j;
+		for (j=0; j<len; ++j,oldStruct+=2) {
+			const char* name = m_Names[oldStruct[1]].m_name;
+			printf("%s %s",	mTypes[oldStruct[0]],name);
+			int elemNumBytes= 0;
+			int arrayDimensions = getArraySizeNew(oldStruct[1]);
+			if (m_Names[oldStruct[1]].m_isPointer)
+			{
+				elemNumBytes = VOID_IS_8 ? 8 : 4;
+			} else
+			{
+				elemNumBytes = getLength(oldStruct[0]);
+			}
+			printf(" /* %d bytes */",elemNumBytes*arrayDimensions);
+			if (j == len-1) {
+				printf(";}");
+			} else {
+				printf("; ");
+			}
+			totalBytes+=elemNumBytes*arrayDimensions;
+		}
+		printf("\ntotalBytes=%d\n\n",totalBytes);
+	}
+#if 0
+	/* dump out display of types and their sizes */
+	for (i=0; i<bf->types_count; ++i) {
+		/* if (!bf->types[i].is_struct)*/
+		{
+			printf("%3d: sizeof(%s%s)=%d",
+				i,
+				bf->types[i].is_struct ? "struct " : "atomic ",
+				bf->types[i].name, bf->types[i].size);
+			if (bf->types[i].is_struct) {
+				int j;
+				printf(", %d fields: { ", bf->types[i].fieldtypes_count);
+				for (j=0; j<bf->types[i].fieldtypes_count; ++j) {
+					printf("%s %s",
+						bf->types[bf->types[i].fieldtypes[j]].name,
+						bf->names[bf->types[i].fieldnames[j]]);
+					if (j == bf->types[i].fieldtypes_count-1) {
+						printf(";}");
+					} else {
+						printf("; ");
+					}
+				}
+			}
+			printf("\n\n");
+		}
+	}
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3DNA.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3DNA.h
new file mode 100644
index 00000000..6e60087c
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3DNA.h
@@ -0,0 +1,110 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef __BDNA_H__
+#define __BDNA_H__
+#include "b3Common.h"
+namespace bParse {
+	struct	bNameInfo
+	{
+		char*	m_name;
+		bool	m_isPointer;
+		int		m_dim0;
+		int		m_dim1;
+	};
+	class bDNA
+	{
+	public:
+		bDNA();
+		~bDNA();
+		void init(char *data, int len, bool swap=false);
+		int getArraySize(char* str);
+		int getArraySizeNew(short name)
+		{
+			const bNameInfo& nameInfo = m_Names[name];
+			return nameInfo.m_dim0*nameInfo.m_dim1;
+		}
+		int getElementSize(short type, short name)
+		{
+			const bNameInfo& nameInfo = m_Names[name];
+			int size = nameInfo.m_isPointer ? mPtrLen*nameInfo.m_dim0*nameInfo.m_dim1 : mTlens[type]*nameInfo.m_dim0*nameInfo.m_dim1;
+			return size;
+		}
+		int	getNumNames() const
+		{
+			return m_Names.size();
+		}
+		char *getName(int ind);
+		char *getType(int ind);
+		short *getStruct(int ind);
+		short getLength(int ind);
+		int getReverseType(short type);
+		int getReverseType(const char *type);
+		int getNumStructs();
+		//
+		bool lessThan(bDNA* other);
+		void initCmpFlags(bDNA *memDNA);
+		bool flagNotEqual(int dna_nr);
+		bool flagEqual(int dna_nr);
+		bool flagNone(int dna_nr);
+		int getPointerSize();
+		void	dumpTypeDefinitions();
+	private:
+		enum FileDNAFlags
+		{
+			FDF_NONE=0,
+		};
+		void initRecurseCmpFlags(int i);
+		b3AlignedObjectArray<int>			mCMPFlags;
+		b3AlignedObjectArray<bNameInfo>			m_Names;
+		b3AlignedObjectArray<char*>			mTypes;
+		b3AlignedObjectArray<short*>			mStructs;
+		b3AlignedObjectArray<short>			mTlens;
+		b3HashMap<b3HashInt, int>			mStructReverse;
+		b3HashMap<b3HashString,int>	mTypeLookup;
+		int							mPtrLen;
+	};
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Defines.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Defines.h
new file mode 100644
index 00000000..8f28d3c4
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Defines.h
@@ -0,0 +1,136 @@
+/* Copyright (C) 2006-2009 Charlie C & Erwin Coumans http://gamekit.googlecode.com
+* This software is provided 'as-is', without any express or implied
+* warranty.  In no event will the authors be held liable for any damages
+* arising from the use of this software.
+* Permission is granted to anyone to use this software for any purpose,
+* including commercial applications, and to alter it and redistribute it
+* freely, subject to the following restrictions:
+* 1. The origin of this software must not be misrepresented; you must not
+*    claim that you wrote the original software. If you use this software
+*    in a product, an acknowledgment in the product documentation would be
+*    appreciated but is not required.
+* 2. Altered source versions must be plainly marked as such, and must not be
+*    misrepresented as being the original software.
+* 3. This notice may not be removed or altered from any source distribution.
+#ifndef __B_DEFINES_H__
+#define __B_DEFINES_H__
+// MISC defines, see BKE_global.h, BKE_utildefines.h
+// ------------------------------------------------------------
+#if defined(__sgi) || defined (__sparc) || defined (__sparc__) || defined (__PPC__) || defined (__ppc__) || defined (__BIG_ENDIAN__)
+#	define B3_MAKE_ID(a,b,c,d) ( (int)(a)<<24 | (int)(b)<<16 | (c)<<8 | (d) )
+#	define B3_MAKE_ID(a,b,c,d) ( (int)(d)<<24 | (int)(c)<<16 | (b)<<8 | (a) )
+// ------------------------------------------------------------
+#if defined(__sgi) || defined(__sparc) || defined(__sparc__) || defined (__PPC__) || defined (__ppc__) || defined (__BIG_ENDIAN__)
+#	define B3_MAKE_ID2(c, d) ( (c)<<8 | (d) )
+#	define B3_MAKE_ID2(c, d) ( (d)<<8 | (c) )
+// ------------------------------------------------------------
+#define B3_ID_SCE		B3_MAKE_ID2('S', 'C')
+#define B3_ID_LI		B3_MAKE_ID2('L', 'I')
+#define B3_ID_OB		B3_MAKE_ID2('O', 'B')
+#define B3_ID_ME		B3_MAKE_ID2('M', 'E')
+#define B3_ID_CU		B3_MAKE_ID2('C', 'U')
+#define B3_ID_MB		B3_MAKE_ID2('M', 'B')
+#define B3_ID_MA		B3_MAKE_ID2('M', 'A')
+#define B3_ID_TE		B3_MAKE_ID2('T', 'E')
+#define B3_ID_IM		B3_MAKE_ID2('I', 'M')
+#define B3_ID_IK		B3_MAKE_ID2('I', 'K')
+#define B3_ID_WV		B3_MAKE_ID2('W', 'V')
+#define B3_ID_LT		B3_MAKE_ID2('L', 'T')
+#define B3_ID_SE		B3_MAKE_ID2('S', 'E')
+#define B3_ID_LF		B3_MAKE_ID2('L', 'F')
+#define B3_ID_LA		B3_MAKE_ID2('L', 'A')
+#define B3_ID_CA		B3_MAKE_ID2('C', 'A')
+#define B3_ID_IP		B3_MAKE_ID2('I', 'P')
+#define B3_ID_KE		B3_MAKE_ID2('K', 'E')
+#define B3_ID_WO		B3_MAKE_ID2('W', 'O')
+#define B3_ID_SCR		B3_MAKE_ID2('S', 'R')
+#define B3_ID_VF		B3_MAKE_ID2('V', 'F')
+#define B3_ID_TXT		B3_MAKE_ID2('T', 'X')
+#define B3_ID_SO		B3_MAKE_ID2('S', 'O')
+#define B3_ID_SAMPLE	B3_MAKE_ID2('S', 'A')
+#define B3_ID_GR		B3_MAKE_ID2('G', 'R')
+#define B3_ID_ID		B3_MAKE_ID2('I', 'D')
+#define B3_ID_AR		B3_MAKE_ID2('A', 'R')
+#define B3_ID_AC		B3_MAKE_ID2('A', 'C')
+#define B3_ID_SCRIPT	B3_MAKE_ID2('P', 'Y')
+#define B3_ID_FLUIDSIM	B3_MAKE_ID2('F', 'S')
+#define B3_ID_NT		B3_MAKE_ID2('N', 'T')
+#define B3_ID_BR		B3_MAKE_ID2('B', 'R')
+#define B3_ID_SEQ		B3_MAKE_ID2('S', 'Q')
+#define B3_ID_CO		B3_MAKE_ID2('C', 'O')
+#define B3_ID_PO		B3_MAKE_ID2('A', 'C')
+#define B3_ID_NLA		B3_MAKE_ID2('N', 'L')
+#define B3_ID_VS		B3_MAKE_ID2('V', 'S')
+#define B3_ID_VN		B3_MAKE_ID2('V', 'N')
+// ------------------------------------------------------------
+#define B3_FORM B3_MAKE_ID('F','O','R','M')
+#define B3_DDG1 B3_MAKE_ID('3','D','G','1')
+#define B3_DDG2 B3_MAKE_ID('3','D','G','2')
+#define B3_DDG3 B3_MAKE_ID('3','D','G','3')
+#define B3_DDG4 B3_MAKE_ID('3','D','G','4')
+#define B3_GOUR B3_MAKE_ID('G','O','U','R')
+#define B3_BLEN B3_MAKE_ID('B','L','E','N')
+#define B3_DER_ B3_MAKE_ID('D','E','R','_')
+#define B3_V100 B3_MAKE_ID('V','1','0','0')
+#define B3_DATA B3_MAKE_ID('D','A','T','A')
+#define B3_GLOB B3_MAKE_ID('G','L','O','B')
+#define B3_IMAG B3_MAKE_ID('I','M','A','G')
+#define B3_TEST B3_MAKE_ID('T','E','S','T')
+#define B3_USER B3_MAKE_ID('U','S','E','R')
+// ------------------------------------------------------------
+#define B3_DNA1 B3_MAKE_ID('D','N','A','1')
+#define B3_REND B3_MAKE_ID('R','E','N','D')
+#define B3_ENDB B3_MAKE_ID('E','N','D','B')
+#define B3_NAME B3_MAKE_ID('N','A','M','E')
+#define B3_SDNA B3_MAKE_ID('S','D','N','A')
+#define B3_TYPE B3_MAKE_ID('T','Y','P','E')
+#define B3_TLEN B3_MAKE_ID('T','L','E','N')
+#define B3_STRC B3_MAKE_ID('S','T','R','C')
+// ------------------------------------------------------------
+#define B3_SWITCH_INT(a) { \
+    char s_i, *p_i; \
+    p_i= (char *)&(a); \
+    s_i=p_i[0]; p_i[0]=p_i[3]; p_i[3]=s_i; \
+    s_i=p_i[1]; p_i[1]=p_i[2]; p_i[2]=s_i; }
+// ------------------------------------------------------------
+#define B3_SWITCH_SHORT(a)	{ \
+    char s_i, *p_i; \
+	p_i= (char *)&(a); \
+	s_i=p_i[0]; p_i[0]=p_i[1]; p_i[1]=s_i; }
+// ------------------------------------------------------------
+#define B3_SWITCH_LONGINT(a) { \
+    char s_i, *p_i; \
+    p_i= (char *)&(a);  \
+    s_i=p_i[0]; p_i[0]=p_i[7]; p_i[7]=s_i; \
+    s_i=p_i[1]; p_i[1]=p_i[6]; p_i[6]=s_i; \
+    s_i=p_i[2]; p_i[2]=p_i[5]; p_i[5]=s_i; \
+    s_i=p_i[3]; p_i[3]=p_i[4]; p_i[4]=s_i; }
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp
new file mode 100644
index 00000000..71d3b9e3
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp
@@ -0,0 +1,1738 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "b3File.h"
+#include "b3Common.h"
+#include "b3Chunk.h"
+#include "b3DNA.h"
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include "b3Defines.h"
+#include "Bullet3Serialize/Bullet2FileLoader/b3Serializer.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+#include "Bullet3Common/b3MinMax.h"
+#define MAX_ARRAY_LENGTH 512
+using namespace bParse;
+#define MAX_STRLEN 1024
+const char* getCleanName(const char* memName, char* buffer)
+	int slen = strlen(memName);
+	assert(slen<MAX_STRLEN);
+	slen=b3Min(slen,MAX_STRLEN);
+	for (int i=0;i<slen;i++)
+	{
+		if (memName[i]==']'||memName[i]=='[')
+		{
+			buffer[i] = 0;//'_';
+		} else
+		{
+			buffer[i] = memName[i];
+		}
+	}
+	buffer[slen]=0;
+	return buffer;
+// ----------------------------------------------------- //
+bFile::bFile(const char *filename, const char headerString[7])
+	:	mOwnsBuffer(true),
+		mFileBuffer(0),
+		mFileLen(0),
+		mVersion(0),
+		mDataStart(0),
+		mFileDNA(0),
+		mMemoryDNA(0),
+		mFlags(FD_INVALID)
+	for (int i=0;i<7;i++)
+	{
+		m_headerString[i] = headerString[i];
+	}
+	FILE *fp = fopen(filename, "rb");
+	if (fp)
+	{
+		fseek(fp, 0L, SEEK_END);
+		mFileLen = ftell(fp);
+		fseek(fp, 0L, SEEK_SET);
+		mFileBuffer = (char*)malloc(mFileLen+1);
+		int bytesRead;
+		bytesRead = fread(mFileBuffer, mFileLen, 1, fp);
+		fclose(fp);
+		//
+		parseHeader();
+	}
+// ----------------------------------------------------- //
+bFile::bFile( char *memoryBuffer, int len, const char headerString[7])
+:	mOwnsBuffer(false),
+	mFileBuffer(0),
+		mFileLen(0),
+		mVersion(0),
+		mDataStart(0),
+		mFileDNA(0),
+		mMemoryDNA(0),
+		mFlags(FD_INVALID)
+	for (int i=0;i<7;i++)
+	{
+		m_headerString[i] = headerString[i];
+	}
+	mFileBuffer = memoryBuffer;
+	mFileLen = len;
+	parseHeader();
+// ----------------------------------------------------- //
+	if (mOwnsBuffer && mFileBuffer)
+	{
+		free(mFileBuffer);
+		mFileBuffer = 0;
+	}
+	delete mMemoryDNA;
+	delete mFileDNA;
+// ----------------------------------------------------- //
+void bFile::parseHeader()
+	if (!mFileLen || !mFileBuffer)
+		return;
+	char *blenderBuf = mFileBuffer;
+	char header[B3_SIZEOFBLENDERHEADER+1] ;
+	memcpy(header, blenderBuf, B3_SIZEOFBLENDERHEADER);
+	if (strncmp(header, m_headerString, 6)!=0)
+	{
+		memcpy(header, m_headerString, B3_SIZEOFBLENDERHEADER);
+		return;
+	}
+	if (header[6] == 'd')
+	{
+	}
+	char *ver = header+9;
+	mVersion = atoi(ver);
+	if (mVersion <= 241)
+	{
+		//printf("Warning, %d not fully tested : <= 242\n", mVersion);
+	}
+	int littleEndian= 1;
+	littleEndian= ((char*)&littleEndian)[0];
+	// swap ptr sizes...
+	if (header[7]=='-')
+	{
+		mFlags |= FD_FILE_64;
+		if (!VOID_IS_8)
+			mFlags |= FD_BITS_VARIES;
+	}
+	else if (VOID_IS_8) mFlags |= FD_BITS_VARIES;
+	// swap endian...
+	if (header[8]=='V')
+	{
+		if (littleEndian ==1)
+			mFlags |= FD_ENDIAN_SWAP;
+	}
+	else
+		if (littleEndian==0)
+			mFlags |= FD_ENDIAN_SWAP;
+	mFlags |= FD_OK;
+// ----------------------------------------------------- //
+bool bFile::ok()
+	return (mFlags &FD_OK)!=0;
+// ----------------------------------------------------- //
+void bFile::parseInternal(int verboseMode, char* memDna,int memDnaLength)
+	if ( (mFlags &FD_OK) ==0)
+		return;
+	char *blenderData = mFileBuffer;
+	bChunkInd dna;
+	dna.oldPtr = 0;
+	char *tempBuffer = blenderData;
+	for (int i=0; i<mFileLen; i++)
+	{
+		// looking for the data's starting position
+		// and the start of SDNA decls
+		if (!mDataStart && strncmp(tempBuffer, "REND", 4)==0)
+			mDataStart = i;
+		if (strncmp(tempBuffer, "DNA1", 4)==0)
+		{
+			// read the DNA1 block and extract SDNA
+			if (getNextBlock(&dna, tempBuffer, mFlags) > 0)
+			{
+				if (strncmp((tempBuffer + ChunkUtils::getOffset(mFlags)), "SDNANAME", 8) ==0)
+					dna.oldPtr = (tempBuffer + ChunkUtils::getOffset(mFlags));
+				else dna.oldPtr = 0;
+			}
+			else dna.oldPtr = 0;
+		}
+		// Some Bullet files are missing the DNA1 block
+		// In Blender it's DNA1 + ChunkUtils::getOffset() + SDNA + NAME
+		// In Bullet tests its SDNA + NAME
+		else if (strncmp(tempBuffer, "SDNANAME", 8) ==0)
+		{
+			dna.oldPtr = blenderData + i;
+			dna.len = mFileLen-i;
+			// Also no REND block, so exit now.
+			if (mVersion==276) break;
+		}
+        if (mDataStart && dna.oldPtr) break;
+		tempBuffer++;
+	}
+	if (!dna.oldPtr || !dna.len)
+	{
+		//printf("Failed to find DNA1+SDNA pair\n");
+		mFlags &= ~FD_OK;
+		return;
+	}
+	mFileDNA = new bDNA();
+	///mFileDNA->init will convert part of DNA file endianness to current CPU endianness if necessary
+	mFileDNA->init((char*)dna.oldPtr, dna.len, (mFlags & FD_ENDIAN_SWAP)!=0);
+	if (mVersion==276)
+	{
+		int i;
+		for (i=0;i<mFileDNA->getNumNames();i++)
+		{
+			if (strcmp(mFileDNA->getName(i),"int")==0)
+			{
+				mFlags |= FD_BROKEN_DNA;
+			}
+		}
+		if ((mFlags&FD_BROKEN_DNA)!=0)
+		{
+			//printf("warning: fixing some broken DNA version\n");
+		}
+	}
+		mFileDNA->dumpTypeDefinitions();
+	mMemoryDNA = new bDNA();
+	int littleEndian= 1;
+	littleEndian= ((char*)&littleEndian)[0];
+	mMemoryDNA->init(memDna,memDnaLength,littleEndian==0);
+	///@todo we need a better version check, add version/sub version info from FileGlobal into memory DNA/header files
+	if (mMemoryDNA->getNumNames() != mFileDNA->getNumNames())
+	{
+		//printf ("Warning, file DNA is different than built in, performance is reduced. Best to re-export file with a matching version/platform");
+	}
+	// as long as it kept up to date it will be ok!!
+	if (mMemoryDNA->lessThan(mFileDNA))
+	{
+		//printf ("Warning, file DNA is newer than built in.");
+	}
+	mFileDNA->initCmpFlags(mMemoryDNA);
+	parseData();
+	resolvePointers(verboseMode);
+	updateOldPointers();
+// ----------------------------------------------------- //
+void bFile::swap(char *head, bChunkInd& dataChunk, bool ignoreEndianFlag)
+	char *data = head;
+	short *strc = mFileDNA->getStruct(dataChunk.dna_nr);
+	const char s[] = "SoftBodyMaterialData";
+	int szs = sizeof(s);
+	if (strncmp((char*)&dataChunk.code,"ARAY",4)==0)
+	{
+		short *oldStruct = mFileDNA->getStruct(dataChunk.dna_nr);
+		char *oldType = mFileDNA->getType(oldStruct[0]);
+		if (strncmp(oldType,s,szs)==0)
+		{
+			return;
+		}
+	}
+	int len = mFileDNA->getLength(strc[0]);
+	for (int i=0; i<dataChunk.nr; i++)
+	{
+		swapStruct(dataChunk.dna_nr, data,ignoreEndianFlag);
+		data+=len;
+	}
+void bFile::swapLen(char *dataPtr)
+	const bool VOID_IS_8 = ((sizeof(void*)==8));
+	if (VOID_IS_8)
+	{
+		if (mFlags &FD_BITS_VARIES)
+		{
+			bChunkPtr4*c = (bChunkPtr4*) dataPtr;
+			if ((c->code & 0xFFFF)==0)
+					c->code >>=16;
+			B3_SWITCH_INT(c->len);
+			B3_SWITCH_INT(c->dna_nr);
+			B3_SWITCH_INT(c->nr);
+		} else
+		{
+			bChunkPtr8* c = (bChunkPtr8*) dataPtr;
+			if ((c->code & 0xFFFF)==0)
+				c->code >>=16;
+			B3_SWITCH_INT(c->len);
+			B3_SWITCH_INT(c->dna_nr);
+			B3_SWITCH_INT(c->nr);
+		}
+	} else
+	{
+		if (mFlags &FD_BITS_VARIES)
+		{
+			bChunkPtr8*c = (bChunkPtr8*) dataPtr;
+			if ((c->code & 0xFFFF)==0)
+				c->code >>=16;
+			B3_SWITCH_INT(c->len);
+			B3_SWITCH_INT(c->dna_nr);
+			B3_SWITCH_INT(c->nr);
+		} else
+		{
+			bChunkPtr4* c = (bChunkPtr4*) dataPtr;
+			if ((c->code & 0xFFFF)==0)
+				c->code >>=16;
+			B3_SWITCH_INT(c->len);
+			B3_SWITCH_INT(c->dna_nr);
+			B3_SWITCH_INT(c->nr);
+		}
+	}
+void bFile::swapDNA(char* ptr)
+	bool swap = ((mFlags & FD_ENDIAN_SWAP)!=0);
+	char* data = &ptr[20];
+//	void bDNA::init(char *data, int len, bool swap)
+	int *intPtr=0;short *shtPtr=0;
+	char *cp = 0;int dataLen =0;long nr=0;
+	intPtr = (int*)data;
+	/*
+		SDNA (4 bytes) (magic number)
+		NAME (4 bytes)
+		<nr> (4 bytes) amount of names (int)
+		<string>
+		<string>
+	*/
+	if (strncmp(data, "SDNA", 4)==0)
+	{
+		// skip ++ NAME
+		intPtr++; intPtr++;
+	}
+	// Parse names
+	if (swap)
+		dataLen = ChunkUtils::swapInt(*intPtr);
+	else
+		dataLen = *intPtr;
+	*intPtr = ChunkUtils::swapInt(*intPtr);
+	intPtr++;
+	cp = (char*)intPtr;
+	int i;
+	for ( i=0; i<dataLen; i++)
+	{
+		while (*cp)cp++;
+		cp++;
+	}
+	cp = b3AlignPointer(cp,4);
+	/*
+		TYPE (4 bytes)
+		<nr> amount of types (int)
+		<string>
+		<string>
+	*/
+	intPtr = (int*)cp;
+	assert(strncmp(cp, "TYPE", 4)==0); intPtr++;
+	if (swap)
+		dataLen = ChunkUtils::swapInt(*intPtr);
+	else
+		dataLen = *intPtr;
+	*intPtr = ChunkUtils::swapInt(*intPtr);
+	intPtr++;
+	cp = (char*)intPtr;
+	for ( i=0; i<dataLen; i++)
+	{
+		while (*cp)cp++;
+		cp++;
+	}
+	cp = b3AlignPointer(cp,4);
+	/*
+		TLEN (4 bytes)
+		<len> (short) the lengths of types
+		<len>
+	*/
+	// Parse type lens
+	intPtr = (int*)cp;
+	assert(strncmp(cp, "TLEN", 4)==0); intPtr++;
+	shtPtr = (short*)intPtr;
+	for ( i=0; i<dataLen; i++, shtPtr++)
+	{
+		//??????if (swap)
+			shtPtr[0] = ChunkUtils::swapShort(shtPtr[0]);
+	}
+	if (dataLen & 1)
+		shtPtr++;
+	/*
+		STRC (4 bytes)
+		<nr> amount of structs (int)
+		<typenr>
+		<nr_of_elems>
+		<typenr>
+		<namenr>
+		<typenr>
+		<namenr>
+	*/
+	intPtr = (int*)shtPtr;
+	cp = (char*)intPtr;
+	assert(strncmp(cp, "STRC", 4)==0);
+	intPtr++;
+	if (swap)
+		dataLen = ChunkUtils::swapInt(*intPtr);
+	else
+		dataLen = *intPtr;
+	*intPtr = ChunkUtils::swapInt(*intPtr);
+	intPtr++;
+	shtPtr = (short*)intPtr;
+	for ( i=0; i<dataLen; i++)
+	{
+		//if (swap)
+		{
+			int len = shtPtr[1];
+			shtPtr[0]= ChunkUtils::swapShort(shtPtr[0]);
+			shtPtr[1]= ChunkUtils::swapShort(shtPtr[1]);
+			shtPtr+= 2;
+			for (int a=0; a<len; a++, shtPtr+=2)
+			{
+				shtPtr[0]= ChunkUtils::swapShort(shtPtr[0]);
+				shtPtr[1]= ChunkUtils::swapShort(shtPtr[1]);
+			}
+		}
+//		else
+//			shtPtr+= (2*shtPtr[1])+2;
+	}
+void bFile::writeFile(const char* fileName)
+	FILE* f = fopen(fileName,"wb");
+	fwrite(mFileBuffer,1,mFileLen,f);
+	fclose(f);
+void bFile::preSwap()
+	const bool brokenDNA = (mFlags&FD_BROKEN_DNA)!=0;
+	//byte 8 determines the endianness of the file, little (v) versus big (V)
+	int littleEndian= 1;
+	littleEndian= ((char*)&littleEndian)[0];
+	if (mFileBuffer[8]=='V')
+	{
+		mFileBuffer[8]='v';
+	}
+	else
+	{
+		mFileBuffer[8]='V';
+	}
+	mDataStart = 12;
+	char *dataPtr = mFileBuffer+mDataStart;
+	bChunkInd dataChunk;
+	dataChunk.code = 0;
+	bool ignoreEndianFlag = true;
+	//we always want to swap here
+	int seek = getNextBlock(&dataChunk, dataPtr, mFlags);
+	//dataPtr += ChunkUtils::getOffset(mFlags);
+	char *dataPtrHead = 0;
+	while (1)
+	{
+		// one behind
+		if (dataChunk.code == B3_SDNA || dataChunk.code==B3_DNA1 || dataChunk.code == B3_TYPE || dataChunk.code == B3_TLEN || dataChunk.code==B3_STRC)
+		{
+			swapDNA(dataPtr);
+			break;
+		} else
+		{
+			//if (dataChunk.code == DNA1) break;
+			dataPtrHead = dataPtr+ChunkUtils::getOffset(mFlags);
+			swapLen(dataPtr);
+			if (dataChunk.dna_nr>=0)
+			{
+				swap(dataPtrHead, dataChunk,ignoreEndianFlag);
+			} else
+			{
+				//printf("unknown chunk\n");
+			}
+		}
+		// next please!
+		dataPtr += seek;
+		seek =  getNextBlock(&dataChunk, dataPtr, mFlags);
+		if (seek < 0)
+			break;
+	}
+	if (mFlags & FD_ENDIAN_SWAP)
+	{
+		mFlags &= ~FD_ENDIAN_SWAP;
+	} else
+	{
+		mFlags |= FD_ENDIAN_SWAP;
+	}
+// ----------------------------------------------------- //
+char* bFile::readStruct(char *head, bChunkInd&  dataChunk)
+	bool ignoreEndianFlag = false;
+	if (mFlags & FD_ENDIAN_SWAP)
+		swap(head, dataChunk, ignoreEndianFlag);
+	if (!mFileDNA->flagEqual(dataChunk.dna_nr))
+	{
+		// Ouch! need to rebuild the struct
+		short *oldStruct,*curStruct;
+		char *oldType, *newType;
+		int oldLen, curLen, reverseOld;
+		oldStruct = mFileDNA->getStruct(dataChunk.dna_nr);
+		oldType = mFileDNA->getType(oldStruct[0]);
+		oldLen = mFileDNA->getLength(oldStruct[0]);
+		if ((mFlags&FD_BROKEN_DNA)!=0)
+		{
+			if ((strcmp(oldType,"b3QuantizedBvhNodeData")==0)&&oldLen==20)
+			{
+				return 0;
+			}
+			if ((strcmp(oldType,"b3ShortIntIndexData")==0))
+			{
+				int allocLen = 2;
+	    		char *dataAlloc = new char[(dataChunk.nr*allocLen)+1];
+				memset(dataAlloc, 0, (dataChunk.nr*allocLen)+1);
+				short* dest = (short*) dataAlloc;
+				const short* src = (short*) head;
+				for (int i=0;i<dataChunk.nr;i++)
+				{
+					dest[i] = src[i];
+					if (mFlags &FD_ENDIAN_SWAP)
+					{
+						B3_SWITCH_SHORT(dest[i]);
+					}
+				}
+				addDataBlock(dataAlloc);
+				return dataAlloc;
+			}
+		}
+		///don't try to convert Link block data, just memcpy it. Other data can be converted.
+		if (strcmp("Link",oldType)!=0)
+		{
+			reverseOld = mMemoryDNA->getReverseType(oldType);
+			if ((reverseOld!=-1))
+			{
+				// make sure it's here
+				//assert(reverseOld!= -1 && "getReverseType() returned -1, struct required!");
+				//
+				curStruct = mMemoryDNA->getStruct(reverseOld);
+				newType = mMemoryDNA->getType(curStruct[0]);
+				curLen = mMemoryDNA->getLength(curStruct[0]);
+				// make sure it's the same
+				assert((strcmp(oldType, newType)==0) && "internal error, struct mismatch!");
+				// numBlocks * length
+                int allocLen = (curLen);
+    			char *dataAlloc = new char[(dataChunk.nr*allocLen)+1];
+				memset(dataAlloc, 0, (dataChunk.nr*allocLen));
+				// track allocated
+				addDataBlock(dataAlloc);
+				char *cur = dataAlloc;
+				char *old = head;
+				for (int block=0; block<dataChunk.nr; block++)
+				{
+					bool fixupPointers = true;
+					parseStruct(cur, old, dataChunk.dna_nr, reverseOld, fixupPointers);
+					mLibPointers.insert(old,(bStructHandle*)cur);
+					cur += curLen;
+					old += oldLen;
+				}
+				return dataAlloc;
+			}
+		} else
+		{
+			//printf("Link found\n");
+		}
+	} else
+	{
+		short *oldStruct;
+		char *oldType;
+		oldStruct = mFileDNA->getStruct(dataChunk.dna_nr);
+		oldType = mFileDNA->getType(oldStruct[0]);
+		printf("%s equal structure, just memcpy\n",oldType);
+#endif //
+	}
+	char *dataAlloc = new char[(dataChunk.len)+1];
+	memset(dataAlloc, 0, dataChunk.len+1);
+	// track allocated
+	addDataBlock(dataAlloc);
+	memcpy(dataAlloc, head, dataChunk.len);
+	return dataAlloc;
+// ----------------------------------------------------- //
+void bFile::parseStruct(char *strcPtr, char *dtPtr, int old_dna, int new_dna, bool fixupPointers)
+	if (old_dna == -1) return;
+	if (new_dna == -1) return;
+	//disable this, because we need to fixup pointers/ListBase
+	if (0)//mFileDNA->flagEqual(old_dna))
+	{
+		short *strc = mFileDNA->getStruct(old_dna);
+		int len = mFileDNA->getLength(strc[0]);
+		memcpy(strcPtr, dtPtr, len);
+		return;
+	}
+	// Ok, now build the struct
+	char *memType, *memName, *cpc, *cpo;
+	short *fileStruct, *filePtrOld, *memoryStruct, *firstStruct;
+	int elementLength, size, revType, old_nr, new_nr, fpLen;
+	short firstStructType;
+	// File to memory lookup
+	memoryStruct = mMemoryDNA->getStruct(new_dna);
+	fileStruct = mFileDNA->getStruct(old_dna);
+	firstStruct = fileStruct;
+	filePtrOld = fileStruct;
+	firstStructType = mMemoryDNA->getStruct(0)[0];
+	// Get number of elements
+	elementLength = memoryStruct[1];
+	memoryStruct+=2;
+	cpc = strcPtr; cpo = 0;
+	for (int ele=0; ele<elementLength; ele++, memoryStruct+=2)
+	{
+		memType = mMemoryDNA->getType(memoryStruct[0]);
+		memName = mMemoryDNA->getName(memoryStruct[1]);
+		size = mMemoryDNA->getElementSize(memoryStruct[0], memoryStruct[1]);
+		revType = mMemoryDNA->getReverseType(memoryStruct[0]);
+		if (revType != -1 && memoryStruct[0]>=firstStructType && memName[0] != '*')
+		{
+			cpo = getFileElement(firstStruct, memName, memType, dtPtr, &filePtrOld);
+			if (cpo)
+			{
+				int arrayLen = mFileDNA->getArraySizeNew(filePtrOld[1]);
+				old_nr = mFileDNA->getReverseType(memType);
+				new_nr = revType;
+				fpLen = mFileDNA->getElementSize(filePtrOld[0], filePtrOld[1]);
+				if (arrayLen==1)
+				{
+					parseStruct(cpc, cpo, old_nr, new_nr,fixupPointers);
+				} else
+				{
+					char* tmpCpc = cpc;
+					char* tmpCpo = cpo;
+					for (int i=0;i<arrayLen;i++)
+					{
+						parseStruct(tmpCpc, tmpCpo, old_nr, new_nr,fixupPointers);
+						tmpCpc += size/arrayLen;
+						tmpCpo += fpLen/arrayLen;
+					}
+				}
+				cpc+=size;
+				cpo+=fpLen;
+			}
+			else
+				cpc+=size;
+		}
+		else
+		{
+			getMatchingFileDNA(fileStruct, memName, memType, cpc, dtPtr,fixupPointers);
+			cpc+=size;
+		}
+	}
+// ----------------------------------------------------- //
+static void getElement(int arrayLen, const char *cur, const char *old, char *oldPtr, char *curData)
+#define b3GetEle(value, current, type, cast, size, ptr)\
+	if (strcmp(current, type)==0)\
+	{\
+		value = (*(cast*)ptr);\
+		ptr += size;\
+	}
+#define b3SetEle(value, current, type, cast, size, ptr)\
+	if (strcmp(current, type)==0)\
+	{\
+		(*(cast*)ptr) = (cast)value;\
+		ptr += size;\
+	}
+	double value = 0.0;
+	for (int i=0; i<arrayLen; i++)
+	{
+		b3GetEle(value, old, "char",   char,   sizeof(char),   oldPtr);
+		b3SetEle(value, cur, "char",   char,   sizeof(char),   curData);
+		b3GetEle(value, old, "short",  short,  sizeof(short),  oldPtr);
+		b3SetEle(value, cur, "short",  short,  sizeof(short),  curData);
+		b3GetEle(value, old, "ushort",  unsigned short,  sizeof(unsigned short),  oldPtr);
+		b3SetEle(value, cur, "ushort",  unsigned short,  sizeof(unsigned short),  curData);
+		b3GetEle(value, old, "int",    int,    sizeof(int),    oldPtr);
+		b3SetEle(value, cur, "int",    int,    sizeof(int),    curData);
+		b3GetEle(value, old, "long",   int,    sizeof(int),    oldPtr);
+		b3SetEle(value, cur, "long",   int,    sizeof(int),    curData);
+		b3GetEle(value, old, "float",  float,  sizeof(float),  oldPtr);
+		b3SetEle(value, cur, "float",  float,  sizeof(float),  curData);
+		b3GetEle(value, old, "double", double, sizeof(double), oldPtr);
+		b3SetEle(value, cur, "double", double, sizeof(double), curData);
+	}
+// ----------------------------------------------------- //
+void bFile::swapData(char *data, short type, int arraySize,bool ignoreEndianFlag)
+	if (ignoreEndianFlag || (mFlags &FD_ENDIAN_SWAP))
+	{
+		if (type == 2 || type == 3)
+		{
+			short *sp = (short*)data;
+			for (int i=0; i<arraySize; i++)
+			{
+				sp[0] = ChunkUtils::swapShort(sp[0]);
+				sp++;
+			}
+		}
+		if (type>3 && type <8)
+		{
+			char c;
+			char *cp = data;
+			for (int i=0; i<arraySize; i++)
+			{
+				c = cp[0];
+				cp[0] = cp[3];
+				cp[3] = c;
+				c = cp[1];
+				cp[1] = cp[2];
+				cp[2] = c;
+				cp+=4;
+			}
+		}
+	}
+void bFile::safeSwapPtr(char *dst, const char *src)
+	int ptrFile = mFileDNA->getPointerSize();
+	int ptrMem = mMemoryDNA->getPointerSize();
+	if (!src && !dst)
+		return;
+	if (ptrFile == ptrMem)
+	{
+		memcpy(dst, src, ptrMem);
+	}
+	else if (ptrMem==4 && ptrFile==8)
+	{
+		b3PointerUid* oldPtr = (b3PointerUid*)src;
+		b3PointerUid* newPtr = (b3PointerUid*)dst;
+		if (oldPtr->m_uniqueIds[0] == oldPtr->m_uniqueIds[1])
+		{
+			//Bullet stores the 32bit unique ID in both upper and lower part of 64bit pointers
+			//so it can be used to distinguish between .blend and .bullet
+			newPtr->m_uniqueIds[0] = oldPtr->m_uniqueIds[0];
+		} else
+		{
+			//deal with pointers the Blender .blend style way, see
+			//readfile.c in the Blender source tree
+			b3Long64 longValue = *((b3Long64*)src);
+			//endian swap for 64bit pointer otherwise truncation will fail due to trailing zeros
+			if (mFlags & FD_ENDIAN_SWAP)
+				B3_SWITCH_LONGINT(longValue);
+			*((int*)dst) = (int)(longValue>>3);
+		}
+	}
+	else if (ptrMem==8 && ptrFile==4)
+	{
+		b3PointerUid* oldPtr = (b3PointerUid*)src;
+		b3PointerUid* newPtr = (b3PointerUid*)dst;
+		if (oldPtr->m_uniqueIds[0] == oldPtr->m_uniqueIds[1])
+		{
+			newPtr->m_uniqueIds[0] = oldPtr->m_uniqueIds[0];
+			newPtr->m_uniqueIds[1] = 0;
+		} else
+		{
+			*((b3Long64*)dst)= *((int*)src);
+		}
+	}
+	else
+	{
+		printf ("%d %d\n", ptrFile,ptrMem);
+		assert(0 && "Invalid pointer len");
+	}
+// ----------------------------------------------------- //
+void bFile::getMatchingFileDNA(short* dna_addr, const char* lookupName,  const char* lookupType, char *strcData, char *data, bool fixupPointers)
+	// find the matching memory dna data
+	// to the file being loaded. Fill the
+	// memory with the file data...
+	int len = dna_addr[1];
+	dna_addr+=2;
+	for (int i=0; i<len; i++, dna_addr+=2)
+	{
+		const char* type = mFileDNA->getType(dna_addr[0]);
+		const char* name = mFileDNA->getName(dna_addr[1]);
+		int eleLen = mFileDNA->getElementSize(dna_addr[0], dna_addr[1]);
+		if ((mFlags&FD_BROKEN_DNA)!=0)
+		{
+			if ((strcmp(type,"short")==0)&&(strcmp(name,"int")==0))
+			{
+				eleLen = 0;
+			}
+		}
+		if (strcmp(lookupName, name)==0)
+		{
+			//int arrayLenold = mFileDNA->getArraySize((char*)name.c_str());
+			int arrayLen = mFileDNA->getArraySizeNew(dna_addr[1]);
+			//assert(arrayLenold == arrayLen);
+			if (name[0] == '*')
+			{
+				// cast pointers
+				int ptrFile = mFileDNA->getPointerSize();
+				int ptrMem = mMemoryDNA->getPointerSize();
+				safeSwapPtr(strcData,data);
+				if (fixupPointers)
+				{
+					if (arrayLen > 1)
+					{
+						//void **sarray = (void**)strcData;
+						//void **darray = (void**)data;
+                        char *cpc, *cpo;
+						cpc = (char*)strcData;
+						cpo = (char*)data;
+						for (int a=0; a<arrayLen; a++)
+						{
+							safeSwapPtr(cpc, cpo);
+							m_pointerFixupArray.push_back(cpc);
+							cpc += ptrMem;
+							cpo += ptrFile;
+						}
+					}
+					else
+					{
+						if (name[1] == '*')
+							m_pointerPtrFixupArray.push_back(strcData);
+						else
+							m_pointerFixupArray.push_back(strcData);
+					}
+				}
+				else
+				{
+//					printf("skipped %s %s : %x\n",type.c_str(),name.c_str(),strcData);
+				}
+			}
+			else if (strcmp(type, lookupType)==0)
+				memcpy(strcData, data, eleLen);
+			else
+				getElement(arrayLen, lookupType, type, data, strcData);
+			// --
+			return;
+		}
+		data+=eleLen;
+	}
+// ----------------------------------------------------- //
+char* bFile::getFileElement(short *firstStruct, char *lookupName, char *lookupType, char *data, short **foundPos)
+	short *old = firstStruct;//mFileDNA->getStruct(old_nr);
+	int elementLength = old[1];
+	old+=2;
+	for (int i=0; i<elementLength; i++, old+=2)
+	{
+		char* type = mFileDNA->getType(old[0]);
+		char* name = mFileDNA->getName(old[1]);
+		int len = mFileDNA->getElementSize(old[0], old[1]);
+		if (strcmp(lookupName, name)==0)
+		{
+			if (strcmp(type, lookupType)==0)
+			{
+				if (foundPos)
+					*foundPos = old;
+				return data;
+			}
+			return 0;
+		}
+		data+=len;
+	}
+	return 0;
+// ----------------------------------------------------- //
+void bFile::swapStruct(int dna_nr, char *data,bool ignoreEndianFlag)
+	if (dna_nr == -1) return;
+	short *strc = mFileDNA->getStruct(dna_nr);
+	//short *firstStrc = strc;
+	int elementLen= strc[1];
+	strc+=2;
+	short first = mFileDNA->getStruct(0)[0];
+	char *buf = data;
+	for (int i=0; i<elementLen; i++, strc+=2)
+	{
+		char *type = mFileDNA->getType(strc[0]);
+		char *name = mFileDNA->getName(strc[1]);
+		int size = mFileDNA->getElementSize(strc[0], strc[1]);
+		if (strc[0] >= first && name[0]!='*')
+		{
+			int old_nr = mFileDNA->getReverseType(type);
+			int arrayLen = mFileDNA->getArraySizeNew(strc[1]);
+			if (arrayLen==1)
+			{
+				swapStruct(old_nr,buf,ignoreEndianFlag);
+			} else
+			{
+				char* tmpBuf = buf;
+				for (int i=0;i<arrayLen;i++)
+				{
+					swapStruct(old_nr,tmpBuf,ignoreEndianFlag);
+					tmpBuf+=size/arrayLen;
+				}
+			}
+		}
+		else
+		{
+			//int arrayLenOld = mFileDNA->getArraySize(name);
+			int arrayLen = mFileDNA->getArraySizeNew(strc[1]);
+			//assert(arrayLenOld == arrayLen);
+			swapData(buf, strc[0], arrayLen,ignoreEndianFlag);
+		}
+		buf+=size;
+	}
+void bFile::resolvePointersMismatch()
+//	printf("resolvePointersStructMismatch\n");
+	int i;
+	for (i=0;i<	m_pointerFixupArray.size();i++)
+	{
+		char* cur = m_pointerFixupArray.at(i);
+		void** ptrptr = (void**) cur;
+		void* ptr = *ptrptr;
+		ptr = findLibPointer(ptr);
+		if (ptr)
+		{
+			//printf("Fixup pointer!\n");
+			*(ptrptr) = ptr;
+		} else
+		{
+//			printf("pointer not found: %x\n",cur);
+		}
+	}
+	for (i=0; i<m_pointerPtrFixupArray.size(); i++)
+	{
+		char* cur= m_pointerPtrFixupArray.at(i);
+		void** ptrptr = (void**)cur;
+		bChunkInd *block = m_chunkPtrPtrMap.find(*ptrptr);
+		if (block)
+		{
+			int ptrMem = mMemoryDNA->getPointerSize();
+			int ptrFile = mFileDNA->getPointerSize();
+			int blockLen = block->len / ptrFile;
+			void *onptr = findLibPointer(*ptrptr);
+			if (onptr)
+			{
+				char *newPtr = new char[blockLen * ptrMem];
+				addDataBlock(newPtr);
+				memset(newPtr, 0, blockLen * ptrMem);
+				void **onarray = (void**)onptr;
+				char *oldPtr = (char*)onarray;
+				int p = 0;
+				while (blockLen-- > 0)
+				{
+					b3PointerUid dp = {0};
+					safeSwapPtr((char*)dp.m_uniqueIds, oldPtr);
+					void **tptr = (void**)(newPtr + p * ptrMem);
+					*tptr = findLibPointer(dp.m_ptr);
+					oldPtr += ptrFile;
+					++p;
+				}
+				*ptrptr = newPtr;
+			}
+		}
+	}
+///this loop only works fine if the Blender DNA structure of the file matches the headerfiles
+void bFile::resolvePointersChunk(const bChunkInd& dataChunk, int verboseMode)
+	bParse::bDNA* fileDna = mFileDNA ? mFileDNA : mMemoryDNA;
+	short int* oldStruct = fileDna->getStruct(dataChunk.dna_nr);
+	short oldLen = fileDna->getLength(oldStruct[0]);
+	//char* structType = fileDna->getType(oldStruct[0]);
+	char* cur	= (char*)findLibPointer(dataChunk.oldPtr);
+	for (int block=0; block<dataChunk.nr; block++)
+	{
+		resolvePointersStructRecursive(cur,dataChunk.dna_nr, verboseMode,1);
+		cur += oldLen;
+	}
+int bFile::resolvePointersStructRecursive(char *strcPtr, int dna_nr, int verboseMode,int recursion)
+	bParse::bDNA* fileDna = mFileDNA ? mFileDNA : mMemoryDNA;
+	char* memType;
+	char* memName;
+	short	firstStructType = fileDna->getStruct(0)[0];
+	char* elemPtr= strcPtr;
+	short int* oldStruct = fileDna->getStruct(dna_nr);
+	int elementLength = oldStruct[1];
+	oldStruct+=2;
+	int totalSize = 0;
+	for (int ele=0; ele<elementLength; ele++, oldStruct+=2)
+	{
+		memType = fileDna->getType(oldStruct[0]);
+		memName = fileDna->getName(oldStruct[1]);
+		int arrayLen = fileDna->getArraySizeNew(oldStruct[1]);
+		if (memName[0] == '*')
+		{
+			if (arrayLen > 1)
+			{
+				void **array= (void**)elemPtr;
+				for (int a=0; a<arrayLen; a++)
+				{
+					if (verboseMode & FD_VERBOSE_EXPORT_XML)
+					{
+						for (int i=0;i<recursion;i++)
+						{
+							printf("  ");
+						}
+						//skip the *
+						printf("<%s type=\"pointer\"> ",&memName[1]);
+						printf("%p ", array[a]);
+						printf("</%s>\n",&memName[1]);
+					}
+					array[a] = findLibPointer(array[a]);
+				}
+			}
+			else
+			{
+				void** ptrptr = (void**) elemPtr;
+				void* ptr = *ptrptr;
+				if (verboseMode & FD_VERBOSE_EXPORT_XML)
+				{
+					for (int i=0;i<recursion;i++)
+					{
+						printf("  ");
+					}
+					printf("<%s type=\"pointer\"> ",&memName[1]);
+					printf("%p ", ptr);
+					printf("</%s>\n",&memName[1]);
+				}
+				ptr = findLibPointer(ptr);
+				if (ptr)
+				{
+	//				printf("Fixup pointer at 0x%x from 0x%x to 0x%x!\n",ptrptr,*ptrptr,ptr);
+					*(ptrptr) = ptr;
+					if (memName[1] == '*' && ptrptr && *ptrptr)
+					{
+						// This	will only work if the given	**array	is continuous
+						void **array= (void**)*(ptrptr);
+						void *np= array[0];
+						int	n=0;
+						while (np)
+						{
+							np= findLibPointer(array[n]);
+							if (np) array[n]= np;
+							n++;
+						}
+					}
+				} else
+				{
+	//				printf("Cannot fixup pointer at 0x%x from 0x%x to 0x%x!\n",ptrptr,*ptrptr,ptr);
+				}
+			}
+		} else
+		{
+			int revType = fileDna->getReverseType(oldStruct[0]);
+			if (oldStruct[0]>=firstStructType) //revType != -1 &&
+			{
+				char cleanName[MAX_STRLEN];
+				getCleanName(memName,cleanName);
+				int arrayLen = fileDna->getArraySizeNew(oldStruct[1]);
+				int byteOffset = 0;
+				if (verboseMode & FD_VERBOSE_EXPORT_XML)
+				{
+					for (int i=0;i<recursion;i++)
+					{
+						printf("  ");
+					}
+					if (arrayLen>1)
+					{
+						printf("<%s type=\"%s\" count=%d>\n",cleanName,memType, arrayLen);
+					} else
+					{
+						printf("<%s type=\"%s\">\n",cleanName,memType);
+					}
+				}
+				for (int i=0;i<arrayLen;i++)
+				{
+					byteOffset += resolvePointersStructRecursive(elemPtr+byteOffset,revType, verboseMode,recursion+1);
+				}
+				if (verboseMode & FD_VERBOSE_EXPORT_XML)
+				{
+					for (int i=0;i<recursion;i++)
+					{
+						printf("  ");
+					}
+					printf("</%s>\n",cleanName);
+				}
+			} else
+			{
+				//export a simple type
+				if (verboseMode & FD_VERBOSE_EXPORT_XML)
+				{
+					if (arrayLen>MAX_ARRAY_LENGTH)
+					{
+						printf("too long\n");
+					} else
+					{
+						//printf("%s %s\n",memType,memName);
+						bool isIntegerType = (strcmp(memType,"char")==0) || (strcmp(memType,"int")==0) || (strcmp(memType,"short")==0);
+						if (isIntegerType)
+						{
+							const char* newtype="int";
+							int dbarray[MAX_ARRAY_LENGTH];
+							int* dbPtr = 0;
+							char* tmp = elemPtr;
+							dbPtr = &dbarray[0];
+							if (dbPtr)
+							{
+								char cleanName[MAX_STRLEN];
+								getCleanName(memName,cleanName);
+								int i;
+								getElement(arrayLen, newtype,memType, tmp, (char*)dbPtr);
+								for (i=0;i<recursion;i++)
+									printf("  ");
+								if (arrayLen==1)
+									printf("<%s type=\"%s\">",cleanName,memType);
+								else
+									printf("<%s type=\"%s\" count=%d>",cleanName,memType,arrayLen);
+								for (i=0;i<arrayLen;i++)
+									printf(" %d ",dbPtr[i]);
+								printf("</%s>\n",cleanName);
+							}
+						} else
+						{
+							const char* newtype="double";
+							double dbarray[MAX_ARRAY_LENGTH];
+					 		double* dbPtr = 0;
+							char* tmp = elemPtr;
+							dbPtr = &dbarray[0];
+							if (dbPtr)
+							{
+								int i;
+								getElement(arrayLen, newtype,memType, tmp, (char*)dbPtr);
+								for (i=0;i<recursion;i++)
+									printf("  ");
+								char cleanName[MAX_STRLEN];
+								getCleanName(memName,cleanName);
+								if (arrayLen==1)
+								{
+									printf("<%s type=\"%s\">",memName,memType);
+								}
+								else
+								{
+									printf("<%s type=\"%s\" count=%d>",cleanName,memType,arrayLen);
+								}
+								for (i=0;i<arrayLen;i++)
+									printf(" %f ",dbPtr[i]);
+								printf("</%s>\n",cleanName);
+							}
+						}
+					}
+				}
+			}
+		}
+		int size = fileDna->getElementSize(oldStruct[0], oldStruct[1]);
+		totalSize += size;
+		elemPtr+=size;
+	}
+	return totalSize;
+///Resolve pointers replaces the original pointers in structures, and linked lists by the new in-memory structures
+void bFile::resolvePointers(int verboseMode)
+	bParse::bDNA* fileDna = mFileDNA ? mFileDNA : mMemoryDNA;
+	//char *dataPtr = mFileBuffer+mDataStart;
+	if (1) //mFlags & (FD_BITS_VARIES | FD_VERSION_VARIES))
+	{
+		resolvePointersMismatch();
+	}
+	{
+		if (verboseMode & FD_VERBOSE_EXPORT_XML)
+		{
+			printf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
+			int numitems = m_chunks.size();
+			printf("<bullet_physics version=%d itemcount = %d>\n", b3GetVersion(), numitems);
+		}
+		for (int i=0;i<m_chunks.size();i++)
+		{
+			const bChunkInd& dataChunk = m_chunks.at(i);
+			if (!mFileDNA || fileDna->flagEqual(dataChunk.dna_nr))
+			{
+				//dataChunk.len
+				short int* oldStruct = fileDna->getStruct(dataChunk.dna_nr);
+				char* oldType = fileDna->getType(oldStruct[0]);
+				if (verboseMode & FD_VERBOSE_EXPORT_XML)
+					printf(" <%s pointer=%p>\n",oldType,dataChunk.oldPtr);
+				resolvePointersChunk(dataChunk, verboseMode);
+				if (verboseMode & FD_VERBOSE_EXPORT_XML)
+					printf(" </%s>\n",oldType);
+			} else
+			{
+				//printf("skipping mStruct\n");
+			}
+		}
+			if (verboseMode & FD_VERBOSE_EXPORT_XML)
+			{
+				printf("</bullet_physics>\n");
+			}
+	}
+// ----------------------------------------------------- //
+void* bFile::findLibPointer(void *ptr)
+	bStructHandle** ptrptr = getLibPointers().find(ptr);
+	if (ptrptr)
+		return *ptrptr;
+	return 0;
+void	bFile::updateOldPointers()
+	int i;
+	for (i=0;i<m_chunks.size();i++)
+	{
+		bChunkInd& dataChunk = m_chunks[i];
+		dataChunk.oldPtr = findLibPointer(dataChunk.oldPtr);
+	}
+void	bFile::dumpChunks(bParse::bDNA* dna)
+	int i;
+	for (i=0;i<m_chunks.size();i++)
+	{
+		bChunkInd& dataChunk = m_chunks[i];
+		char* codeptr = (char*)&dataChunk.code;
+		char codestr[5] = {codeptr[0],codeptr[1],codeptr[2],codeptr[3],0};
+		short* newStruct = dna->getStruct(dataChunk.dna_nr);
+		char* typeName = dna->getType(newStruct[0]);
+		printf("%3d: %s  ",i,typeName);
+		printf("code=%s  ",codestr);
+		printf("ptr=%p  ",dataChunk.oldPtr);
+		printf("len=%d  ",dataChunk.len);
+		printf("nr=%d  ",dataChunk.nr);
+		if (dataChunk.nr!=1)
+		{
+			printf("not 1\n");
+		}
+		printf("\n");
+	}
+#if 0
+	IDFinderData ifd;
+	ifd.success = 0;
+	ifd.IDname = NULL;
+	ifd.just_print_it = 1;
+	for (i=0; i<bf->m_blocks.size(); ++i)
+	{
+		BlendBlock* bb = bf->m_blocks[i];
+		printf("tag='%s'\tptr=%p\ttype=%s\t[%4d]",		bb->tag, bb,bf->types[bb->type_index].name,bb->m_array_entries_.size());
+		block_ID_finder(bb, bf, &ifd);
+		printf("\n");
+	}
+void	bFile::writeChunks(FILE* fp, bool fixupPointers)
+	bParse::bDNA* fileDna = mFileDNA ? mFileDNA : mMemoryDNA;
+	for (int i=0;i<m_chunks.size();i++)
+	{
+		bChunkInd& dataChunk = m_chunks.at(i);
+		// Ouch! need to rebuild the struct
+		short *oldStruct,*curStruct;
+		char *oldType, *newType;
+		int oldLen, curLen, reverseOld;
+		oldStruct = fileDna->getStruct(dataChunk.dna_nr);
+		oldType = fileDna->getType(oldStruct[0]);
+		oldLen = fileDna->getLength(oldStruct[0]);
+		///don't try to convert Link block data, just memcpy it. Other data can be converted.
+		reverseOld = mMemoryDNA->getReverseType(oldType);
+		if ((reverseOld!=-1))
+		{
+			// make sure it's here
+			//assert(reverseOld!= -1 && "getReverseType() returned -1, struct required!");
+			//
+			curStruct = mMemoryDNA->getStruct(reverseOld);
+			newType = mMemoryDNA->getType(curStruct[0]);
+			// make sure it's the same
+			assert((strcmp(oldType, newType)==0) && "internal error, struct mismatch!");
+			curLen = mMemoryDNA->getLength(curStruct[0]);
+			dataChunk.dna_nr = reverseOld;
+			if (strcmp("Link",oldType)!=0)
+			{
+				dataChunk.len = curLen * dataChunk.nr;
+			} else
+			{
+//				printf("keep length of link = %d\n",dataChunk.len);
+			}
+			//write the structure header
+			fwrite(&dataChunk,sizeof(bChunkInd),1,fp);
+			short int* curStruct1;
+			curStruct1 = mMemoryDNA->getStruct(dataChunk.dna_nr);
+			assert(curStruct1 == curStruct);
+			char* cur	= fixupPointers  ?  (char*)findLibPointer(dataChunk.oldPtr) : (char*)dataChunk.oldPtr;
+			//write the actual contents of the structure(s)
+			fwrite(cur,dataChunk.len,1,fp);
+		} else
+		{
+			printf("serious error, struct mismatch: don't write\n");
+		}
+	}
+// ----------------------------------------------------- //
+int bFile::getNextBlock(bChunkInd *dataChunk,  const char *dataPtr, const int flags)
+	bool swap = false;
+	bool varies = false;
+	if (flags &FD_ENDIAN_SWAP)
+		swap = true;
+	if (flags &FD_BITS_VARIES)
+		varies = true;
+	if (VOID_IS_8)
+	{
+		if (varies)
+		{
+			bChunkPtr4 head;
+			memcpy(&head, dataPtr, sizeof(bChunkPtr4));
+			bChunkPtr8 chunk;
+			chunk.code		= head.code;
+			chunk.len		= head.len;
+			chunk.m_uniqueInts[0] = head.m_uniqueInt;
+			chunk.m_uniqueInts[1] = 0;
+			chunk.dna_nr	= head.dna_nr;
+			chunk.nr		= head.nr;
+			if (swap)
+			{
+				if ((chunk.code & 0xFFFF)==0)
+					chunk.code >>=16;
+				B3_SWITCH_INT(chunk.len);
+				B3_SWITCH_INT(chunk.dna_nr);
+				B3_SWITCH_INT(chunk.nr);
+			}
+			memcpy(dataChunk, &chunk, sizeof(bChunkInd));
+		}
+		else
+		{
+			bChunkPtr8 c;
+			memcpy(&c, dataPtr, sizeof(bChunkPtr8));
+			if (swap)
+			{
+				if ((c.code & 0xFFFF)==0)
+					c.code >>=16;
+				B3_SWITCH_INT(c.len);
+				B3_SWITCH_INT(c.dna_nr);
+				B3_SWITCH_INT(c.nr);
+			}
+			memcpy(dataChunk, &c, sizeof(bChunkInd));
+		}
+	}
+	else
+	{
+		if (varies)
+		{
+			bChunkPtr8 head;
+			memcpy(&head, dataPtr, sizeof(bChunkPtr8));
+			bChunkPtr4 chunk;
+			chunk.code = head.code;
+			chunk.len = head.len;
+			if (head.m_uniqueInts[0]==head.m_uniqueInts[1])
+			{
+				chunk.m_uniqueInt = head.m_uniqueInts[0];
+			} else
+			{
+				b3Long64 oldPtr =0;
+				memcpy(&oldPtr, &head.m_uniqueInts[0], 8);
+				if (swap)
+					B3_SWITCH_LONGINT(oldPtr);
+				chunk.m_uniqueInt = (int)(oldPtr >> 3);
+			}
+			chunk.dna_nr = head.dna_nr;
+			chunk.nr = head.nr;
+			if (swap)
+			{
+				if ((chunk.code & 0xFFFF)==0)
+					chunk.code >>=16;
+				B3_SWITCH_INT(chunk.len);
+				B3_SWITCH_INT(chunk.dna_nr);
+				B3_SWITCH_INT(chunk.nr);
+			}
+			memcpy(dataChunk, &chunk, sizeof(bChunkInd));
+		}
+		else
+		{
+			bChunkPtr4 c;
+			memcpy(&c, dataPtr, sizeof(bChunkPtr4));
+			if (swap)
+			{
+				if ((c.code & 0xFFFF)==0)
+					c.code >>=16;
+				B3_SWITCH_INT(c.len);
+				B3_SWITCH_INT(c.dna_nr);
+				B3_SWITCH_INT(c.nr);
+			}
+			memcpy(dataChunk, &c, sizeof(bChunkInd));
+		}
+	}
+	if (dataChunk->len < 0)
+		return -1;
+#if 0
+	print ("----------");
+	print (dataChunk->code);
+	print (dataChunk->len);
+	print (dataChunk->old);
+	print (dataChunk->dna_nr);
+	print (dataChunk->nr);
+	return (dataChunk->len+ChunkUtils::getOffset(flags));
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.h
new file mode 100644
index 00000000..86105680
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.h
@@ -0,0 +1,165 @@
+Copyright (c) 2006-2009 Charlie C & Erwin Coumans  http://gamekit.googlecode.com
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef __BFILE_H__
+#define __BFILE_H__
+#include "b3Common.h"
+#include "b3Chunk.h"
+#include <stdio.h>
+namespace bParse {
+	// ----------------------------------------------------- //
+	enum bFileFlags
+	{
+		FD_INVALID   =0,
+		FD_OK        =1,
+		FD_VOID_IS_8 =2,
+		FD_ENDIAN_SWAP      =4,
+		FD_FILE_64   =8,
+		FD_BITS_VARIES    =16,
+		FD_BROKEN_DNA = 128
+	};
+	enum bFileVerboseMode
+	{
+	};
+	// ----------------------------------------------------- //
+	class bFile
+	{
+	protected:
+		char				m_headerString[7];
+		bool				mOwnsBuffer;
+		char*				mFileBuffer;
+		int					mFileLen;
+		int					mVersion;
+		bPtrMap				mLibPointers;
+		int					mDataStart;
+		bDNA*				mFileDNA;
+		bDNA*				mMemoryDNA;
+		b3AlignedObjectArray<char*>	m_pointerFixupArray;
+		b3AlignedObjectArray<char*>	m_pointerPtrFixupArray;
+		b3AlignedObjectArray<bChunkInd>	m_chunks;
+        b3HashMap<b3HashPtr, bChunkInd> m_chunkPtrPtrMap;
+        // 
+		bPtrMap				mDataPointers;
+		int					mFlags;
+		// ////////////////////////////////////////////////////////////////////////////
+			// buffer offset util
+		int getNextBlock(bChunkInd *dataChunk,  const char *dataPtr, const int flags);
+		void safeSwapPtr(char *dst, const char *src);
+		virtual	void parseHeader();
+		virtual	void parseData() = 0;
+		void resolvePointersMismatch();
+		void resolvePointersChunk(const bChunkInd& dataChunk, int verboseMode);
+		int resolvePointersStructRecursive(char *strcPtr, int old_dna, int verboseMode, int recursion);
+		//void swapPtr(char *dst, char *src);
+		void parseStruct(char *strcPtr, char *dtPtr, int old_dna, int new_dna, bool fixupPointers);
+		void getMatchingFileDNA(short* old, const char* lookupName, const char* lookupType, char *strcData, char *data, bool fixupPointers);
+		char* getFileElement(short *firstStruct, char *lookupName, char *lookupType, char *data, short **foundPos);
+		void swap(char *head, class bChunkInd& ch, bool ignoreEndianFlag);
+		void swapData(char *data, short type, int arraySize, bool ignoreEndianFlag);
+		void swapStruct(int dna_nr, char *data, bool ignoreEndianFlag);
+		void swapLen(char *dataPtr);
+		void swapDNA(char* ptr);
+		char* readStruct(char *head, class bChunkInd& chunk);
+		char *getAsString(int code);
+		void	parseInternal(int verboseMode, char* memDna,int memDnaLength);
+	public:
+		bFile(const char *filename, const char headerString[7]);
+		//todo: make memoryBuffer const char
+		//bFile( const char *memoryBuffer, int len);
+		bFile( char *memoryBuffer, int len, const char headerString[7]);
+		virtual ~bFile();
+		bDNA*				getFileDNA()
+		{
+			return mFileDNA;
+		}
+		virtual	void	addDataBlock(char* dataBlock) = 0;
+		int	getFlags() const
+		{
+			return mFlags;
+		}
+		bPtrMap&		getLibPointers()
+		{
+			return mLibPointers;
+		}
+		void* findLibPointer(void *ptr);
+		bool ok();
+		virtual	void parse(int verboseMode) = 0;
+		virtual	int	write(const char* fileName, bool fixupPointers=false) = 0;
+		virtual	void	writeChunks(FILE* fp, bool fixupPointers );
+		virtual	void	writeDNA(FILE* fp) = 0;
+		void	updateOldPointers();
+		void	resolvePointers(int verboseMode);
+		void	dumpChunks(bDNA* dna);
+		int		getVersion() const
+		{
+			return mVersion;
+		}
+		//pre-swap the endianness, so that data loaded on a target with different endianness doesn't need to be swapped
+		void preSwap();
+		void writeFile(const char* fileName);
+	};
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Serializer.cpp b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Serializer.cpp
new file mode 100644
index 00000000..c6a2a832
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Serializer.cpp
@@ -0,0 +1,908 @@
+char b3s_bulletDNAstr[]= {
+int b3s_bulletDNAlen= sizeof(b3s_bulletDNAstr);
+char b3s_bulletDNAstr64[]= {
+int b3s_bulletDNAlen64= sizeof(b3s_bulletDNAstr64);
diff --git a/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Serializer.h b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Serializer.h
new file mode 100644
index 00000000..1c1ce437
--- /dev/null
+++ b/src/bullet/Bullet3Serialize/Bullet2FileLoader/b3Serializer.h
@@ -0,0 +1,639 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#ifndef B3_SERIALIZER_H
+#define B3_SERIALIZER_H
+#include "Bullet3Common/b3Scalar.h" // has definitions like B3_FORCE_INLINE
+#include "Bullet3Common/b3StackAlloc.h"
+#include "Bullet3Common/b3HashMap.h"
+#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
+#include <memory.h>
+#include <string.h>
+extern char b3s_bulletDNAstr[];
+extern int b3s_bulletDNAlen;
+extern char b3s_bulletDNAstr64[];
+extern int b3s_bulletDNAlen64;
+B3_FORCE_INLINE	int b3StrLen(const char* str) 
+    if (!str) 
+		return(0);
+	int len = 0;
+	while (*str != 0)
+	{
+        str++;
+        len++;
+    }
+    return len;
+class b3Chunk
+	int		m_chunkCode;
+	int		m_length;
+	void	*m_oldPtr;
+	int		m_dna_nr;
+	int		m_number;
+enum	b3SerializationFlags
+class	b3Serializer
+	virtual ~b3Serializer() {}
+	virtual	const unsigned char*		getBufferPointer() const = 0;
+	virtual	int		getCurrentBufferSize() const = 0;
+	virtual	b3Chunk*	allocate(size_t size, int numElements) = 0;
+	virtual	void	finalizeChunk(b3Chunk* chunk, const char* structType, int chunkCode,void* oldPtr)= 0;
+	virtual	 void*	findPointer(void* oldPtr)  = 0;
+	virtual	void*	getUniquePointer(void*oldPtr) = 0;
+	virtual	void	startSerialization() = 0;
+	virtual	void	finishSerialization() = 0;
+	virtual	const char*	findNameForPointer(const void* ptr) const = 0;
+	virtual	void	registerNameForPointer(const void* ptr, const char* name) = 0;
+	virtual void	serializeName(const char* ptr) = 0;
+	virtual int		getSerializationFlags() const = 0;
+	virtual void	setSerializationFlags(int flags) = 0;
+#define B3_HEADER_LENGTH 12
+#if defined(__sgi) || defined (__sparc) || defined (__sparc__) || defined (__PPC__) || defined (__ppc__) || defined (__BIG_ENDIAN__)
+#	define B3_MAKE_ID(a,b,c,d) ( (int)(a)<<24 | (int)(b)<<16 | (c)<<8 | (d) )
+#	define B3_MAKE_ID(a,b,c,d) ( (int)(d)<<24 | (int)(c)<<16 | (b)<<8 | (a) )
+#define B3_SOFTBODY_CODE		B3_MAKE_ID('S','B','D','Y')
+#define B3_RIGIDBODY_CODE		B3_MAKE_ID('R','B','D','Y')
+#define B3_CONSTRAINT_CODE		B3_MAKE_ID('C','O','N','S')
+#define B3_BOXSHAPE_CODE		B3_MAKE_ID('B','O','X','S')
+#define B3_QUANTIZED_BVH_CODE	B3_MAKE_ID('Q','B','V','H')
+#define B3_TRIANLGE_INFO_MAP	B3_MAKE_ID('T','M','A','P')
+#define B3_SHAPE_CODE			B3_MAKE_ID('S','H','A','P')
+#define B3_ARRAY_CODE			B3_MAKE_ID('A','R','A','Y')
+#define B3_SBMATERIAL_CODE		B3_MAKE_ID('S','B','M','T')
+#define B3_SBNODE_CODE			B3_MAKE_ID('S','B','N','D')
+#define B3_DYNAMICSWORLD_CODE	B3_MAKE_ID('D','W','L','D')
+#define B3_DNA_CODE				B3_MAKE_ID('D','N','A','1')
+struct	b3PointerUid
+	union
+	{
+		void*	m_ptr;
+		int		m_uniqueIds[2];
+	};
+///The b3DefaultSerializer is the main Bullet serialization class.
+///The constructor takes an optional argument for backwards compatibility, it is recommended to leave this empty/zero.
+class b3DefaultSerializer	:	public b3Serializer
+	b3AlignedObjectArray<char*>			mTypes;
+	b3AlignedObjectArray<short*>			mStructs;
+	b3AlignedObjectArray<short>			mTlens;
+	b3HashMap<b3HashInt, int>			mStructReverse;
+	b3HashMap<b3HashString,int>	mTypeLookup;
+	b3HashMap<b3HashPtr,void*>	m_chunkP;
+	b3HashMap<b3HashPtr,const char*>	m_nameMap;
+	b3HashMap<b3HashPtr,b3PointerUid>	m_uniquePointers;
+	int	m_uniqueIdGenerator;
+	int					m_totalSize;
+	unsigned char*		m_buffer;
+	int					m_currentSize;
+	void*				m_dna;
+	int					m_dnaLength;
+	int					m_serializationFlags;
+	b3AlignedObjectArray<b3Chunk*>	m_chunkPtrs;
+	virtual	void*	findPointer(void* oldPtr) 
+	{
+		void** ptr = m_chunkP.find(oldPtr);
+		if (ptr && *ptr)
+			return *ptr;
+		return 0;
+	}
+		void	writeDNA()
+		{
+			b3Chunk* dnaChunk = allocate(m_dnaLength,1);
+			memcpy(dnaChunk->m_oldPtr,m_dna,m_dnaLength);
+			finalizeChunk(dnaChunk,"DNA1",B3_DNA_CODE, m_dna);
+		}
+		int getReverseType(const char *type) const
+		{
+			b3HashString key(type);
+			const int* valuePtr = mTypeLookup.find(key);
+			if (valuePtr)
+				return *valuePtr;
+			return -1;
+		}
+		void initDNA(const char* bdnaOrg,int dnalen)
+		{
+			///was already initialized
+			if (m_dna)
+				return;
+			int littleEndian= 1;
+			littleEndian= ((char*)&littleEndian)[0];
+			m_dna = b3AlignedAlloc(dnalen,16);
+			memcpy(m_dna,bdnaOrg,dnalen);
+			m_dnaLength = dnalen;
+			int *intPtr=0;
+			short *shtPtr=0;
+			char *cp = 0;int dataLen =0;
+			intPtr = (int*)m_dna;
+			/*
+				SDNA (4 bytes) (magic number)
+				NAME (4 bytes)
+				<nr> (4 bytes) amount of names (int)
+				<string>
+				<string>
+			*/
+			if (strncmp((const char*)m_dna, "SDNA", 4)==0)
+			{
+				// skip ++ NAME
+				intPtr++; intPtr++;
+			}
+			// Parse names
+			if (!littleEndian)
+				*intPtr = b3SwapEndian(*intPtr);
+			dataLen = *intPtr;
+			intPtr++;
+			cp = (char*)intPtr;
+			int i;
+			for ( i=0; i<dataLen; i++)
+			{
+				while (*cp)cp++;
+				cp++;
+			}
+			cp = b3AlignPointer(cp,4);
+			/*
+				TYPE (4 bytes)
+				<nr> amount of types (int)
+				<string>
+				<string>
+			*/
+			intPtr = (int*)cp;
+			b3Assert(strncmp(cp, "TYPE", 4)==0); intPtr++;
+			if (!littleEndian)
+				*intPtr =  b3SwapEndian(*intPtr);
+			dataLen = *intPtr;
+			intPtr++;
+			cp = (char*)intPtr;
+			for (i=0; i<dataLen; i++)
+			{
+				mTypes.push_back(cp);
+				while (*cp)cp++;
+				cp++;
+			}
+			cp = b3AlignPointer(cp,4);
+			/*
+				TLEN (4 bytes)
+				<len> (short) the lengths of types
+				<len>
+			*/
+			// Parse type lens
+			intPtr = (int*)cp;
+			b3Assert(strncmp(cp, "TLEN", 4)==0); intPtr++;
+			dataLen = (int)mTypes.size();
+			shtPtr = (short*)intPtr;
+			for (i=0; i<dataLen; i++, shtPtr++)
+			{
+				if (!littleEndian)
+					shtPtr[0] = b3SwapEndian(shtPtr[0]);
+				mTlens.push_back(shtPtr[0]);
+			}
+			if (dataLen & 1) shtPtr++;
+			/*
+				STRC (4 bytes)
+				<nr> amount of structs (int)
+				<typenr>
+				<nr_of_elems>
+				<typenr>
+				<namenr>
+				<typenr>
+				<namenr>
+			*/
+			intPtr = (int*)shtPtr;
+			cp = (char*)intPtr;
+			b3Assert(strncmp(cp, "STRC", 4)==0); intPtr++;
+			if (!littleEndian)
+				*intPtr = b3SwapEndian(*intPtr);
+			dataLen = *intPtr ; 
+			intPtr++;
+			shtPtr = (short*)intPtr;
+			for (i=0; i<dataLen; i++)
+			{
+				mStructs.push_back (shtPtr);
+				if (!littleEndian)
+				{
+					shtPtr[0]= b3SwapEndian(shtPtr[0]);
+					shtPtr[1]= b3SwapEndian(shtPtr[1]);
+					int len = shtPtr[1];
+					shtPtr+= 2;
+					for (int a=0; a<len; a++, shtPtr+=2)
+					{
+							shtPtr[0]= b3SwapEndian(shtPtr[0]);
+							shtPtr[1]= b3SwapEndian(shtPtr[1]);
+					}
+				} else
+				{
+					shtPtr+= (2*shtPtr[1])+2;
+				}
+			}
+			// build reverse lookups
+			for (i=0; i<(int)mStructs.size(); i++)
+			{
+				short *strc = mStructs.at(i);
+				mStructReverse.insert(strc[0], i);
+				mTypeLookup.insert(b3HashString(mTypes[strc[0]]),i);
+			}
+		}
+		b3DefaultSerializer(int totalSize=0)
+			:m_totalSize(totalSize),
+			m_currentSize(0),
+			m_dna(0),
+			m_dnaLength(0),
+			m_serializationFlags(0)
+		{
+			m_buffer = m_totalSize?(unsigned char*)b3AlignedAlloc(totalSize,16):0;
+			const bool VOID_IS_8 = ((sizeof(void*)==8));
+			if (VOID_IS_8)
+			{
+#if _WIN64
+				initDNA((const char*)b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+				b3Assert(0);
+			} else
+			{
+#ifndef _WIN64
+				initDNA((const char*)b3s_bulletDNAstr,b3s_bulletDNAlen);
+				b3Assert(0);
+			}
+			if (VOID_IS_8)
+			{
+				initDNA((const char*)b3s_bulletDNAstr64,b3s_bulletDNAlen64);
+			} else
+			{
+				initDNA((const char*)b3s_bulletDNAstr,b3s_bulletDNAlen);
+			}
+		}
+		virtual ~b3DefaultSerializer() 
+		{
+			if (m_buffer)
+				b3AlignedFree(m_buffer);
+			if (m_dna)
+				b3AlignedFree(m_dna);
+		}
+		void	writeHeader(unsigned char* buffer) const
+		{
+			memcpy(buffer, "BULLETd", 7);
+			memcpy(buffer, "BULLETf", 7);
+			int littleEndian= 1;
+			littleEndian= ((char*)&littleEndian)[0];
+			if (sizeof(void*)==8)
+			{
+				buffer[7] = '-';
+			} else
+			{
+				buffer[7] = '_';
+			}
+			if (littleEndian)
+			{
+				buffer[8]='v';				
+			} else
+			{
+				buffer[8]='V';
+			}
+			buffer[9] = '2';
+			buffer[10] = '8';
+			buffer[11] = '1';
+		}
+		virtual	void	startSerialization()
+		{
+			m_uniqueIdGenerator= 1;
+			if (m_totalSize)
+			{
+				unsigned char* buffer = internalAlloc(B3_HEADER_LENGTH);
+				writeHeader(buffer);
+			}
+		}
+		virtual	void	finishSerialization()
+		{
+			writeDNA();
+			//if we didn't pre-allocate a buffer, we need to create a contiguous buffer now
+			int mysize = 0;
+			if (!m_totalSize)
+			{
+				if (m_buffer)
+					b3AlignedFree(m_buffer);
+				m_currentSize += B3_HEADER_LENGTH;
+				m_buffer = (unsigned char*)b3AlignedAlloc(m_currentSize,16);
+				unsigned char* currentPtr = m_buffer;
+				writeHeader(m_buffer);
+				currentPtr += B3_HEADER_LENGTH;
+				mysize+=B3_HEADER_LENGTH;
+				for (int i=0;i<	m_chunkPtrs.size();i++)
+				{
+					int curLength = sizeof(b3Chunk)+m_chunkPtrs[i]->m_length;
+					memcpy(currentPtr,m_chunkPtrs[i], curLength);
+					b3AlignedFree(m_chunkPtrs[i]);
+					currentPtr+=curLength;
+					mysize+=curLength;
+				}
+			}
+			mTypes.clear();
+			mStructs.clear();
+			mTlens.clear();
+			mStructReverse.clear();
+			mTypeLookup.clear();
+			m_chunkP.clear();
+			m_nameMap.clear();
+			m_uniquePointers.clear();
+			m_chunkPtrs.clear();
+		}
+		virtual	void*	getUniquePointer(void*oldPtr)
+		{
+			if (!oldPtr)
+				return 0;
+			b3PointerUid* uptr = (b3PointerUid*)m_uniquePointers.find(oldPtr);
+			if (uptr)
+			{
+				return uptr->m_ptr;
+			}
+			m_uniqueIdGenerator++;
+			b3PointerUid uid;
+			uid.m_uniqueIds[0] = m_uniqueIdGenerator;
+			uid.m_uniqueIds[1] = m_uniqueIdGenerator;
+			m_uniquePointers.insert(oldPtr,uid);
+			return uid.m_ptr;
+		}
+		virtual	const unsigned char*		getBufferPointer() const
+		{
+			return m_buffer;
+		}
+		virtual	int					getCurrentBufferSize() const
+		{
+			return	m_currentSize;
+		}
+		virtual	void	finalizeChunk(b3Chunk* chunk, const char* structType, int chunkCode,void* oldPtr)
+		{
+			if (!(m_serializationFlags&B3_SERIALIZE_NO_DUPLICATE_ASSERT))
+			{
+				b3Assert(!findPointer(oldPtr));
+			}
+			chunk->m_dna_nr = getReverseType(structType);
+			chunk->m_chunkCode = chunkCode;
+			void* uniquePtr = getUniquePointer(oldPtr);
+			m_chunkP.insert(oldPtr,uniquePtr);//chunk->m_oldPtr);
+			chunk->m_oldPtr = uniquePtr;//oldPtr;
+		}
+		virtual unsigned char* internalAlloc(size_t size)
+		{
+			unsigned char* ptr = 0;
+			if (m_totalSize)
+			{
+				ptr = m_buffer+m_currentSize;
+				m_currentSize += int(size);
+				b3Assert(m_currentSize<m_totalSize);
+			} else
+			{
+				ptr = (unsigned char*)b3AlignedAlloc(size,16);
+				m_currentSize += int(size);
+			}
+			return ptr;
+		}
+		virtual	b3Chunk*	allocate(size_t size, int numElements)
+		{
+			unsigned char* ptr = internalAlloc(int(size)*numElements+sizeof(b3Chunk));
+			unsigned char* data = ptr + sizeof(b3Chunk);
+			b3Chunk* chunk = (b3Chunk*)ptr;
+			chunk->m_chunkCode = 0;
+			chunk->m_oldPtr = data;
+			chunk->m_length = int(size)*numElements;
+			chunk->m_number = numElements;
+			m_chunkPtrs.push_back(chunk);
+			return chunk;
+		}
+		virtual	const char*	findNameForPointer(const void* ptr) const
+		{
+			const char*const * namePtr = m_nameMap.find(ptr);
+			if (namePtr && *namePtr)
+				return *namePtr;
+			return 0;
+		}
+		virtual	void	registerNameForPointer(const void* ptr, const char* name)
+		{
+			m_nameMap.insert(ptr,name);
+		}
+		virtual void	serializeName(const char* name)
+		{
+			if (name)
+			{
+				//don't serialize name twice
+				if (findPointer((void*)name))
+					return;
+				int len = b3StrLen(name);
+				if (len)
+				{
+					int newLen = len+1;
+					int padding = ((newLen+3)&~3)-newLen;
+					newLen += padding;
+					//serialize name string now
+					b3Chunk* chunk = allocate(sizeof(char),newLen);
+					char* destinationName = (char*)chunk->m_oldPtr;
+					for (int i=0;i<len;i++)
+					{
+						destinationName[i] = name[i];
+					}
+					destinationName[len] = 0;
+					finalizeChunk(chunk,"char",B3_ARRAY_CODE,(void*)name);
+				}
+			}
+		}
+		virtual int		getSerializationFlags() const
+		{
+			return m_serializationFlags;
+		}
+		virtual void	setSerializationFlags(int flags)
+		{
+			m_serializationFlags = flags;
+		}
+#endif //B3_SERIALIZER_H
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btAxisSweep3.h b/src/bullet/BulletCollision/BroadphaseCollision/btAxisSweep3.h
index 4f4d94b3..cd6e1a89 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btAxisSweep3.h
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btAxisSweep3.h
@@ -615,7 +615,7 @@ void btAxisSweep3Internal<BP_FP_INT_TYPE>::removeHandle(BP_FP_INT_TYPE handle,bt
 template <typename BP_FP_INT_TYPE>
-void btAxisSweep3Internal<BP_FP_INT_TYPE>::resetPool(btDispatcher* dispatcher)
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::resetPool(btDispatcher* /*dispatcher*/)
 	if (m_numHandles == 0)
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h b/src/bullet/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h
index 36eec971..40565623 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h
@@ -23,6 +23,7 @@ struct btBroadphaseProxy;
 class btDispatcher;
 class btManifoldResult;
 class btCollisionObject;
+struct btCollisionObjectWrapper;
 struct btDispatcherInfo;
 class	btPersistentManifold;
@@ -69,7 +70,7 @@ public:
 	virtual ~btCollisionAlgorithm() {};
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut) = 0;
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut) = 0;
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut) = 0;
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.cpp b/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.cpp
index 95443af5..2ca20cdd 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.cpp
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.cpp
@@ -38,8 +38,9 @@ static DBVT_INLINE btDbvtVolume	merge(	const btDbvtVolume& a,
 									  const btDbvtVolume& b)
-	ATTRIBUTE_ALIGNED16(char locals[sizeof(btDbvtAabbMm)]);
-	btDbvtVolume&	res=*(btDbvtVolume*)locals;
+	ATTRIBUTE_ALIGNED16( char locals[sizeof(btDbvtAabbMm)]);
+	btDbvtVolume* ptr = (btDbvtVolume*) locals;
+	btDbvtVolume&	res=*ptr;
 		btDbvtVolume	res;
@@ -250,7 +251,8 @@ static btDbvtVolume				bounds(	const tNodeArray& leaves)
 	ATTRIBUTE_ALIGNED16(char	locals[sizeof(btDbvtVolume)]);
-	btDbvtVolume&	volume=*(btDbvtVolume*)locals;
+	btDbvtVolume* ptr = (btDbvtVolume*) locals;
+	btDbvtVolume&	volume=*ptr;
 	btDbvtVolume volume=leaves[0]->volume;
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.h b/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.h
index f8abaf13..bee17e5c 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.h
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btDbvt.h
@@ -57,7 +57,7 @@ subject to the following restrictions:
 // Specific methods implementation
 //SSE gives errors on a MSVC 7.1
-#if defined (BT_USE_SSE) && defined (_WIN32)
+#if defined (BT_USE_SSE) //&& defined (_WIN32)
@@ -92,9 +92,7 @@ subject to the following restrictions:
-#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__) && !defined(__native_client__)
+#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
 #include <memory.h>
 #include <string.h>
@@ -124,6 +122,7 @@ subject to the following restrictions:
 #error "DBVT_INT0_IMPL undefined"
 // Defaults volumes
@@ -162,6 +161,10 @@ struct	btDbvtAabbMm
 		btDbvtAabbMm& r);
 	DBVT_INLINE friend bool			NotEqual(	const btDbvtAabbMm& a,
 		const btDbvtAabbMm& b);
+    DBVT_INLINE btVector3&	tMins()	{ return(mi); }
+	DBVT_INLINE btVector3&	tMaxs()	{ return(mx); }
 	DBVT_INLINE void				AddSpan(const btVector3& d,btScalar& smi,btScalar& smx) const;
@@ -186,6 +189,9 @@ struct	btDbvtNode
+typedef btAlignedObjectArray<const btDbvtNode*> btNodeStack;
 ///The btDbvt class implements a fast dynamic bounding volume tree based on axis aligned bounding boxes (aabb tree).
 ///This btDbvt is used for soft body collision detection and for the btDbvtBroadphase. It has a fast insert, remove and update of nodes.
 ///Unlike the btQuantizedBvh, nodes can be dynamically moved around, which allows for change in topology of the underlying data structure.
@@ -322,7 +328,17 @@ struct	btDbvt
 		void		collideTV(	const btDbvtNode* root,
 		const btDbvtVolume& volume,
+		DBVT_IPOLICY) const;
+	void		collideTVNoStackAlloc(	const btDbvtNode* root,
+						  const btDbvtVolume& volume,
+						  btNodeStack& stack,
+						  DBVT_IPOLICY) const;
 	///rayTest is a re-entrant ray test, and can be called in parallel as long as the btAlignedAlloc is thread-safe (uses locking etc)
 	///rayTest is slower than rayTestInternal, because it builds a local stack, using memory allocations, and it recomputes signs/rayDirectionInverses each time
@@ -521,7 +537,11 @@ DBVT_INLINE bool		Intersect(	const btDbvtAabbMm& a,
 	const __m128	rt(_mm_or_ps(	_mm_cmplt_ps(_mm_load_ps(b.mx),_mm_load_ps(a.mi)),
+#if defined (_WIN32)
 	const __int32*	pu((const __int32*)&rt);
+    const int*	pu((const int*)&rt);
 	return(	(a.mi.x()<=b.mx.x())&&
@@ -570,7 +590,12 @@ DBVT_INLINE int			Select(	const btDbvtAabbMm& o,
 							   const btDbvtAabbMm& b)
+#if defined (_WIN32)
 	static ATTRIBUTE_ALIGNED16(const unsigned __int32)	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
+    static ATTRIBUTE_ALIGNED16(const unsigned int)	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x00000000 /*0x7fffffff*/};
 	///@todo: the intrinsic version is 11% slower
@@ -906,39 +931,72 @@ inline void		btDbvt::collideTT(	const btDbvtNode* root0,
 inline void		btDbvt::collideTV(	const btDbvtNode* root,
 								  const btDbvtVolume& vol,
-								  DBVT_IPOLICY)
+								  DBVT_IPOLICY) const
-		if(root)
-		{
-			ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume(vol);
-			btAlignedObjectArray<const btDbvtNode*>	stack;
-			stack.resize(0);
-			stack.reserve(SIMPLE_STACKSIZE);
-			stack.push_back(root);
-			do	{
-				const btDbvtNode*	n=stack[stack.size()-1];
-				stack.pop_back();
-				if(Intersect(n->volume,volume))
+	if(root)
+	{
+		ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume(vol);
+		btAlignedObjectArray<const btDbvtNode*>	stack;
+		stack.resize(0);
+		stack.reserve(SIMPLE_STACKSIZE);
+		stack.push_back(root);
+		do	{
+			const btDbvtNode*	n=stack[stack.size()-1];
+			stack.pop_back();
+			if(Intersect(n->volume,volume))
+			{
+				if(n->isinternal())
-					if(n->isinternal())
-					{
-						stack.push_back(n->childs[0]);
-						stack.push_back(n->childs[1]);
-					}
-					else
-					{
-						policy.Process(n);
-					}
+					stack.push_back(n->childs[0]);
+					stack.push_back(n->childs[1]);
-			} while(stack.size()>0);
-		}
+				else
+				{
+					policy.Process(n);
+				}
+			}
+		} while(stack.size()>0);
+	}
+inline void		btDbvt::collideTVNoStackAlloc(	const btDbvtNode* root,
+											 const btDbvtVolume& vol,
+											 btNodeStack& stack,
+											 DBVT_IPOLICY) const
+	if(root)
+	{
+		ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume(vol);
+		stack.resize(0);
+		stack.reserve(SIMPLE_STACKSIZE);
+		stack.push_back(root);
+		do	{
+			const btDbvtNode*	n=stack[stack.size()-1];
+			stack.pop_back();
+			if(Intersect(n->volume,volume))
+			{
+				if(n->isinternal())
+				{
+					stack.push_back(n->childs[0]);
+					stack.push_back(n->childs[1]);
+				}
+				else
+				{
+					policy.Process(n);
+				}
+			}
+		} while(stack.size()>0);
+	}
 inline void		btDbvt::rayTestInternal(	const btDbvtNode* root,
 								const btVector3& rayFrom,
@@ -1182,19 +1240,34 @@ inline void		btDbvt::collideOCL(	const btDbvtNode* root,
 							/* Insert 0	*/ 
+							//void * memmove ( void * destination, const void * source, size_t num );
-							memmove(&stack[j+1],&stack[j],sizeof(int)*(stack.size()-j-1));
+                     {
+                     int num_items_to_move = stack.size()-1-j;
+                     if(num_items_to_move > 0)
+                        memmove(&stack[j+1],&stack[j],sizeof(int)*num_items_to_move);
+                     }
-							for(int k=stack.size()-1;k>j;--k) stack[k]=stack[k-1];
+                     for(int k=stack.size()-1;k>j;--k) {
+								stack[k]=stack[k-1];
+                     }
 							/* Insert 1	*/ 
-							memmove(&stack[j+1],&stack[j],sizeof(int)*(stack.size()-j-1));
+                     {
+                     int num_items_to_move = stack.size()-1-j;
+                     if(num_items_to_move > 0)
+                        memmove(&stack[j+1],&stack[j],sizeof(int)*num_items_to_move);
+                     }
-							for(int k=stack.size()-1;k>j;--k) stack[k]=stack[k-1];
+                     for(int k=stack.size()-1;k>j;--k) {
+                        stack[k]=stack[k-1];
+                     }
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btDispatcher.h b/src/bullet/BulletCollision/BroadphaseCollision/btDispatcher.h
index a79cf940..89c307d1 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btDispatcher.h
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btDispatcher.h
@@ -22,10 +22,9 @@ struct btBroadphaseProxy;
 class btRigidBody;
 class	btCollisionObject;
 class btOverlappingPairCache;
+struct btCollisionObjectWrapper;
 class btPersistentManifold;
-class btStackAlloc;
 class btPoolAllocator;
 struct btDispatcherInfo
@@ -47,8 +46,7 @@ struct btDispatcherInfo
-		m_convexConservativeDistanceThreshold(0.0f),
-		m_stackAllocator(0)
+		m_convexConservativeDistanceThreshold(0.0f)
@@ -64,7 +62,6 @@ struct btDispatcherInfo
 	btScalar	m_allowedCcdPenetration;
 	bool		m_useConvexConservativeDistanceUtil;
 	btScalar	m_convexConservativeDistanceThreshold;
-	btStackAlloc*	m_stackAllocator;
 ///The btDispatcher interface class can be used in combination with broadphase to dispatch calculations for overlapping pairs.
@@ -76,17 +73,17 @@ class btDispatcher
 	virtual ~btDispatcher() ;
-	virtual btCollisionAlgorithm* findAlgorithm(btCollisionObject* body0,btCollisionObject* body1,btPersistentManifold* sharedManifold=0) = 0;
+	virtual btCollisionAlgorithm* findAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btPersistentManifold* sharedManifold=0) = 0;
-	virtual btPersistentManifold*	getNewManifold(void* body0,void* body1)=0;
+	virtual btPersistentManifold*	getNewManifold(const btCollisionObject* b0,const btCollisionObject* b1)=0;
 	virtual void releaseManifold(btPersistentManifold* manifold)=0;
 	virtual void clearManifold(btPersistentManifold* manifold)=0;
-	virtual bool	needsCollision(btCollisionObject* body0,btCollisionObject* body1) = 0;
+	virtual bool	needsCollision(const btCollisionObject* body0,const btCollisionObject* body1) = 0;
-	virtual bool	needsResponse(btCollisionObject* body0,btCollisionObject* body1)=0;
+	virtual bool	needsResponse(const btCollisionObject* body0,const btCollisionObject* body1)=0;
 	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher)  =0;
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp b/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp
index 041bbe05..ad69fcbd 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp
@@ -34,7 +34,6 @@ int gFindPairs =0;
-	m_blockedForChanges(false),
 	int initialAllocatedSize= 2;
@@ -53,7 +52,7 @@ btHashedOverlappingPairCache::~btHashedOverlappingPairCache()
 void	btHashedOverlappingPairCache::cleanOverlappingPair(btBroadphasePair& pair,btDispatcher* dispatcher)
-	if (pair.m_algorithm)
+	if (pair.m_algorithm && dispatcher)
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.h b/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.h
index 7a3806c1..14614270 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.h
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btOverlappingPairCache.h
@@ -94,7 +94,12 @@ class btHashedOverlappingPairCache : public btOverlappingPairCache
 	btBroadphasePairArray	m_overlappingPairArray;
 	btOverlapFilterCallback* m_overlapFilterCallback;
-	bool		m_blockedForChanges;
+	btAlignedObjectArray<int>	m_hashTable;
+	btAlignedObjectArray<int>	m_next;
+	btOverlappingPairCallback*	m_ghostPairCallback;
@@ -265,11 +270,6 @@ private:
 	virtual void	sortOverlappingPairs(btDispatcher* dispatcher);
-	btAlignedObjectArray<int>	m_hashTable;
-	btAlignedObjectArray<int>	m_next;
-	btOverlappingPairCallback*	m_ghostPairCallback;
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp b/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp
index c911435a..93de4999 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp
@@ -96,7 +96,27 @@ void	btQuantizedBvh::setQuantizationValues(const btVector3& bvhAabbMin,const btV
 	m_bvhAabbMax = bvhAabbMax + clampValue;
 	btVector3 aabbSize = m_bvhAabbMax - m_bvhAabbMin;
 	m_bvhQuantization = btVector3(btScalar(65533.0),btScalar(65533.0),btScalar(65533.0)) / aabbSize;
 	m_useQuantization = true;
+	{
+		unsigned short vecIn[3];
+		btVector3 v;
+		{
+			quantize(vecIn,m_bvhAabbMin,false);
+			v = unQuantize(vecIn);
+			m_bvhAabbMin.setMin(v-clampValue);
+		}
+        aabbSize = m_bvhAabbMax - m_bvhAabbMin;
+        m_bvhQuantization = btVector3(btScalar(65533.0),btScalar(65533.0),btScalar(65533.0)) / aabbSize;
+		{
+			quantize(vecIn,m_bvhAabbMax,true);
+			v = unQuantize(vecIn);
+			m_bvhAabbMax.setMax(v+clampValue);
+		}
+		aabbSize = m_bvhAabbMax - m_bvhAabbMin;
+		m_bvhQuantization = btVector3(btScalar(65533.0),btScalar(65533.0),btScalar(65533.0)) / aabbSize;
+	}
diff --git a/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.h b/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
index 579cc9a5..78382da7 100644
--- a/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
+++ b/src/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
@@ -78,8 +78,10 @@ ATTRIBUTE_ALIGNED16	(struct) btQuantizedBvhNode
 	int	getTriangleIndex() const
+		unsigned int x=0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
 		// Get only the lower bits where the triangle index is stored
-		return (m_escapeIndexOrTriangleIndex&~((~0)<<(31-MAX_NUM_PARTS_IN_BITS)));
+		return (m_escapeIndexOrTriangleIndex&~(y));
 	int	getPartId() const
diff --git a/src/bullet/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp b/src/bullet/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp
index 23a5c752..63401780 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp
@@ -158,7 +158,6 @@ bool SphereTriangleDetector::collide(const btVector3& sphereCenter,btVector3 &po
 				depth = -(radius-distance);
 			} else
-				btScalar distance = 0.f;
 				resultNormal = normal;
 				point = contactPoint;
 				depth = -radius;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp
index 7e5da6c5..57f14649 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp
@@ -24,7 +24,7 @@ btActivatingCollisionAlgorithm::btActivatingCollisionAlgorithm (const btCollisio
-btActivatingCollisionAlgorithm::btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* colObj0,btCollisionObject* colObj1)
+btActivatingCollisionAlgorithm::btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* ,const btCollisionObjectWrapper* )
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h
index 25fe0889..489812b9 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h
@@ -28,7 +28,7 @@ public:
 	btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci);
-	btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* colObj0,btCollisionObject* colObj1);
+	btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 	virtual ~btActivatingCollisionAlgorithm();
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
index 2182d0d7..2c362778 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
@@ -22,17 +22,18 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionDispatch/btBoxBoxDetector.h"
 #include "BulletCollision/CollisionShapes/btBox2dShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
-btBox2dBox2dCollisionAlgorithm::btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* obj0,btCollisionObject* obj1)
-: btActivatingCollisionAlgorithm(ci,obj0,obj1),
+btBox2dBox2dCollisionAlgorithm::btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* obj0Wrap,const btCollisionObjectWrapper* obj1Wrap)
+: btActivatingCollisionAlgorithm(ci,obj0Wrap,obj1Wrap),
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(obj0,obj1))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(obj0Wrap->getCollisionObject(),obj1Wrap->getCollisionObject()))
-		m_manifoldPtr = m_dispatcher->getNewManifold(obj0,obj1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(obj0Wrap->getCollisionObject(),obj1Wrap->getCollisionObject());
 		m_ownManifold = true;
@@ -52,19 +53,18 @@ btBox2dBox2dCollisionAlgorithm::~btBox2dBox2dCollisionAlgorithm()
 void b2CollidePolygons(btManifoldResult* manifold,  const btBox2dShape* polyA, const btTransform& xfA, const btBox2dShape* polyB, const btTransform& xfB);
 //#include <stdio.h>
-void btBox2dBox2dCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btBox2dBox2dCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-	btCollisionObject*	col0 = body0;
-	btCollisionObject*	col1 = body1;
-	btBox2dShape* box0 = (btBox2dShape*)col0->getCollisionShape();
-	btBox2dShape* box1 = (btBox2dShape*)col1->getCollisionShape();
+	const btBox2dShape* box0 = (const btBox2dShape*)body0Wrap->getCollisionShape();
+	const btBox2dShape* box1 = (const btBox2dShape*)body1Wrap->getCollisionShape();
-	b2CollidePolygons(resultOut,box0,col0->getWorldTransform(),box1,col1->getWorldTransform());
+	b2CollidePolygons(resultOut,box0,body0Wrap->getWorldTransform(),box1,body1Wrap->getWorldTransform());
 	//  refreshContactPoints is only necessary when using persistent contact points. otherwise all points are newly added
 	if (m_ownManifold)
@@ -151,15 +151,8 @@ static btScalar EdgeSeparation(const btBox2dShape* poly1, const btTransform& xf1
 	int index = 0;
 	btScalar minDot = BT_LARGE_FLOAT;
-	for (int i = 0; i < count2; ++i)
-	{
-		btScalar dot = b2Dot(vertices2[i], normal1);
-		if (dot < minDot)
-		{
-			minDot = dot;
-			index = i;
-		}
-	}
+    if( count2 > 0 )
+        index = (int) normal1.minDot( vertices2, count2, minDot);
 	btVector3 v1 = b2Mul(xf1, vertices1[edge1]);
 	btVector3 v2 = b2Mul(xf2, vertices2[index]);
@@ -181,16 +174,9 @@ static btScalar FindMaxSeparation(int* edgeIndex,
 	// Find edge normal on poly1 that has the largest projection onto d.
 	int edge = 0;
-	btScalar maxDot = -BT_LARGE_FLOAT;
-	for (int i = 0; i < count1; ++i)
-	{
-		btScalar dot = b2Dot(normals1[i], dLocal1);
-		if (dot > maxDot)
-		{
-			maxDot = dot;
-			edge = i;
-		}
-	}
+    btScalar maxDot;
+    if( count1 > 0 )
+        edge = (int) dLocal1.maxDot( normals1, count1, maxDot);
 	// Get the separation for the edge normal.
 	btScalar s = EdgeSeparation(poly1, xf1, edge, poly2, xf2);
@@ -368,7 +354,7 @@ void b2CollidePolygons(btManifoldResult* manifold,
 	btVector3 v11 = vertices1[edge1];
 	btVector3 v12 = edge1 + 1 < count1 ? vertices1[edge1+1] : vertices1[0];
-	btVector3 dv = v12 - v11;
+	//btVector3 dv = v12 - v11;
 	btVector3 sideNormal = b2Mul(xf1.getBasis(), v12 - v11);
 	btVector3 frontNormal = btCrossS(sideNormal, 1.0f);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h
index 97c5be77..6ea6e89b 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h
@@ -33,11 +33,11 @@ public:
 	btBox2dBox2dCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
-	btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 	virtual ~btBox2dBox2dCollisionAlgorithm();
@@ -52,11 +52,11 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			int bbsize = sizeof(btBox2dBox2dCollisionAlgorithm);
 			void* ptr = ci.m_dispatcher1->allocateCollisionAlgorithm(bbsize);
-			return new(ptr) btBox2dBox2dCollisionAlgorithm(0,ci,body0,body1);
+			return new(ptr) btBox2dBox2dCollisionAlgorithm(0,ci,body0Wrap,body1Wrap);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp
index 49628853..ac68968f 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp
@@ -18,17 +18,17 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btBoxShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "btBoxBoxDetector.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
-btBoxBoxCollisionAlgorithm::btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* obj0,btCollisionObject* obj1)
-: btActivatingCollisionAlgorithm(ci,obj0,obj1),
+btBoxBoxCollisionAlgorithm::btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(obj0,obj1))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject()))
-		m_manifoldPtr = m_dispatcher->getNewManifold(obj0,obj1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
@@ -42,15 +42,14 @@ btBoxBoxCollisionAlgorithm::~btBoxBoxCollisionAlgorithm()
-void btBoxBoxCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btBoxBoxCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-	btCollisionObject*	col0 = body0;
-	btCollisionObject*	col1 = body1;
-	btBoxShape* box0 = (btBoxShape*)col0->getCollisionShape();
-	btBoxShape* box1 = (btBoxShape*)col1->getCollisionShape();
+	const btBoxShape* box0 = (btBoxShape*)body0Wrap->getCollisionShape();
+	const btBoxShape* box1 = (btBoxShape*)body1Wrap->getCollisionShape();
@@ -62,8 +61,8 @@ void btBoxBoxCollisionAlgorithm::processCollision (btCollisionObject* body0,btCo
 	btDiscreteCollisionDetectorInterface::ClosestPointInput input;
 	input.m_maximumDistanceSquared = BT_LARGE_FLOAT;
-	input.m_transformA = body0->getWorldTransform();
-	input.m_transformB = body1->getWorldTransform();
+	input.m_transformA = body0Wrap->getWorldTransform();
+	input.m_transformB = body1Wrap->getWorldTransform();
 	btBoxBoxDetector detector(box0,box1);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h
index f0bbae61..59808df5 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h
@@ -33,11 +33,11 @@ public:
 	btBoxBoxCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
-	btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 	virtual ~btBoxBoxCollisionAlgorithm();
@@ -52,11 +52,11 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			int bbsize = sizeof(btBoxBoxCollisionAlgorithm);
 			void* ptr = ci.m_dispatcher1->allocateCollisionAlgorithm(bbsize);
-			return new(ptr) btBoxBoxCollisionAlgorithm(0,ci,body0,body1);
+			return new(ptr) btBoxBoxCollisionAlgorithm(0,ci,body0Wrap,body1Wrap);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp
index a7c8cf14..7043bde3 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp
@@ -24,7 +24,7 @@ subject to the following restrictions:
 #include <float.h>
 #include <string.h>
-btBoxBoxDetector::btBoxBoxDetector(btBoxShape* box1,btBoxShape* box2)
+btBoxBoxDetector::btBoxBoxDetector(const btBoxShape* box1,const btBoxShape* box2)
 : m_box1(box1),
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.h b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.h
index 3c941f7d..39243777 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btBoxBoxDetector.h
@@ -28,12 +28,12 @@ class btBoxShape;
 /// re-distributed under the Zlib license with permission from Russell L. Smith
 struct btBoxBoxDetector : public btDiscreteCollisionDetectorInterface
-	btBoxShape* m_box1;
-	btBoxShape* m_box2;
+	const btBoxShape* m_box1;
+	const btBoxShape* m_box2;
-	btBoxBoxDetector(btBoxShape* box1,btBoxShape* box2);
+	btBoxBoxDetector(const btBoxShape* box1,const btBoxShape* box2);
 	virtual ~btBoxBoxDetector() {};
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionConfiguration.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionConfiguration.h
index f63e0923..66949849 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionConfiguration.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionConfiguration.h
@@ -18,7 +18,6 @@ subject to the following restrictions:
 struct btCollisionAlgorithmCreateFunc;
-class btStackAlloc;
 class btPoolAllocator;
 ///btCollisionConfiguration allows to configure Bullet collision detection
@@ -38,7 +37,6 @@ public:
 	virtual btPoolAllocator* getCollisionAlgorithmPool() = 0;
-	virtual btStackAlloc*	getStackAllocator() = 0;
 	virtual btCollisionAlgorithmCreateFunc* getCollisionAlgorithmCreateFunc(int proxyType0,int proxyType1) =0;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h
index 1d7e7440..62ee66c4 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h
@@ -19,7 +19,7 @@ subject to the following restrictions:
 #include "LinearMath/btAlignedObjectArray.h"
 class btCollisionAlgorithm;
 class btCollisionObject;
+struct btCollisionObjectWrapper;
 struct btCollisionAlgorithmConstructionInfo;
 ///Used by the btCollisionDispatcher to register and create instances for btCollisionAlgorithm
@@ -33,11 +33,11 @@ struct btCollisionAlgorithmCreateFunc
 	virtual ~btCollisionAlgorithmCreateFunc(){};
-	virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& , btCollisionObject* body0,btCollisionObject* body1)
+	virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& , const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
-		(void)body0;
-		(void)body1;
+		(void)body0Wrap;
+		(void)body1Wrap;
 		return 0;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp b/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp
index 29674f3b..3b6913c0 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp
@@ -25,6 +25,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
 #include "LinearMath/btPoolAllocator.h"
 #include "BulletCollision/CollisionDispatch/btCollisionConfiguration.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 int gNumManifold = 0;
@@ -67,15 +68,13 @@ btCollisionDispatcher::~btCollisionDispatcher()
-btPersistentManifold*	btCollisionDispatcher::getNewManifold(void* b0,void* b1) 
+btPersistentManifold*	btCollisionDispatcher::getNewManifold(const btCollisionObject* body0,const btCollisionObject* body1) 
 	//btAssert(gNumManifold < 65535);
-	btCollisionObject* body0 = (btCollisionObject*)b0;
-	btCollisionObject* body1 = (btCollisionObject*)b1;
 	//optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
@@ -85,7 +84,7 @@ btPersistentManifold*	btCollisionDispatcher::getNewManifold(void* b0,void* b1)
 	btScalar contactProcessingThreshold = btMin(body0->getContactProcessingThreshold(),body1->getContactProcessingThreshold());
-	void* mem = 0;
+ 	void* mem = 0;
 	if (m_persistentManifoldPoolAllocator->getFreeCount())
@@ -143,14 +142,14 @@ void btCollisionDispatcher::releaseManifold(btPersistentManifold* manifold)
-btCollisionAlgorithm* btCollisionDispatcher::findAlgorithm(btCollisionObject* body0,btCollisionObject* body1,btPersistentManifold* sharedManifold)
+btCollisionAlgorithm* btCollisionDispatcher::findAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btPersistentManifold* sharedManifold)
 	btCollisionAlgorithmConstructionInfo ci;
 	ci.m_dispatcher1 = this;
 	ci.m_manifold = sharedManifold;
-	btCollisionAlgorithm* algo = m_doubleDispatch[body0->getCollisionShape()->getShapeType()][body1->getCollisionShape()->getShapeType()]->CreateCollisionAlgorithm(ci,body0,body1);
+	btCollisionAlgorithm* algo = m_doubleDispatch[body0Wrap->getCollisionShape()->getShapeType()][body1Wrap->getCollisionShape()->getShapeType()]->CreateCollisionAlgorithm(ci,body0Wrap,body1Wrap);
 	return algo;
@@ -158,7 +157,7 @@ btCollisionAlgorithm* btCollisionDispatcher::findAlgorithm(btCollisionObject* bo
-bool	btCollisionDispatcher::needsResponse(btCollisionObject* body0,btCollisionObject* body1)
+bool	btCollisionDispatcher::needsResponse(const btCollisionObject* body0,const btCollisionObject* body1)
 	//here you can do filtering
 	bool hasResponse = 
@@ -169,7 +168,7 @@ bool	btCollisionDispatcher::needsResponse(btCollisionObject* body0,btCollisionOb
 	return hasResponse;
-bool	btCollisionDispatcher::needsCollision(btCollisionObject* body0,btCollisionObject* body1)
+bool	btCollisionDispatcher::needsCollision(const btCollisionObject* body0,const btCollisionObject* body1)
@@ -190,7 +189,7 @@ bool	btCollisionDispatcher::needsCollision(btCollisionObject* body0,btCollisionO
 	if ((!body0->isActive()) && (!body1->isActive()))
 		needsCollision = false;
-	else if (!body0->checkCollideWith(body1))
+	else if ((!body0->checkCollideWith(body1)) || (!body1->checkCollideWith(body0)))
 		needsCollision = false;
 	return needsCollision ;
@@ -259,20 +258,25 @@ void btCollisionDispatcher::defaultNearCallback(btBroadphasePair& collisionPair,
 		if (dispatcher.needsCollision(colObj0,colObj1))
+			btCollisionObjectWrapper obj0Wrap(0,colObj0->getCollisionShape(),colObj0,colObj0->getWorldTransform(),-1,-1);
+			btCollisionObjectWrapper obj1Wrap(0,colObj1->getCollisionShape(),colObj1,colObj1->getWorldTransform(),-1,-1);
 			//dispatcher will keep algorithms persistent in the collision pair
 			if (!collisionPair.m_algorithm)
-				collisionPair.m_algorithm = dispatcher.findAlgorithm(colObj0,colObj1);
+				collisionPair.m_algorithm = dispatcher.findAlgorithm(&obj0Wrap,&obj1Wrap);
 			if (collisionPair.m_algorithm)
-				btManifoldResult contactPointResult(colObj0,colObj1);
+				btManifoldResult contactPointResult(&obj0Wrap,&obj1Wrap);
 				if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
 					//discrete collision detection query
-					collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
+					collisionPair.m_algorithm->processCollision(&obj0Wrap,&obj1Wrap,dispatchInfo,&contactPointResult);
 				} else
 					//continuous collision detection query, time of impact (toi)
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.h
index 5accad9a..92696ee5 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionDispatcher.h
@@ -108,19 +108,18 @@ public:
 	virtual ~btCollisionDispatcher();
-	virtual btPersistentManifold*	getNewManifold(void* b0,void* b1);
+	virtual btPersistentManifold*	getNewManifold(const btCollisionObject* b0,const btCollisionObject* b1);
 	virtual void releaseManifold(btPersistentManifold* manifold);
 	virtual void clearManifold(btPersistentManifold* manifold);
-	btCollisionAlgorithm* findAlgorithm(btCollisionObject* body0,btCollisionObject* body1,btPersistentManifold* sharedManifold = 0);
+	btCollisionAlgorithm* findAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btPersistentManifold* sharedManifold = 0);
-	virtual bool	needsCollision(btCollisionObject* body0,btCollisionObject* body1);
+	virtual bool	needsCollision(const btCollisionObject* body0,const btCollisionObject* body1);
-	virtual bool	needsResponse(btCollisionObject* body0,btCollisionObject* body1);
+	virtual bool	needsResponse(const btCollisionObject* body0,const btCollisionObject* body1);
 	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) ;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.cpp b/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.cpp
index 580ea345..395df3a5 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.cpp
@@ -4,8 +4,8 @@ Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
@@ -32,12 +32,15 @@ btCollisionObject::btCollisionObject()
+		m_rollingFriction(0.0f),
+		m_userIndex(-1),
-		m_checkCollideWith(false)
+		m_checkCollideWith(false),
+		m_updateRevision(0)
@@ -46,18 +49,18 @@ btCollisionObject::~btCollisionObject()
-void btCollisionObject::setActivationState(int newState) 
+void btCollisionObject::setActivationState(int newState) const
 	if ( (m_activationState1 != DISABLE_DEACTIVATION) && (m_activationState1 != DISABLE_SIMULATION))
 		m_activationState1 = newState;
-void btCollisionObject::forceActivationState(int newState)
+void btCollisionObject::forceActivationState(int newState) const
 	m_activationState1 = newState;
-void btCollisionObject::activate(bool forceActivation)
+void btCollisionObject::activate(bool forceActivation) const
 	if (forceActivation || !(m_collisionFlags & (CF_STATIC_OBJECT|CF_KINEMATIC_OBJECT)))
@@ -85,9 +88,9 @@ const char* btCollisionObject::serialize(void* dataBuffer, btSerializer* seriali
 	dataOut->m_islandTag1 = m_islandTag1;
 	dataOut->m_companionId = m_companionId;
 	dataOut->m_activationState1 = m_activationState1;
-	dataOut->m_activationState1 = m_activationState1;
 	dataOut->m_deactivationTime = m_deactivationTime;
 	dataOut->m_friction = m_friction;
+	dataOut->m_rollingFriction = m_rollingFriction;
 	dataOut->m_restitution = m_restitution;
 	dataOut->m_internalType = m_internalType;
@@ -100,7 +103,6 @@ const char* btCollisionObject::serialize(void* dataBuffer, btSerializer* seriali
 	dataOut->m_hitFraction = m_hitFraction;
 	dataOut->m_ccdSweptSphereRadius = m_ccdSweptSphereRadius;
 	dataOut->m_ccdMotionThreshold = m_ccdMotionThreshold;
-	dataOut->m_ccdMotionThreshold = m_ccdMotionThreshold;
 	dataOut->m_checkCollideWith = m_checkCollideWith;
 	return btCollisionObjectDataName;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h
index 3a11c967..c6840241 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h
@@ -80,18 +80,22 @@ protected:
 	int				m_islandTag1;
 	int				m_companionId;
-	int				m_activationState1;
-	btScalar			m_deactivationTime;
+	mutable int				m_activationState1;
+	mutable btScalar			m_deactivationTime;
 	btScalar		m_friction;
 	btScalar		m_restitution;
+	btScalar		m_rollingFriction;
 	///m_internalType is reserved to distinguish Bullet's btCollisionObject, btRigidBody, btSoftBody, btGhostObject etc.
 	///do not assign your own m_internalType unless you write a new dynamics object class.
 	int				m_internalType;
 	///users can point to their objects, m_userPointer is not used by Bullet, see setUserPointer/getUserPointer
-	void*			m_userObjectPointer;
+    void*			m_userObjectPointer;
+    int	m_userIndex;
 	///time of impact calculation
 	btScalar		m_hitFraction; 
@@ -105,10 +109,11 @@ protected:
 	/// If some object should have elaborate collision filtering by sub-classes
 	int			m_checkCollideWith;
-	virtual bool	checkCollideWithOverride(btCollisionObject* /* co */)
-	{
-		return true;
-	}
+	btAlignedObjectArray<const btCollisionObject*> m_objectsWithoutCollisionCheck;
+	///internal update revision number. It will be increased when the object changes. This allows some subsystems to perform lazy evaluation.
+	int			m_updateRevision;
@@ -134,7 +139,15 @@ public:
+	};
+	enum AnisotropicFrictionFlags
+	{
 	SIMD_FORCE_INLINE bool mergesSimulationIslands() const
@@ -147,14 +160,15 @@ public:
 		return m_anisotropicFriction;
-	void	setAnisotropicFriction(const btVector3& anisotropicFriction)
+	void	setAnisotropicFriction(const btVector3& anisotropicFriction, int frictionMode = CF_ANISOTROPIC_FRICTION)
 		m_anisotropicFriction = anisotropicFriction;
-		m_hasAnisotropicFriction = (anisotropicFriction[0]!=1.f) || (anisotropicFriction[1]!=1.f) || (anisotropicFriction[2]!=1.f);
+		bool isUnity = (anisotropicFriction[0]!=1.f) || (anisotropicFriction[1]!=1.f) || (anisotropicFriction[2]!=1.f);
+		m_hasAnisotropicFriction = isUnity?frictionMode : 0;
-	bool	hasAnisotropicFriction() const
+	bool	hasAnisotropicFriction(int frictionMode = CF_ANISOTROPIC_FRICTION) const
-		return m_hasAnisotropicFriction!=0;
+		return (m_hasAnisotropicFriction&frictionMode)!=0;
 	///the constraint solver can discard solving contacts, if the distance is above this threshold. 0 by default.
@@ -193,6 +207,7 @@ public:
 	virtual void	setCollisionShape(btCollisionShape* collisionShape)
+		m_updateRevision++;
 		m_collisionShape = collisionShape;
 		m_rootCollisionShape = collisionShape;
@@ -207,22 +222,36 @@ public:
 		return m_collisionShape;
-	SIMD_FORCE_INLINE const btCollisionShape*	getRootCollisionShape() const
+	void	setIgnoreCollisionCheck(const btCollisionObject* co, bool ignoreCollisionCheck)
-		return m_rootCollisionShape;
+		if (ignoreCollisionCheck)
+		{
+			//We don't check for duplicates. Is it ok to leave that up to the user of this API?
+			//int index = m_objectsWithoutCollisionCheck.findLinearSearch(co);
+			//if (index == m_objectsWithoutCollisionCheck.size())
+			//{
+			m_objectsWithoutCollisionCheck.push_back(co);
+			//}
+		}
+		else
+		{
+			m_objectsWithoutCollisionCheck.remove(co);
+		}
+		m_checkCollideWith = m_objectsWithoutCollisionCheck.size() > 0;
-	SIMD_FORCE_INLINE btCollisionShape*	getRootCollisionShape()
+	virtual bool	checkCollideWithOverride(const btCollisionObject*  co) const
-		return m_rootCollisionShape;
+		int index = m_objectsWithoutCollisionCheck.findLinearSearch(co);
+		if (index < m_objectsWithoutCollisionCheck.size())
+		{
+			return false;
+		}
+		return true;
-	///Avoid using this internal API call
-	///internalSetTemporaryCollisionShape is used to temporary replace the actual collision shape by a child collision shape.
-	void	internalSetTemporaryCollisionShape(btCollisionShape* collisionShape)
-	{
-		m_collisionShape = collisionShape;
-	}
 	///Avoid using this internal API call, the extension pointer is used by some Bullet extensions. 
 	///If you need to store your own user pointer, use 'setUserPointer/getUserPointer' instead.
@@ -239,7 +268,7 @@ public:
 	SIMD_FORCE_INLINE	int	getActivationState() const { return m_activationState1;}
-	void setActivationState(int newState);
+	void setActivationState(int newState) const;
 	void	setDeactivationTime(btScalar time)
@@ -250,9 +279,9 @@ public:
 		return m_deactivationTime;
-	void forceActivationState(int newState);
+	void forceActivationState(int newState) const;
-	void	activate(bool forceActivation = false);
+	void	activate(bool forceActivation = false) const;
 	SIMD_FORCE_INLINE bool isActive() const
@@ -261,6 +290,7 @@ public:
 	void	setRestitution(btScalar rest)
+		m_updateRevision++;
 		m_restitution = rest;
 	btScalar	getRestitution() const
@@ -269,6 +299,7 @@ public:
 	void	setFriction(btScalar frict)
+		m_updateRevision++;
 		m_friction = frict;
 	btScalar	getFriction() const
@@ -276,6 +307,17 @@ public:
 		return m_friction;
+	void	setRollingFriction(btScalar frict)
+	{
+		m_updateRevision++;
+		m_rollingFriction = frict;
+	}
+	btScalar	getRollingFriction() const
+	{
+		return m_rollingFriction;
+	}
 	///reserved for Bullet internal usage
 	int	getInternalType() const
@@ -294,6 +336,7 @@ public:
 	void	setWorldTransform(const btTransform& worldTrans)
+		m_updateRevision++;
 		m_worldTransform = worldTrans;
@@ -326,16 +369,19 @@ public:
 	void	setInterpolationWorldTransform(const btTransform&	trans)
+		m_updateRevision++;
 		m_interpolationWorldTransform = trans;
 	void	setInterpolationLinearVelocity(const btVector3& linvel)
+		m_updateRevision++;
 		m_interpolationLinearVelocity = linvel;
 	void	setInterpolationAngularVelocity(const btVector3& angvel)
+		m_updateRevision++;
 		m_interpolationAngularVelocity = angvel;
@@ -425,15 +471,30 @@ public:
 		return m_userObjectPointer;
+	int	getUserIndex() const
+	{
+		return m_userIndex;
+	}
 	///users can point to their objects, userPointer is not used by Bullet
 	void	setUserPointer(void* userPointer)
 		m_userObjectPointer = userPointer;
+	///users can point to their objects, userPointer is not used by Bullet
+	void	setUserIndex(int index)
+	{
+		m_userIndex = index;
+	}
+	int	getUpdateRevisionInternal() const
+	{
+		return m_updateRevision;
+	}
-	inline bool checkCollideWith(btCollisionObject* co)
+	inline bool checkCollideWith(const btCollisionObject* co) const
 		if (m_checkCollideWith)
 			return checkCollideWithOverride(co);
@@ -466,6 +527,7 @@ struct	btCollisionObjectDoubleData
 	double					m_contactProcessingThreshold;	
 	double					m_deactivationTime;
 	double					m_friction;
+	double					m_rollingFriction;
 	double					m_restitution;
 	double					m_hitFraction; 
 	double					m_ccdSweptSphereRadius;
@@ -498,6 +560,8 @@ struct	btCollisionObjectFloatData
 	float					m_contactProcessingThreshold;	
 	float					m_deactivationTime;
 	float					m_friction;
+	float					m_rollingFriction;
 	float					m_restitution;
 	float					m_hitFraction; 
 	float					m_ccdSweptSphereRadius;
@@ -510,6 +574,7 @@ struct	btCollisionObjectFloatData
 	int						m_activationState1;
 	int						m_internalType;
 	int						m_checkCollideWith;
+	char					m_padding[4];
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h
new file mode 100644
index 00000000..952440b7
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h
@@ -0,0 +1,43 @@
+///btCollisionObjectWrapperis an internal data structure. 
+///Most users can ignore this and use btCollisionObject and btCollisionShape instead
+class btCollisionShape;
+class btCollisionObject;
+class btTransform;
+#include "LinearMath/btScalar.h" // for SIMD_FORCE_INLINE definition
+	private: \
+		void* operator new(size_t size); \
+		void operator delete(void*);
+struct btCollisionObjectWrapper;
+struct btCollisionObjectWrapper
+	btCollisionObjectWrapper(const btCollisionObjectWrapper&); // not implemented. Not allowed.
+	btCollisionObjectWrapper* operator=(const btCollisionObjectWrapper&);
+	const btCollisionObjectWrapper* m_parent;
+	const btCollisionShape* m_shape;
+	const btCollisionObject* m_collisionObject;
+	const btTransform& m_worldTransform;
+	int		m_partId;
+	int		m_index;
+	btCollisionObjectWrapper(const btCollisionObjectWrapper* parent, const btCollisionShape* shape, const btCollisionObject* collisionObject, const btTransform& worldTransform, int partId, int index)
+	: m_parent(parent), m_shape(shape), m_collisionObject(collisionObject), m_worldTransform(worldTransform),
+	m_partId(partId), m_index(index)
+	{}
+	SIMD_FORCE_INLINE const btTransform& getWorldTransform() const { return m_worldTransform; }
+	SIMD_FORCE_INLINE const btCollisionObject* getCollisionObject() const { return m_collisionObject; }
+	SIMD_FORCE_INLINE const btCollisionShape* getCollisionShape() const { return m_shape; }
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.cpp b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.cpp
index 66b93b88..fa4cac66 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.cpp
@@ -31,9 +31,9 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btDbvt.h"
 #include "LinearMath/btAabbUtil2.h"
 #include "LinearMath/btQuickprof.h"
-#include "LinearMath/btStackAlloc.h"
 #include "LinearMath/btSerializer.h"
 #include "BulletCollision/CollisionShapes/btConvexPolyhedron.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
@@ -72,8 +72,6 @@ m_broadphasePairCache(pairCache),
-	m_stackAlloc = collisionConfiguration->getStackAllocator();
-	m_dispatchInfo.m_stackAllocator = m_stackAlloc;
@@ -207,6 +205,11 @@ void	btCollisionWorld::updateAabbs()
+void	btCollisionWorld::computeOverlappingPairs()
+	BT_PROFILE("calculateOverlappingPairs");
+	m_broadphasePairCache->calculateOverlappingPairs(m_dispatcher1);
 void	btCollisionWorld::performDiscreteCollisionDetection()
@@ -216,11 +219,7 @@ void	btCollisionWorld::performDiscreteCollisionDetection()
-	{
-		BT_PROFILE("calculateOverlappingPairs");
-		m_broadphasePairCache->calculateOverlappingPairs(m_dispatcher1);
-	}
+	computeOverlappingPairs();
 	btDispatcher* dispatcher = getDispatcher();
@@ -260,16 +259,25 @@ void	btCollisionWorld::removeCollisionObject(btCollisionObject* collisionObject)
 void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTransform& rayToTrans,
 										btCollisionObject* collisionObject,
 										const btCollisionShape* collisionShape,
 										const btTransform& colObjWorldTransform,
 										RayResultCallback& resultCallback)
+	btCollisionObjectWrapper colObWrap(0,collisionShape,collisionObject,colObjWorldTransform,-1,-1);
+	btCollisionWorld::rayTestSingleInternal(rayFromTrans,rayToTrans,&colObWrap,resultCallback);
+void	btCollisionWorld::rayTestSingleInternal(const btTransform& rayFromTrans,const btTransform& rayToTrans,
+										const btCollisionObjectWrapper* collisionObjectWrap,
+										RayResultCallback& resultCallback)
 	btSphereShape pointShape(btScalar(0.0));
 	const btConvexShape* castShape = &pointShape;
+	const btCollisionShape* collisionShape = collisionObjectWrap->getCollisionShape();
+	const btTransform& colObjWorldTransform = collisionObjectWrap->getWorldTransform();
 	if (collisionShape->isConvex())
@@ -279,13 +287,20 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 		btConvexShape* convexShape = (btConvexShape*) collisionShape;
 		btVoronoiSimplexSolver	simplexSolver;
-		btSubsimplexConvexCast convexCaster(castShape,convexShape,&simplexSolver);
-		//btGjkConvexCast	convexCaster(castShape,convexShape,&simplexSolver);
+		btSubsimplexConvexCast subSimplexConvexCaster(castShape,convexShape,&simplexSolver);
+		btGjkConvexCast	gjkConvexCaster(castShape,convexShape,&simplexSolver);
 		//btContinuousConvexCollision convexCaster(castShape,convexShape,&simplexSolver,0);
+		btConvexCast* convexCasterPtr = 0;
+		//use kF_UseSubSimplexConvexCastRaytest by default
+		if (resultCallback.m_flags & btTriangleRaycastCallback::kF_UseGjkConvexCastRaytest)
+			convexCasterPtr = &gjkConvexCaster;
+		else
+			convexCasterPtr = &subSimplexConvexCaster;
+		btConvexCast& convexCaster = *convexCasterPtr;
 		if (convexCaster.calcTimeOfImpact(rayFromTrans,rayToTrans,colObjWorldTransform,colObjWorldTransform,castResult))
@@ -294,6 +309,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				if (castResult.m_fraction < resultCallback.m_closestHitFraction)
+					//todo: figure out what this is about. When is rayFromTest.getBasis() not identity?
 					//rotate normal into worldspace
 					castResult.m_normal = rayFromTrans.getBasis() * castResult.m_normal;
@@ -302,7 +318,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 					btCollisionWorld::LocalRayResult localRayResult
-						collisionObject,
+						collisionObjectWrap->getCollisionObject(),
@@ -317,34 +333,26 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 	} else {
 		if (collisionShape->isConcave())
-			//			BT_PROFILE("rayTestConcave");
-			if (collisionShape->getShapeType()==TRIANGLE_MESH_SHAPE_PROXYTYPE)
-			{
-				///optimized version for btBvhTriangleMeshShape
-				btBvhTriangleMeshShape* triangleMesh = (btBvhTriangleMeshShape*)collisionShape;
-				btTransform worldTocollisionObject = colObjWorldTransform.inverse();
-				btVector3 rayFromLocal = worldTocollisionObject * rayFromTrans.getOrigin();
-				btVector3 rayToLocal = worldTocollisionObject * rayToTrans.getOrigin();
-				//ConvexCast::CastResult
+			//ConvexCast::CastResult
 				struct BridgeTriangleRaycastCallback : public btTriangleRaycastCallback
 					btCollisionWorld::RayResultCallback* m_resultCallback;
-					btCollisionObject*	m_collisionObject;
-					btTriangleMeshShape*	m_triangleMesh;
+					const btCollisionObject*	m_collisionObject;
+					const btConcaveShape*	m_triangleMesh;
 					btTransform m_colObjWorldTransform;
 					BridgeTriangleRaycastCallback( const btVector3& from,const btVector3& to,
-						btCollisionWorld::RayResultCallback* resultCallback, btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh,const btTransform& colObjWorldTransform):
-					//@BP Mod
-					btTriangleRaycastCallback(from,to, resultCallback->m_flags),
-						m_resultCallback(resultCallback),
-						m_collisionObject(collisionObject),
-						m_triangleMesh(triangleMesh),
-						m_colObjWorldTransform(colObjWorldTransform)
-					{
-					}
+					btCollisionWorld::RayResultCallback* resultCallback, const btCollisionObject* collisionObject,const btConcaveShape*	triangleMesh,const btTransform& colObjWorldTransform):
+						//@BP Mod
+						btTriangleRaycastCallback(from,to, resultCallback->m_flags),
+							m_resultCallback(resultCallback),
+							m_collisionObject(collisionObject),
+							m_triangleMesh(triangleMesh),
+							m_colObjWorldTransform(colObjWorldTransform)
+						{
+						}
 					virtual btScalar reportHit(const btVector3& hitNormalLocal, btScalar hitFraction, int partId, int triangleIndex )
@@ -367,10 +375,21 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
-				BridgeTriangleRaycastCallback rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObject,triangleMesh,colObjWorldTransform);
+			btTransform worldTocollisionObject = colObjWorldTransform.inverse();
+			btVector3 rayFromLocal = worldTocollisionObject * rayFromTrans.getOrigin();
+			btVector3 rayToLocal = worldTocollisionObject * rayToTrans.getOrigin();
+			//			BT_PROFILE("rayTestConcave");
+			if (collisionShape->getShapeType()==TRIANGLE_MESH_SHAPE_PROXYTYPE)
+			{
+				///optimized version for btBvhTriangleMeshShape
+				btBvhTriangleMeshShape* triangleMesh = (btBvhTriangleMeshShape*)collisionShape;
+				BridgeTriangleRaycastCallback rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObjectWrap->getCollisionObject(),triangleMesh,colObjWorldTransform);
 				rcb.m_hitFraction = resultCallback.m_closestHitFraction;
-			} else
+			}
+			else
 				//generic (slower) case
 				btConcaveShape* concaveShape = (btConcaveShape*)collisionShape;
@@ -385,13 +404,13 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				struct BridgeTriangleRaycastCallback : public btTriangleRaycastCallback
 					btCollisionWorld::RayResultCallback* m_resultCallback;
-					btCollisionObject*	m_collisionObject;
+					const btCollisionObject*	m_collisionObject;
 					btConcaveShape*	m_triangleMesh;
 					btTransform m_colObjWorldTransform;
 					BridgeTriangleRaycastCallback( const btVector3& from,const btVector3& to,
-						btCollisionWorld::RayResultCallback* resultCallback, btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& colObjWorldTransform):
+						btCollisionWorld::RayResultCallback* resultCallback, const btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& colObjWorldTransform):
 					//@BP Mod
 					btTriangleRaycastCallback(from,to, resultCallback->m_flags),
@@ -423,7 +442,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
-				BridgeTriangleRaycastCallback	rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObject,concaveShape, colObjWorldTransform);
+				BridgeTriangleRaycastCallback	rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObjectWrap->getCollisionObject(),concaveShape, colObjWorldTransform);
 				rcb.m_hitFraction = resultCallback.m_closestHitFraction;
 				btVector3 rayAabbMinLocal = rayFromLocal;
@@ -446,6 +465,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 						: m_userCallback(user), m_i(i)
 						m_closestHitFraction = m_userCallback->m_closestHitFraction;
+						m_flags = m_userCallback->m_flags;
 					virtual bool needsCollision(btBroadphaseProxy* p) const
@@ -468,14 +488,14 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				struct RayTester : btDbvt::ICollide
-					btCollisionObject* m_collisionObject;
+					const btCollisionObject* m_collisionObject;
 					const btCompoundShape* m_compoundShape;
 					const btTransform& m_colObjWorldTransform;
 					const btTransform& m_rayFromTrans;
 					const btTransform& m_rayToTrans;
 					RayResultCallback& m_resultCallback;
-					RayTester(btCollisionObject* collisionObject,
+					RayTester(const btCollisionObject* collisionObject,
 							const btCompoundShape* compoundShape,
 							const btTransform& colObjWorldTransform,
 							const btTransform& rayFromTrans,
@@ -491,33 +511,30 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
-					void Process(int i)
+					void ProcessLeaf(int i)
 						const btCollisionShape* childCollisionShape = m_compoundShape->getChildShape(i);
 						const btTransform& childTrans = m_compoundShape->getChildTransform(i);
 						btTransform childWorldTrans = m_colObjWorldTransform * childTrans;
+						btCollisionObjectWrapper tmpOb(0,childCollisionShape,m_collisionObject,childWorldTrans,-1,i);
 						// replace collision shape so that callback can determine the triangle
-						btCollisionShape* saveCollisionShape = m_collisionObject->getCollisionShape();
-						m_collisionObject->internalSetTemporaryCollisionShape((btCollisionShape*)childCollisionShape);
 						LocalInfoAdder2 my_cb(i, &m_resultCallback);
-						rayTestSingle(
+						rayTestSingleInternal(
-							m_collisionObject,
-							childCollisionShape,
-							childWorldTrans,
+							&tmpOb,
-						// restore
-						m_collisionObject->internalSetTemporaryCollisionShape(saveCollisionShape);
 					void Process(const btDbvtNode* leaf)
-						Process(leaf->dataAsInt);
+						ProcessLeaf(leaf->dataAsInt);
@@ -526,7 +543,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				RayTester rayCB(
-					collisionObject,
+					collisionObjectWrap->getCollisionObject(),
@@ -544,7 +561,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 					for (int i = 0, n = compoundShape->getNumChildShapes(); i < n; ++i)
-						rayCB.Process(i);
+						rayCB.ProcessLeaf(i);
@@ -558,6 +575,17 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 											const btTransform& colObjWorldTransform,
 											ConvexResultCallback& resultCallback, btScalar allowedPenetration)
+	btCollisionObjectWrapper tmpOb(0,collisionShape,collisionObject,colObjWorldTransform,-1,-1);
+	btCollisionWorld::objectQuerySingleInternal(castShape,convexFromTrans,convexToTrans,&tmpOb,resultCallback,allowedPenetration);
+void	btCollisionWorld::objectQuerySingleInternal(const btConvexShape* castShape,const btTransform& convexFromTrans,const btTransform& convexToTrans,
+											const btCollisionObjectWrapper* colObjWrap,
+											ConvexResultCallback& resultCallback, btScalar allowedPenetration)
+	const btCollisionShape* collisionShape = colObjWrap->getCollisionShape();
+	const btTransform& colObjWorldTransform = colObjWrap->getWorldTransform();
 	if (collisionShape->isConvex())
@@ -587,7 +615,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 					btCollisionWorld::LocalConvexResult localConvexResult
-						collisionObject,
+						colObjWrap->getCollisionObject(),
@@ -617,11 +645,11 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 				struct BridgeTriangleConvexcastCallback : public btTriangleConvexcastCallback
 					btCollisionWorld::ConvexResultCallback* m_resultCallback;
-					btCollisionObject*	m_collisionObject;
+					const btCollisionObject*	m_collisionObject;
 					btTriangleMeshShape*	m_triangleMesh;
 					BridgeTriangleConvexcastCallback(const btConvexShape* castShape, const btTransform& from,const btTransform& to,
-						btCollisionWorld::ConvexResultCallback* resultCallback, btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh, const btTransform& triangleToWorld):
+						btCollisionWorld::ConvexResultCallback* resultCallback, const btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh, const btTransform& triangleToWorld):
 					btTriangleConvexcastCallback(castShape, from,to, triangleToWorld, triangleMesh->getMargin()),
@@ -655,7 +683,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
-				BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,collisionObject,triangleMesh, colObjWorldTransform);
+				BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,colObjWrap->getCollisionObject(),triangleMesh, colObjWorldTransform);
 				tccb.m_hitFraction = resultCallback.m_closestHitFraction;
 				tccb.m_allowedPenetration = allowedPenetration;
 				btVector3 boxMinLocal, boxMaxLocal;
@@ -682,7 +710,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 								btCollisionWorld::LocalConvexResult localConvexResult
-									collisionObject,
+									colObjWrap->getCollisionObject(),
@@ -709,11 +737,11 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 					struct BridgeTriangleConvexcastCallback : public btTriangleConvexcastCallback
 						btCollisionWorld::ConvexResultCallback* m_resultCallback;
-						btCollisionObject*	m_collisionObject;
+						const btCollisionObject*	m_collisionObject;
 						btConcaveShape*	m_triangleMesh;
 						BridgeTriangleConvexcastCallback(const btConvexShape* castShape, const btTransform& from,const btTransform& to,
-							btCollisionWorld::ConvexResultCallback* resultCallback, btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& triangleToWorld):
+							btCollisionWorld::ConvexResultCallback* resultCallback, const btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& triangleToWorld):
 						btTriangleConvexcastCallback(castShape, from,to, triangleToWorld, triangleMesh->getMargin()),
@@ -737,7 +765,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
-								bool	normalInWorldSpace = false;
+								bool	normalInWorldSpace = true;
 								return m_resultCallback->addSingleResult(convexResult,normalInWorldSpace);
@@ -746,7 +774,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
-					BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,collisionObject,concaveShape, colObjWorldTransform);
+					BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,colObjWrap->getCollisionObject(),concaveShape, colObjWorldTransform);
 					tccb.m_hitFraction = resultCallback.m_closestHitFraction;
 					tccb.m_allowedPenetration = allowedPenetration;
 					btVector3 boxMinLocal, boxMaxLocal;
@@ -762,25 +790,50 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 		} else {
-			///@todo : use AABB tree or other BVH acceleration structure!
 			if (collisionShape->isCompound())
-				BT_PROFILE("convexSweepCompound");
-				const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(collisionShape);
-				int i=0;
-				for (i=0;i<compoundShape->getNumChildShapes();i++)
+				struct	btCompoundLeafCallback : btDbvt::ICollide
-					btTransform childTrans = compoundShape->getChildTransform(i);
-					const btCollisionShape* childCollisionShape = compoundShape->getChildShape(i);
-					btTransform childWorldTrans = colObjWorldTransform * childTrans;
-					// replace collision shape so that callback can determine the triangle
-					btCollisionShape* saveCollisionShape = collisionObject->getCollisionShape();
-					collisionObject->internalSetTemporaryCollisionShape((btCollisionShape*)childCollisionShape);
-                    struct	LocalInfoAdder : public ConvexResultCallback {
-                            ConvexResultCallback* m_userCallback;
+					btCompoundLeafCallback(
+										   const btCollisionObjectWrapper* colObjWrap,
+										   const btConvexShape* castShape,
+										   const btTransform& convexFromTrans,
+										   const btTransform& convexToTrans,
+										   btScalar allowedPenetration,
+										   const btCompoundShape* compoundShape,
+										   const btTransform& colObjWorldTransform,
+										   ConvexResultCallback& resultCallback)
+					: 
+					  m_colObjWrap(colObjWrap),
+						m_castShape(castShape),
+						m_convexFromTrans(convexFromTrans),
+						m_convexToTrans(convexToTrans),
+						m_allowedPenetration(allowedPenetration),
+						m_compoundShape(compoundShape),
+						m_colObjWorldTransform(colObjWorldTransform),
+						m_resultCallback(resultCallback) {
+					}
+				  const btCollisionObjectWrapper* m_colObjWrap;
+					const btConvexShape* m_castShape;
+					const btTransform& m_convexFromTrans;
+					const btTransform& m_convexToTrans;
+					btScalar m_allowedPenetration;
+					const btCompoundShape* m_compoundShape;
+					const btTransform& m_colObjWorldTransform;
+					ConvexResultCallback& m_resultCallback;
+				public:
+					void		ProcessChild(int index, const btTransform& childTrans, const btCollisionShape* childCollisionShape)
+					{
+						btTransform childWorldTrans = m_colObjWorldTransform * childTrans;
+						struct	LocalInfoAdder : public ConvexResultCallback {
+							ConvexResultCallback* m_userCallback;
 							int m_i;
-                            LocalInfoAdder (int i, ConvexResultCallback *user)
+							LocalInfoAdder(int i, ConvexResultCallback *user)
 								: m_userCallback(user), m_i(i)
 								m_closestHitFraction = m_userCallback->m_closestHitFraction;
@@ -789,30 +842,66 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 								return m_userCallback->needsCollision(p);
-                            virtual btScalar addSingleResult (btCollisionWorld::LocalConvexResult&	r,	bool b)
-                            {
-                                    btCollisionWorld::LocalShapeInfo	shapeInfo;
-                                    shapeInfo.m_shapePart = -1;
-                                    shapeInfo.m_triangleIndex = m_i;
-                                    if (r.m_localShapeInfo == NULL)
-                                        r.m_localShapeInfo = &shapeInfo;
-									const btScalar result = m_userCallback->addSingleResult(r, b);
-									m_closestHitFraction = m_userCallback->m_closestHitFraction;
-									return result;
-                            }
-                    };
+							virtual btScalar addSingleResult(btCollisionWorld::LocalConvexResult&	r, bool b)
+							{
+								btCollisionWorld::LocalShapeInfo	shapeInfo;
+								shapeInfo.m_shapePart = -1;
+								shapeInfo.m_triangleIndex = m_i;
+								if (r.m_localShapeInfo == NULL)
+									r.m_localShapeInfo = &shapeInfo;
+								const btScalar result = m_userCallback->addSingleResult(r, b);
+								m_closestHitFraction = m_userCallback->m_closestHitFraction;
+								return result;
-                    LocalInfoAdder my_cb(i, &resultCallback);
+							}
+						};
+						LocalInfoAdder my_cb(index, &m_resultCallback);
+						btCollisionObjectWrapper tmpObj(m_colObjWrap, childCollisionShape, m_colObjWrap->getCollisionObject(), childWorldTrans, -1, index);
+						objectQuerySingleInternal(m_castShape, m_convexFromTrans, m_convexToTrans, &tmpObj, my_cb, m_allowedPenetration);
+					}
-					objectQuerySingle(castShape, convexFromTrans,convexToTrans,
-						collisionObject,
-						childCollisionShape,
-						childWorldTrans,
-						my_cb, allowedPenetration);
-					// restore
-					collisionObject->internalSetTemporaryCollisionShape(saveCollisionShape);
+					void		Process(const btDbvtNode* leaf)
+					{
+						// Processing leaf node
+						int index = leaf->dataAsInt;
+						btTransform childTrans = m_compoundShape->getChildTransform(index);
+						const btCollisionShape* childCollisionShape = m_compoundShape->getChildShape(index);
+						ProcessChild(index, childTrans, childCollisionShape);
+					}
+				};
+				BT_PROFILE("convexSweepCompound");
+				const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(collisionShape);
+				btVector3 fromLocalAabbMin, fromLocalAabbMax;
+				btVector3 toLocalAabbMin, toLocalAabbMax;
+				castShape->getAabb(colObjWorldTransform.inverse() * convexFromTrans, fromLocalAabbMin, fromLocalAabbMax);
+				castShape->getAabb(colObjWorldTransform.inverse() * convexToTrans, toLocalAabbMin, toLocalAabbMax);
+				fromLocalAabbMin.setMin(toLocalAabbMin);
+				fromLocalAabbMax.setMax(toLocalAabbMax);
+				btCompoundLeafCallback callback(colObjWrap, castShape, convexFromTrans, convexToTrans,
+					  allowedPenetration, compoundShape, colObjWorldTransform, resultCallback);
+				const btDbvt* tree = compoundShape->getDynamicAabbTree();
+				if (tree) {
+					const ATTRIBUTE_ALIGNED16(btDbvtVolume)	bounds = btDbvtVolume::FromMM(fromLocalAabbMin, fromLocalAabbMax);
+					tree->collideTV(tree->m_root, bounds, callback);
+				} else {
+					int i;
+					for (i=0;i<compoundShape->getNumChildShapes();i++)
+					{
+						const btCollisionShape* childCollisionShape = compoundShape->getChildShape(i);
+						btTransform childTrans = compoundShape->getChildTransform(i);
+						callback.ProcessChild(i, childTrans, childCollisionShape);
+					}
@@ -993,13 +1082,13 @@ void	btCollisionWorld::convexSweepTest(const btConvexShape* castShape, const btT
 	/* Compute AABB that encompasses angular movement */
 		btVector3 linVel, angVel;
-		btTransformUtil::calculateVelocity (convexFromTrans, convexToTrans, 1.0, linVel, angVel);
+		btTransformUtil::calculateVelocity (convexFromTrans, convexToTrans, 1.0f, linVel, angVel);
 		btVector3 zeroLinVel;
 		btTransform R;
 		R.setIdentity ();
 		R.setRotation (convexFromTrans.getRotation());
-		castShape->calculateTemporalAabb (R, zeroLinVel, angVel, 1.0, castShapeAabbMin, castShapeAabbMax);
+		castShape->calculateTemporalAabb (R, zeroLinVel, angVel, 1.0f, castShapeAabbMin, castShapeAabbMax);
@@ -1044,26 +1133,26 @@ struct btBridgedManifoldResult : public btManifoldResult
 	btCollisionWorld::ContactResultCallback&	m_resultCallback;
-	btBridgedManifoldResult( btCollisionObject* obj0,btCollisionObject* obj1,btCollisionWorld::ContactResultCallback& resultCallback )
-		:btManifoldResult(obj0,obj1),
+	btBridgedManifoldResult( const btCollisionObjectWrapper* obj0Wrap,const btCollisionObjectWrapper* obj1Wrap,btCollisionWorld::ContactResultCallback& resultCallback )
+		:btManifoldResult(obj0Wrap,obj1Wrap),
 	virtual void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
-		bool isSwapped = m_manifoldPtr->getBody0() != m_body0;
+		bool isSwapped = m_manifoldPtr->getBody0() != m_body0Wrap->getCollisionObject();
 		btVector3 pointA = pointInWorld + normalOnBInWorld * depth;
 		btVector3 localA;
 		btVector3 localB;
 		if (isSwapped)
-			localA = m_rootTransB.invXform(pointA );
-			localB = m_rootTransA.invXform(pointInWorld);
+			localA = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+			localB = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 		} else
-			localA = m_rootTransA.invXform(pointA );
-			localB = m_rootTransB.invXform(pointInWorld);
+			localA = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+			localB = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 		btManifoldPoint newPt(localA,localB,normalOnBInWorld,depth);
@@ -1086,9 +1175,9 @@ struct btBridgedManifoldResult : public btManifoldResult
 		//experimental feature info, for per-triangle material etc.
-		btCollisionObject* obj0 = isSwapped? m_body1 : m_body0;
-		btCollisionObject* obj1 = isSwapped? m_body0 : m_body1;
-		m_resultCallback.addSingleResult(newPt,obj0,newPt.m_partId0,newPt.m_index0,obj1,newPt.m_partId1,newPt.m_index1);
+		const btCollisionObjectWrapper* obj0Wrap = isSwapped? m_body1Wrap : m_body0Wrap;
+		const btCollisionObjectWrapper* obj1Wrap = isSwapped? m_body0Wrap : m_body1Wrap;
+		m_resultCallback.addSingleResult(newPt,obj0Wrap,newPt.m_partId0,newPt.m_index0,obj1Wrap,newPt.m_partId1,newPt.m_index1);
@@ -1120,12 +1209,16 @@ struct btSingleContactCallback : public btBroadphaseAabbCallback
 		//only perform raycast if filterMask matches
-			btCollisionAlgorithm* algorithm = m_world->getDispatcher()->findAlgorithm(m_collisionObject,collisionObject);
+			btCollisionObjectWrapper ob0(0,m_collisionObject->getCollisionShape(),m_collisionObject,m_collisionObject->getWorldTransform(),-1,-1);
+			btCollisionObjectWrapper ob1(0,collisionObject->getCollisionShape(),collisionObject,collisionObject->getWorldTransform(),-1,-1);
+			btCollisionAlgorithm* algorithm = m_world->getDispatcher()->findAlgorithm(&ob0,&ob1);
 			if (algorithm)
-				btBridgedManifoldResult contactPointResult(m_collisionObject,collisionObject, m_resultCallback);
+				btBridgedManifoldResult contactPointResult(&ob0,&ob1, m_resultCallback);
 				//discrete collision detection query
-				algorithm->processCollision(m_collisionObject,collisionObject, m_world->getDispatchInfo(),&contactPointResult);
+				algorithm->processCollision(&ob0,&ob1, m_world->getDispatchInfo(),&contactPointResult);
@@ -1152,12 +1245,15 @@ void	btCollisionWorld::contactTest( btCollisionObject* colObj, ContactResultCall
 ///it reports one or more contact points (including the one with deepest penetration)
 void	btCollisionWorld::contactPairTest(btCollisionObject* colObjA, btCollisionObject* colObjB, ContactResultCallback& resultCallback)
-	btCollisionAlgorithm* algorithm = getDispatcher()->findAlgorithm(colObjA,colObjB);
+	btCollisionObjectWrapper obA(0,colObjA->getCollisionShape(),colObjA,colObjA->getWorldTransform(),-1,-1);
+	btCollisionObjectWrapper obB(0,colObjB->getCollisionShape(),colObjB,colObjB->getWorldTransform(),-1,-1);
+	btCollisionAlgorithm* algorithm = getDispatcher()->findAlgorithm(&obA,&obB);
 	if (algorithm)
-		btBridgedManifoldResult contactPointResult(colObjA,colObjB, resultCallback);
+		btBridgedManifoldResult contactPointResult(&obA,&obB, resultCallback);
 		//discrete collision detection query
-		algorithm->processCollision(colObjA,colObjB, getDispatchInfo(),&contactPointResult);
+		algorithm->processCollision(&obA,&obB, getDispatchInfo(),&contactPointResult);
@@ -1216,7 +1312,10 @@ public:
 void btCollisionWorld::debugDrawObject(const btTransform& worldTransform, const btCollisionShape* shape, const btVector3& color)
 	// Draw a small simplex at the center of the object
-	getDebugDrawer()->drawTransform(worldTransform,1);
+	if (getDebugDrawer() && getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawFrames)
+	{
+		getDebugDrawer()->drawTransform(worldTransform,1);
+	}
 	if (shape->getShapeType() == COMPOUND_SHAPE_PROXYTYPE)
@@ -1231,245 +1330,254 @@ void btCollisionWorld::debugDrawObject(const btTransform& worldTransform, const
 	} else
-		/// for polyhedral shapes
-		if (shape->isPolyhedral())
-		{
-			btPolyhedralConvexShape* polyshape = (btPolyhedralConvexShape*) shape;
-			int i;
-			if (polyshape->getConvexPolyhedron())
-			{
-				const btConvexPolyhedron* poly = polyshape->getConvexPolyhedron();
-				for (i=0;i<poly->m_faces.size();i++)
-				{
-					btVector3 centroid(0,0,0);
-					int numVerts = poly->m_faces[i].m_indices.size();
-					if (numVerts)
-					{
-						int lastV = poly->m_faces[i].m_indices[numVerts-1];
-						for (int v=0;v<poly->m_faces[i].m_indices.size();v++)
-						{
-							int curVert = poly->m_faces[i].m_indices[v];
-							centroid+=poly->m_vertices[curVert];
-							getDebugDrawer()->drawLine(worldTransform*poly->m_vertices[lastV],worldTransform*poly->m_vertices[curVert],color);
-							lastV = curVert;
-						}
-					}
-					centroid*= btScalar(1.f)/btScalar(numVerts);
-                    if (getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawNormals)
+        switch (shape->getShapeType())
+        {
+        case BOX_SHAPE_PROXYTYPE:
+            {
+                const btBoxShape* boxShape = static_cast<const btBoxShape*>(shape);
+                btVector3 halfExtents = boxShape->getHalfExtentsWithMargin();
+                getDebugDrawer()->drawBox(-halfExtents,halfExtents,worldTransform,color);
+                break;
+            }
+            {
+                const btSphereShape* sphereShape = static_cast<const btSphereShape*>(shape);
+                btScalar radius = sphereShape->getMargin();//radius doesn't include the margin, so draw with margin
+                getDebugDrawer()->drawSphere(radius, worldTransform, color);
+                break;
+            }
+            {
+                const btMultiSphereShape* multiSphereShape = static_cast<const btMultiSphereShape*>(shape);
+                btTransform childTransform;
+                childTransform.setIdentity();
+                for (int i = multiSphereShape->getSphereCount()-1; i>=0;i--)
+                {
+                    childTransform.setOrigin(multiSphereShape->getSpherePosition(i));
+                    getDebugDrawer()->drawSphere(multiSphereShape->getSphereRadius(i), worldTransform*childTransform, color);
+                }
+                break;
+            }
+            {
+                const btCapsuleShape* capsuleShape = static_cast<const btCapsuleShape*>(shape);
+                btScalar radius = capsuleShape->getRadius();
+                btScalar halfHeight = capsuleShape->getHalfHeight();
+                int upAxis = capsuleShape->getUpAxis();
+                getDebugDrawer()->drawCapsule(radius, halfHeight, upAxis, worldTransform, color);
+                break;
+            }
+            {
+                const btConeShape* coneShape = static_cast<const btConeShape*>(shape);
+                btScalar radius = coneShape->getRadius();//+coneShape->getMargin();
+                btScalar height = coneShape->getHeight();//+coneShape->getMargin();
+                int upAxis= coneShape->getConeUpIndex();
+                getDebugDrawer()->drawCone(radius, height, upAxis, worldTransform, color);
+                break;
+            }
+            {
+                const btCylinderShape* cylinder = static_cast<const btCylinderShape*>(shape);
+                int upAxis = cylinder->getUpAxis();
+                btScalar radius = cylinder->getRadius();
+                btScalar halfHeight = cylinder->getHalfExtentsWithMargin()[upAxis];
+                getDebugDrawer()->drawCylinder(radius, halfHeight, upAxis, worldTransform, color);
+                break;
+            }
+            {
+                const btStaticPlaneShape* staticPlaneShape = static_cast<const btStaticPlaneShape*>(shape);
+                btScalar planeConst = staticPlaneShape->getPlaneConstant();
+                const btVector3& planeNormal = staticPlaneShape->getPlaneNormal();
+                getDebugDrawer()->drawPlane(planeNormal, planeConst,worldTransform, color);
+                break;
+            }
+        default:
+            {
+                /// for polyhedral shapes
+                if (shape->isPolyhedral())
+                {
+                    btPolyhedralConvexShape* polyshape = (btPolyhedralConvexShape*) shape;
+                    int i;
+                    if (polyshape->getConvexPolyhedron())
+                    {
+                        const btConvexPolyhedron* poly = polyshape->getConvexPolyhedron();
+                        for (i=0;i<poly->m_faces.size();i++)
+                        {
+                            btVector3 centroid(0,0,0);
+                            int numVerts = poly->m_faces[i].m_indices.size();
+                            if (numVerts)
+                            {
+                                int lastV = poly->m_faces[i].m_indices[numVerts-1];
+                                for (int v=0;v<poly->m_faces[i].m_indices.size();v++)
+                                {
+                                    int curVert = poly->m_faces[i].m_indices[v];
+                                    centroid+=poly->m_vertices[curVert];
+                                    getDebugDrawer()->drawLine(worldTransform*poly->m_vertices[lastV],worldTransform*poly->m_vertices[curVert],color);
+                                    lastV = curVert;
+                                }
+                            }
+                            centroid*= btScalar(1.f)/btScalar(numVerts);
+                            if (getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawNormals)
+                            {
+                                btVector3 normalColor(1,1,0);
+                                btVector3 faceNormal(poly->m_faces[i].m_plane[0],poly->m_faces[i].m_plane[1],poly->m_faces[i].m_plane[2]);
+                                getDebugDrawer()->drawLine(worldTransform*centroid,worldTransform*(centroid+faceNormal),normalColor);
+                            }
+                        }
+                    } else
-					  btVector3 normalColor(1,1,0);
-					  btVector3 faceNormal(poly->m_faces[i].m_plane[0],poly->m_faces[i].m_plane[1],poly->m_faces[i].m_plane[2]);
-					  getDebugDrawer()->drawLine(worldTransform*centroid,worldTransform*(centroid+faceNormal),normalColor);
+                        for (i=0;i<polyshape->getNumEdges();i++)
+                        {
+                            btVector3 a,b;
+                            polyshape->getEdge(i,a,b);
+                            btVector3 wa = worldTransform * a;
+                            btVector3 wb = worldTransform * b;
+                            getDebugDrawer()->drawLine(wa,wb,color);
+                        }
-				}
+                }
+                if (shape->isConcave())
+                {
+                    btConcaveShape* concaveMesh = (btConcaveShape*) shape;
+                    ///@todo pass camera, for some culling? no -> we are not a graphics lib
+                    btVector3 aabbMax(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+                    btVector3 aabbMin(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
+                    DebugDrawcallback drawCallback(getDebugDrawer(),worldTransform,color);
+                    concaveMesh->processAllTriangles(&drawCallback,aabbMin,aabbMax);
+                }
+                if (shape->getShapeType() == CONVEX_TRIANGLEMESH_SHAPE_PROXYTYPE)
+                {
+                    btConvexTriangleMeshShape* convexMesh = (btConvexTriangleMeshShape*) shape;
+                    //todo: pass camera for some culling			
+                    btVector3 aabbMax(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+                    btVector3 aabbMin(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
+                    //DebugDrawcallback drawCallback;
+                    DebugDrawcallback drawCallback(getDebugDrawer(),worldTransform,color);
+                    convexMesh->getMeshInterface()->InternalProcessAllTriangles(&drawCallback,aabbMin,aabbMax);
+                }
+            }
+		}
+	}
-			} else
-			{
-				for (i=0;i<polyshape->getNumEdges();i++)
-				{
-					btVector3 a,b;
-					polyshape->getEdge(i,a,b);
-					btVector3 wa = worldTransform * a;
-					btVector3 wb = worldTransform * b;
-					getDebugDrawer()->drawLine(wa,wb,color);
-				}
-			}
+void	btCollisionWorld::debugDrawWorld()
+	if (getDebugDrawer())
+	{
+		btIDebugDraw::DefaultColors defaultColors = getDebugDrawer()->getDefaultColors();
-		}
-		else
+		if ( getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawContactPoints)
-			switch (shape->getShapeType())
-			{
-				{
-					const btBoxShape* boxShape = static_cast<const btBoxShape*>(shape);
-					btVector3 halfExtents = boxShape->getHalfExtentsWithMargin();
-					getDebugDrawer()->drawBox(-halfExtents,halfExtents,worldTransform,color);
-					break;
-				}
-				{
-					const btSphereShape* sphereShape = static_cast<const btSphereShape*>(shape);
-					btScalar radius = sphereShape->getMargin();//radius doesn't include the margin, so draw with margin
-					getDebugDrawer()->drawSphere(radius, worldTransform, color);
-					break;
-				}
+			if (getDispatcher())
+			{
+				int numManifolds = getDispatcher()->getNumManifolds();
+				for (int i=0;i<numManifolds;i++)
-					const btMultiSphereShape* multiSphereShape = static_cast<const btMultiSphereShape*>(shape);
-					btTransform childTransform;
-					childTransform.setIdentity();
+					btPersistentManifold* contactManifold = getDispatcher()->getManifoldByIndexInternal(i);
+					//btCollisionObject* obA = static_cast<btCollisionObject*>(contactManifold->getBody0());
+					//btCollisionObject* obB = static_cast<btCollisionObject*>(contactManifold->getBody1());
-					for (int i = multiSphereShape->getSphereCount()-1; i>=0;i--)
+					int numContacts = contactManifold->getNumContacts();
+					for (int j=0;j<numContacts;j++)
-						childTransform.setOrigin(multiSphereShape->getSpherePosition(i));
-						getDebugDrawer()->drawSphere(multiSphereShape->getSphereRadius(i), worldTransform*childTransform, color);
+						btManifoldPoint& cp = contactManifold->getContactPoint(j);
+						getDebugDrawer()->drawContactPoint(cp.m_positionWorldOnB,cp.m_normalWorldOnB,cp.getDistance(),cp.getLifeTime(),defaultColors.m_contactPoint);
-					break;
-				}
-				{
-					const btCapsuleShape* capsuleShape = static_cast<const btCapsuleShape*>(shape);
-					btScalar radius = capsuleShape->getRadius();
-					btScalar halfHeight = capsuleShape->getHalfHeight();
-					int upAxis = capsuleShape->getUpAxis();
-					getDebugDrawer()->drawCapsule(radius, halfHeight, upAxis, worldTransform, color);
-					break;
-				}
-				{
-					const btConeShape* coneShape = static_cast<const btConeShape*>(shape);
-					btScalar radius = coneShape->getRadius();//+coneShape->getMargin();
-					btScalar height = coneShape->getHeight();//+coneShape->getMargin();
-					int upAxis= coneShape->getConeUpIndex();
-					getDebugDrawer()->drawCone(radius, height, upAxis, worldTransform, color);
-					break;
-				}
-				{
-					const btCylinderShape* cylinder = static_cast<const btCylinderShape*>(shape);
-					int upAxis = cylinder->getUpAxis();
-					btScalar radius = cylinder->getRadius();
-					btScalar halfHeight = cylinder->getHalfExtentsWithMargin()[upAxis];
-					getDebugDrawer()->drawCylinder(radius, halfHeight, upAxis, worldTransform, color);
-					break;
+			}
+		}
-				{
-					const btStaticPlaneShape* staticPlaneShape = static_cast<const btStaticPlaneShape*>(shape);
-					btScalar planeConst = staticPlaneShape->getPlaneConstant();
-					const btVector3& planeNormal = staticPlaneShape->getPlaneNormal();
-					getDebugDrawer()->drawPlane(planeNormal, planeConst,worldTransform, color);
-					break;
+		if ((getDebugDrawer()->getDebugMode() & (btIDebugDraw::DBG_DrawWireframe | btIDebugDraw::DBG_DrawAabb)))
+		{
+			int i;
-				}
-			default:
+			for (  i=0;i<m_collisionObjects.size();i++)
+			{
+				btCollisionObject* colObj = m_collisionObjects[i];
+				if ((colObj->getCollisionFlags() & btCollisionObject::CF_DISABLE_VISUALIZE_OBJECT)==0)
-					if (shape->isConcave())
+					if (getDebugDrawer() && (getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawWireframe))
-						btConcaveShape* concaveMesh = (btConcaveShape*) shape;
-						///@todo pass camera, for some culling? no -> we are not a graphics lib
-						btVector3 aabbMax(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
-						btVector3 aabbMin(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
+						btVector3 color(btScalar(0.4),btScalar(0.4),btScalar(0.4));
-						DebugDrawcallback drawCallback(getDebugDrawer(),worldTransform,color);
-						concaveMesh->processAllTriangles(&drawCallback,aabbMin,aabbMax);
+						switch(colObj->getActivationState())
+						{
+						case  ACTIVE_TAG:
+							color = defaultColors.m_activeObject; break;
+						case ISLAND_SLEEPING:
+							color =  defaultColors.m_deactivatedObject;break;
+							color = defaultColors.m_wantsDeactivationObject;break;
+							color = defaultColors.m_disabledDeactivationObject;break;
+							color = defaultColors.m_disabledSimulationObject;break;
+						default:
+							{
+								color = btVector3(btScalar(.3),btScalar(0.3),btScalar(0.3));
+							}
+						};
+						debugDrawObject(colObj->getWorldTransform(),colObj->getCollisionShape(),color);
-					if (shape->getShapeType() == CONVEX_TRIANGLEMESH_SHAPE_PROXYTYPE)
+					if (m_debugDrawer && (m_debugDrawer->getDebugMode() & btIDebugDraw::DBG_DrawAabb))
-						btConvexTriangleMeshShape* convexMesh = (btConvexTriangleMeshShape*) shape;
-						//todo: pass camera for some culling			
-						btVector3 aabbMax(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
-						btVector3 aabbMin(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
-						//DebugDrawcallback drawCallback;
-						DebugDrawcallback drawCallback(getDebugDrawer(),worldTransform,color);
-						convexMesh->getMeshInterface()->InternalProcessAllTriangles(&drawCallback,aabbMin,aabbMax);
-					}
-				}
-			}
-		}
-	}
+						btVector3 minAabb,maxAabb;
+						btVector3 colorvec = defaultColors.m_aabb;
+						colObj->getCollisionShape()->getAabb(colObj->getWorldTransform(), minAabb,maxAabb);
+						btVector3 contactThreshold(gContactBreakingThreshold,gContactBreakingThreshold,gContactBreakingThreshold);
+						minAabb -= contactThreshold;
+						maxAabb += contactThreshold;
+						btVector3 minAabb2,maxAabb2;
-void	btCollisionWorld::debugDrawWorld()
-	if (getDebugDrawer() && getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawContactPoints)
-	{
-		int numManifolds = getDispatcher()->getNumManifolds();
-		btVector3 color(1,0.65,0);
-		for (int i=0;i<numManifolds;i++)
-		{
-			btPersistentManifold* contactManifold = getDispatcher()->getManifoldByIndexInternal(i);
-			//btCollisionObject* obA = static_cast<btCollisionObject*>(contactManifold->getBody0());
-			//btCollisionObject* obB = static_cast<btCollisionObject*>(contactManifold->getBody1());
-			int numContacts = contactManifold->getNumContacts();
-			for (int j=0;j<numContacts;j++)
-			{
-				btManifoldPoint& cp = contactManifold->getContactPoint(j);
-				getDebugDrawer()->drawContactPoint(cp.m_positionWorldOnB,cp.m_normalWorldOnB,cp.getDistance(),cp.getLifeTime(),color);
-			}
-		}
-	}
-	if (getDebugDrawer() && (getDebugDrawer()->getDebugMode() & (btIDebugDraw::DBG_DrawWireframe | btIDebugDraw::DBG_DrawAabb)))
-	{
-		int i;
-		for (  i=0;i<m_collisionObjects.size();i++)
-		{
-			btCollisionObject* colObj = m_collisionObjects[i];
-			if ((colObj->getCollisionFlags() & btCollisionObject::CF_DISABLE_VISUALIZE_OBJECT)==0)
-			{
-				if (getDebugDrawer() && (getDebugDrawer()->getDebugMode() & btIDebugDraw::DBG_DrawWireframe))
-				{
-					btVector3 color(btScalar(1.),btScalar(1.),btScalar(1.));
-					switch(colObj->getActivationState())
-					{
-					case  ACTIVE_TAG:
-						color = btVector3(btScalar(1.),btScalar(1.),btScalar(1.)); break;
-						color =  btVector3(btScalar(0.),btScalar(1.),btScalar(0.));break;
-						color = btVector3(btScalar(0.),btScalar(1.),btScalar(1.));break;
-						color = btVector3(btScalar(1.),btScalar(0.),btScalar(0.));break;
-						color = btVector3(btScalar(1.),btScalar(1.),btScalar(0.));break;
-					default:
+						if(getDispatchInfo().m_useContinuous && colObj->getInternalType()==btCollisionObject::CO_RIGID_BODY && !colObj->isStaticOrKinematicObject())
-							color = btVector3(btScalar(1),btScalar(0.),btScalar(0.));
+							colObj->getCollisionShape()->getAabb(colObj->getInterpolationWorldTransform(),minAabb2,maxAabb2);
+							minAabb2 -= contactThreshold;
+							maxAabb2 += contactThreshold;
+							minAabb.setMin(minAabb2);
+							maxAabb.setMax(maxAabb2);
-					};
-					debugDrawObject(colObj->getWorldTransform(),colObj->getCollisionShape(),color);
-				}
-				if (m_debugDrawer && (m_debugDrawer->getDebugMode() & btIDebugDraw::DBG_DrawAabb))
-				{
-					btVector3 minAabb,maxAabb;
-					btVector3 colorvec(1,0,0);
-					colObj->getCollisionShape()->getAabb(colObj->getWorldTransform(), minAabb,maxAabb);
-					btVector3 contactThreshold(gContactBreakingThreshold,gContactBreakingThreshold,gContactBreakingThreshold);
-					minAabb -= contactThreshold;
-					maxAabb += contactThreshold;
-					btVector3 minAabb2,maxAabb2;
-					if(getDispatchInfo().m_useContinuous && colObj->getInternalType()==btCollisionObject::CO_RIGID_BODY && !colObj->isStaticOrKinematicObject())
-					{
-						colObj->getCollisionShape()->getAabb(colObj->getInterpolationWorldTransform(),minAabb2,maxAabb2);
-						minAabb2 -= contactThreshold;
-						maxAabb2 += contactThreshold;
-						minAabb.setMin(minAabb2);
-						maxAabb.setMax(maxAabb2);
+						m_debugDrawer->drawAabb(minAabb,maxAabb,colorvec);
-					m_debugDrawer->drawAabb(minAabb,maxAabb,colorvec);
@@ -1478,15 +1586,6 @@ void	btCollisionWorld::debugDrawWorld()
 void	btCollisionWorld::serializeCollisionObjects(btSerializer* serializer)
 	int i;
-	//serialize all collision objects
-	for (i=0;i<m_collisionObjects.size();i++)
-	{
-		btCollisionObject* colObj = m_collisionObjects[i];
-		if (colObj->getInternalType() == btCollisionObject::CO_COLLISION_OBJECT)
-		{
-			colObj->serializeSingleObject(serializer);
-		}
-	}
 	///keep track of shapes already serialized
 	btHashMap<btHashPtr,btCollisionShape*>	serializedShapes;
@@ -1503,6 +1602,15 @@ void	btCollisionWorld::serializeCollisionObjects(btSerializer* serializer)
+	//serialize all collision objects
+	for (i=0;i<m_collisionObjects.size();i++)
+	{
+		btCollisionObject* colObj = m_collisionObjects[i];
+		if ((colObj->getInternalType() == btCollisionObject::CO_COLLISION_OBJECT) || (colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK))
+		{
+			colObj->serializeSingleObject(serializer);
+		}
+	}
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.h
index 0a92d2d6..be9eca61 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorld.h
@@ -1,6 +1,6 @@
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://bulletphysics.com/Bullet/
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -18,21 +18,28 @@ subject to the following restrictions:
  * @mainpage Bullet Documentation
  * @section intro_sec Introduction
- * Bullet Collision Detection & Physics SDK
- *
  * Bullet is a Collision Detection and Rigid Body Dynamics Library. The Library is Open Source and free for commercial use, under the ZLib license ( http://opensource.org/licenses/zlib-license.php ).
  * The main documentation is Bullet_User_Manual.pdf, included in the source code distribution.
  * There is the Physics Forum for feedback and general Collision Detection and Physics discussions.
- * Please visit http://www.bulletphysics.com
+ * Please visit http://www.bulletphysics.org
  * @section install_sec Installation
  * @subsection step1 Step 1: Download
- * You can download the Bullet Physics Library from the Google Code repository: http://code.google.com/p/bullet/downloads/list
+ * You can download the Bullet Physics Library from the github repository: https://github.com/bulletphysics/bullet3/releases 
  * @subsection step2 Step 2: Building
- * Bullet main build system for all platforms is cmake, you can download http://www.cmake.org
+ * Bullet has multiple build systems, including premake, cmake and autotools. Premake and cmake support all platforms.
+ * Premake is included in the Bullet/build folder for Windows, Mac OSX and Linux. 
+ * Under Windows you can click on Bullet/build/vs2010.bat to create Microsoft Visual Studio projects. 
+ * On Mac OSX and Linux you can open a terminal and generate Makefile, codeblocks or Xcode4 projects:
+ * cd Bullet/build
+ * ./premake4_osx gmake or ./premake4_linux gmake or ./premake4_linux64 gmake or (for Mac) ./premake4_osx xcode4
+ * cd Bullet/build/gmake
+ * make
+ * 
+ * An alternative to premake is cmake. You can download cmake from http://www.cmake.org
  * cmake can autogenerate projectfiles for Microsoft Visual Studio, Apple Xcode, KDevelop and Unix Makefiles.
  * The easiest is to run the CMake cmake-gui graphical user interface and choose the options and generate projectfiles.
  * You can also use cmake in the command-line. Here are some examples for various platforms:
@@ -65,7 +72,6 @@ subject to the following restrictions:
-class btStackAlloc;
 class btCollisionShape;
 class btConvexShape;
 class btBroadphaseInterface;
@@ -91,8 +97,6 @@ protected:
 	btDispatcherInfo	m_dispatchInfo;
-	btStackAlloc*	m_stackAlloc;
 	btBroadphaseInterface*	m_broadphasePairCache;
 	btIDebugDraw*	m_debugDrawer;
@@ -144,6 +148,11 @@ public:
 	void	updateSingleAabb(btCollisionObject* colObj);
 	virtual void	updateAabbs();
+	///the computeOverlappingPairs is usually already called by performDiscreteCollisionDetection (or stepSimulation)
+	///it can be useful to use if you perform ray tests without collision detection/simulation
+	virtual void	computeOverlappingPairs();
 	virtual void	setDebugDrawer(btIDebugDraw*	debugDrawer)
@@ -173,7 +182,7 @@ public:
 	struct	LocalRayResult
-		LocalRayResult(btCollisionObject*	collisionObject, 
+		LocalRayResult(const btCollisionObject*	collisionObject, 
 			LocalShapeInfo*	localShapeInfo,
 			const btVector3&		hitNormalLocal,
 			btScalar hitFraction)
@@ -184,7 +193,7 @@ public:
-		btCollisionObject*		m_collisionObject;
+		const btCollisionObject*		m_collisionObject;
 		LocalShapeInfo*			m_localShapeInfo;
 		btVector3				m_hitNormalLocal;
 		btScalar				m_hitFraction;
@@ -195,11 +204,11 @@ public:
 	struct	RayResultCallback
 		btScalar	m_closestHitFraction;
-		btCollisionObject*		m_collisionObject;
+		const btCollisionObject*		m_collisionObject;
 		short int	m_collisionFilterGroup;
 		short int	m_collisionFilterMask;
-      //@BP Mod - Custom flags, currently used to enable backface culling on tri-meshes, see btRaycastCallback
-      unsigned int m_flags;
+		//@BP Mod - Custom flags, currently used to enable backface culling on tri-meshes, see btRaycastCallback.h. Apply any of the EFlags defined there on m_flags here to invoke.
+		unsigned int m_flags;
 		virtual ~RayResultCallback()
@@ -214,8 +223,8 @@ public:
-         //@BP Mod
-         m_flags(0)
+			//@BP Mod
+			m_flags(0)
@@ -272,7 +281,7 @@ public:
-		btAlignedObjectArray<btCollisionObject*>		m_collisionObjects;
+		btAlignedObjectArray<const btCollisionObject*>		m_collisionObjects;
 		btVector3	m_rayFromWorld;//used to calculate hitPointWorld from hitFraction
 		btVector3	m_rayToWorld;
@@ -306,7 +315,7 @@ public:
 	struct LocalConvexResult
-		LocalConvexResult(btCollisionObject*	hitCollisionObject, 
+		LocalConvexResult(const btCollisionObject*	hitCollisionObject, 
 			LocalShapeInfo*	localShapeInfo,
 			const btVector3&		hitNormalLocal,
 			const btVector3&		hitPointLocal,
@@ -320,7 +329,7 @@ public:
-		btCollisionObject*		m_hitCollisionObject;
+		const btCollisionObject*		m_hitCollisionObject;
 		LocalShapeInfo*			m_localShapeInfo;
 		btVector3				m_hitNormalLocal;
 		btVector3				m_hitPointLocal;
@@ -376,7 +385,7 @@ public:
 		btVector3	m_hitNormalWorld;
 		btVector3	m_hitPointWorld;
-		btCollisionObject*	m_hitCollisionObject;
+		const btCollisionObject*	m_hitCollisionObject;
 		virtual	btScalar	addSingleResult(LocalConvexResult& convexResult,bool normalInWorldSpace)
@@ -421,7 +430,7 @@ public:
 			return collides;
-		virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1) = 0;
+		virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1) = 0;
@@ -457,6 +466,10 @@ public:
 					  const btTransform& colObjWorldTransform,
 					  RayResultCallback& resultCallback);
+	static void	rayTestSingleInternal(const btTransform& rayFromTrans,const btTransform& rayToTrans,
+					  const btCollisionObjectWrapper* collisionObjectWrap,
+					  RayResultCallback& resultCallback);
 	/// objectQuerySingle performs a collision detection query and calls the resultCallback. It is used internally by rayTest.
 	static void	objectQuerySingle(const btConvexShape* castShape, const btTransform& rayFromTrans,const btTransform& rayToTrans,
 					  btCollisionObject* collisionObject,
@@ -464,6 +477,10 @@ public:
 					  const btTransform& colObjWorldTransform,
 					  ConvexResultCallback& resultCallback, btScalar	allowedPenetration);
+	static void	objectQuerySingleInternal(const btConvexShape* castShape,const btTransform& convexFromTrans,const btTransform& convexToTrans,
+											const btCollisionObjectWrapper* colObjWrap,
+											ConvexResultCallback& resultCallback, btScalar allowedPenetration);
 	virtual void	addCollisionObject(btCollisionObject* collisionObject,short int collisionFilterGroup=btBroadphaseProxy::DefaultFilter,short int collisionFilterMask=btBroadphaseProxy::AllFilter);
 	btCollisionObjectArray& getCollisionObjectArray()
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorldImporter.cpp b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorldImporter.cpp
new file mode 100644
index 00000000..36dd0435
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorldImporter.cpp
@@ -0,0 +1,1147 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2014 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btCollisionWorldImporter.h"
+#include "btBulletCollisionCommon.h"
+#include "LinearMath/btSerializer.h" //for btBulletSerializedArrays definition
+#include "BulletCollision/Gimpact/btGImpactShape.h"
+btCollisionWorldImporter::btCollisionWorldImporter(btCollisionWorld* world)
+bool	btCollisionWorldImporter::convertAllObjects( btBulletSerializedArrays* arrays)
+	m_shapeMap.clear();
+	m_bodyMap.clear();
+	int i;
+	for (i=0;i<arrays->m_bvhsDouble.size();i++)
+	{
+		btOptimizedBvh* bvh = createOptimizedBvh();
+		btQuantizedBvhDoubleData* bvhData = arrays->m_bvhsDouble[i];
+		bvh->deSerializeDouble(*bvhData);
+		m_bvhMap.insert(arrays->m_bvhsDouble[i],bvh);
+	}
+	for (i=0;i<arrays->m_bvhsFloat.size();i++)
+    {
+        btOptimizedBvh* bvh = createOptimizedBvh();
+   		btQuantizedBvhFloatData* bvhData = arrays->m_bvhsFloat[i];
+		bvh->deSerializeFloat(*bvhData);
+		m_bvhMap.insert(arrays->m_bvhsFloat[i],bvh);
+	}
+	for (i=0;i<arrays->m_colShapeData.size();i++)
+	{
+		btCollisionShapeData* shapeData = arrays->m_colShapeData[i];
+		btCollisionShape* shape = convertCollisionShape(shapeData);
+		if (shape)
+		{
+	//		printf("shapeMap.insert(%x,%x)\n",shapeData,shape);
+			m_shapeMap.insert(shapeData,shape);
+		}
+		if (shape&& shapeData->m_name)
+		{
+			char* newname = duplicateName(shapeData->m_name);
+			m_objectNameMap.insert(shape,newname);
+			m_nameShapeMap.insert(newname,shape);
+		}
+	}
+	for (i=0;i<arrays->m_collisionObjectDataDouble.size();i++)
+	{
+        btCollisionObjectDoubleData* colObjData = arrays->m_collisionObjectDataDouble[i];
+        btCollisionShape** shapePtr = m_shapeMap.find(colObjData->m_collisionShape);
+        if (shapePtr && *shapePtr)
+        {
+            btTransform startTransform;
+            colObjData->m_worldTransform.m_origin.m_floats[3] = 0.f;
+            startTransform.deSerializeDouble(colObjData->m_worldTransform);
+            btCollisionShape* shape = (btCollisionShape*)*shapePtr;
+            btCollisionObject* body = createCollisionObject(startTransform,shape,colObjData->m_name);
+            body->setFriction(btScalar(colObjData->m_friction));
+            body->setRestitution(btScalar(colObjData->m_restitution));
+            if (shape->getShapeType() == TRIANGLE_MESH_SHAPE_PROXYTYPE)
+            {
+                btBvhTriangleMeshShape* trimesh = (btBvhTriangleMeshShape*)shape;
+                if (trimesh->getTriangleInfoMap())
+                {
+                    body->setCollisionFlags(body->getCollisionFlags()  | btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK);
+                }
+            }
+            m_bodyMap.insert(colObjData,body);
+        } else
+        {
+            printf("error: no shape found\n");
+        }
+	}
+	for (i=0;i<arrays->m_collisionObjectDataFloat.size();i++)
+	{
+        btCollisionObjectFloatData* colObjData = arrays->m_collisionObjectDataFloat[i];
+        btCollisionShape** shapePtr = m_shapeMap.find(colObjData->m_collisionShape);
+        if (shapePtr && *shapePtr)
+        {
+            btTransform startTransform;
+            colObjData->m_worldTransform.m_origin.m_floats[3] = 0.f;
+            startTransform.deSerializeFloat(colObjData->m_worldTransform);
+            btCollisionShape* shape = (btCollisionShape*)*shapePtr;
+            btCollisionObject* body = createCollisionObject(startTransform,shape,colObjData->m_name);
+            if (shape->getShapeType() == TRIANGLE_MESH_SHAPE_PROXYTYPE)
+            {
+                btBvhTriangleMeshShape* trimesh = (btBvhTriangleMeshShape*)shape;
+                if (trimesh->getTriangleInfoMap())
+                {
+                    body->setCollisionFlags(body->getCollisionFlags()  | btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK);
+                }
+            }
+            m_bodyMap.insert(colObjData,body);
+        } else
+        {
+            printf("error: no shape found\n");
+        }
+    }
+	return true;
+void btCollisionWorldImporter::deleteAllData()
+	int i;
+	for (i=0;i<m_allocatedCollisionObjects.size();i++)
+	{
+		if(m_collisionWorld)
+			m_collisionWorld->removeCollisionObject(m_allocatedCollisionObjects[i]);
+		delete m_allocatedCollisionObjects[i];
+	}
+	m_allocatedCollisionObjects.clear();
+	for (i=0;i<m_allocatedCollisionShapes.size();i++)
+	{
+		delete m_allocatedCollisionShapes[i];
+	}
+	m_allocatedCollisionShapes.clear();
+	for (i=0;i<m_allocatedBvhs.size();i++)
+	{
+		delete m_allocatedBvhs[i];
+	}
+	m_allocatedBvhs.clear();
+	for (i=0;i<m_allocatedTriangleInfoMaps.size();i++)
+	{
+		delete m_allocatedTriangleInfoMaps[i];
+	}
+	m_allocatedTriangleInfoMaps.clear();
+	for (i=0;i<m_allocatedTriangleIndexArrays.size();i++)
+	{
+		delete m_allocatedTriangleIndexArrays[i];
+	}
+	m_allocatedTriangleIndexArrays.clear();
+	for (i=0;i<m_allocatedNames.size();i++)
+	{
+		delete[] m_allocatedNames[i];
+	}
+	m_allocatedNames.clear();
+	for (i=0;i<m_allocatedbtStridingMeshInterfaceDatas.size();i++)
+	{
+		btStridingMeshInterfaceData* curData = m_allocatedbtStridingMeshInterfaceDatas[i];
+		for(int a = 0;a < curData->m_numMeshParts;a++)
+		{
+			btMeshPartData* curPart = &curData->m_meshPartsPtr[a];
+			if(curPart->m_vertices3f)
+				delete [] curPart->m_vertices3f;
+			if(curPart->m_vertices3d)
+				delete [] curPart->m_vertices3d;
+			if(curPart->m_indices32)
+				delete [] curPart->m_indices32;
+			if(curPart->m_3indices16)
+				delete [] curPart->m_3indices16;
+			if(curPart->m_indices16)
+				delete [] curPart->m_indices16;
+			if (curPart->m_3indices8)
+				delete [] curPart->m_3indices8;
+		}
+		delete [] curData->m_meshPartsPtr;
+		delete curData;
+	}
+	m_allocatedbtStridingMeshInterfaceDatas.clear();
+	for (i=0;i<m_indexArrays.size();i++)
+	{
+		btAlignedFree(m_indexArrays[i]);
+	}
+  m_indexArrays.clear();
+	for (i=0;i<m_shortIndexArrays.size();i++)
+	{
+		btAlignedFree(m_shortIndexArrays[i]);
+	}
+  m_shortIndexArrays.clear();
+	for (i=0;i<m_charIndexArrays.size();i++)
+	{
+		btAlignedFree(m_charIndexArrays[i]);
+	}
+  m_charIndexArrays.clear();
+	for (i=0;i<m_floatVertexArrays.size();i++)
+	{
+		btAlignedFree(m_floatVertexArrays[i]);
+	}
+  m_floatVertexArrays.clear();
+	for (i=0;i<m_doubleVertexArrays.size();i++)
+	{
+		btAlignedFree(m_doubleVertexArrays[i]);
+	}
+   m_doubleVertexArrays.clear();
+btCollisionShape* btCollisionWorldImporter::convertCollisionShape(  btCollisionShapeData* shapeData  )
+	btCollisionShape* shape = 0;
+	switch (shapeData->m_shapeType)
+		{
+		{
+			btStaticPlaneShapeData* planeData = (btStaticPlaneShapeData*)shapeData;
+			btVector3 planeNormal,localScaling;
+			planeNormal.deSerializeFloat(planeData->m_planeNormal);
+			localScaling.deSerializeFloat(planeData->m_localScaling);
+			shape = createPlaneShape(planeNormal,planeData->m_planeConstant);
+			shape->setLocalScaling(localScaling);
+			break;
+		}
+		{
+			btScaledTriangleMeshShapeData* scaledMesh = (btScaledTriangleMeshShapeData*) shapeData;
+			btCollisionShapeData* colShapeData = (btCollisionShapeData*) &scaledMesh->m_trimeshShapeData;
+			colShapeData->m_shapeType = TRIANGLE_MESH_SHAPE_PROXYTYPE;
+			btCollisionShape* childShape = convertCollisionShape(colShapeData);
+			btBvhTriangleMeshShape* meshShape = (btBvhTriangleMeshShape*)childShape;
+			btVector3 localScaling;
+			localScaling.deSerializeFloat(scaledMesh->m_localScaling);
+			shape = createScaledTrangleMeshShape(meshShape, localScaling);
+			break;
+		}
+		{
+			btGImpactMeshShapeData* gimpactData = (btGImpactMeshShapeData*) shapeData;
+			if (gimpactData->m_gimpactSubType == CONST_GIMPACT_TRIMESH_SHAPE)
+			{
+				btStridingMeshInterfaceData* interfaceData = createStridingMeshInterfaceData(&gimpactData->m_meshInterface);
+				btTriangleIndexVertexArray* meshInterface = createMeshInterface(*interfaceData);
+				btGImpactMeshShape* gimpactShape = createGimpactShape(meshInterface);
+				btVector3 localScaling;
+				localScaling.deSerializeFloat(gimpactData->m_localScaling);
+				gimpactShape->setLocalScaling(localScaling);
+				gimpactShape->setMargin(btScalar(gimpactData->m_collisionMargin));
+				gimpactShape->updateBound();
+				shape = gimpactShape;
+			} else
+			{
+				printf("unsupported gimpact sub type\n");
+			}
+			break;
+		}
+	//The btCapsuleShape* API has issue passing the margin/scaling/halfextents unmodified through the API
+	//so deal with this
+		{
+			btCapsuleShapeData* capData = (btCapsuleShapeData*)shapeData;
+			switch (capData->m_upAxis)
+			{
+			case 0:
+				{
+					shape = createCapsuleShapeX(1,1);
+					break;
+				}
+			case 1:
+				{
+					shape = createCapsuleShapeY(1,1);
+					break;
+				}
+			case 2:
+				{
+					shape = createCapsuleShapeZ(1,1);
+					break;
+				}
+			default:
+				{
+					printf("error: wrong up axis for btCapsuleShape\n");
+				}
+			};
+			if (shape)
+			{
+				btCapsuleShape* cap = (btCapsuleShape*) shape;
+				cap->deSerializeFloat(capData);
+			}
+			break;
+		}
+			{
+				btConvexInternalShapeData* bsd = (btConvexInternalShapeData*)shapeData;
+				btVector3 implicitShapeDimensions;
+				implicitShapeDimensions.deSerializeFloat(bsd->m_implicitShapeDimensions);
+				btVector3 localScaling;
+				localScaling.deSerializeFloat(bsd->m_localScaling);
+				btVector3 margin(bsd->m_collisionMargin,bsd->m_collisionMargin,bsd->m_collisionMargin);
+				switch (shapeData->m_shapeType)
+				{
+						{
+							btBoxShape* box= (btBoxShape*)createBoxShape(implicitShapeDimensions/localScaling+margin);
+							//box->initializePolyhedralFeatures();
+							shape = box;
+							break;
+						}
+						{
+							shape = createSphereShape(implicitShapeDimensions.getX());
+							break;
+						}
+						{
+							btCylinderShapeData* cylData = (btCylinderShapeData*) shapeData;
+							btVector3 halfExtents = implicitShapeDimensions+margin;
+							switch (cylData->m_upAxis)
+							{
+							case 0:
+								{
+									shape = createCylinderShapeX(halfExtents.getY(),halfExtents.getX());
+									break;
+								}
+							case 1:
+								{
+									shape = createCylinderShapeY(halfExtents.getX(),halfExtents.getY());
+									break;
+								}
+							case 2:
+								{
+									shape = createCylinderShapeZ(halfExtents.getX(),halfExtents.getZ());
+									break;
+								}
+							default:
+								{
+									printf("unknown Cylinder up axis\n");
+								}
+							};
+							break;
+						}
+						{
+							btConeShapeData* conData = (btConeShapeData*) shapeData;
+							btVector3 halfExtents = implicitShapeDimensions;//+margin;
+							switch (conData->m_upIndex)
+							{
+							case 0:
+								{
+									shape = createConeShapeX(halfExtents.getY(),halfExtents.getX());
+									break;
+								}
+							case 1:
+								{
+									shape = createConeShapeY(halfExtents.getX(),halfExtents.getY());
+									break;
+								}
+							case 2:
+								{
+									shape = createConeShapeZ(halfExtents.getX(),halfExtents.getZ());
+									break;
+								}
+							default:
+								{
+									printf("unknown Cone up axis\n");
+								}
+							};
+							break;
+						}
+						{
+							btMultiSphereShapeData* mss = (btMultiSphereShapeData*)bsd;
+							int numSpheres = mss->m_localPositionArraySize;
+							btAlignedObjectArray<btVector3> tmpPos;
+							btAlignedObjectArray<btScalar> radii;
+							radii.resize(numSpheres);
+							tmpPos.resize(numSpheres);
+							int i;
+							for ( i=0;i<numSpheres;i++)
+							{
+								tmpPos[i].deSerializeFloat(mss->m_localPositionArrayPtr[i].m_pos);
+								radii[i] = mss->m_localPositionArrayPtr[i].m_radius;
+							}
+							shape = createMultiSphereShape(&tmpPos[0],&radii[0],numSpheres);
+							break;
+						}
+						{
+						//	int sz = sizeof(btConvexHullShapeData);
+						//	int sz2 = sizeof(btConvexInternalShapeData);
+						//	int sz3 = sizeof(btCollisionShapeData);
+							btConvexHullShapeData* convexData = (btConvexHullShapeData*)bsd;
+							int numPoints = convexData->m_numUnscaledPoints;
+							btAlignedObjectArray<btVector3> tmpPoints;
+							tmpPoints.resize(numPoints);
+							int i;
+							for ( i=0;i<numPoints;i++)
+							{
+							if (convexData->m_unscaledPointsDoublePtr)
+								tmpPoints[i].deSerialize(convexData->m_unscaledPointsDoublePtr[i]);
+							if (convexData->m_unscaledPointsFloatPtr)
+								tmpPoints[i].deSerializeFloat(convexData->m_unscaledPointsFloatPtr[i]);
+							if (convexData->m_unscaledPointsFloatPtr)
+								tmpPoints[i].deSerialize(convexData->m_unscaledPointsFloatPtr[i]);
+							if (convexData->m_unscaledPointsDoublePtr)
+								tmpPoints[i].deSerializeDouble(convexData->m_unscaledPointsDoublePtr[i]);
+							}
+							btConvexHullShape* hullShape = createConvexHullShape();
+							for (i=0;i<numPoints;i++)
+							{
+								hullShape->addPoint(tmpPoints[i]);
+							}
+							hullShape->setMargin(bsd->m_collisionMargin);
+							//hullShape->initializePolyhedralFeatures();
+							shape = hullShape;
+							break;
+						}
+					default:
+						{
+							printf("error: cannot create shape type (%d)\n",shapeData->m_shapeType);
+						}
+				}
+				if (shape)
+				{
+					shape->setMargin(bsd->m_collisionMargin);
+					btVector3 localScaling;
+					localScaling.deSerializeFloat(bsd->m_localScaling);
+					shape->setLocalScaling(localScaling);
+				}
+				break;
+			}
+		{
+			btTriangleMeshShapeData* trimesh = (btTriangleMeshShapeData*)shapeData;
+			btStridingMeshInterfaceData* interfaceData = createStridingMeshInterfaceData(&trimesh->m_meshInterface);
+			btTriangleIndexVertexArray* meshInterface = createMeshInterface(*interfaceData);
+			if (!meshInterface->getNumSubParts())
+			{
+				return 0;
+			}
+			btVector3 scaling; scaling.deSerializeFloat(trimesh->m_meshInterface.m_scaling);
+			meshInterface->setScaling(scaling);
+			btOptimizedBvh* bvh = 0;
+#if 1
+			if (trimesh->m_quantizedFloatBvh)
+			{
+				btOptimizedBvh** bvhPtr = m_bvhMap.find(trimesh->m_quantizedFloatBvh);
+				if (bvhPtr && *bvhPtr)
+				{
+					bvh = *bvhPtr;
+				} else
+				{
+					bvh = createOptimizedBvh();
+					bvh->deSerializeFloat(*trimesh->m_quantizedFloatBvh);
+				}
+			}
+			if (trimesh->m_quantizedDoubleBvh)
+			{
+				btOptimizedBvh** bvhPtr = m_bvhMap.find(trimesh->m_quantizedDoubleBvh);
+				if (bvhPtr && *bvhPtr)
+				{
+					bvh = *bvhPtr;
+				} else
+				{
+					bvh = createOptimizedBvh();
+					bvh->deSerializeDouble(*trimesh->m_quantizedDoubleBvh);
+				}
+			}
+			btBvhTriangleMeshShape* trimeshShape = createBvhTriangleMeshShape(meshInterface,bvh);
+			trimeshShape->setMargin(trimesh->m_collisionMargin);
+			shape = trimeshShape;
+			if (trimesh->m_triangleInfoMap)
+			{
+				btTriangleInfoMap* map = createTriangleInfoMap();
+				map->deSerialize(*trimesh->m_triangleInfoMap);
+				trimeshShape->setTriangleInfoMap(map);
+				gContactAddedCallback = btAdjustInternalEdgeContactsCallback;
+			}
+			//printf("trimesh->m_collisionMargin=%f\n",trimesh->m_collisionMargin);
+			break;
+		}
+			{
+				btCompoundShapeData* compoundData = (btCompoundShapeData*)shapeData;
+				btCompoundShape* compoundShape = createCompoundShape();
+				btCompoundShapeChildData* childShapeDataArray = &compoundData->m_childShapePtr[0];
+				btAlignedObjectArray<btCollisionShape*> childShapes;
+				for (int i=0;i<compoundData->m_numChildShapes;i++)
+				{
+					btCompoundShapeChildData* ptr = &compoundData->m_childShapePtr[i];
+					btCollisionShapeData* cd = compoundData->m_childShapePtr[i].m_childShape;
+					btCollisionShape* childShape = convertCollisionShape(cd);
+					if (childShape)
+					{
+						btTransform localTransform;
+						localTransform.deSerializeFloat(compoundData->m_childShapePtr[i].m_transform);
+						compoundShape->addChildShape(localTransform,childShape);
+					} else
+					{
+#ifdef _DEBUG
+						printf("error: couldn't create childShape for compoundShape\n");
+					}
+				}
+				shape = compoundShape;
+				break;
+			}
+			{
+				return 0;
+			}
+		default:
+			{
+#ifdef _DEBUG
+				printf("unsupported shape type (%d)\n",shapeData->m_shapeType);
+			}
+		}
+		return shape;
+char* btCollisionWorldImporter::duplicateName(const char* name)
+	if (name)
+	{
+		int l = (int)strlen(name);
+		char* newName = new char[l+1];
+		memcpy(newName,name,l);
+		newName[l] = 0;
+		m_allocatedNames.push_back(newName);
+		return newName;
+	}
+	return 0;
+btTriangleIndexVertexArray* btCollisionWorldImporter::createMeshInterface(btStridingMeshInterfaceData&  meshData)
+	btTriangleIndexVertexArray* meshInterface = createTriangleMeshContainer();
+	for (int i=0;i<meshData.m_numMeshParts;i++)
+	{
+		btIndexedMesh meshPart;
+		meshPart.m_numTriangles = meshData.m_meshPartsPtr[i].m_numTriangles;
+		meshPart.m_numVertices = meshData.m_meshPartsPtr[i].m_numVertices;
+		if (meshData.m_meshPartsPtr[i].m_indices32)
+		{
+			meshPart.m_indexType = PHY_INTEGER;
+			meshPart.m_triangleIndexStride = 3*sizeof(int);
+			int* indexArray = (int*)btAlignedAlloc(sizeof(int)*3*meshPart.m_numTriangles,16);
+			m_indexArrays.push_back(indexArray);
+			for (int j=0;j<3*meshPart.m_numTriangles;j++)
+			{
+				indexArray[j] = meshData.m_meshPartsPtr[i].m_indices32[j].m_value;
+			}
+			meshPart.m_triangleIndexBase = (const unsigned char*)indexArray;
+		} else
+		{
+			if (meshData.m_meshPartsPtr[i].m_3indices16)
+			{
+				meshPart.m_indexType = PHY_SHORT;
+				meshPart.m_triangleIndexStride = sizeof(short int)*3;//sizeof(btShortIntIndexTripletData);
+				short int* indexArray = (short int*)btAlignedAlloc(sizeof(short int)*3*meshPart.m_numTriangles,16);
+				m_shortIndexArrays.push_back(indexArray);
+				for (int j=0;j<meshPart.m_numTriangles;j++)
+				{
+					indexArray[3*j] = meshData.m_meshPartsPtr[i].m_3indices16[j].m_values[0];
+					indexArray[3*j+1] = meshData.m_meshPartsPtr[i].m_3indices16[j].m_values[1];
+					indexArray[3*j+2] = meshData.m_meshPartsPtr[i].m_3indices16[j].m_values[2];
+				}
+				meshPart.m_triangleIndexBase = (const unsigned char*)indexArray;
+			}
+			if (meshData.m_meshPartsPtr[i].m_indices16)
+			{
+				meshPart.m_indexType = PHY_SHORT;
+				meshPart.m_triangleIndexStride = 3*sizeof(short int);
+				short int* indexArray = (short int*)btAlignedAlloc(sizeof(short int)*3*meshPart.m_numTriangles,16);
+				m_shortIndexArrays.push_back(indexArray);
+				for (int j=0;j<3*meshPart.m_numTriangles;j++)
+				{
+					indexArray[j] = meshData.m_meshPartsPtr[i].m_indices16[j].m_value;
+				}
+				meshPart.m_triangleIndexBase = (const unsigned char*)indexArray;
+			}
+			if (meshData.m_meshPartsPtr[i].m_3indices8)
+			{
+				meshPart.m_indexType = PHY_UCHAR;
+				meshPart.m_triangleIndexStride = sizeof(unsigned char)*3;
+				unsigned char* indexArray = (unsigned char*)btAlignedAlloc(sizeof(unsigned char)*3*meshPart.m_numTriangles,16);
+				m_charIndexArrays.push_back(indexArray);
+				for (int j=0;j<meshPart.m_numTriangles;j++)
+				{
+					indexArray[3*j] = meshData.m_meshPartsPtr[i].m_3indices8[j].m_values[0];
+					indexArray[3*j+1] = meshData.m_meshPartsPtr[i].m_3indices8[j].m_values[1];
+					indexArray[3*j+2] = meshData.m_meshPartsPtr[i].m_3indices8[j].m_values[2];
+				}
+				meshPart.m_triangleIndexBase = (const unsigned char*)indexArray;
+			}
+		}
+		if (meshData.m_meshPartsPtr[i].m_vertices3f)
+		{
+			meshPart.m_vertexType = PHY_FLOAT;
+			meshPart.m_vertexStride = sizeof(btVector3FloatData);
+			btVector3FloatData* vertices = (btVector3FloatData*) btAlignedAlloc(sizeof(btVector3FloatData)*meshPart.m_numVertices,16);
+			m_floatVertexArrays.push_back(vertices);
+			for (int j=0;j<meshPart.m_numVertices;j++)
+			{
+				vertices[j].m_floats[0] = meshData.m_meshPartsPtr[i].m_vertices3f[j].m_floats[0];
+				vertices[j].m_floats[1] = meshData.m_meshPartsPtr[i].m_vertices3f[j].m_floats[1];
+				vertices[j].m_floats[2] = meshData.m_meshPartsPtr[i].m_vertices3f[j].m_floats[2];
+				vertices[j].m_floats[3] = meshData.m_meshPartsPtr[i].m_vertices3f[j].m_floats[3];
+			}
+			meshPart.m_vertexBase = (const unsigned char*)vertices;
+		} else
+		{
+			meshPart.m_vertexType = PHY_DOUBLE;
+			meshPart.m_vertexStride = sizeof(btVector3DoubleData);
+			btVector3DoubleData* vertices = (btVector3DoubleData*) btAlignedAlloc(sizeof(btVector3DoubleData)*meshPart.m_numVertices,16);
+			m_doubleVertexArrays.push_back(vertices);
+			for (int j=0;j<meshPart.m_numVertices;j++)
+			{
+				vertices[j].m_floats[0] = meshData.m_meshPartsPtr[i].m_vertices3d[j].m_floats[0];
+				vertices[j].m_floats[1] = meshData.m_meshPartsPtr[i].m_vertices3d[j].m_floats[1];
+				vertices[j].m_floats[2] = meshData.m_meshPartsPtr[i].m_vertices3d[j].m_floats[2];
+				vertices[j].m_floats[3] = meshData.m_meshPartsPtr[i].m_vertices3d[j].m_floats[3];
+			}
+			meshPart.m_vertexBase = (const unsigned char*)vertices;
+		}
+		if (meshPart.m_triangleIndexBase && meshPart.m_vertexBase)
+		{
+			meshInterface->addIndexedMesh(meshPart,meshPart.m_indexType);
+		}
+	}
+	return meshInterface;
+btStridingMeshInterfaceData* btCollisionWorldImporter::createStridingMeshInterfaceData(btStridingMeshInterfaceData* interfaceData)
+	//create a new btStridingMeshInterfaceData that is an exact copy of shapedata and store it in the WorldImporter
+	btStridingMeshInterfaceData* newData = new btStridingMeshInterfaceData;
+	newData->m_scaling = interfaceData->m_scaling;
+	newData->m_numMeshParts = interfaceData->m_numMeshParts;
+	newData->m_meshPartsPtr = new btMeshPartData[newData->m_numMeshParts];
+	for(int i = 0;i < newData->m_numMeshParts;i++)
+	{
+		btMeshPartData* curPart = &interfaceData->m_meshPartsPtr[i];
+		btMeshPartData* curNewPart = &newData->m_meshPartsPtr[i];
+		curNewPart->m_numTriangles = curPart->m_numTriangles;
+		curNewPart->m_numVertices = curPart->m_numVertices;
+		if(curPart->m_vertices3f)
+		{
+			curNewPart->m_vertices3f = new btVector3FloatData[curNewPart->m_numVertices];
+			memcpy(curNewPart->m_vertices3f,curPart->m_vertices3f,sizeof(btVector3FloatData) * curNewPart->m_numVertices);
+		}
+		else
+			curNewPart->m_vertices3f = NULL;
+		if(curPart->m_vertices3d)
+		{
+			curNewPart->m_vertices3d = new btVector3DoubleData[curNewPart->m_numVertices];
+			memcpy(curNewPart->m_vertices3d,curPart->m_vertices3d,sizeof(btVector3DoubleData) * curNewPart->m_numVertices);
+		}
+		else
+			curNewPart->m_vertices3d = NULL;
+		int numIndices = curNewPart->m_numTriangles * 3;
+		///the m_3indices8 was not initialized in some Bullet versions, this can cause crashes at loading time
+		///we catch it by only dealing with m_3indices8 if none of the other indices are initialized
+		bool uninitialized3indices8Workaround =false;
+		if(curPart->m_indices32)
+		{
+			uninitialized3indices8Workaround=true;
+			curNewPart->m_indices32 = new btIntIndexData[numIndices];
+			memcpy(curNewPart->m_indices32,curPart->m_indices32,sizeof(btIntIndexData) * numIndices);
+		}
+		else
+			curNewPart->m_indices32 = NULL;
+		if(curPart->m_3indices16)
+		{
+			uninitialized3indices8Workaround=true;
+			curNewPart->m_3indices16 = new btShortIntIndexTripletData[curNewPart->m_numTriangles];
+			memcpy(curNewPart->m_3indices16,curPart->m_3indices16,sizeof(btShortIntIndexTripletData) * curNewPart->m_numTriangles);
+		}
+		else
+			curNewPart->m_3indices16 = NULL;
+		if(curPart->m_indices16)
+		{
+			uninitialized3indices8Workaround=true;
+			curNewPart->m_indices16 = new btShortIntIndexData[numIndices];
+			memcpy(curNewPart->m_indices16,curPart->m_indices16,sizeof(btShortIntIndexData) * numIndices);
+		}
+		else
+			curNewPart->m_indices16 = NULL;
+		if(!uninitialized3indices8Workaround && curPart->m_3indices8)
+		{
+			curNewPart->m_3indices8 = new btCharIndexTripletData[curNewPart->m_numTriangles];
+			memcpy(curNewPart->m_3indices8,curPart->m_3indices8,sizeof(btCharIndexTripletData) * curNewPart->m_numTriangles);
+		}
+		else
+			curNewPart->m_3indices8 = NULL;
+	}
+	m_allocatedbtStridingMeshInterfaceDatas.push_back(newData);
+	return(newData);
+extern ContactAddedCallback		gContactAddedCallback;
+static bool btAdjustInternalEdgeContactsCallback(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+	btAdjustInternalEdgeContacts(cp,colObj1,colObj0, partId1,index1);
+		//btAdjustInternalEdgeContacts(cp,colObj1,colObj0, partId1,index1, BT_TRIANGLE_CONVEX_BACKFACE_MODE);
+		//btAdjustInternalEdgeContacts(cp,colObj1,colObj0, partId1,index1, BT_TRIANGLE_CONVEX_DOUBLE_SIDED+BT_TRIANGLE_CONCAVE_DOUBLE_SIDED);
+	return true;
+btRigidBody*  btWorldImporter::createRigidBody(bool isDynamic, btScalar mass, const btTransform& startTransform,btCollisionShape* shape,const char* bodyName)
+	btVector3 localInertia;
+	localInertia.setZero();
+	if (mass)
+		shape->calculateLocalInertia(mass,localInertia);
+	btRigidBody* body = new btRigidBody(mass,0,shape,localInertia);
+	body->setWorldTransform(startTransform);
+	if (m_dynamicsWorld)
+		m_dynamicsWorld->addRigidBody(body);
+	if (bodyName)
+	{
+		char* newname = duplicateName(bodyName);
+		m_objectNameMap.insert(body,newname);
+		m_nameBodyMap.insert(newname,body);
+	}
+	m_allocatedRigidBodies.push_back(body);
+	return body;
+btCollisionObject* btCollisionWorldImporter::getCollisionObjectByName(const char* name)
+	btCollisionObject** bodyPtr = m_nameColObjMap.find(name);
+	if (bodyPtr && *bodyPtr)
+	{
+		return *bodyPtr;
+	}
+	return 0;
+btCollisionObject* btCollisionWorldImporter::createCollisionObject(const btTransform& startTransform,btCollisionShape* shape, const char* bodyName)
+	btCollisionObject* colObj = new btCollisionObject();
+	colObj->setWorldTransform(startTransform);
+	colObj->setCollisionShape(shape);
+	m_collisionWorld->addCollisionObject(colObj);//todo: flags etc
+	if (bodyName)
+	{
+		char* newname = duplicateName(bodyName);
+		m_objectNameMap.insert(colObj,newname);
+		m_nameColObjMap.insert(newname,colObj);
+	}
+	m_allocatedCollisionObjects.push_back(colObj);
+	return colObj;
+btCollisionShape* btCollisionWorldImporter::createPlaneShape(const btVector3& planeNormal,btScalar planeConstant)
+	btStaticPlaneShape* shape = new btStaticPlaneShape(planeNormal,planeConstant);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createBoxShape(const btVector3& halfExtents)
+	btBoxShape* shape = new btBoxShape(halfExtents);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createSphereShape(btScalar radius)
+	btSphereShape* shape = new btSphereShape(radius);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createCapsuleShapeX(btScalar radius, btScalar height)
+	btCapsuleShapeX* shape = new btCapsuleShapeX(radius,height);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createCapsuleShapeY(btScalar radius, btScalar height)
+	btCapsuleShape* shape = new btCapsuleShape(radius,height);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createCapsuleShapeZ(btScalar radius, btScalar height)
+	btCapsuleShapeZ* shape = new btCapsuleShapeZ(radius,height);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createCylinderShapeX(btScalar radius,btScalar height)
+	btCylinderShapeX* shape = new btCylinderShapeX(btVector3(height,radius,radius));
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createCylinderShapeY(btScalar radius,btScalar height)
+	btCylinderShape* shape = new btCylinderShape(btVector3(radius,height,radius));
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createCylinderShapeZ(btScalar radius,btScalar height)
+	btCylinderShapeZ* shape = new btCylinderShapeZ(btVector3(radius,radius,height));
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createConeShapeX(btScalar radius,btScalar height)
+	btConeShapeX* shape = new btConeShapeX(radius,height);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createConeShapeY(btScalar radius,btScalar height)
+	btConeShape* shape = new btConeShape(radius,height);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCollisionShape* btCollisionWorldImporter::createConeShapeZ(btScalar radius,btScalar height)
+	btConeShapeZ* shape = new btConeShapeZ(radius,height);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btTriangleIndexVertexArray*	btCollisionWorldImporter::createTriangleMeshContainer()
+	btTriangleIndexVertexArray* in = new btTriangleIndexVertexArray();
+	m_allocatedTriangleIndexArrays.push_back(in);
+	return in;
+btOptimizedBvh*	btCollisionWorldImporter::createOptimizedBvh()
+	btOptimizedBvh* bvh = new btOptimizedBvh();
+	m_allocatedBvhs.push_back(bvh);
+	return bvh;
+btTriangleInfoMap* btCollisionWorldImporter::createTriangleInfoMap()
+	btTriangleInfoMap* tim = new btTriangleInfoMap();
+	m_allocatedTriangleInfoMaps.push_back(tim);
+	return tim;
+btBvhTriangleMeshShape* btCollisionWorldImporter::createBvhTriangleMeshShape(btStridingMeshInterface* trimesh, btOptimizedBvh* bvh)
+	if (bvh)
+	{
+		btBvhTriangleMeshShape* bvhTriMesh = new btBvhTriangleMeshShape(trimesh,bvh->isQuantized(), false);
+		bvhTriMesh->setOptimizedBvh(bvh);
+		m_allocatedCollisionShapes.push_back(bvhTriMesh);
+		return bvhTriMesh;
+	}
+	btBvhTriangleMeshShape* ts = new btBvhTriangleMeshShape(trimesh,true);
+	m_allocatedCollisionShapes.push_back(ts);
+	return ts;
+btCollisionShape* btCollisionWorldImporter::createConvexTriangleMeshShape(btStridingMeshInterface* trimesh)
+	return 0;
+btGImpactMeshShape* btCollisionWorldImporter::createGimpactShape(btStridingMeshInterface* trimesh)
+	btGImpactMeshShape* shape = new btGImpactMeshShape(trimesh);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btConvexHullShape* btCollisionWorldImporter::createConvexHullShape()
+	btConvexHullShape* shape = new btConvexHullShape();
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btCompoundShape* btCollisionWorldImporter::createCompoundShape()
+	btCompoundShape* shape = new btCompoundShape();
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btScaledBvhTriangleMeshShape* btCollisionWorldImporter::createScaledTrangleMeshShape(btBvhTriangleMeshShape* meshShape,const btVector3& localScaling)
+	btScaledBvhTriangleMeshShape* shape = new btScaledBvhTriangleMeshShape(meshShape,localScaling);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+btMultiSphereShape* btCollisionWorldImporter::createMultiSphereShape(const btVector3* positions,const btScalar* radi,int numSpheres)
+	btMultiSphereShape* shape = new btMultiSphereShape(positions, radi, numSpheres);
+	m_allocatedCollisionShapes.push_back(shape);
+	return shape;
+	// query for data
+int	btCollisionWorldImporter::getNumCollisionShapes() const
+	return m_allocatedCollisionShapes.size();
+btCollisionShape* btCollisionWorldImporter::getCollisionShapeByIndex(int index)
+	return m_allocatedCollisionShapes[index];
+btCollisionShape* btCollisionWorldImporter::getCollisionShapeByName(const char* name)
+	btCollisionShape** shapePtr = m_nameShapeMap.find(name);
+	if (shapePtr&& *shapePtr)
+	{
+		return *shapePtr;
+	}
+	return 0;
+const char*	btCollisionWorldImporter::getNameForPointer(const void* ptr) const
+	const char*const * namePtr = m_objectNameMap.find(ptr);
+	if (namePtr && *namePtr)
+		return *namePtr;
+	return 0;
+int btCollisionWorldImporter::getNumRigidBodies() const
+	return m_allocatedRigidBodies.size();
+btCollisionObject* btCollisionWorldImporter::getRigidBodyByIndex(int index) const
+	return m_allocatedRigidBodies[index];
+int btCollisionWorldImporter::getNumBvhs() const
+	return m_allocatedBvhs.size();
+ btOptimizedBvh* btCollisionWorldImporter::getBvhByIndex(int index) const
+	return m_allocatedBvhs[index];
+int btCollisionWorldImporter::getNumTriangleInfoMaps() const
+	return m_allocatedTriangleInfoMaps.size();
+btTriangleInfoMap* btCollisionWorldImporter::getTriangleInfoMapByIndex(int index) const
+	return m_allocatedTriangleInfoMaps[index];
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorldImporter.h b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorldImporter.h
new file mode 100644
index 00000000..9a6d16fb
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCollisionWorldImporter.h
@@ -0,0 +1,190 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2014 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btTransform.h"
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btHashMap.h"
+class btCollisionShape;
+class btCollisionObject;
+struct btBulletSerializedArrays;
+struct ConstraintInput;
+class btCollisionWorld;
+struct btCollisionShapeData;
+class btTriangleIndexVertexArray;
+class btStridingMeshInterface;
+struct btStridingMeshInterfaceData;
+class btGImpactMeshShape;
+class btOptimizedBvh;
+struct btTriangleInfoMap;
+class btBvhTriangleMeshShape;
+class btPoint2PointConstraint;
+class btHingeConstraint;
+class btConeTwistConstraint;
+class btGeneric6DofConstraint;
+class btGeneric6DofSpringConstraint;
+class btSliderConstraint;
+class btGearConstraint;
+struct btContactSolverInfo;
+class btCollisionWorldImporter
+	btCollisionWorld* m_collisionWorld;
+	int m_verboseMode;
+	btAlignedObjectArray<btCollisionShape*>  m_allocatedCollisionShapes;
+	btAlignedObjectArray<btCollisionObject*> m_allocatedRigidBodies;
+	btAlignedObjectArray<btOptimizedBvh*>	 m_allocatedBvhs;
+	btAlignedObjectArray<btTriangleInfoMap*> m_allocatedTriangleInfoMaps;
+	btAlignedObjectArray<btTriangleIndexVertexArray*> m_allocatedTriangleIndexArrays;
+	btAlignedObjectArray<btStridingMeshInterfaceData*> m_allocatedbtStridingMeshInterfaceDatas;
+	btAlignedObjectArray<btCollisionObject*> m_allocatedCollisionObjects;
+	btAlignedObjectArray<char*>				m_allocatedNames;
+	btAlignedObjectArray<int*>				m_indexArrays;
+	btAlignedObjectArray<short int*>		m_shortIndexArrays;
+	btAlignedObjectArray<unsigned char*>	m_charIndexArrays;
+	btAlignedObjectArray<btVector3FloatData*>	m_floatVertexArrays;
+	btAlignedObjectArray<btVector3DoubleData*>	m_doubleVertexArrays;
+	btHashMap<btHashPtr,btOptimizedBvh*>	m_bvhMap;
+	btHashMap<btHashPtr,btTriangleInfoMap*>	m_timMap;
+	btHashMap<btHashString,btCollisionShape*>	m_nameShapeMap;
+	btHashMap<btHashString,btCollisionObject*>	m_nameColObjMap;
+	btHashMap<btHashPtr,const char*>	m_objectNameMap;
+	btHashMap<btHashPtr,btCollisionShape*>	m_shapeMap;
+	btHashMap<btHashPtr,btCollisionObject*>	m_bodyMap;
+	//methods
+	char*	duplicateName(const char* name);
+	btCollisionShape* convertCollisionShape(  btCollisionShapeData* shapeData  );
+	btCollisionWorldImporter(btCollisionWorld* world);
+	virtual ~btCollisionWorldImporter();
+    bool	convertAllObjects( btBulletSerializedArrays* arrays);
+		///delete all memory collision shapes, rigid bodies, constraints etc. allocated during the load.
+	///make sure you don't use the dynamics world containing objects after you call this method
+	virtual void deleteAllData();
+	void	setVerboseMode(int verboseMode)
+	{
+		m_verboseMode = verboseMode;
+	}
+	int getVerboseMode() const
+	{
+		return m_verboseMode;
+	}
+		// query for data
+	int	getNumCollisionShapes() const;
+	btCollisionShape* getCollisionShapeByIndex(int index);
+	int getNumRigidBodies() const;
+	btCollisionObject* getRigidBodyByIndex(int index) const;
+	int getNumConstraints() const;
+	int getNumBvhs() const;
+	btOptimizedBvh*  getBvhByIndex(int index) const;
+	int getNumTriangleInfoMaps() const;
+	btTriangleInfoMap* getTriangleInfoMapByIndex(int index) const;
+	// queris involving named objects
+	btCollisionShape* getCollisionShapeByName(const char* name);
+	btCollisionObject* getCollisionObjectByName(const char* name);
+	const char*	getNameForPointer(const void* ptr) const;
+	///those virtuals are called by load and can be overridden by the user
+	//bodies
+	virtual btCollisionObject*  createCollisionObject(	const btTransform& startTransform,	btCollisionShape* shape,const char* bodyName);
+	///shapes
+	virtual btCollisionShape* createPlaneShape(const btVector3& planeNormal,btScalar planeConstant);
+	virtual btCollisionShape* createBoxShape(const btVector3& halfExtents);
+	virtual btCollisionShape* createSphereShape(btScalar radius);
+	virtual btCollisionShape* createCapsuleShapeX(btScalar radius, btScalar height);
+	virtual btCollisionShape* createCapsuleShapeY(btScalar radius, btScalar height);
+	virtual btCollisionShape* createCapsuleShapeZ(btScalar radius, btScalar height);
+	virtual btCollisionShape* createCylinderShapeX(btScalar radius,btScalar height);
+	virtual btCollisionShape* createCylinderShapeY(btScalar radius,btScalar height);
+	virtual btCollisionShape* createCylinderShapeZ(btScalar radius,btScalar height);
+	virtual btCollisionShape* createConeShapeX(btScalar radius,btScalar height);
+	virtual btCollisionShape* createConeShapeY(btScalar radius,btScalar height);
+	virtual btCollisionShape* createConeShapeZ(btScalar radius,btScalar height);
+	virtual class btTriangleIndexVertexArray*	createTriangleMeshContainer();
+	virtual	btBvhTriangleMeshShape* createBvhTriangleMeshShape(btStridingMeshInterface* trimesh, btOptimizedBvh* bvh);
+	virtual btCollisionShape* createConvexTriangleMeshShape(btStridingMeshInterface* trimesh);
+	virtual btGImpactMeshShape* createGimpactShape(btStridingMeshInterface* trimesh);
+	virtual btStridingMeshInterfaceData* createStridingMeshInterfaceData(btStridingMeshInterfaceData* interfaceData);
+	virtual class btConvexHullShape* createConvexHullShape();
+	virtual class btCompoundShape* createCompoundShape();
+	virtual class btScaledBvhTriangleMeshShape* createScaledTrangleMeshShape(btBvhTriangleMeshShape* meshShape,const btVector3& localScalingbtBvhTriangleMeshShape);
+	virtual class btMultiSphereShape* createMultiSphereShape(const btVector3* positions,const btScalar* radi,int numSpheres);
+	virtual btTriangleIndexVertexArray* createMeshInterface(btStridingMeshInterfaceData& meshData);
+	///acceleration and connectivity structures
+	virtual btOptimizedBvh*	createOptimizedBvh();
+	virtual btTriangleInfoMap* createTriangleInfoMap();
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
index 54889a63..13cddc11 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
@@ -11,6 +11,7 @@ subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 #include "BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h"
@@ -20,30 +21,34 @@ subject to the following restrictions:
 #include "LinearMath/btIDebugDraw.h"
 #include "LinearMath/btAabbUtil2.h"
 #include "btManifoldResult.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
+btShapePairCallback gCompoundChildShapePairCallback = 0;
-btCompoundCollisionAlgorithm::btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
+btCompoundCollisionAlgorithm::btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
 	m_ownsManifold = false;
-	btCollisionObject* colObj = m_isSwapped? body1 : body0;
-	btAssert (colObj->getCollisionShape()->isCompound());
+	const btCollisionObjectWrapper* colObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	btAssert (colObjWrap->getCollisionShape()->isCompound());
-	btCompoundShape* compoundShape = static_cast<btCompoundShape*>(colObj->getCollisionShape());
+	const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(colObjWrap->getCollisionShape());
 	m_compoundShapeRevision = compoundShape->getUpdateRevision();
-	preallocateChildAlgorithms(body0,body1);
+	preallocateChildAlgorithms(body0Wrap,body1Wrap);
-void	btCompoundCollisionAlgorithm::preallocateChildAlgorithms(btCollisionObject* body0,btCollisionObject* body1)
+void	btCompoundCollisionAlgorithm::preallocateChildAlgorithms(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
-	btCollisionObject* colObj = m_isSwapped? body1 : body0;
-	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
-	btAssert (colObj->getCollisionShape()->isCompound());
+	const btCollisionObjectWrapper* colObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* otherObjWrap = m_isSwapped? body0Wrap : body1Wrap;
+	btAssert (colObjWrap->getCollisionShape()->isCompound());
-	btCompoundShape* compoundShape = static_cast<btCompoundShape*>(colObj->getCollisionShape());
+	const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(colObjWrap->getCollisionShape());
 	int numChildren = compoundShape->getNumChildShapes();
 	int i;
@@ -56,11 +61,11 @@ void	btCompoundCollisionAlgorithm::preallocateChildAlgorithms(btCollisionObject*
 			m_childCollisionAlgorithms[i] = 0;
 		} else
-			btCollisionShape* tmpShape = colObj->getCollisionShape();
-			btCollisionShape* childShape = compoundShape->getChildShape(i);
-			colObj->internalSetTemporaryCollisionShape( childShape );
-			m_childCollisionAlgorithms[i] = m_dispatcher->findAlgorithm(colObj,otherObj,m_sharedManifold);
-			colObj->internalSetTemporaryCollisionShape( tmpShape );
+			const btCollisionShape* childShape = compoundShape->getChildShape(i);
+			btCollisionObjectWrapper childWrap(colObjWrap,childShape,colObjWrap->getCollisionObject(),colObjWrap->getWorldTransform(),-1,i);//wrong child trans, but unused (hopefully)
+			m_childCollisionAlgorithms[i] = m_dispatcher->findAlgorithm(&childWrap,otherObjWrap,m_sharedManifold);
@@ -92,19 +97,16 @@ struct	btCompoundLeafCallback : btDbvt::ICollide
-	btCollisionObject* m_compoundColObj;
-	btCollisionObject* m_otherObj;
+	const btCollisionObjectWrapper* m_compoundColObjWrap;
+	const btCollisionObjectWrapper* m_otherObjWrap;
 	btDispatcher* m_dispatcher;
 	const btDispatcherInfo& m_dispatchInfo;
 	btManifoldResult*	m_resultOut;
 	btCollisionAlgorithm**	m_childCollisionAlgorithms;
 	btPersistentManifold*	m_sharedManifold;
-	btCompoundLeafCallback (btCollisionObject* compoundObj,btCollisionObject* otherObj,btDispatcher* dispatcher,const btDispatcherInfo& dispatchInfo,btManifoldResult*	resultOut,btCollisionAlgorithm**	childCollisionAlgorithms,btPersistentManifold*	sharedManifold)
-		:m_compoundColObj(compoundObj),m_otherObj(otherObj),m_dispatcher(dispatcher),m_dispatchInfo(dispatchInfo),m_resultOut(resultOut),
+	btCompoundLeafCallback (const btCollisionObjectWrapper* compoundObjWrap,const btCollisionObjectWrapper* otherObjWrap,btDispatcher* dispatcher,const btDispatcherInfo& dispatchInfo,btManifoldResult*	resultOut,btCollisionAlgorithm**	childCollisionAlgorithms,btPersistentManifold*	sharedManifold)
+		:m_compoundColObjWrap(compoundObjWrap),m_otherObjWrap(otherObjWrap),m_dispatcher(dispatcher),m_dispatchInfo(dispatchInfo),m_resultOut(resultOut),
@@ -112,73 +114,95 @@ public:
-	void	ProcessChildShape(btCollisionShape* childShape,int index)
+	void	ProcessChildShape(const btCollisionShape* childShape,int index)
-		btCompoundShape* compoundShape = static_cast<btCompoundShape*>(m_compoundColObj->getCollisionShape());
+		const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(m_compoundColObjWrap->getCollisionShape());
-		btTransform	orgTrans = m_compoundColObj->getWorldTransform();
-		btTransform	orgInterpolationTrans = m_compoundColObj->getInterpolationWorldTransform();
+		btTransform	orgTrans = m_compoundColObjWrap->getWorldTransform();
 		const btTransform& childTrans = compoundShape->getChildTransform(index);
 		btTransform	newChildWorldTrans = orgTrans*childTrans ;
 		//perform an AABB check first
 		btVector3 aabbMin0,aabbMax0,aabbMin1,aabbMax1;
-		m_otherObj->getCollisionShape()->getAabb(m_otherObj->getWorldTransform(),aabbMin1,aabbMax1);
+		m_otherObjWrap->getCollisionShape()->getAabb(m_otherObjWrap->getWorldTransform(),aabbMin1,aabbMax1);
+		if (gCompoundChildShapePairCallback)
+		{
+			if (!gCompoundChildShapePairCallback(m_otherObjWrap->getCollisionShape(), childShape))
+				return;
+		}
 		if (TestAabbAgainstAabb2(aabbMin0,aabbMax0,aabbMin1,aabbMax1))
-			m_compoundColObj->setWorldTransform( newChildWorldTrans);
-			m_compoundColObj->setInterpolationWorldTransform(newChildWorldTrans);
+			btCollisionObjectWrapper compoundWrap(this->m_compoundColObjWrap,childShape,m_compoundColObjWrap->getCollisionObject(),newChildWorldTrans,-1,index);
-			//the contactpoint is still projected back using the original inverted worldtrans
-			btCollisionShape* tmpShape = m_compoundColObj->getCollisionShape();
-			m_compoundColObj->internalSetTemporaryCollisionShape( childShape );
+			//the contactpoint is still projected back using the original inverted worldtrans
 			if (!m_childCollisionAlgorithms[index])
-				m_childCollisionAlgorithms[index] = m_dispatcher->findAlgorithm(m_compoundColObj,m_otherObj,m_sharedManifold);
+				m_childCollisionAlgorithms[index] = m_dispatcher->findAlgorithm(&compoundWrap,m_otherObjWrap,m_sharedManifold);
+			const btCollisionObjectWrapper* tmpWrap = 0;
 			///detect swapping case
-			if (m_resultOut->getBody0Internal() == m_compoundColObj)
+			if (m_resultOut->getBody0Internal() == m_compoundColObjWrap->getCollisionObject())
+				tmpWrap = m_resultOut->getBody0Wrap();
+				m_resultOut->setBody0Wrap(&compoundWrap);
 			} else
+				tmpWrap = m_resultOut->getBody1Wrap();
+				m_resultOut->setBody1Wrap(&compoundWrap);
-			m_childCollisionAlgorithms[index]->processCollision(m_compoundColObj,m_otherObj,m_dispatchInfo,m_resultOut);
+			m_childCollisionAlgorithms[index]->processCollision(&compoundWrap,m_otherObjWrap,m_dispatchInfo,m_resultOut);
+#if 0
 			if (m_dispatchInfo.m_debugDraw && (m_dispatchInfo.m_debugDraw->getDebugMode() & btIDebugDraw::DBG_DrawAabb))
 				btVector3 worldAabbMin,worldAabbMax;
+			if (m_resultOut->getBody0Internal() == m_compoundColObjWrap->getCollisionObject())
+			{
+				m_resultOut->setBody0Wrap(tmpWrap);
+			} else
+			{
+				m_resultOut->setBody1Wrap(tmpWrap);
+			}
-			//revert back transform
-			m_compoundColObj->internalSetTemporaryCollisionShape( tmpShape);
-			m_compoundColObj->setWorldTransform(  orgTrans );
-			m_compoundColObj->setInterpolationWorldTransform(orgInterpolationTrans);
 	void		Process(const btDbvtNode* leaf)
 		int index = leaf->dataAsInt;
-		btCompoundShape* compoundShape = static_cast<btCompoundShape*>(m_compoundColObj->getCollisionShape());
-		btCollisionShape* childShape = compoundShape->getChildShape(index);
+		const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(m_compoundColObjWrap->getCollisionShape());
+		const btCollisionShape* childShape = compoundShape->getChildShape(index);
+#if 0
 		if (m_dispatchInfo.m_debugDraw && (m_dispatchInfo.m_debugDraw->getDebugMode() & btIDebugDraw::DBG_DrawAabb))
 			btVector3 worldAabbMin,worldAabbMax;
-			btTransform	orgTrans = m_compoundColObj->getWorldTransform();
+			btTransform	orgTrans = m_compoundColObjWrap->getWorldTransform();
@@ -189,15 +213,13 @@ public:
-void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btCompoundCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
-	btCollisionObject* colObj = m_isSwapped? body1 : body0;
-	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
+	const btCollisionObjectWrapper* colObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* otherObjWrap = m_isSwapped? body0Wrap : body1Wrap;
-	btAssert (colObj->getCollisionShape()->isCompound());
-	btCompoundShape* compoundShape = static_cast<btCompoundShape*>(colObj->getCollisionShape());
+	btAssert (colObjWrap->getCollisionShape()->isCompound());
+	const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(colObjWrap->getCollisionShape());
 	///btCompoundShape might have changed:
 	////make sure the internal child collision algorithm caches are still valid
@@ -206,20 +228,23 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 		///clear and update all
-		preallocateChildAlgorithms(body0,body1);
+		preallocateChildAlgorithms(body0Wrap,body1Wrap);
+		m_compoundShapeRevision = compoundShape->getUpdateRevision();
-	btDbvt* tree = compoundShape->getDynamicAabbTree();
+    if (m_childCollisionAlgorithms.size()==0)
+        return;
+	const btDbvt* tree = compoundShape->getDynamicAabbTree();
 	//use a dynamic aabb tree to cull potential child-overlaps
-	btCompoundLeafCallback  callback(colObj,otherObj,m_dispatcher,dispatchInfo,resultOut,&m_childCollisionAlgorithms[0],m_sharedManifold);
+	btCompoundLeafCallback  callback(colObjWrap,otherObjWrap,m_dispatcher,dispatchInfo,resultOut,&m_childCollisionAlgorithms[0],m_sharedManifold);
 	///we need to refresh all contact manifolds
 	///note that we should actually recursively traverse all children, btCompoundShape can nested more then 1 level deep
 	///so we should add a 'refreshManifolds' in the btCollisionAlgorithm
 		int i;
-		btManifoldArray manifoldArray;
+		manifoldArray.resize(0);
 		for (i=0;i<m_childCollisionAlgorithms.size();i++)
 			if (m_childCollisionAlgorithms[i])
@@ -244,12 +269,12 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 		btVector3 localAabbMin,localAabbMax;
 		btTransform otherInCompoundSpace;
-		otherInCompoundSpace = colObj->getWorldTransform().inverse() * otherObj->getWorldTransform();
-		otherObj->getCollisionShape()->getAabb(otherInCompoundSpace,localAabbMin,localAabbMax);
+		otherInCompoundSpace = colObjWrap->getWorldTransform().inverse() * otherObjWrap->getWorldTransform();
+		otherObjWrap->getCollisionShape()->getAabb(otherInCompoundSpace,localAabbMin,localAabbMax);
 		const ATTRIBUTE_ALIGNED16(btDbvtVolume)	bounds=btDbvtVolume::FromMM(localAabbMin,localAabbMax);
 		//process all children, that overlap with  the given AABB bounds
-		tree->collideTV(tree->m_root,bounds,callback);
+		tree->collideTVNoStackAlloc(tree->m_root,bounds,stack2,callback);
 	} else
@@ -266,10 +291,10 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 				//iterate over all children, perform an AABB check inside ProcessChildShape
 		int numChildren = m_childCollisionAlgorithms.size();
 		int i;
-		btManifoldArray	manifoldArray;
-        btCollisionShape* childShape = 0;
+		manifoldArray.resize(0);
+        const btCollisionShape* childShape = 0;
         btTransform	orgTrans;
-        btTransform	orgInterpolationTrans;
         btTransform	newChildWorldTrans;
         btVector3 aabbMin0,aabbMax0,aabbMin1,aabbMax1;        
@@ -279,14 +304,14 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 				childShape = compoundShape->getChildShape(i);
 			//if not longer overlapping, remove the algorithm
-                orgTrans = colObj->getWorldTransform();
-                orgInterpolationTrans = colObj->getInterpolationWorldTransform();
+				orgTrans = colObjWrap->getWorldTransform();
 				const btTransform& childTrans = compoundShape->getChildTransform(i);
                 newChildWorldTrans = orgTrans*childTrans ;
 				//perform an AABB check first
-				otherObj->getCollisionShape()->getAabb(otherObj->getWorldTransform(),aabbMin1,aabbMax1);
+				otherObjWrap->getCollisionShape()->getAabb(otherObjWrap->getWorldTransform(),aabbMin1,aabbMax1);
 				if (!TestAabbAgainstAabb2(aabbMin0,aabbMax0,aabbMin1,aabbMax1))
@@ -301,7 +326,8 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 btScalar	btCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+	btAssert(0);
+	//needs to be fixed, using btCollisionObjectWrapper and NOT modifying internal data structures
 	btCollisionObject* colObj = m_isSwapped? body1 : body0;
 	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
@@ -324,8 +350,7 @@ btScalar	btCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject*
     btScalar frac;
 	for (i=0;i<numChildren;i++)
-		//temporarily exchange parent btCollisionShape with childShape, and recurse
-		btCollisionShape* childShape = compoundShape->getChildShape(i);
+		//btCollisionShape* childShape = compoundShape->getChildShape(i);
         orgTrans = colObj->getWorldTransform();
@@ -334,15 +359,15 @@ btScalar	btCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject*
 		//btTransform	newChildWorldTrans = orgTrans*childTrans ;
 		colObj->setWorldTransform( orgTrans*childTrans );
-		btCollisionShape* tmpShape = colObj->getCollisionShape();
-		colObj->internalSetTemporaryCollisionShape( childShape );
+		//btCollisionShape* tmpShape = colObj->getCollisionShape();
+		//colObj->internalSetTemporaryCollisionShape( childShape );
         frac = m_childCollisionAlgorithms[i]->calculateTimeOfImpact(colObj,otherObj,dispatchInfo,resultOut);
 		if (frac<hitFraction)
 			hitFraction = frac;
 		//revert back
-		colObj->internalSetTemporaryCollisionShape( tmpShape);
+		//colObj->internalSetTemporaryCollisionShape( tmpShape);
 		colObj->setWorldTransform( orgTrans);
 	return hitFraction;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h
index 40457498..d2086fbc 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h
@@ -11,6 +11,7 @@ subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
@@ -25,31 +26,47 @@ class btDispatcher;
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
 #include "btCollisionCreateFunc.h"
 #include "LinearMath/btAlignedObjectArray.h"
+#include "BulletCollision/BroadphaseCollision/btDbvt.h"
 class btDispatcher;
 class btCollisionObject;
+class btCollisionShape;
+typedef bool (*btShapePairCallback)(const btCollisionShape* pShape0, const btCollisionShape* pShape1);
+extern btShapePairCallback gCompoundChildShapePairCallback;
 /// btCompoundCollisionAlgorithm  supports collision between CompoundCollisionShapes and other collision shapes
 class btCompoundCollisionAlgorithm  : public btActivatingCollisionAlgorithm
+	btNodeStack stack2;
+	btManifoldArray manifoldArray;
 	btAlignedObjectArray<btCollisionAlgorithm*> m_childCollisionAlgorithms;
 	bool m_isSwapped;
 	class btPersistentManifold*	m_sharedManifold;
 	bool					m_ownsManifold;
 	int	m_compoundShapeRevision;//to keep track of changes, so that childAlgorithm array can be updated
 	void	removeChildAlgorithms();
-	void	preallocateChildAlgorithms(btCollisionObject* body0,btCollisionObject* body1);
+	void	preallocateChildAlgorithms(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
-	btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 	virtual ~btCompoundCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	btCollisionAlgorithm* getChildAlgorithm (int n) const
+	{
+		return m_childCollisionAlgorithms[n];
+	}
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -63,21 +80,22 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btCompoundCollisionAlgorithm));
-			return new(mem) btCompoundCollisionAlgorithm(ci,body0,body1,false);
+			return new(mem) btCompoundCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btCompoundCollisionAlgorithm));
-			return new(mem) btCompoundCollisionAlgorithm(ci,body0,body1,true);
+			return new(mem) btCompoundCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.cpp
new file mode 100644
index 00000000..1d64d84b
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.cpp
@@ -0,0 +1,426 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btCompoundCompoundCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCompoundShape.h"
+#include "BulletCollision/BroadphaseCollision/btDbvt.h"
+#include "LinearMath/btIDebugDraw.h"
+#include "LinearMath/btAabbUtil2.h"
+#include "BulletCollision/CollisionDispatch/btManifoldResult.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
+btShapePairCallback gCompoundCompoundChildShapePairCallback = 0;
+btCompoundCompoundCollisionAlgorithm::btCompoundCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
+	void* ptr = btAlignedAlloc(sizeof(btHashedSimplePairCache),16);
+	m_childCollisionAlgorithmCache= new(ptr) btHashedSimplePairCache();
+	const btCollisionObjectWrapper* col0ObjWrap = body0Wrap;
+	btAssert (col0ObjWrap->getCollisionShape()->isCompound());
+	const btCollisionObjectWrapper* col1ObjWrap = body1Wrap;
+	btAssert (col1ObjWrap->getCollisionShape()->isCompound());
+	const btCompoundShape* compoundShape0 = static_cast<const btCompoundShape*>(col0ObjWrap->getCollisionShape());
+	m_compoundShapeRevision0 = compoundShape0->getUpdateRevision();
+	const btCompoundShape* compoundShape1 = static_cast<const btCompoundShape*>(col1ObjWrap->getCollisionShape());
+	m_compoundShapeRevision1 = compoundShape1->getUpdateRevision();
+	removeChildAlgorithms();
+	m_childCollisionAlgorithmCache->~btHashedSimplePairCache();
+	btAlignedFree(m_childCollisionAlgorithmCache);
+void	btCompoundCompoundCollisionAlgorithm::getAllContactManifolds(btManifoldArray&	manifoldArray)
+	int i;
+	btSimplePairArray& pairs = m_childCollisionAlgorithmCache->getOverlappingPairArray();
+	for (i=0;i<pairs.size();i++)
+	{
+		if (pairs[i].m_userPointer)
+		{
+			((btCollisionAlgorithm*)pairs[i].m_userPointer)->getAllContactManifolds(manifoldArray);
+		}
+	}
+void	btCompoundCompoundCollisionAlgorithm::removeChildAlgorithms()
+	btSimplePairArray& pairs = m_childCollisionAlgorithmCache->getOverlappingPairArray();
+	int numChildren = pairs.size();
+	int i;
+	for (i=0;i<numChildren;i++)
+	{
+		if (pairs[i].m_userPointer)
+		{
+			btCollisionAlgorithm* algo = (btCollisionAlgorithm*) pairs[i].m_userPointer;
+			algo->~btCollisionAlgorithm();
+			m_dispatcher->freeCollisionAlgorithm(algo);
+		}
+	}
+	m_childCollisionAlgorithmCache->removeAllPairs();
+struct	btCompoundCompoundLeafCallback : btDbvt::ICollide
+	int m_numOverlapPairs;
+	const btCollisionObjectWrapper* m_compound0ColObjWrap;
+	const btCollisionObjectWrapper* m_compound1ColObjWrap;
+	btDispatcher* m_dispatcher;
+	const btDispatcherInfo& m_dispatchInfo;
+	btManifoldResult*	m_resultOut;
+	class btHashedSimplePairCache*	m_childCollisionAlgorithmCache;
+	btPersistentManifold*	m_sharedManifold;
+	btCompoundCompoundLeafCallback (const btCollisionObjectWrapper* compound1ObjWrap,
+									const btCollisionObjectWrapper* compound0ObjWrap,
+									btDispatcher* dispatcher,
+									const btDispatcherInfo& dispatchInfo,
+									btManifoldResult*	resultOut,
+									btHashedSimplePairCache* childAlgorithmsCache,
+									btPersistentManifold*	sharedManifold)
+		:m_numOverlapPairs(0),m_compound0ColObjWrap(compound1ObjWrap),m_compound1ColObjWrap(compound0ObjWrap),m_dispatcher(dispatcher),m_dispatchInfo(dispatchInfo),m_resultOut(resultOut),
+		m_childCollisionAlgorithmCache(childAlgorithmsCache),
+		m_sharedManifold(sharedManifold)
+	{
+	}
+	void		Process(const btDbvtNode* leaf0,const btDbvtNode* leaf1)
+	{
+		m_numOverlapPairs++;
+		int childIndex0 = leaf0->dataAsInt;
+		int childIndex1 = leaf1->dataAsInt;
+		btAssert(childIndex0>=0);
+		btAssert(childIndex1>=0);
+		const btCompoundShape* compoundShape0 = static_cast<const btCompoundShape*>(m_compound0ColObjWrap->getCollisionShape());
+		btAssert(childIndex0<compoundShape0->getNumChildShapes());
+		const btCompoundShape* compoundShape1 = static_cast<const btCompoundShape*>(m_compound1ColObjWrap->getCollisionShape());
+		btAssert(childIndex1<compoundShape1->getNumChildShapes());
+		const btCollisionShape* childShape0 = compoundShape0->getChildShape(childIndex0);
+		const btCollisionShape* childShape1 = compoundShape1->getChildShape(childIndex1);
+		//backup
+		btTransform	orgTrans0 = m_compound0ColObjWrap->getWorldTransform();
+		const btTransform& childTrans0 = compoundShape0->getChildTransform(childIndex0);
+		btTransform	newChildWorldTrans0 = orgTrans0*childTrans0 ;
+		btTransform	orgTrans1 = m_compound1ColObjWrap->getWorldTransform();
+		const btTransform& childTrans1 = compoundShape1->getChildTransform(childIndex1);
+		btTransform	newChildWorldTrans1 = orgTrans1*childTrans1 ;
+		//perform an AABB check first
+		btVector3 aabbMin0,aabbMax0,aabbMin1,aabbMax1;
+		childShape0->getAabb(newChildWorldTrans0,aabbMin0,aabbMax0);
+		childShape1->getAabb(newChildWorldTrans1,aabbMin1,aabbMax1);
+		if (gCompoundCompoundChildShapePairCallback)
+		{
+			if (!gCompoundCompoundChildShapePairCallback(childShape0,childShape1))
+				return;
+		}
+		if (TestAabbAgainstAabb2(aabbMin0,aabbMax0,aabbMin1,aabbMax1))
+		{
+			btCollisionObjectWrapper compoundWrap0(this->m_compound0ColObjWrap,childShape0, m_compound0ColObjWrap->getCollisionObject(),newChildWorldTrans0,-1,childIndex0);
+			btCollisionObjectWrapper compoundWrap1(this->m_compound1ColObjWrap,childShape1,m_compound1ColObjWrap->getCollisionObject(),newChildWorldTrans1,-1,childIndex1);
+			btSimplePair* pair = m_childCollisionAlgorithmCache->findPair(childIndex0,childIndex1);
+			btCollisionAlgorithm* colAlgo = 0;
+			if (pair)
+			{
+				colAlgo = (btCollisionAlgorithm*)pair->m_userPointer;
+			} else
+			{
+				colAlgo = m_dispatcher->findAlgorithm(&compoundWrap0,&compoundWrap1,m_sharedManifold);
+				pair = m_childCollisionAlgorithmCache->addOverlappingPair(childIndex0,childIndex1);
+				btAssert(pair);
+				pair->m_userPointer = colAlgo;
+			}
+			btAssert(colAlgo);
+			const btCollisionObjectWrapper* tmpWrap0 = 0;
+			const btCollisionObjectWrapper* tmpWrap1 = 0;
+			tmpWrap0 = m_resultOut->getBody0Wrap();
+			tmpWrap1 = m_resultOut->getBody1Wrap();
+			m_resultOut->setBody0Wrap(&compoundWrap0);
+			m_resultOut->setBody1Wrap(&compoundWrap1);
+			m_resultOut->setShapeIdentifiersA(-1,childIndex0);
+			m_resultOut->setShapeIdentifiersB(-1,childIndex1);
+			colAlgo->processCollision(&compoundWrap0,&compoundWrap1,m_dispatchInfo,m_resultOut);
+			m_resultOut->setBody0Wrap(tmpWrap0);
+			m_resultOut->setBody1Wrap(tmpWrap1);
+		}
+	}
+static DBVT_INLINE bool		MyIntersect(	const btDbvtAabbMm& a,
+								  const btDbvtAabbMm& b, const btTransform& xform)
+	btVector3 newmin,newmax;
+	btTransformAabb(b.Mins(),b.Maxs(),0.f,xform,newmin,newmax);
+	btDbvtAabbMm newb = btDbvtAabbMm::FromMM(newmin,newmax);
+	return Intersect(a,newb);
+static inline void		MycollideTT(	const btDbvtNode* root0,
+								  const btDbvtNode* root1,
+								  const btTransform& xform,
+								  btCompoundCompoundLeafCallback* callback)
+		if(root0&&root1)
+		{
+			int								depth=1;
+			int								treshold=btDbvt::DOUBLE_STACKSIZE-4;
+			btAlignedObjectArray<btDbvt::sStkNN>	stkStack;
+			stkStack.resize(btDbvt::DOUBLE_STACKSIZE);
+			stkStack[0]=btDbvt::sStkNN(root0,root1);
+			do	{
+				btDbvt::sStkNN	p=stkStack[--depth];
+				if(MyIntersect(p.a->volume,p.b->volume,xform))
+				{
+					if(depth>treshold)
+					{
+						stkStack.resize(stkStack.size()*2);
+						treshold=stkStack.size()-4;
+					}
+					if(p.a->isinternal())
+					{
+						if(p.b->isinternal())
+						{					
+							stkStack[depth++]=btDbvt::sStkNN(p.a->childs[0],p.b->childs[0]);
+							stkStack[depth++]=btDbvt::sStkNN(p.a->childs[1],p.b->childs[0]);
+							stkStack[depth++]=btDbvt::sStkNN(p.a->childs[0],p.b->childs[1]);
+							stkStack[depth++]=btDbvt::sStkNN(p.a->childs[1],p.b->childs[1]);
+						}
+						else
+						{
+							stkStack[depth++]=btDbvt::sStkNN(p.a->childs[0],p.b);
+							stkStack[depth++]=btDbvt::sStkNN(p.a->childs[1],p.b);
+						}
+					}
+					else
+					{
+						if(p.b->isinternal())
+						{
+							stkStack[depth++]=btDbvt::sStkNN(p.a,p.b->childs[0]);
+							stkStack[depth++]=btDbvt::sStkNN(p.a,p.b->childs[1]);
+						}
+						else
+						{
+							callback->Process(p.a,p.b);
+						}
+					}
+				}
+			} while(depth);
+		}
+void btCompoundCompoundCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+	const btCollisionObjectWrapper* col0ObjWrap = body0Wrap;
+	const btCollisionObjectWrapper* col1ObjWrap= body1Wrap;
+	btAssert (col0ObjWrap->getCollisionShape()->isCompound());
+	btAssert (col1ObjWrap->getCollisionShape()->isCompound());
+	const btCompoundShape* compoundShape0 = static_cast<const btCompoundShape*>(col0ObjWrap->getCollisionShape());
+	const btCompoundShape* compoundShape1 = static_cast<const btCompoundShape*>(col1ObjWrap->getCollisionShape());
+	const btDbvt* tree0 = compoundShape0->getDynamicAabbTree();
+	const btDbvt* tree1 = compoundShape1->getDynamicAabbTree();
+	if (!tree0 || !tree1)
+	{
+		return btCompoundCollisionAlgorithm::processCollision(body0Wrap,body1Wrap,dispatchInfo,resultOut);
+	}
+	///btCompoundShape might have changed:
+	////make sure the internal child collision algorithm caches are still valid
+	if ((compoundShape0->getUpdateRevision() != m_compoundShapeRevision0) || (compoundShape1->getUpdateRevision() != m_compoundShapeRevision1))
+	{
+		///clear all
+		removeChildAlgorithms();
+		m_compoundShapeRevision0 = compoundShape0->getUpdateRevision();
+		m_compoundShapeRevision1 = compoundShape1->getUpdateRevision();
+	}
+	///we need to refresh all contact manifolds
+	///note that we should actually recursively traverse all children, btCompoundShape can nested more then 1 level deep
+	///so we should add a 'refreshManifolds' in the btCollisionAlgorithm
+	{
+		int i;
+		btManifoldArray manifoldArray;
+		btSimplePairArray& pairs = m_childCollisionAlgorithmCache->getOverlappingPairArray();
+		for (i=0;i<pairs.size();i++)
+		{
+			if (pairs[i].m_userPointer)
+			{
+				btCollisionAlgorithm* algo = (btCollisionAlgorithm*) pairs[i].m_userPointer;
+				algo->getAllContactManifolds(manifoldArray);
+				for (int m=0;m<manifoldArray.size();m++)
+				{
+					if (manifoldArray[m]->getNumContacts())
+					{
+						resultOut->setPersistentManifold(manifoldArray[m]);
+						resultOut->refreshContactPoints();
+						resultOut->setPersistentManifold(0);
+					}
+				}
+				manifoldArray.resize(0);
+			}
+		}
+	}
+	btCompoundCompoundLeafCallback callback(col0ObjWrap,col1ObjWrap,this->m_dispatcher,dispatchInfo,resultOut,this->m_childCollisionAlgorithmCache,m_sharedManifold);
+	const btTransform	xform=col0ObjWrap->getWorldTransform().inverse()*col1ObjWrap->getWorldTransform();
+	MycollideTT(tree0->m_root,tree1->m_root,xform,&callback);
+	//printf("#compound-compound child/leaf overlap =%d                      \r",callback.m_numOverlapPairs);
+	//remove non-overlapping child pairs
+	{
+		btAssert(m_removePairs.size()==0);
+		//iterate over all children, perform an AABB check inside ProcessChildShape
+		btSimplePairArray& pairs = m_childCollisionAlgorithmCache->getOverlappingPairArray();
+		int i;
+		btManifoldArray	manifoldArray;
+        btVector3 aabbMin0,aabbMax0,aabbMin1,aabbMax1;        
+		for (i=0;i<pairs.size();i++)
+		{
+			if (pairs[i].m_userPointer)
+			{
+				btCollisionAlgorithm* algo = (btCollisionAlgorithm*)pairs[i].m_userPointer;
+				{
+					btTransform	orgTrans0;
+					const btCollisionShape* childShape0 = 0;
+					btTransform	newChildWorldTrans0;
+					btTransform	orgInterpolationTrans0;
+					childShape0 = compoundShape0->getChildShape(pairs[i].m_indexA);
+					orgTrans0 = col0ObjWrap->getWorldTransform();
+					orgInterpolationTrans0 = col0ObjWrap->getWorldTransform();
+					const btTransform& childTrans0 = compoundShape0->getChildTransform(pairs[i].m_indexA);
+					newChildWorldTrans0 = orgTrans0*childTrans0 ;
+					childShape0->getAabb(newChildWorldTrans0,aabbMin0,aabbMax0);
+				}
+				{
+					btTransform	orgInterpolationTrans1;
+					const btCollisionShape* childShape1 = 0;
+					btTransform	orgTrans1;
+					btTransform	newChildWorldTrans1;
+					childShape1 = compoundShape1->getChildShape(pairs[i].m_indexB);
+					orgTrans1 = col1ObjWrap->getWorldTransform();
+					orgInterpolationTrans1 = col1ObjWrap->getWorldTransform();
+					const btTransform& childTrans1 = compoundShape1->getChildTransform(pairs[i].m_indexB);
+					newChildWorldTrans1 = orgTrans1*childTrans1 ;
+					childShape1->getAabb(newChildWorldTrans1,aabbMin1,aabbMax1);
+				}
+				if (!TestAabbAgainstAabb2(aabbMin0,aabbMax0,aabbMin1,aabbMax1))
+				{
+					algo->~btCollisionAlgorithm();
+					m_dispatcher->freeCollisionAlgorithm(algo);
+					m_removePairs.push_back(btSimplePair(pairs[i].m_indexA,pairs[i].m_indexB));
+				}
+			}
+		}
+		for (int i=0;i<m_removePairs.size();i++)
+		{
+			m_childCollisionAlgorithmCache->removeOverlappingPair(m_removePairs[i].m_indexA,m_removePairs[i].m_indexB);
+		}
+		m_removePairs.clear();
+	}
+btScalar	btCompoundCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+	btAssert(0);
+	return 0.f;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.h
new file mode 100644
index 00000000..06a762f2
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.h
@@ -0,0 +1,89 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btCompoundCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h"
+#include "BulletCollision/BroadphaseCollision/btDispatcher.h"
+#include "BulletCollision/BroadphaseCollision/btBroadphaseInterface.h"
+#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
+class btDispatcher;
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+#include "BulletCollision/CollisionDispatch/btCollisionCreateFunc.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "BulletCollision/CollisionDispatch/btHashedSimplePairCache.h"
+class btDispatcher;
+class btCollisionObject;
+class btCollisionShape;
+typedef bool (*btShapePairCallback)(const btCollisionShape* pShape0, const btCollisionShape* pShape1);
+extern btShapePairCallback gCompoundCompoundChildShapePairCallback;
+/// btCompoundCompoundCollisionAlgorithm  supports collision between two btCompoundCollisionShape shapes
+class btCompoundCompoundCollisionAlgorithm  : public btCompoundCollisionAlgorithm
+	class btHashedSimplePairCache*	m_childCollisionAlgorithmCache;
+	btSimplePairArray m_removePairs;
+	int	m_compoundShapeRevision0;//to keep track of changes, so that childAlgorithm array can be updated
+	int	m_compoundShapeRevision1;
+	void	removeChildAlgorithms();
+//	void	preallocateChildAlgorithms(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
+	btCompoundCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
+	virtual ~btCompoundCompoundCollisionAlgorithm();
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual	void	getAllContactManifolds(btManifoldArray&	manifoldArray);
+	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
+	{
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+		{
+			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btCompoundCompoundCollisionAlgorithm));
+			return new(mem) btCompoundCompoundCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
+		}
+	};
+	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
+	{
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+		{
+			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btCompoundCompoundCollisionAlgorithm));
+			return new(mem) btCompoundCompoundCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
+		}
+	};
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp
index db7f884a..1cb3d2e7 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp
@@ -43,12 +43,10 @@ subject to the following restrictions:
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpa2.h"
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 btConvex2dConvex2dAlgorithm::CreateFunc::CreateFunc(btSimplexSolverInterface*			simplexSolver, btConvexPenetrationDepthSolver* pdSolver)
-	m_numPerturbationIterations = 0;
-	m_minimumPointsPerturbationThreshold = 3;
 	m_simplexSolver = simplexSolver;
 	m_pdSolver = pdSolver;
@@ -57,18 +55,16 @@ btConvex2dConvex2dAlgorithm::CreateFunc::~CreateFunc()
-btConvex2dConvex2dAlgorithm::btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
-: btActivatingCollisionAlgorithm(ci,body0,body1),
+btConvex2dConvex2dAlgorithm::btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int /* numPerturbationIterations */, int /* minimumPointsPerturbationThreshold */)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_ownManifold (false),
- m_numPerturbationIterations(numPerturbationIterations),
-	(void)body0;
-	(void)body1;
+	(void)body0Wrap;
+	(void)body1Wrap;
@@ -96,13 +92,13 @@ extern btScalar gContactBreakingThreshold;
 // Convex-Convex collision algorithm
-void btConvex2dConvex2dAlgorithm ::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvex2dConvex2dAlgorithm ::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
@@ -111,8 +107,8 @@ void btConvex2dConvex2dAlgorithm ::processCollision (btCollisionObject* body0,bt
-	btConvexShape* min0 = static_cast<btConvexShape*>(body0->getCollisionShape());
-	btConvexShape* min1 = static_cast<btConvexShape*>(body1->getCollisionShape());
+	const btConvexShape* min0 = static_cast<const btConvexShape*>(body0Wrap->getCollisionShape());
+	const btConvexShape* min1 = static_cast<const btConvexShape*>(body1Wrap->getCollisionShape());
 	btVector3  normalOnB;
 	btVector3  pointOnBWorld;
@@ -132,9 +128,8 @@ void btConvex2dConvex2dAlgorithm ::processCollision (btCollisionObject* body0,bt
 			input.m_maximumDistanceSquared*= input.m_maximumDistanceSquared;
-		input.m_stackAlloc = dispatchInfo.m_stackAllocator;
-		input.m_transformA = body0->getWorldTransform();
-		input.m_transformB = body1->getWorldTransform();
+		input.m_transformA = body0Wrap->getWorldTransform();
+		input.m_transformB = body1Wrap->getWorldTransform();
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h
index 53d13b87..24d13367 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h
@@ -40,17 +40,14 @@ class btConvex2dConvex2dAlgorithm : public btActivatingCollisionAlgorithm
 	btPersistentManifold*	m_manifoldPtr;
 	bool			m_lowLevelOfDetail;
-	int m_numPerturbationIterations;
-	int m_minimumPointsPerturbationThreshold;
-	btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
+	btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
 	virtual ~btConvex2dConvex2dAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -82,10 +79,10 @@ public:
 		virtual ~CreateFunc();
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvex2dConvex2dAlgorithm));
-			return new(mem) btConvex2dConvex2dAlgorithm(ci.m_manifold,ci,body0,body1,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+			return new(mem) btConvex2dConvex2dAlgorithm(ci.m_manifold,ci,body0Wrap,body1Wrap,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp
index d2b2c221..912a5285 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp
@@ -25,11 +25,12 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "LinearMath/btIDebugDraw.h"
 #include "BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
-btConvexConcaveCollisionAlgorithm::btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
-: btActivatingCollisionAlgorithm(ci,body0,body1),
+btConvexConcaveCollisionAlgorithm::btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
@@ -46,17 +47,17 @@ void	btConvexConcaveCollisionAlgorithm::getAllContactManifolds(btManifoldArray&
-btConvexTriangleCallback::btConvexTriangleCallback(btDispatcher*  dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped):
+btConvexTriangleCallback::btConvexTriangleCallback(btDispatcher*  dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped):
-	m_convexBody = isSwapped? body1:body0;
-	m_triBody = isSwapped? body0:body1;
+	m_convexBodyWrap = isSwapped? body1Wrap:body0Wrap;
+	m_triBodyWrap = isSwapped? body0Wrap:body1Wrap;
 	  // create the manifold from the dispatcher 'manifold pool'
-	  m_manifoldPtr = m_dispatcher->getNewManifold(m_convexBody,m_triBody);
+	  m_manifoldPtr = m_dispatcher->getNewManifold(m_convexBodyWrap->getCollisionObject(),m_triBodyWrap->getCollisionObject());
@@ -75,26 +76,31 @@ void	btConvexTriangleCallback::clearCache()
-void btConvexTriangleCallback::processTriangle(btVector3* triangle,int partId, int triangleIndex)
+void btConvexTriangleCallback::processTriangle(btVector3* triangle,int
+partId, int triangleIndex)
-	//just for debugging purposes
-	//printf("triangle %d",m_triangleCount++);
+	if (!TestTriangleAgainstAabb2(triangle, m_aabbMin, m_aabbMax))
+	{
+		return;
+	}
+        //just for debugging purposes
+        //printf("triangle %d",m_triangleCount++);
-	//aabb filter is already applied!	
 	btCollisionAlgorithmConstructionInfo ci;
 	ci.m_dispatcher1 = m_dispatcher;
-	btCollisionObject* ob = static_cast<btCollisionObject*>(m_triBody);
 #if 0	
 	///debug drawing of the overlapping triangles
 	if (m_dispatchInfoPtr && m_dispatchInfoPtr->m_debugDraw && (m_dispatchInfoPtr->m_debugDraw->getDebugMode() &btIDebugDraw::DBG_DrawWireframe ))
+		const btCollisionObject* ob = const_cast<btCollisionObject*>(m_triBodyWrap->getCollisionObject());
 		btVector3 color(1,1,0);
 		btTransform& tr = ob->getWorldTransform();
@@ -103,46 +109,63 @@ void btConvexTriangleCallback::processTriangle(btVector3* triangle,int partId, i
-	if (m_convexBody->getCollisionShape()->isConvex())
+	if (m_convexBodyWrap->getCollisionShape()->isConvex())
 		btTriangleShape tm(triangle[0],triangle[1],triangle[2]);	
-		btCollisionShape* tmpShape = ob->getCollisionShape();
-		ob->internalSetTemporaryCollisionShape( &tm );
+		btCollisionObjectWrapper triObWrap(m_triBodyWrap,&tm,m_triBodyWrap->getCollisionObject(),m_triBodyWrap->getWorldTransform(),partId,triangleIndex);//correct transform?
+		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_convexBodyWrap,&triObWrap,m_manifoldPtr);
-		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_convexBody,m_triBody,m_manifoldPtr);
+		const btCollisionObjectWrapper* tmpWrap = 0;
-		if (m_resultOut->getBody0Internal() == m_triBody)
+		if (m_resultOut->getBody0Internal() == m_triBodyWrap->getCollisionObject())
+			tmpWrap = m_resultOut->getBody0Wrap();
+			m_resultOut->setBody0Wrap(&triObWrap);
+			tmpWrap = m_resultOut->getBody1Wrap();
+			m_resultOut->setBody1Wrap(&triObWrap);
-		colAlgo->processCollision(m_convexBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
+		colAlgo->processCollision(m_convexBodyWrap,&triObWrap,*m_dispatchInfoPtr,m_resultOut);
+		if (m_resultOut->getBody0Internal() == m_triBodyWrap->getCollisionObject())
+		{
+			m_resultOut->setBody0Wrap(tmpWrap);
+		} else
+		{
+			m_resultOut->setBody1Wrap(tmpWrap);
+		}
-		ob->internalSetTemporaryCollisionShape( tmpShape);
-void	btConvexTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void	btConvexTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,const btCollisionObjectWrapper* convexBodyWrap, const btCollisionObjectWrapper* triBodyWrap, btManifoldResult* resultOut)
+	m_convexBodyWrap = convexBodyWrap;
+	m_triBodyWrap = triBodyWrap;
 	m_dispatchInfoPtr = &dispatchInfo;
 	m_collisionMarginTriangle = collisionMarginTriangle;
 	m_resultOut = resultOut;
 	//recalc aabbs
 	btTransform convexInTriangleSpace;
-	convexInTriangleSpace = m_triBody->getWorldTransform().inverse() * m_convexBody->getWorldTransform();
-	btCollisionShape* convexShape = static_cast<btCollisionShape*>(m_convexBody->getCollisionShape());
+	convexInTriangleSpace = m_triBodyWrap->getWorldTransform().inverse() * m_convexBodyWrap->getWorldTransform();
+	const btCollisionShape* convexShape = static_cast<const btCollisionShape*>(m_convexBodyWrap->getCollisionShape());
 	//CollisionShape* triangleShape = static_cast<btCollisionShape*>(triBody->m_collisionShape);
 	btScalar extraMargin = collisionMarginTriangle;
@@ -159,35 +182,34 @@ void btConvexConcaveCollisionAlgorithm::clearCache()
-void btConvexConcaveCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexConcaveCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
-	btCollisionObject* convexBody = m_isSwapped ? body1 : body0;
-	btCollisionObject* triBody = m_isSwapped ? body0 : body1;
+	const btCollisionObjectWrapper* convexBodyWrap = m_isSwapped ? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* triBodyWrap = m_isSwapped ? body0Wrap : body1Wrap;
-	if (triBody->getCollisionShape()->isConcave())
+	if (triBodyWrap->getCollisionShape()->isConcave())
-		btCollisionObject*	triOb = triBody;
-		btConcaveShape* concaveShape = static_cast<btConcaveShape*>( triOb->getCollisionShape());
-		if (convexBody->getCollisionShape()->isConvex())
+		const btConcaveShape* concaveShape = static_cast<const btConcaveShape*>( triBodyWrap->getCollisionShape());
+		if (convexBodyWrap->getCollisionShape()->isConvex())
 			btScalar collisionMarginTriangle = concaveShape->getMargin();
-			m_btConvexTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,dispatchInfo,resultOut);
+			m_btConvexTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,dispatchInfo,convexBodyWrap,triBodyWrap,resultOut);
-			//Disable persistency. previously, some older algorithm calculated all contacts in one go, so you can clear it here.
-			//m_dispatcher->clearManifold(m_btConvexTriangleCallback.m_manifoldPtr);
-			m_btConvexTriangleCallback.m_manifoldPtr->setBodies(convexBody,triBody);
+			m_btConvexTriangleCallback.m_manifoldPtr->setBodies(convexBodyWrap->getCollisionObject(),triBodyWrap->getCollisionObject());
 			concaveShape->processAllTriangles( &m_btConvexTriangleCallback,m_btConvexTriangleCallback.getAabbMin(),m_btConvexTriangleCallback.getAabbMax());
+			m_btConvexTriangleCallback.clearWrapperData();
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h
index f718d1de..93d842ef 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h
@@ -26,14 +26,16 @@ class btDispatcher;
 #include "btCollisionCreateFunc.h"
 ///For each triangle in the concave mesh that overlaps with the AABB of a convex (m_convexProxy), processTriangle is called.
-class btConvexTriangleCallback : public btTriangleCallback
+ATTRIBUTE_ALIGNED16(class)  btConvexTriangleCallback : public btTriangleCallback
-	btCollisionObject* m_convexBody;
-	btCollisionObject* m_triBody;
 	btVector3	m_aabbMin;
 	btVector3	m_aabbMax ;
+	const btCollisionObjectWrapper* m_convexBodyWrap;
+	const btCollisionObjectWrapper* m_triBodyWrap;
 	btManifoldResult* m_resultOut;
 	btDispatcher*	m_dispatcher;
@@ -41,14 +43,21 @@ class btConvexTriangleCallback : public btTriangleCallback
 	btScalar m_collisionMarginTriangle;
 int	m_triangleCount;
 	btPersistentManifold*	m_manifoldPtr;
-	btConvexTriangleCallback(btDispatcher* dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btConvexTriangleCallback(btDispatcher* dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
-	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,const btCollisionObjectWrapper* convexBodyWrap, const btCollisionObjectWrapper* triBodyWrap, btManifoldResult* resultOut);
+	void	clearWrapperData()
+	{
+		m_convexBodyWrap = 0;
+		m_triBodyWrap = 0;
+	}
 	virtual ~btConvexTriangleCallback();
 	virtual void processTriangle(btVector3* triangle, int partId, int triangleIndex);
@@ -70,22 +79,24 @@ int	m_triangleCount;
 /// btConvexConcaveCollisionAlgorithm  supports collision between convex shapes and (concave) trianges meshes.
-class btConvexConcaveCollisionAlgorithm  : public btActivatingCollisionAlgorithm
+ATTRIBUTE_ALIGNED16(class)  btConvexConcaveCollisionAlgorithm  : public btActivatingCollisionAlgorithm
-	bool	m_isSwapped;
 	btConvexTriangleCallback m_btConvexTriangleCallback;
+	bool	m_isSwapped;
-	btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 	virtual ~btConvexConcaveCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -95,19 +106,19 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexConcaveCollisionAlgorithm));
-			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0,body1,false);
+			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexConcaveCollisionAlgorithm));
-			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0,body1,true);
+			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp
index dd1f3e24..b9cc0418 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp
@@ -52,7 +52,7 @@ subject to the following restrictions:
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpa2.h"
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h"
 #include "BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
@@ -191,8 +191,8 @@ btConvexConvexAlgorithm::CreateFunc::~CreateFunc()
-btConvexConvexAlgorithm::btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
-: btActivatingCollisionAlgorithm(ci,body0,body1),
+btConvexConvexAlgorithm::btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_ownManifold (false),
@@ -205,8 +205,8 @@ m_sepDistance((static_cast<btConvexShape*>(body0->getCollisionShape()))->getAngu
-	(void)body0;
-	(void)body1;
+	(void)body0Wrap;
+	(void)body1Wrap;
@@ -289,13 +289,13 @@ extern btScalar gContactBreakingThreshold;
 // Convex-Convex collision algorithm
-void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexConvexAlgorithm ::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
@@ -304,8 +304,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
-	btConvexShape* min0 = static_cast<btConvexShape*>(body0->getCollisionShape());
-	btConvexShape* min1 = static_cast<btConvexShape*>(body1->getCollisionShape());
+	const btConvexShape* min0 = static_cast<const btConvexShape*>(body0Wrap->getCollisionShape());
+	const btConvexShape* min1 = static_cast<const btConvexShape*>(body1Wrap->getCollisionShape());
 	btVector3  normalOnB;
 		btVector3  pointOnBWorld;
@@ -314,14 +314,14 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 		btCapsuleShape* capsuleA = (btCapsuleShape*) min0;
 		btCapsuleShape* capsuleB = (btCapsuleShape*) min1;
-		btVector3 localScalingA = capsuleA->getLocalScaling();
-		btVector3 localScalingB = capsuleB->getLocalScaling();
+	//	btVector3 localScalingA = capsuleA->getLocalScaling();
+	//	btVector3 localScalingB = capsuleB->getLocalScaling();
 		btScalar threshold = m_manifoldPtr->getContactBreakingThreshold();
 		btScalar dist = capsuleCapsuleDistance(normalOnB,	pointOnBWorld,capsuleA->getHalfHeight(),capsuleA->getRadius(),
-			body0->getWorldTransform(),body1->getWorldTransform(),threshold);
+			body0Wrap->getWorldTransform(),body1Wrap->getWorldTransform(),threshold);
 		if (dist<threshold)
@@ -373,9 +373,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 		input.m_maximumDistanceSquared*= input.m_maximumDistanceSquared;
-	input.m_stackAlloc = dispatchInfo.m_stackAllocator;
-	input.m_transformA = body0->getWorldTransform();
-	input.m_transformB = body1->getWorldTransform();
+	input.m_transformA = body0Wrap->getWorldTransform();
+	input.m_transformB = body1Wrap->getWorldTransform();
@@ -407,9 +406,51 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
+		struct btWithoutMarginResult : public btDiscreteCollisionDetectorInterface::Result
+		{
+			btDiscreteCollisionDetectorInterface::Result* m_originalResult;
+			btVector3	m_reportedNormalOnWorld;
+			btScalar m_marginOnA;
+			btScalar m_marginOnB;
+			btScalar	m_reportedDistance;
+			bool		m_foundResult;
+			btWithoutMarginResult(btDiscreteCollisionDetectorInterface::Result* result, btScalar marginOnA, btScalar marginOnB)
+			:m_originalResult(result),
+			m_marginOnA(marginOnA),
+			m_marginOnB(marginOnB),
+			m_foundResult(false)
+			{
+			}
+			virtual void setShapeIdentifiersA(int partId0,int index0){}
+			virtual void setShapeIdentifiersB(int partId1,int index1){}
+			virtual void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorldOrg,btScalar depthOrg) 
+			{
+				m_reportedDistance = depthOrg;
+				m_reportedNormalOnWorld = normalOnBInWorld;
+				btVector3 adjustedPointB = pointInWorldOrg - normalOnBInWorld*m_marginOnB;
+				m_reportedDistance = depthOrg+(m_marginOnA+m_marginOnB);
+				if (m_reportedDistance<0.f)
+				{
+					m_foundResult = true;					
+				}
+				m_originalResult->addContactPoint(normalOnBInWorld,adjustedPointB,m_reportedDistance);
+			}
+		};
 		btDummyResult dummy;
+///btBoxShape is an exception: its vertices are created WITH margin so don't subtract it
+		btScalar min0Margin = min0->getShapeType()==BOX_SHAPE_PROXYTYPE? 0.f : min0->getMargin();
+		btScalar min1Margin = min1->getShapeType()==BOX_SHAPE_PROXYTYPE? 0.f : min1->getMargin();
+		btWithoutMarginResult	withoutMargin(resultOut, min0Margin,min1Margin);
 		btPolyhedralConvexShape* polyhedronA = (btPolyhedralConvexShape*) min0;
 		btPolyhedralConvexShape* polyhedronB = (btPolyhedralConvexShape*) min1;
@@ -429,39 +470,44 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 				foundSepAxis = btPolyhedralContactClipping::findSeparatingAxis(
 					*polyhedronA->getConvexPolyhedron(), *polyhedronB->getConvexPolyhedron(),
-					body0->getWorldTransform(), 
-					body1->getWorldTransform(),
-					sepNormalWorldSpace);
+					body0Wrap->getWorldTransform(), 
+					body1Wrap->getWorldTransform(),
+					sepNormalWorldSpace,*resultOut);
 			} else
-				//gjkPairDetector.getClosestPoints(input,*resultOut,dispatchInfo.m_debugDraw);
-				gjkPairDetector.getClosestPoints(input,dummy,dispatchInfo.m_debugDraw);
+				gjkPairDetector.getClosestPoints(input,withoutMargin,dispatchInfo.m_debugDraw);
+				//gjkPairDetector.getClosestPoints(input,dummy,dispatchInfo.m_debugDraw);
 #endif //ZERO_MARGIN
-				btScalar l2 = gjkPairDetector.getCachedSeparatingAxis().length2();
-				if (l2>SIMD_EPSILON)
+				//btScalar l2 = gjkPairDetector.getCachedSeparatingAxis().length2();
+				//if (l2>SIMD_EPSILON)
-					sepNormalWorldSpace = gjkPairDetector.getCachedSeparatingAxis()*(1.f/l2);
+					sepNormalWorldSpace = withoutMargin.m_reportedNormalOnWorld;//gjkPairDetector.getCachedSeparatingAxis()*(1.f/l2);
 					//minDist = -1e30f;//gjkPairDetector.getCachedSeparatingDistance();
-					minDist = gjkPairDetector.getCachedSeparatingDistance()-min0->getMargin()-min1->getMargin();
+					minDist = withoutMargin.m_reportedDistance;//gjkPairDetector.getCachedSeparatingDistance()+min0->getMargin()+min1->getMargin();
 					foundSepAxis = true;//gjkPairDetector.getCachedSeparatingDistance()<0.f;
-					foundSepAxis = gjkPairDetector.getCachedSeparatingDistance()<(min0->getMargin()+min1->getMargin());
+					foundSepAxis = withoutMargin.m_foundResult && minDist<0;//-(min0->getMargin()+min1->getMargin());
 			if (foundSepAxis)
 //				printf("sepNormalWorldSpace=%f,%f,%f\n",sepNormalWorldSpace.getX(),sepNormalWorldSpace.getY(),sepNormalWorldSpace.getZ());
+				worldVertsB1.resize(0);
 				btPolyhedralContactClipping::clipHullAgainstHull(sepNormalWorldSpace, *polyhedronA->getConvexPolyhedron(), *polyhedronB->getConvexPolyhedron(),
-					body0->getWorldTransform(), 
-					body1->getWorldTransform(), minDist-threshold, threshold, *resultOut);
+					body0Wrap->getWorldTransform(), 
+																 body1Wrap->getWorldTransform(), minDist-threshold, threshold, worldVertsB1,worldVertsB2,
+																 *resultOut);
 			if (m_ownManifold)
@@ -478,9 +524,9 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 				btVertexArray vertices;
 				btTriangleShape* tri = (btTriangleShape*)polyhedronB;
-				vertices.push_back(	body1->getWorldTransform()*tri->m_vertices1[0]);
-				vertices.push_back(	body1->getWorldTransform()*tri->m_vertices1[1]);
-				vertices.push_back(	body1->getWorldTransform()*tri->m_vertices1[2]);
+				vertices.push_back(	body1Wrap->getWorldTransform()*tri->m_vertices1[0]);
+				vertices.push_back(	body1Wrap->getWorldTransform()*tri->m_vertices1[1]);
+				vertices.push_back(	body1Wrap->getWorldTransform()*tri->m_vertices1[2]);
@@ -496,9 +542,9 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 					 foundSepAxis = btPolyhedralContactClipping::findSeparatingAxis(
 					*polyhedronA->getConvexPolyhedron(), *polyhedronB->getConvexPolyhedron(),
-					body0->getWorldTransform(), 
-					body1->getWorldTransform(),
-					sepNormalWorldSpace);
+					body0Wrap->getWorldTransform(), 
+					body1Wrap->getWorldTransform(),
+					sepNormalWorldSpace,*resultOut);
 				//	 printf("sepNormalWorldSpace=%f,%f,%f\n",sepNormalWorldSpace.getX(),sepNormalWorldSpace.getY(),sepNormalWorldSpace.getZ());
 				} else
@@ -524,8 +570,9 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 			if (foundSepAxis)
+				worldVertsB2.resize(0);
 				btPolyhedralContactClipping::clipFaceAgainstHull(sepNormalWorldSpace, *polyhedronA->getConvexPolyhedron(), 
-					body0->getWorldTransform(), vertices, minDist-threshold, maxDist, *resultOut);
+					body0Wrap->getWorldTransform(), vertices, worldVertsB2,minDist-threshold, maxDist, *resultOut);
@@ -599,15 +646,15 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 				if (perturbeA)
-					input.m_transformA.setBasis(  btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body0->getWorldTransform().getBasis());
-					input.m_transformB = body1->getWorldTransform();
+					input.m_transformA.setBasis(  btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body0Wrap->getWorldTransform().getBasis());
+					input.m_transformB = body1Wrap->getWorldTransform();
 				} else
-					input.m_transformA = body0->getWorldTransform();
-					input.m_transformB.setBasis( btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body1->getWorldTransform().getBasis());
+					input.m_transformA = body0Wrap->getWorldTransform();
+					input.m_transformB.setBasis( btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body1Wrap->getWorldTransform().getBasis());
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h
index 4380b80e..d0ff3b3c 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h
@@ -24,6 +24,7 @@ subject to the following restrictions:
 #include "btCollisionCreateFunc.h"
 #include "btCollisionDispatcher.h"
 #include "LinearMath/btTransformUtil.h" //for btConvexSeparatingDistanceUtil
+#include "BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h"
 class btConvexPenetrationDepthSolver;
@@ -45,6 +46,8 @@ class btConvexConvexAlgorithm : public btActivatingCollisionAlgorithm
 	btSimplexSolverInterface*		m_simplexSolver;
 	btConvexPenetrationDepthSolver* m_pdSolver;
+	btVertexArray worldVertsB1;
+	btVertexArray worldVertsB2;
 	bool	m_ownManifold;
 	btPersistentManifold*	m_manifoldPtr;
@@ -59,12 +62,11 @@ class btConvexConvexAlgorithm : public btActivatingCollisionAlgorithm
-	btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
+	btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
 	virtual ~btConvexConvexAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -96,10 +98,10 @@ public:
 		virtual ~CreateFunc();
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexConvexAlgorithm));
-			return new(mem) btConvexConvexAlgorithm(ci.m_manifold,ci,body0,body1,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+			return new(mem) btConvexConvexAlgorithm(ci.m_manifold,ci,body0Wrap,body1Wrap,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp
index b2e9bfaf..cce2d95b 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp
@@ -19,10 +19,11 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionShapes/btConvexShape.h"
 #include "BulletCollision/CollisionShapes/btStaticPlaneShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 //#include <stdio.h>
-btConvexPlaneCollisionAlgorithm::btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold)
+btConvexPlaneCollisionAlgorithm::btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold)
 : btCollisionAlgorithm(ci),
@@ -30,12 +31,12 @@ m_isSwapped(isSwapped),
-	btCollisionObject* convexObj = m_isSwapped? col1 : col0;
-	btCollisionObject* planeObj = m_isSwapped? col0 : col1;
+	const btCollisionObjectWrapper* convexObjWrap = m_isSwapped? col1Wrap : col0Wrap;
+	const btCollisionObjectWrapper* planeObjWrap = m_isSwapped? col0Wrap : col1Wrap;
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(convexObj,planeObj))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(convexObjWrap->getCollisionObject(),planeObjWrap->getCollisionObject()))
-		m_manifoldPtr = m_dispatcher->getNewManifold(convexObj,planeObj);
+		m_manifoldPtr = m_dispatcher->getNewManifold(convexObjWrap->getCollisionObject(),planeObjWrap->getCollisionObject());
 		m_ownManifold = true;
@@ -50,25 +51,25 @@ btConvexPlaneCollisionAlgorithm::~btConvexPlaneCollisionAlgorithm()
-void btConvexPlaneCollisionAlgorithm::collideSingleContact (const btQuaternion& perturbeRot, btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexPlaneCollisionAlgorithm::collideSingleContact (const btQuaternion& perturbeRot, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
-    btCollisionObject* convexObj = m_isSwapped? body1 : body0;
-	btCollisionObject* planeObj = m_isSwapped? body0: body1;
+    const btCollisionObjectWrapper* convexObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* planeObjWrap = m_isSwapped? body0Wrap: body1Wrap;
-	btConvexShape* convexShape = (btConvexShape*) convexObj->getCollisionShape();
-	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObj->getCollisionShape();
+	btConvexShape* convexShape = (btConvexShape*) convexObjWrap->getCollisionShape();
+	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObjWrap->getCollisionShape();
     bool hasCollision = false;
 	const btVector3& planeNormal = planeShape->getPlaneNormal();
 	const btScalar& planeConstant = planeShape->getPlaneConstant();
-	btTransform convexWorldTransform = convexObj->getWorldTransform();
+	btTransform convexWorldTransform = convexObjWrap->getWorldTransform();
 	btTransform convexInPlaneTrans;
-	convexInPlaneTrans= planeObj->getWorldTransform().inverse() * convexWorldTransform;
+	convexInPlaneTrans= planeObjWrap->getWorldTransform().inverse() * convexWorldTransform;
 	//now perturbe the convex-world transform
 	btTransform planeInConvex;
-	planeInConvex= convexWorldTransform.inverse() * planeObj->getWorldTransform();
+	planeInConvex= convexWorldTransform.inverse() * planeObjWrap->getWorldTransform();
 	btVector3 vtx = convexShape->localGetSupportingVertex(planeInConvex.getBasis()*-planeNormal);
@@ -76,53 +77,53 @@ void btConvexPlaneCollisionAlgorithm::collideSingleContact (const btQuaternion&
 	btScalar distance = (planeNormal.dot(vtxInPlane) - planeConstant);
 	btVector3 vtxInPlaneProjected = vtxInPlane - distance*planeNormal;
-	btVector3 vtxInPlaneWorld = planeObj->getWorldTransform() * vtxInPlaneProjected;
+	btVector3 vtxInPlaneWorld = planeObjWrap->getWorldTransform() * vtxInPlaneProjected;
 	hasCollision = distance < m_manifoldPtr->getContactBreakingThreshold();
 	if (hasCollision)
 		/// report a contact. internally this will be kept persistent, and contact reduction is done
-		btVector3 normalOnSurfaceB = planeObj->getWorldTransform().getBasis() * planeNormal;
+		btVector3 normalOnSurfaceB = planeObjWrap->getWorldTransform().getBasis() * planeNormal;
 		btVector3 pOnB = vtxInPlaneWorld;
-void btConvexPlaneCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexPlaneCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-	btCollisionObject* convexObj = m_isSwapped? body1 : body0;
-	btCollisionObject* planeObj = m_isSwapped? body0: body1;
+	const btCollisionObjectWrapper* convexObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* planeObjWrap = m_isSwapped? body0Wrap: body1Wrap;
-	btConvexShape* convexShape = (btConvexShape*) convexObj->getCollisionShape();
-	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObj->getCollisionShape();
+	btConvexShape* convexShape = (btConvexShape*) convexObjWrap->getCollisionShape();
+	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObjWrap->getCollisionShape();
 	bool hasCollision = false;
 	const btVector3& planeNormal = planeShape->getPlaneNormal();
 	const btScalar& planeConstant = planeShape->getPlaneConstant();
 	btTransform planeInConvex;
-	planeInConvex= convexObj->getWorldTransform().inverse() * planeObj->getWorldTransform();
+	planeInConvex= convexObjWrap->getWorldTransform().inverse() * planeObjWrap->getWorldTransform();
 	btTransform convexInPlaneTrans;
-	convexInPlaneTrans= planeObj->getWorldTransform().inverse() * convexObj->getWorldTransform();
+	convexInPlaneTrans= planeObjWrap->getWorldTransform().inverse() * convexObjWrap->getWorldTransform();
 	btVector3 vtx = convexShape->localGetSupportingVertex(planeInConvex.getBasis()*-planeNormal);
 	btVector3 vtxInPlane = convexInPlaneTrans(vtx);
 	btScalar distance = (planeNormal.dot(vtxInPlane) - planeConstant);
 	btVector3 vtxInPlaneProjected = vtxInPlane - distance*planeNormal;
-	btVector3 vtxInPlaneWorld = planeObj->getWorldTransform() * vtxInPlaneProjected;
+	btVector3 vtxInPlaneWorld = planeObjWrap->getWorldTransform() * vtxInPlaneProjected;
 	hasCollision = distance < m_manifoldPtr->getContactBreakingThreshold();
 	if (hasCollision)
 		/// report a contact. internally this will be kept persistent, and contact reduction is done
-		btVector3 normalOnSurfaceB = planeObj->getWorldTransform().getBasis() * planeNormal;
+		btVector3 normalOnSurfaceB = planeObjWrap->getWorldTransform().getBasis() * planeNormal;
 		btVector3 pOnB = vtxInPlaneWorld;
@@ -148,7 +149,7 @@ void btConvexPlaneCollisionAlgorithm::processCollision (btCollisionObject* body0
 			btScalar iterationAngle = i*(SIMD_2_PI/btScalar(m_numPerturbationIterations));
 			btQuaternion rotq(planeNormal,iterationAngle);
-			collideSingleContact(rotq.inverse()*perturbeRot*rotq,body0,body1,dispatchInfo,resultOut);
+			collideSingleContact(rotq.inverse()*perturbeRot*rotq,body0Wrap,body1Wrap,dispatchInfo,resultOut);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h
index b9494f5a..d28c430c 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h
@@ -36,13 +36,13 @@ class btConvexPlaneCollisionAlgorithm : public btCollisionAlgorithm
-	btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold);
+	btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold);
 	virtual ~btConvexPlaneCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
-	void collideSingleContact (const btQuaternion& perturbeRot, btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	void collideSingleContact (const btQuaternion& perturbeRot, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -65,15 +65,15 @@ public:
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexPlaneCollisionAlgorithm));
 			if (!m_swapped)
-				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0,body1,false,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,false,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
 			} else
-				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0,body1,true,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,true,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp b/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp
index 7faee6fa..9a2e3394 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp
@@ -19,6 +19,8 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h"
@@ -32,7 +34,6 @@ subject to the following restrictions:
-#include "LinearMath/btStackAlloc.h"
 #include "LinearMath/btPoolAllocator.h"
@@ -65,6 +66,10 @@ btDefaultCollisionConfiguration::btDefaultCollisionConfiguration(const btDefault
 	m_swappedConvexConcaveCreateFunc = new (mem)btConvexConcaveCollisionAlgorithm::SwappedCreateFunc;
 	mem = btAlignedAlloc(sizeof(btCompoundCollisionAlgorithm::CreateFunc),16);
 	m_compoundCreateFunc = new (mem)btCompoundCollisionAlgorithm::CreateFunc;
+	mem = btAlignedAlloc(sizeof(btCompoundCompoundCollisionAlgorithm::CreateFunc),16);
+	m_compoundCompoundCreateFunc = new (mem)btCompoundCompoundCollisionAlgorithm::CreateFunc;
 	mem = btAlignedAlloc(sizeof(btCompoundCollisionAlgorithm::SwappedCreateFunc),16);
 	m_swappedCompoundCreateFunc = new (mem)btCompoundCollisionAlgorithm::SwappedCreateFunc;
 	mem = btAlignedAlloc(sizeof(btEmptyAlgorithm::CreateFunc),16);
@@ -100,22 +105,12 @@ btDefaultCollisionConfiguration::btDefaultCollisionConfiguration(const btDefault
 	int maxSize = sizeof(btConvexConvexAlgorithm);
 	int maxSize2 = sizeof(btConvexConcaveCollisionAlgorithm);
 	int maxSize3 = sizeof(btCompoundCollisionAlgorithm);
-	int sl = sizeof(btConvexSeparatingDistanceUtil);
-	sl = sizeof(btGjkPairDetector);
+	int maxSize4 = sizeof(btCompoundCompoundCollisionAlgorithm);
 	int	collisionAlgorithmMaxElementSize = btMax(maxSize,constructionInfo.m_customCollisionAlgorithmMaxElementSize);
 	collisionAlgorithmMaxElementSize = btMax(collisionAlgorithmMaxElementSize,maxSize2);
 	collisionAlgorithmMaxElementSize = btMax(collisionAlgorithmMaxElementSize,maxSize3);
-	if (constructionInfo.m_stackAlloc)
-	{
-		m_ownsStackAllocator = false;
-		this->m_stackAlloc = constructionInfo.m_stackAlloc;
-	} else
-	{
-		m_ownsStackAllocator = true;
-		void* mem = btAlignedAlloc(sizeof(btStackAlloc),16);
-		m_stackAlloc = new(mem)btStackAlloc(constructionInfo.m_defaultStackAllocatorSize);
-	}
+	collisionAlgorithmMaxElementSize = btMax(collisionAlgorithmMaxElementSize,maxSize4);
 	if (constructionInfo.m_persistentManifoldPool)
@@ -128,6 +123,7 @@ btDefaultCollisionConfiguration::btDefaultCollisionConfiguration(const btDefault
 		m_persistentManifoldPool = new (mem) btPoolAllocator(sizeof(btPersistentManifold),constructionInfo.m_defaultMaxPersistentManifoldPoolSize);
+	collisionAlgorithmMaxElementSize = (collisionAlgorithmMaxElementSize+16)&0xffffffffffff0;
 	if (constructionInfo.m_collisionAlgorithmPool)
 		m_ownsCollisionAlgorithmPool = false;
@@ -144,12 +140,6 @@ btDefaultCollisionConfiguration::btDefaultCollisionConfiguration(const btDefault
-	if (m_ownsStackAllocator)
-	{
-		m_stackAlloc->destroy();
-		m_stackAlloc->~btStackAlloc();
-		btAlignedFree(m_stackAlloc);
-	}
 	if (m_ownsCollisionAlgorithmPool)
@@ -172,6 +162,9 @@ btDefaultCollisionConfiguration::~btDefaultCollisionConfiguration()
 	btAlignedFree( m_compoundCreateFunc);
+	m_compoundCompoundCreateFunc->~btCollisionAlgorithmCreateFunc();
+	btAlignedFree(m_compoundCompoundCreateFunc);
 	btAlignedFree( m_swappedCompoundCreateFunc);
@@ -275,6 +268,12 @@ btCollisionAlgorithmCreateFunc* btDefaultCollisionConfiguration::getCollisionAlg
 		return m_swappedConvexConcaveCreateFunc;
+	if (btBroadphaseProxy::isCompound(proxyType0) && btBroadphaseProxy::isCompound(proxyType1))
+	{
+		return m_compoundCompoundCreateFunc;
+	}
 	if (btBroadphaseProxy::isCompound(proxyType0))
 		return m_compoundCreateFunc;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h b/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h
index 81ed424a..2078420e 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h
@@ -22,23 +22,19 @@ class btConvexPenetrationDepthSolver;
 struct	btDefaultCollisionConstructionInfo
-	btStackAlloc*		m_stackAlloc;
 	btPoolAllocator*	m_persistentManifoldPool;
 	btPoolAllocator*	m_collisionAlgorithmPool;
 	int					m_defaultMaxPersistentManifoldPoolSize;
 	int					m_defaultMaxCollisionAlgorithmPoolSize;
 	int					m_customCollisionAlgorithmMaxElementSize;
-	int					m_defaultStackAllocatorSize;
 	int					m_useEpaPenetrationAlgorithm;
-		:m_stackAlloc(0),
-		m_persistentManifoldPool(0),
+		:m_persistentManifoldPool(0),
-		m_defaultStackAllocatorSize(0),
@@ -56,8 +52,6 @@ protected:
 	int	m_persistentManifoldPoolSize;
-	btStackAlloc*	m_stackAlloc;
-	bool	m_ownsStackAllocator;
 	btPoolAllocator*	m_persistentManifoldPool;
 	bool	m_ownsPersistentManifoldPool;
@@ -75,13 +69,13 @@ protected:
 	btCollisionAlgorithmCreateFunc*	m_convexConcaveCreateFunc;
 	btCollisionAlgorithmCreateFunc*	m_swappedConvexConcaveCreateFunc;
 	btCollisionAlgorithmCreateFunc*	m_compoundCreateFunc;
+	btCollisionAlgorithmCreateFunc*	m_compoundCompoundCreateFunc;
 	btCollisionAlgorithmCreateFunc*	m_swappedCompoundCreateFunc;
 	btCollisionAlgorithmCreateFunc* m_emptyCreateFunc;
 	btCollisionAlgorithmCreateFunc* m_sphereSphereCF;
 	btCollisionAlgorithmCreateFunc* m_sphereBoxCF;
 	btCollisionAlgorithmCreateFunc* m_boxSphereCF;
 	btCollisionAlgorithmCreateFunc* m_boxBoxCF;
 	btCollisionAlgorithmCreateFunc*	m_sphereTriangleCF;
@@ -107,10 +101,6 @@ public:
 		return m_collisionAlgorithmPool;
-	virtual btStackAlloc*	getStackAllocator()
-	{
-		return m_stackAlloc;
-	}
 	virtual	btVoronoiSimplexSolver*	getSimplexSolver()
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp
index 93605438..5fa1c8be 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp
@@ -22,7 +22,7 @@ btEmptyAlgorithm::btEmptyAlgorithm(const btCollisionAlgorithmConstructionInfo& c
-void btEmptyAlgorithm::processCollision (btCollisionObject* ,btCollisionObject* ,const btDispatcherInfo& ,btManifoldResult* )
+void btEmptyAlgorithm::processCollision (const btCollisionObjectWrapper* ,const btCollisionObjectWrapper* ,const btDispatcherInfo& ,btManifoldResult* )
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h
index f03c9dc3..cb0f1521 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h
@@ -30,7 +30,7 @@ public:
 	btEmptyAlgorithm(const btCollisionAlgorithmConstructionInfo& ci);
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -40,10 +40,10 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
-		{
-			(void)body0;
-			(void)body1;
+        virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+        {
+			(void)body0Wrap;
+			(void)body1Wrap;
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btEmptyAlgorithm));
 			return new(mem) btEmptyAlgorithm(ci);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btHashedSimplePairCache.cpp b/src/bullet/BulletCollision/CollisionDispatch/btHashedSimplePairCache.cpp
new file mode 100644
index 00000000..8c8a7c3c
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btHashedSimplePairCache.cpp
@@ -0,0 +1,276 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btHashedSimplePairCache.h"
+#include <stdio.h>
+int	gOverlappingSimplePairs = 0;
+int gRemoveSimplePairs =0;
+int gAddedSimplePairs =0;
+int gFindSimplePairs =0;
+btHashedSimplePairCache::btHashedSimplePairCache() {
+	int initialAllocatedSize= 2;
+	m_overlappingPairArray.reserve(initialAllocatedSize);
+	growTables();
+void btHashedSimplePairCache::removeAllPairs()
+	m_overlappingPairArray.clear();
+	m_hashTable.clear();
+	m_next.clear();
+	int initialAllocatedSize= 2;
+	m_overlappingPairArray.reserve(initialAllocatedSize);
+	growTables();
+btSimplePair* btHashedSimplePairCache::findPair(int indexA, int indexB)
+	gFindSimplePairs++;
+	/*if (indexA > indexB) 
+		btSwap(indexA, indexB);*/
+	int hash = static_cast<int>(getHash(static_cast<unsigned int>(indexA), static_cast<unsigned int>(indexB)) & (m_overlappingPairArray.capacity()-1));
+	if (hash >= m_hashTable.size())
+	{
+		return NULL;
+	}
+	int index = m_hashTable[hash];
+	while (index != BT_SIMPLE_NULL_PAIR && equalsPair(m_overlappingPairArray[index], indexA, indexB) == false)
+	{
+		index = m_next[index];
+	}
+	if (index == BT_SIMPLE_NULL_PAIR)
+	{
+		return NULL;
+	}
+	btAssert(index < m_overlappingPairArray.size());
+	return &m_overlappingPairArray[index];
+//#include <stdio.h>
+void	btHashedSimplePairCache::growTables()
+	int newCapacity = m_overlappingPairArray.capacity();
+	if (m_hashTable.size() < newCapacity)
+	{
+		//grow hashtable and next table
+		int curHashtableSize = m_hashTable.size();
+		m_hashTable.resize(newCapacity);
+		m_next.resize(newCapacity);
+		int i;
+		for (i= 0; i < newCapacity; ++i)
+		{
+			m_hashTable[i] = BT_SIMPLE_NULL_PAIR;
+		}
+		for (i = 0; i < newCapacity; ++i)
+		{
+			m_next[i] = BT_SIMPLE_NULL_PAIR;
+		}
+		for(i=0;i<curHashtableSize;i++)
+		{
+			const btSimplePair& pair = m_overlappingPairArray[i];
+			int indexA = pair.m_indexA;
+			int indexB = pair.m_indexB;
+			int	hashValue = static_cast<int>(getHash(static_cast<unsigned int>(indexA),static_cast<unsigned int>(indexB)) & (m_overlappingPairArray.capacity()-1));	// New hash value with new mask
+			m_next[i] = m_hashTable[hashValue];
+			m_hashTable[hashValue] = i;
+		}
+	}
+btSimplePair* btHashedSimplePairCache::internalAddPair(int indexA, int indexB)
+	int	hash = static_cast<int>(getHash(static_cast<unsigned int>(indexA),static_cast<unsigned int>(indexB)) & (m_overlappingPairArray.capacity()-1));	// New hash value with new mask
+	btSimplePair* pair = internalFindPair(indexA, indexB, hash);
+	if (pair != NULL)
+	{
+		return pair;
+	}
+	int count = m_overlappingPairArray.size();
+	int oldCapacity = m_overlappingPairArray.capacity();
+	void* mem = &m_overlappingPairArray.expandNonInitializing();
+	int newCapacity = m_overlappingPairArray.capacity();
+	if (oldCapacity < newCapacity)
+	{
+		growTables();
+		//hash with new capacity
+		hash = static_cast<int>(getHash(static_cast<unsigned int>(indexA),static_cast<unsigned int>(indexB)) & (m_overlappingPairArray.capacity()-1));
+	}
+	pair = new (mem) btSimplePair(indexA,indexB);
+	pair->m_userPointer = 0;
+	m_next[count] = m_hashTable[hash];
+	m_hashTable[hash] = count;
+	return pair;
+void* btHashedSimplePairCache::removeOverlappingPair(int indexA, int indexB)
+	gRemoveSimplePairs++;
+	/*if (indexA > indexB) 
+		btSwap(indexA, indexB);*/
+	int	hash = static_cast<int>(getHash(static_cast<unsigned int>(indexA),static_cast<unsigned int>(indexB)) & (m_overlappingPairArray.capacity()-1));
+	btSimplePair* pair = internalFindPair(indexA, indexB, hash);
+	if (pair == NULL)
+	{
+		return 0;
+	}
+	void* userData = pair->m_userPointer;
+	int pairIndex = int(pair - &m_overlappingPairArray[0]);
+	btAssert(pairIndex < m_overlappingPairArray.size());
+	// Remove the pair from the hash table.
+	int index = m_hashTable[hash];
+	btAssert(index != BT_SIMPLE_NULL_PAIR);
+	int previous = BT_SIMPLE_NULL_PAIR;
+	while (index != pairIndex)
+	{
+		previous = index;
+		index = m_next[index];
+	}
+	if (previous != BT_SIMPLE_NULL_PAIR)
+	{
+		btAssert(m_next[previous] == pairIndex);
+		m_next[previous] = m_next[pairIndex];
+	}
+	else
+	{
+		m_hashTable[hash] = m_next[pairIndex];
+	}
+	// We now move the last pair into spot of the
+	// pair being removed. We need to fix the hash
+	// table indices to support the move.
+	int lastPairIndex = m_overlappingPairArray.size() - 1;
+	// If the removed pair is the last pair, we are done.
+	if (lastPairIndex == pairIndex)
+	{
+		m_overlappingPairArray.pop_back();
+		return userData;
+	}
+	// Remove the last pair from the hash table.
+	const btSimplePair* last = &m_overlappingPairArray[lastPairIndex];
+		/* missing swap here too, Nat. */ 
+	int lastHash = static_cast<int>(getHash(static_cast<unsigned int>(last->m_indexA), static_cast<unsigned int>(last->m_indexB)) & (m_overlappingPairArray.capacity()-1));
+	index = m_hashTable[lastHash];
+	btAssert(index != BT_SIMPLE_NULL_PAIR);
+	previous = BT_SIMPLE_NULL_PAIR;
+	while (index != lastPairIndex)
+	{
+		previous = index;
+		index = m_next[index];
+	}
+	if (previous != BT_SIMPLE_NULL_PAIR)
+	{
+		btAssert(m_next[previous] == lastPairIndex);
+		m_next[previous] = m_next[lastPairIndex];
+	}
+	else
+	{
+		m_hashTable[lastHash] = m_next[lastPairIndex];
+	}
+	// Copy the last pair into the remove pair's spot.
+	m_overlappingPairArray[pairIndex] = m_overlappingPairArray[lastPairIndex];
+	// Insert the last pair into the hash table
+	m_next[pairIndex] = m_hashTable[lastHash];
+	m_hashTable[lastHash] = pairIndex;
+	m_overlappingPairArray.pop_back();
+	return userData;
+//#include <stdio.h>
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btHashedSimplePairCache.h b/src/bullet/BulletCollision/CollisionDispatch/btHashedSimplePairCache.h
new file mode 100644
index 00000000..186964d7
--- /dev/null
+++ b/src/bullet/BulletCollision/CollisionDispatch/btHashedSimplePairCache.h
@@ -0,0 +1,172 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btAlignedObjectArray.h"
+const int BT_SIMPLE_NULL_PAIR=0xffffffff;
+struct btSimplePair
+	btSimplePair(int indexA,int indexB)
+		:m_indexA(indexA),
+		m_indexB(indexB),
+		m_userPointer(0)
+	{
+	}
+	int m_indexA;
+	int m_indexB;
+	union
+	{
+		void*	m_userPointer;
+		int		m_userValue;
+	};
+typedef btAlignedObjectArray<btSimplePair>	btSimplePairArray;
+extern int gOverlappingSimplePairs;
+extern int gRemoveSimplePairs;
+extern int gAddedSimplePairs;
+extern int gFindSimplePairs;
+class btHashedSimplePairCache
+	btSimplePairArray	m_overlappingPairArray;
+	btAlignedObjectArray<int>	m_hashTable;
+	btAlignedObjectArray<int>	m_next;
+	btHashedSimplePairCache();
+	virtual ~btHashedSimplePairCache();
+	void removeAllPairs();
+	virtual void*	removeOverlappingPair(int indexA,int indexB);
+	// Add a pair and return the new pair. If the pair already exists,
+	// no new pair is created and the old one is returned.
+	virtual btSimplePair* 	addOverlappingPair(int indexA,int indexB)
+	{
+		gAddedSimplePairs++;
+		return internalAddPair(indexA,indexB);
+	}
+	virtual btSimplePair*	getOverlappingPairArrayPtr()
+	{
+		return &m_overlappingPairArray[0];
+	}
+	const btSimplePair*	getOverlappingPairArrayPtr() const
+	{
+		return &m_overlappingPairArray[0];
+	}
+	btSimplePairArray&	getOverlappingPairArray()
+	{
+		return m_overlappingPairArray;
+	}
+	const btSimplePairArray&	getOverlappingPairArray() const
+	{
+		return m_overlappingPairArray;
+	}
+	btSimplePair* findPair(int indexA,int indexB);
+	int GetCount() const { return m_overlappingPairArray.size(); }
+	int	getNumOverlappingPairs() const
+	{
+		return m_overlappingPairArray.size();
+	}
+	btSimplePair* 	internalAddPair(int indexA, int indexB);
+	void	growTables();
+	SIMD_FORCE_INLINE bool equalsPair(const btSimplePair& pair, int indexA, int indexB)
+	{	
+		return pair.m_indexA == indexA && pair.m_indexB == indexB;
+	}
+	SIMD_FORCE_INLINE	unsigned int getHash(unsigned int indexA, unsigned int indexB)
+	{
+		int key = static_cast<int>(((unsigned int)indexA) | (((unsigned int)indexB) <<16));
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^=  (key >> 10);
+		key +=  (key << 3);
+		key ^=  (key >> 6);
+		key += ~(key << 11);
+		key ^=  (key >> 16);
+		return static_cast<unsigned int>(key);
+	}
+	SIMD_FORCE_INLINE btSimplePair* internalFindPair(int proxyIdA , int proxyIdB, int hash)
+	{
+		int index = m_hashTable[hash];
+		while( index != BT_SIMPLE_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyIdA, proxyIdB) == false)
+		{
+			index = m_next[index];
+		}
+		if ( index == BT_SIMPLE_NULL_PAIR )
+		{
+			return NULL;
+		}
+		btAssert(index < m_overlappingPairArray.size());
+		return &m_overlappingPairArray[index];
+	}
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp b/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
index 4353cdac..6cba442c 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
@@ -6,7 +6,7 @@
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h"
 #include "LinearMath/btIDebugDraw.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
@@ -193,7 +193,7 @@ struct btConnectivityProcessor : public btTriangleCallback
 				btScalar len2 = calculatedEdge.length2();
 				btScalar correctedAngle(0);
-				btVector3 calculatedNormalB = normalA;
+				//btVector3 calculatedNormalB = normalA;
 				bool isConvex = false;
 				if (len2<m_triangleInfoMap->m_planarEpsilon)
@@ -213,10 +213,6 @@ struct btConnectivityProcessor : public btTriangleCallback
 					isConvex = (dotA<0.);
 					correctedAngle = isConvex ? ang4 : -ang4;
-					btQuaternion orn2(calculatedEdge,-correctedAngle);
-					calculatedNormalB = btMatrix3x3(orn2)*normalA;
@@ -450,18 +446,18 @@ bool	btClampNormal(const btVector3& edge,const btVector3& tri_normal_org,const b
 /// Changes a btManifoldPoint collision normal to the normal from the mesh.
-void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject* colObj0,const btCollisionObject* colObj1, int partId0, int index0, int normalAdjustFlags)
+void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObjectWrapper* colObj0Wrap,const btCollisionObjectWrapper* colObj1Wrap, int partId0, int index0, int normalAdjustFlags)
 	//btAssert(colObj0->getCollisionShape()->getShapeType() == TRIANGLE_SHAPE_PROXYTYPE);
-	if (colObj0->getCollisionShape()->getShapeType() != TRIANGLE_SHAPE_PROXYTYPE)
+	if (colObj0Wrap->getCollisionShape()->getShapeType() != TRIANGLE_SHAPE_PROXYTYPE)
 	btBvhTriangleMeshShape* trimesh = 0;
-	if( colObj0->getRootCollisionShape()->getShapeType() == SCALED_TRIANGLE_MESH_SHAPE_PROXYTYPE )
-	   trimesh = ((btScaledBvhTriangleMeshShape*)colObj0->getRootCollisionShape())->getChildShape();
+	if( colObj0Wrap->getCollisionObject()->getCollisionShape()->getShapeType() == SCALED_TRIANGLE_MESH_SHAPE_PROXYTYPE )
+	   trimesh = ((btScaledBvhTriangleMeshShape*)colObj0Wrap->getCollisionObject()->getCollisionShape())->getChildShape();
-	   trimesh = (btBvhTriangleMeshShape*)colObj0->getRootCollisionShape();
+	   trimesh = (btBvhTriangleMeshShape*)colObj0Wrap->getCollisionObject()->getCollisionShape();
    	btTriangleInfoMap* triangleInfoMapPtr = (btTriangleInfoMap*) trimesh->getTriangleInfoMap();
 	if (!triangleInfoMapPtr)
@@ -476,13 +472,13 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 	btScalar frontFacing = (normalAdjustFlags & BT_TRIANGLE_CONVEX_BACKFACE_MODE)==0? 1.f : -1.f;
-	const btTriangleShape* tri_shape = static_cast<const btTriangleShape*>(colObj0->getCollisionShape());
+	const btTriangleShape* tri_shape = static_cast<const btTriangleShape*>(colObj0Wrap->getCollisionShape());
 	btVector3 v0,v1,v2;
-	btVector3 center = (v0+v1+v2)*btScalar(1./3.);
+	//btVector3 center = (v0+v1+v2)*btScalar(1./3.);
 	btVector3 red(1,0,0), green(0,1,0),blue(0,0,1),white(1,1,1),black(0,0,0);
 	btVector3 tri_normal;
@@ -505,7 +501,7 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 	int numConcaveEdgeHits = 0;
 	int numConvexEdgeHits = 0;
-	btVector3 localContactNormalOnB = colObj0->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
+	btVector3 localContactNormalOnB = colObj0Wrap->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
 	localContactNormalOnB.normalize();//is this necessary?
 	// Get closest edge
@@ -613,12 +609,12 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 						if (((normalAdjustFlags & BT_TRIANGLE_CONVEX_DOUBLE_SIDED)!=0) || (clampedLocalNormal.dot(frontFacing*tri_normal)>0))
-							btVector3 newNormal = colObj0->getWorldTransform().getBasis() * clampedLocalNormal;
+							btVector3 newNormal = colObj0Wrap->getWorldTransform().getBasis() * clampedLocalNormal;
 							//					cp.m_distance1 = cp.m_distance1 * newNormal.dot(cp.m_normalWorldOnB);
 							cp.m_normalWorldOnB = newNormal;
 							// Reproject collision point along normal. (what about cp.m_distance1?)
 							cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-							cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+							cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
@@ -694,19 +690,19 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
-					btVector3 localContactNormalOnB = colObj0->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
+					btVector3 localContactNormalOnB = colObj0Wrap->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
 					btVector3 clampedLocalNormal;
 					bool isClamped = btClampNormal(edge,swapFactor*tri_normal,localContactNormalOnB, info->m_edgeV1V2Angle,clampedLocalNormal);
 					if (isClamped)
 						if (((normalAdjustFlags & BT_TRIANGLE_CONVEX_DOUBLE_SIDED)!=0) || (clampedLocalNormal.dot(frontFacing*tri_normal)>0))
-							btVector3 newNormal = colObj0->getWorldTransform().getBasis() * clampedLocalNormal;
+							btVector3 newNormal = colObj0Wrap->getWorldTransform().getBasis() * clampedLocalNormal;
 							//					cp.m_distance1 = cp.m_distance1 * newNormal.dot(cp.m_normalWorldOnB);
 							cp.m_normalWorldOnB = newNormal;
 							// Reproject collision point along normal.
 							cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-							cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+							cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
@@ -779,19 +775,19 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 					//				printf("hitting convex edge\n");
-					btVector3 localContactNormalOnB = colObj0->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
+					btVector3 localContactNormalOnB = colObj0Wrap->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
 					btVector3 clampedLocalNormal;
 					bool isClamped = btClampNormal(edge,swapFactor*tri_normal,localContactNormalOnB,info->m_edgeV2V0Angle,clampedLocalNormal);
 					if (isClamped)
 						if (((normalAdjustFlags & BT_TRIANGLE_CONVEX_DOUBLE_SIDED)!=0) || (clampedLocalNormal.dot(frontFacing*tri_normal)>0))
-							btVector3 newNormal = colObj0->getWorldTransform().getBasis() * clampedLocalNormal;
+							btVector3 newNormal = colObj0Wrap->getWorldTransform().getBasis() * clampedLocalNormal;
 							//					cp.m_distance1 = cp.m_distance1 * newNormal.dot(cp.m_normalWorldOnB);
 							cp.m_normalWorldOnB = newNormal;
 							// Reproject collision point along normal.
 							cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-							cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+							cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
@@ -820,7 +816,7 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 					tri_normal *= -1;
-				cp.m_normalWorldOnB = colObj0->getWorldTransform().getBasis()*tri_normal;
+				cp.m_normalWorldOnB = colObj0Wrap->getWorldTransform().getBasis()*tri_normal;
 			} else
 				btVector3 newNormal = tri_normal *frontFacing;
@@ -831,12 +827,12 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 				//modify the normal to be the triangle normal (or backfacing normal)
-				cp.m_normalWorldOnB = colObj0->getWorldTransform().getBasis() *newNormal;
+				cp.m_normalWorldOnB = colObj0Wrap->getWorldTransform().getBasis() *newNormal;
 			// Reproject collision point along normal.
 			cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-			cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+			cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h b/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h
index 9efb0122..7d9aafee 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h
@@ -12,6 +12,7 @@
 class btBvhTriangleMeshShape;
 class btCollisionObject;
+struct btCollisionObjectWrapper;
 class btManifoldPoint;
 class btIDebugDraw;
@@ -31,7 +32,7 @@ void	btGenerateInternalEdgeInfo (btBvhTriangleMeshShape*trimeshShape, btTriangle
 ///Call the btFixMeshNormal to adjust the collision normal, using the triangle info map (generated using btGenerateInternalEdgeInfo)
 ///If this info map is missing, or the triangle is not store in this map, nothing will be done
-void	btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject* trimeshColObj0,const btCollisionObject* otherColObj1, int partId0, int index0, int normalAdjustFlags = 0);
+void	btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObjectWrapper* trimeshColObj0Wrap,const btCollisionObjectWrapper* otherColObj1Wrap, int partId0, int index0, int normalAdjustFlags = 0);
 ///Enable the BT_INTERNAL_EDGE_DEBUG_DRAW define and call btSetDebugDrawer, to get visual info to see if the internal edge utility works properly.
 ///If the utility doesn't work properly, you might have to adjust the threshold values in btTriangleInfoMap
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.cpp b/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.cpp
index bf24246e..4b2986a0 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.cpp
@@ -17,13 +17,30 @@ subject to the following restrictions:
 #include "btManifoldResult.h"
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 ///This is to allow MaterialCombiner/Custom Friction/Restitution values
 ContactAddedCallback		gContactAddedCallback=0;
+///User can override this material combiner by implementing gContactAddedCallback and setting body0->m_collisionFlags |= btCollisionObject::customMaterialCallback;
+inline btScalar	calculateCombinedRollingFriction(const btCollisionObject* body0,const btCollisionObject* body1)
+	btScalar friction = body0->getRollingFriction() * body1->getRollingFriction();
+	const btScalar MAX_FRICTION  = btScalar(10.);
+	if (friction < -MAX_FRICTION)
+		friction = -MAX_FRICTION;
+	if (friction > MAX_FRICTION)
+		friction = MAX_FRICTION;
+	return friction;
 ///User can override this material combiner by implementing gContactAddedCallback and setting body0->m_collisionFlags |= btCollisionObject::customMaterialCallback;
-inline btScalar	calculateCombinedFriction(const btCollisionObject* body0,const btCollisionObject* body1)
+btScalar	btManifoldResult::calculateCombinedFriction(const btCollisionObject* body0,const btCollisionObject* body1)
 	btScalar friction = body0->getFriction() * body1->getFriction();
@@ -36,17 +53,17 @@ inline btScalar	calculateCombinedFriction(const btCollisionObject* body0,const b
-inline btScalar	calculateCombinedRestitution(const btCollisionObject* body0,const btCollisionObject* body1)
+btScalar	btManifoldResult::calculateCombinedRestitution(const btCollisionObject* body0,const btCollisionObject* body1)
 	return body0->getRestitution() * body1->getRestitution();
-btManifoldResult::btManifoldResult(btCollisionObject* body0,btCollisionObject* body1)
+btManifoldResult::btManifoldResult(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
-		m_body0(body0),
-		m_body1(body1)
+		m_body0Wrap(body0Wrap),
+		m_body1Wrap(body1Wrap)
@@ -54,8 +71,6 @@ btManifoldResult::btManifoldResult(btCollisionObject* body0,btCollisionObject* b
-	m_rootTransA = body0->getWorldTransform();
-	m_rootTransB = body1->getWorldTransform();
@@ -68,7 +83,7 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 //	if (depth > m_manifoldPtr->getContactProcessingThreshold())
-	bool isSwapped = m_manifoldPtr->getBody0() != m_body0;
+	bool isSwapped = m_manifoldPtr->getBody0() != m_body0Wrap->getCollisionObject();
 	btVector3 pointA = pointInWorld + normalOnBInWorld * depth;
@@ -77,12 +92,12 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 	if (isSwapped)
-		localA = m_rootTransB.invXform(pointA );
-		localB = m_rootTransA.invXform(pointInWorld);
+		localA = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+		localB = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 	} else
-		localA = m_rootTransA.invXform(pointA );
-		localB = m_rootTransB.invXform(pointInWorld);
+		localA = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+		localB = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 	btManifoldPoint newPt(localA,localB,normalOnBInWorld,depth);
@@ -91,9 +106,13 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 	int insertIndex = m_manifoldPtr->getCacheEntry(newPt);
-	newPt.m_combinedFriction = calculateCombinedFriction(m_body0,m_body1);
-	newPt.m_combinedRestitution = calculateCombinedRestitution(m_body0,m_body1);
+	newPt.m_combinedFriction = calculateCombinedFriction(m_body0Wrap->getCollisionObject(),m_body1Wrap->getCollisionObject());
+	newPt.m_combinedRestitution = calculateCombinedRestitution(m_body0Wrap->getCollisionObject(),m_body1Wrap->getCollisionObject());
+	newPt.m_combinedRollingFriction = calculateCombinedRollingFriction(m_body0Wrap->getCollisionObject(),m_body1Wrap->getCollisionObject());
+	btPlaneSpace1(newPt.m_normalWorldOnB,newPt.m_lateralFrictionDir1,newPt.m_lateralFrictionDir2);
    //BP mod, store contact triangles.
 	if (isSwapped)
@@ -122,13 +141,13 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 	//User can override friction and/or restitution
 	if (gContactAddedCallback &&
 		//and if either of the two bodies requires custom material
-		 ((m_body0->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK) ||
-		   (m_body1->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)))
+		 ((m_body0Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK) ||
+		   (m_body1Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)))
 		//experimental feature info, for per-triangle material etc.
-		btCollisionObject* obj0 = isSwapped? m_body1 : m_body0;
-		btCollisionObject* obj1 = isSwapped? m_body0 : m_body1;
-		(*gContactAddedCallback)(m_manifoldPtr->getContactPoint(insertIndex),obj0,newPt.m_partId0,newPt.m_index0,obj1,newPt.m_partId1,newPt.m_index1);
+		const btCollisionObjectWrapper* obj0Wrap = isSwapped? m_body1Wrap : m_body0Wrap;
+		const btCollisionObjectWrapper* obj1Wrap = isSwapped? m_body0Wrap : m_body1Wrap;
+		(*gContactAddedCallback)(m_manifoldPtr->getContactPoint(insertIndex),obj0Wrap,newPt.m_partId0,newPt.m_index0,obj1Wrap,newPt.m_partId1,newPt.m_index1);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.h b/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.h
index 18199b49..977b9a02 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btManifoldResult.h
@@ -18,14 +18,18 @@ subject to the following restrictions:
 class btCollisionObject;
+struct btCollisionObjectWrapper;
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
 class btManifoldPoint;
 #include "BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h"
 #include "LinearMath/btTransform.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-typedef bool (*ContactAddedCallback)(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1);
+typedef bool (*ContactAddedCallback)(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1);
 extern ContactAddedCallback		gContactAddedCallback;
 //#define DEBUG_PART_INDEX 1
@@ -38,12 +42,8 @@ protected:
 	btPersistentManifold* m_manifoldPtr;
-	//we need this for compounds
-	btTransform	m_rootTransA;
-	btTransform	m_rootTransB;
-	btCollisionObject* m_body0;
-	btCollisionObject* m_body1;
+	const btCollisionObjectWrapper* m_body0Wrap;
+	const btCollisionObjectWrapper* m_body1Wrap;
 	int	m_partId0;
 	int m_partId1;
 	int m_index0;
@@ -63,7 +63,7 @@ public:
-	btManifoldResult(btCollisionObject* body0,btCollisionObject* body1);
+	btManifoldResult(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 	virtual ~btManifoldResult() {};
@@ -102,27 +102,49 @@ public:
 		if (!m_manifoldPtr->getNumContacts())
-		bool isSwapped = m_manifoldPtr->getBody0() != m_body0;
+		bool isSwapped = m_manifoldPtr->getBody0() != m_body0Wrap->getCollisionObject();
 		if (isSwapped)
-			m_manifoldPtr->refreshContactPoints(m_rootTransB,m_rootTransA);
+			m_manifoldPtr->refreshContactPoints(m_body1Wrap->getCollisionObject()->getWorldTransform(),m_body0Wrap->getCollisionObject()->getWorldTransform());
 		} else
-			m_manifoldPtr->refreshContactPoints(m_rootTransA,m_rootTransB);
+			m_manifoldPtr->refreshContactPoints(m_body0Wrap->getCollisionObject()->getWorldTransform(),m_body1Wrap->getCollisionObject()->getWorldTransform());
+	const btCollisionObjectWrapper* getBody0Wrap() const
+	{
+		return m_body0Wrap;
+	}
+	const btCollisionObjectWrapper* getBody1Wrap() const
+	{
+		return m_body1Wrap;
+	}
+	void setBody0Wrap(const btCollisionObjectWrapper* obj0Wrap)
+	{
+		m_body0Wrap = obj0Wrap;
+	}
+	void setBody1Wrap(const btCollisionObjectWrapper* obj1Wrap)
+	{
+		m_body1Wrap = obj1Wrap;
+	}
 	const btCollisionObject* getBody0Internal() const
-		return m_body0;
+		return m_body0Wrap->getCollisionObject();
 	const btCollisionObject* getBody1Internal() const
-		return m_body1;
+		return m_body1Wrap->getCollisionObject();
+	/// in the future we can let the user override the methods to combine restitution and friction
+	static btScalar	calculateCombinedRestitution(const btCollisionObject* body0,const btCollisionObject* body1);
+	static btScalar	calculateCombinedFriction(const btCollisionObject* body0,const btCollisionObject* body1);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp b/src/bullet/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp
index 871c6441..13447822 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp
@@ -319,8 +319,8 @@ void btSimulationIslandManager::buildIslands(btDispatcher* dispatcher,btCollisio
 		 btPersistentManifold* manifold = dispatcher->getManifoldByIndexInternal(i);
-		 btCollisionObject* colObj0 = static_cast<btCollisionObject*>(manifold->getBody0());
-		 btCollisionObject* colObj1 = static_cast<btCollisionObject*>(manifold->getBody1());
+		 const btCollisionObject* colObj0 = static_cast<const btCollisionObject*>(manifold->getBody0());
+		 const btCollisionObject* colObj1 = static_cast<const btCollisionObject*>(manifold->getBody1());
 		 ///@todo: check sleeping conditions!
 		 if (((colObj0) && colObj0->getActivationState() != ISLAND_SLEEPING) ||
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp
index 8df87692..e8b567e0 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp
@@ -18,20 +18,21 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionShapes/btBoxShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 //#include <stdio.h>
-btSphereBoxCollisionAlgorithm::btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped)
-: btActivatingCollisionAlgorithm(ci,col0,col1),
+btSphereBoxCollisionAlgorithm::btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, bool isSwapped)
+: btActivatingCollisionAlgorithm(ci,col0Wrap,col1Wrap),
-	btCollisionObject* sphereObj = m_isSwapped? col1 : col0;
-	btCollisionObject* boxObj = m_isSwapped? col0 : col1;
+	const btCollisionObjectWrapper* sphereObjWrap = m_isSwapped? col1Wrap : col0Wrap;
+	const btCollisionObjectWrapper* boxObjWrap = m_isSwapped? col0Wrap : col1Wrap;
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(sphereObj,boxObj))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(sphereObjWrap->getCollisionObject(),boxObjWrap->getCollisionObject()))
-		m_manifoldPtr = m_dispatcher->getNewManifold(sphereObj,boxObj);
+		m_manifoldPtr = m_dispatcher->getNewManifold(sphereObjWrap->getCollisionObject(),boxObjWrap->getCollisionObject());
 		m_ownManifold = true;
@@ -48,36 +49,31 @@ btSphereBoxCollisionAlgorithm::~btSphereBoxCollisionAlgorithm()
-void btSphereBoxCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSphereBoxCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap, const btCollisionObjectWrapper* body1Wrap, const btDispatcherInfo& dispatchInfo, btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-	btCollisionObject* sphereObj = m_isSwapped? body1 : body0;
-	btCollisionObject* boxObj = m_isSwapped? body0 : body1;
+	const btCollisionObjectWrapper* sphereObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* boxObjWrap = m_isSwapped? body0Wrap : body1Wrap;
-	btSphereShape* sphere0 = (btSphereShape*)sphereObj->getCollisionShape();
+	btVector3 pOnBox;
 	btVector3 normalOnSurfaceB;
-	btVector3 pOnBox,pOnSphere;
-	btVector3 sphereCenter = sphereObj->getWorldTransform().getOrigin();
+	btScalar penetrationDepth;
+	btVector3 sphereCenter = sphereObjWrap->getWorldTransform().getOrigin();
+	const btSphereShape* sphere0 = (const btSphereShape*)sphereObjWrap->getCollisionShape();
 	btScalar radius = sphere0->getRadius();
-	btScalar dist = getSphereDistance(boxObj,pOnBox,pOnSphere,sphereCenter,radius);
+	btScalar maxContactDistance = m_manifoldPtr->getContactBreakingThreshold();
-	if (dist < SIMD_EPSILON)
+	if (getSphereDistance(boxObjWrap, pOnBox, normalOnSurfaceB, penetrationDepth, sphereCenter, radius, maxContactDistance))
-		btVector3 normalOnSurfaceB = (pOnBox- pOnSphere).normalize();
 		/// report a contact. internally this will be kept persistent, and contact reduction is done
-		resultOut->addContactPoint(normalOnSurfaceB,pOnBox,dist);
+		resultOut->addContactPoint(normalOnSurfaceB, pOnBox, penetrationDepth);
 	if (m_ownManifold)
@@ -102,159 +98,117 @@ btScalar btSphereBoxCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject*
-btScalar btSphereBoxCollisionAlgorithm::getSphereDistance(btCollisionObject* boxObj, btVector3& pointOnBox, btVector3& v3PointOnSphere, const btVector3& sphereCenter, btScalar fRadius ) 
+bool btSphereBoxCollisionAlgorithm::getSphereDistance(const btCollisionObjectWrapper* boxObjWrap, btVector3& pointOnBox, btVector3& normal, btScalar& penetrationDepth, const btVector3& sphereCenter, btScalar fRadius, btScalar maxContactDistance ) 
-	btScalar margins;
-	btVector3 bounds[2];
-	btBoxShape* boxShape= (btBoxShape*)boxObj->getCollisionShape();
+	const btBoxShape* boxShape= (const btBoxShape*)boxObjWrap->getCollisionShape();
+	btVector3 const &boxHalfExtent = boxShape->getHalfExtentsWithoutMargin();
+	btScalar boxMargin = boxShape->getMargin();
+	penetrationDepth = 1.0f;
+	// convert the sphere position to the box's local space
+	btTransform const &m44T = boxObjWrap->getWorldTransform();
+	btVector3 sphereRelPos = m44T.invXform(sphereCenter);
+	// Determine the closest point to the sphere center in the box
+	btVector3 closestPoint = sphereRelPos;
+	closestPoint.setX( btMin(boxHalfExtent.getX(), closestPoint.getX()) );
+	closestPoint.setX( btMax(-boxHalfExtent.getX(), closestPoint.getX()) );
+	closestPoint.setY( btMin(boxHalfExtent.getY(), closestPoint.getY()) );
+	closestPoint.setY( btMax(-boxHalfExtent.getY(), closestPoint.getY()) );
+	closestPoint.setZ( btMin(boxHalfExtent.getZ(), closestPoint.getZ()) );
+	closestPoint.setZ( btMax(-boxHalfExtent.getZ(), closestPoint.getZ()) );
-	bounds[0] = -boxShape->getHalfExtentsWithoutMargin();
-	bounds[1] = boxShape->getHalfExtentsWithoutMargin();
-	margins = boxShape->getMargin();//also add sphereShape margin?
-	const btTransform&	m44T = boxObj->getWorldTransform();
-	btVector3	boundsVec[2];
-	btScalar	fPenetration;
-	boundsVec[0] = bounds[0];
-	boundsVec[1] = bounds[1];
-	btVector3	marginsVec( margins, margins, margins );
-	// add margins
-	bounds[0] += marginsVec;
-	bounds[1] -= marginsVec;
-	/////////////////////////////////////////////////
-	btVector3	tmp, prel, n[6], normal, v3P;
-	btScalar   fSep = btScalar(10000000.0), fSepThis;
+	btScalar intersectionDist = fRadius + boxMargin;
+	btScalar contactDist = intersectionDist + maxContactDistance;
+	normal = sphereRelPos - closestPoint;
-	n[0].setValue( btScalar(-1.0),  btScalar(0.0),  btScalar(0.0) );
-	n[1].setValue(  btScalar(0.0), btScalar(-1.0),  btScalar(0.0) );
-	n[2].setValue(  btScalar(0.0),  btScalar(0.0), btScalar(-1.0) );
-	n[3].setValue(  btScalar(1.0),  btScalar(0.0),  btScalar(0.0) );
-	n[4].setValue(  btScalar(0.0),  btScalar(1.0),  btScalar(0.0) );
-	n[5].setValue(  btScalar(0.0),  btScalar(0.0),  btScalar(1.0) );
-	// convert  point in local space
-	prel = m44T.invXform( sphereCenter);
-	bool	bFound = false;
+	//if there is no penetration, we are done
+	btScalar dist2 = normal.length2();
+	if (dist2 > contactDist * contactDist)
+	{
+		return false;
+	}
-	v3P = prel;
+	btScalar distance;
-	for (int i=0;i<6;i++)
+	//special case if the sphere center is inside the box
+	if (dist2 <= SIMD_EPSILON)
-		int j = i<3? 0:1;
-		if ( (fSepThis = ((v3P-bounds[j]) .dot(n[i]))) > btScalar(0.0) )
-		{
-			v3P = v3P - n[i]*fSepThis;		
-			bFound = true;
-		}
+		distance = -getSpherePenetration(boxHalfExtent, sphereRelPos, closestPoint, normal);
-	//
-	if ( bFound )
+	else //compute the penetration details
-		bounds[0] = boundsVec[0];
-		bounds[1] = boundsVec[1];
-		normal = (prel - v3P).normalize();
-		pointOnBox = v3P + normal*margins;
-		v3PointOnSphere = prel - normal*fRadius;
-		if ( ((v3PointOnSphere - pointOnBox) .dot (normal)) > btScalar(0.0) )
-		{
-			return btScalar(1.0);
-		}
-		// transform back in world space
-		tmp = m44T( pointOnBox);
-		pointOnBox    = tmp;
-		tmp  = m44T( v3PointOnSphere);		
-		v3PointOnSphere = tmp;
-		btScalar fSeps2 = (pointOnBox-v3PointOnSphere).length2();
-		//if this fails, fallback into deeper penetration case, below
-		if (fSeps2 > SIMD_EPSILON)
-		{
-			fSep = - btSqrt(fSeps2);
-			normal = (pointOnBox-v3PointOnSphere);
-			normal *= btScalar(1.)/fSep;
-		}
-		return fSep;
+		distance = normal.length();
+		normal /= distance;
-	//////////////////////////////////////////////////
-	// Deep penetration case
-	fPenetration = getSpherePenetration( boxObj,pointOnBox, v3PointOnSphere, sphereCenter, fRadius,bounds[0],bounds[1] );
+	pointOnBox = closestPoint + normal * boxMargin;
+//	v3PointOnSphere = sphereRelPos - (normal * fRadius);	
+	penetrationDepth = distance - intersectionDist;
-	bounds[0] = boundsVec[0];
-	bounds[1] = boundsVec[1];
-	if ( fPenetration <= btScalar(0.0) )
-		return (fPenetration-margins);
-	else
-		return btScalar(1.0);
+	// transform back in world space
+	btVector3 tmp = m44T(pointOnBox);
+	pointOnBox = tmp;
+//	tmp = m44T(v3PointOnSphere);
+//	v3PointOnSphere = tmp;
+	tmp = m44T.getBasis() * normal;
+	normal = tmp;
+	return true;
-btScalar btSphereBoxCollisionAlgorithm::getSpherePenetration( btCollisionObject* boxObj,btVector3& pointOnBox, btVector3& v3PointOnSphere, const btVector3& sphereCenter, btScalar fRadius, const btVector3& aabbMin, const btVector3& aabbMax) 
+btScalar btSphereBoxCollisionAlgorithm::getSpherePenetration( btVector3 const &boxHalfExtent, btVector3 const &sphereRelPos, btVector3 &closestPoint, btVector3& normal ) 
+	//project the center of the sphere on the closest face of the box
+	btScalar faceDist = boxHalfExtent.getX() - sphereRelPos.getX();
+	btScalar minDist = faceDist;
+	closestPoint.setX( boxHalfExtent.getX() );
+	normal.setValue(btScalar(1.0f),  btScalar(0.0f),  btScalar(0.0f));
+	faceDist = boxHalfExtent.getX() + sphereRelPos.getX();
+	if (faceDist < minDist)
+	{
+		minDist = faceDist;
+		closestPoint = sphereRelPos;
+		closestPoint.setX( -boxHalfExtent.getX() );
+		normal.setValue(btScalar(-1.0f),  btScalar(0.0f),  btScalar(0.0f));
+	}
-	btVector3 bounds[2];
-	bounds[0] = aabbMin;
-	bounds[1] = aabbMax;
-	btVector3	p0, tmp, prel, n[6], normal;
-	btScalar   fSep = btScalar(-10000000.0), fSepThis;
-	// set p0 and normal to a default value to shup up GCC
-	p0.setValue(btScalar(0.), btScalar(0.), btScalar(0.));
-	normal.setValue(btScalar(0.), btScalar(0.), btScalar(0.));
-	n[0].setValue( btScalar(-1.0),  btScalar(0.0),  btScalar(0.0) );
-	n[1].setValue(  btScalar(0.0), btScalar(-1.0),  btScalar(0.0) );
-	n[2].setValue(  btScalar(0.0),  btScalar(0.0), btScalar(-1.0) );
-	n[3].setValue(  btScalar(1.0),  btScalar(0.0),  btScalar(0.0) );
-	n[4].setValue(  btScalar(0.0),  btScalar(1.0),  btScalar(0.0) );
-	n[5].setValue(  btScalar(0.0),  btScalar(0.0),  btScalar(1.0) );
-	const btTransform&	m44T = boxObj->getWorldTransform();
-	// convert  point in local space
-	prel = m44T.invXform( sphereCenter);
-	///////////
-	for (int i=0;i<6;i++)
+	faceDist = boxHalfExtent.getY() - sphereRelPos.getY();
+	if (faceDist < minDist)
-		int j = i<3 ? 0:1;
-		if ( (fSepThis = ((prel-bounds[j]) .dot( n[i]))-fRadius) > btScalar(0.0) )	return btScalar(1.0);
-		if ( fSepThis > fSep )
-		{
-			p0 = bounds[j];	normal = (btVector3&)n[i];
-			fSep = fSepThis;
-		}
+		minDist = faceDist;
+		closestPoint = sphereRelPos;
+		closestPoint.setY( boxHalfExtent.getY() );
+		normal.setValue(btScalar(0.0f),  btScalar(1.0f),  btScalar(0.0f));
-	pointOnBox = prel - normal*(normal.dot((prel-p0)));
-	v3PointOnSphere = pointOnBox + normal*fSep;
+	faceDist = boxHalfExtent.getY() + sphereRelPos.getY();
+	if (faceDist < minDist)
+	{
+		minDist = faceDist;
+		closestPoint = sphereRelPos;
+		closestPoint.setY( -boxHalfExtent.getY() );
+		normal.setValue(btScalar(0.0f),  btScalar(-1.0f),  btScalar(0.0f));
+	}
-	// transform back in world space
-	tmp  = m44T( pointOnBox);		
-	pointOnBox    = tmp;
-	tmp  = m44T( v3PointOnSphere);		v3PointOnSphere = tmp;
-	normal = (pointOnBox-v3PointOnSphere).normalize();
+	faceDist = boxHalfExtent.getZ() - sphereRelPos.getZ();
+	if (faceDist < minDist)
+	{
+		minDist = faceDist;
+		closestPoint = sphereRelPos;
+		closestPoint.setZ( boxHalfExtent.getZ() );
+		normal.setValue(btScalar(0.0f),  btScalar(0.0f),  btScalar(1.0f));
+	}
-	return fSep;
+	faceDist = boxHalfExtent.getZ() + sphereRelPos.getZ();
+	if (faceDist < minDist)
+	{
+		minDist = faceDist;
+		closestPoint = sphereRelPos;
+		closestPoint.setZ( -boxHalfExtent.getZ() );
+		normal.setValue(btScalar(0.0f),  btScalar(0.0f),  btScalar(-1.0f));
+	}
+	return minDist;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h
index 60286ae0..eefaedc9 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h
@@ -34,11 +34,11 @@ class btSphereBoxCollisionAlgorithm : public btActivatingCollisionAlgorithm
-	btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped);
+	btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, bool isSwapped);
 	virtual ~btSphereBoxCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -50,21 +50,21 @@ public:
-	btScalar getSphereDistance( btCollisionObject* boxObj,btVector3& v3PointOnBox, btVector3& v3PointOnSphere, const btVector3& v3SphereCenter, btScalar fRadius );
+	bool getSphereDistance( const btCollisionObjectWrapper* boxObjWrap, btVector3& v3PointOnBox, btVector3& normal, btScalar& penetrationDepth, const btVector3& v3SphereCenter, btScalar fRadius, btScalar maxContactDistance );
-	btScalar getSpherePenetration( btCollisionObject* boxObj, btVector3& v3PointOnBox, btVector3& v3PointOnSphere, const btVector3& v3SphereCenter, btScalar fRadius, const btVector3& aabbMin, const btVector3& aabbMax);
+	btScalar getSpherePenetration( btVector3 const &boxHalfExtent, btVector3 const &sphereRelPos, btVector3 &closestPoint, btVector3& normal );
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSphereBoxCollisionAlgorithm));
 			if (!m_swapped)
-				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0,body1,false);
+				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,false);
 			} else
-				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0,body1,true);
+				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,true);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp
index 5c4e78fe..36ba21f5 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp
@@ -17,15 +17,16 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
-btSphereSphereCollisionAlgorithm::btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1)
-: btActivatingCollisionAlgorithm(ci,col0,col1),
+btSphereSphereCollisionAlgorithm::btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap)
+: btActivatingCollisionAlgorithm(ci,col0Wrap,col1Wrap),
 	if (!m_manifoldPtr)
-		m_manifoldPtr = m_dispatcher->getNewManifold(col0,col1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(col0Wrap->getCollisionObject(),col1Wrap->getCollisionObject());
 		m_ownManifold = true;
@@ -39,7 +40,7 @@ btSphereSphereCollisionAlgorithm::~btSphereSphereCollisionAlgorithm()
-void btSphereSphereCollisionAlgorithm::processCollision (btCollisionObject* col0,btCollisionObject* col1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSphereSphereCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
@@ -48,10 +49,10 @@ void btSphereSphereCollisionAlgorithm::processCollision (btCollisionObject* col0
-	btSphereShape* sphere0 = (btSphereShape*)col0->getCollisionShape();
-	btSphereShape* sphere1 = (btSphereShape*)col1->getCollisionShape();
+	btSphereShape* sphere0 = (btSphereShape*)col0Wrap->getCollisionShape();
+	btSphereShape* sphere1 = (btSphereShape*)col1Wrap->getCollisionShape();
-	btVector3 diff = col0->getWorldTransform().getOrigin()-  col1->getWorldTransform().getOrigin();
+	btVector3 diff = col0Wrap->getWorldTransform().getOrigin()-  col1Wrap->getWorldTransform().getOrigin();
 	btScalar len = diff.length();
 	btScalar radius0 = sphere0->getRadius();
 	btScalar radius1 = sphere1->getRadius();
@@ -80,7 +81,7 @@ void btSphereSphereCollisionAlgorithm::processCollision (btCollisionObject* col0
 	///point on A (worldspace)
 	///btVector3 pos0 = col0->getWorldTransform().getOrigin() - radius0 * normalOnSurfaceB;
 	///point on B (worldspace)
-	btVector3 pos1 = col1->getWorldTransform().getOrigin() + radius1* normalOnSurfaceB;
+	btVector3 pos1 = col1Wrap->getWorldTransform().getOrigin() + radius1* normalOnSurfaceB;
 	/// report a contact. internally this will be kept persistent, and contact reduction is done
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h
index e55acf27..3517a568 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h
@@ -32,12 +32,12 @@ class btSphereSphereCollisionAlgorithm : public btActivatingCollisionAlgorithm
 	btPersistentManifold*	m_manifoldPtr;
-	btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap);
 	btSphereSphereCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -53,10 +53,10 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSphereSphereCollisionAlgorithm));
-			return new(mem) btSphereSphereCollisionAlgorithm(0,ci,body0,body1);
+			return new(mem) btSphereSphereCollisionAlgorithm(0,ci,col0Wrap,col1Wrap);
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp b/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp
index c327c3ff..280a4d35 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp
@@ -19,17 +19,17 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "SphereTriangleDetector.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
-btSphereTriangleCollisionAlgorithm::btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1,bool swapped)
-: btActivatingCollisionAlgorithm(ci,col0,col1),
+btSphereTriangleCollisionAlgorithm::btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool swapped)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 	if (!m_manifoldPtr)
-		m_manifoldPtr = m_dispatcher->getNewManifold(col0,col1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
@@ -43,16 +43,16 @@ btSphereTriangleCollisionAlgorithm::~btSphereTriangleCollisionAlgorithm()
-void btSphereTriangleCollisionAlgorithm::processCollision (btCollisionObject* col0,btCollisionObject* col1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSphereTriangleCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	if (!m_manifoldPtr)
-	btCollisionObject* sphereObj = m_swapped? col1 : col0;
-	btCollisionObject* triObj = m_swapped? col0 : col1;
+	const btCollisionObjectWrapper* sphereObjWrap = m_swapped? col1Wrap : col0Wrap;
+	const btCollisionObjectWrapper* triObjWrap = m_swapped? col0Wrap : col1Wrap;
-	btSphereShape* sphere = (btSphereShape*)sphereObj->getCollisionShape();
-	btTriangleShape* triangle = (btTriangleShape*)triObj->getCollisionShape();
+	btSphereShape* sphere = (btSphereShape*)sphereObjWrap->getCollisionShape();
+	btTriangleShape* triangle = (btTriangleShape*)triObjWrap->getCollisionShape();
 	/// report a contact. internally this will be kept persistent, and contact reduction is done
@@ -60,8 +60,8 @@ void btSphereTriangleCollisionAlgorithm::processCollision (btCollisionObject* co
 	btDiscreteCollisionDetectorInterface::ClosestPointInput input;
 	input.m_maximumDistanceSquared = btScalar(BT_LARGE_FLOAT);///@todo: tighter bounds
-	input.m_transformA = sphereObj->getWorldTransform();
-	input.m_transformB = triObj->getWorldTransform();
+	input.m_transformA = sphereObjWrap->getWorldTransform();
+	input.m_transformB = triObjWrap->getWorldTransform();
 	bool swapResults = m_swapped;
diff --git a/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h b/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h
index 7c6c4d8f..6b6e39a7 100644
--- a/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h
@@ -32,12 +32,12 @@ class btSphereTriangleCollisionAlgorithm : public btActivatingCollisionAlgorithm
 	bool	m_swapped;
-	btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool swapped);
+	btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool swapped);
 	btSphereTriangleCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -54,12 +54,12 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSphereTriangleCollisionAlgorithm));
-			return new(mem) btSphereTriangleCollisionAlgorithm(ci.m_manifold,ci,body0,body1,m_swapped);
+			return new(mem) btSphereTriangleCollisionAlgorithm(ci.m_manifold,ci,body0Wrap,body1Wrap,m_swapped);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btBox2dShape.h b/src/bullet/BulletCollision/CollisionShapes/btBox2dShape.h
index f4a9ca03..ce333783 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btBox2dShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btBox2dShape.h
@@ -23,7 +23,7 @@ subject to the following restrictions:
 #include "LinearMath/btMinMax.h"
 ///The btBox2dShape is a box primitive around the origin, its sides axis aligned with length specified by half extents, in local shape coordinates. When used as part of a btCollisionObject or btRigidBody it will be an oriented box in world space.
-class btBox2dShape: public btPolyhedralConvexShape
+ATTRIBUTE_ALIGNED16(class) btBox2dShape: public btPolyhedralConvexShape
 	//btVector3	m_boxHalfExtents1; //use m_implicitShapeDimensions instead
@@ -34,6 +34,8 @@ class btBox2dShape: public btPolyhedralConvexShape
 	btVector3 getHalfExtentsWithMargin() const
 		btVector3 halfExtents = getHalfExtentsWithoutMargin();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btBoxShape.h b/src/bullet/BulletCollision/CollisionShapes/btBoxShape.h
index 0c5857da..715e3f2a 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btBoxShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btBoxShape.h
@@ -23,7 +23,7 @@ subject to the following restrictions:
 #include "LinearMath/btMinMax.h"
 ///The btBoxShape is a box primitive around the origin, its sides axis aligned with length specified by half extents, in local shape coordinates. When used as part of a btCollisionObject or btRigidBody it will be an oriented box in world space.
-class btBoxShape: public btPolyhedralConvexShape
+ATTRIBUTE_ALIGNED16(class) btBoxShape: public btPolyhedralConvexShape
 	//btVector3	m_boxHalfExtents1; //use m_implicitShapeDimensions instead
@@ -31,6 +31,8 @@ class btBoxShape: public btPolyhedralConvexShape
 	btVector3 getHalfExtentsWithMargin() const
 		btVector3 halfExtents = getHalfExtentsWithoutMargin();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h b/src/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h
index d1c21629..1fa4995d 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h
@@ -21,7 +21,13 @@ subject to the following restrictions:
 #include "LinearMath/btAlignedAllocator.h"
 #include "btTriangleInfoMap.h"
-///The btBvhTriangleMeshShape is a static-triangle mesh shape with several optimizations, such as bounding volume hierarchy and cache friendly traversal for PlayStation 3 Cell SPU. It is recommended to enable useQuantizedAabbCompression for better memory usage.
+///The btBvhTriangleMeshShape is a static-triangle mesh shape, it can only be used for fixed/non-moving objects.
+///If you required moving concave triangle meshes, it is recommended to perform convex decomposition
+///using HACD, see Bullet/Demos/ConvexDecompositionDemo. 
+///Alternatively, you can use btGimpactMeshShape for moving concave triangle meshes.
+///btBvhTriangleMeshShape has several optimizations, such as bounding volume hierarchy and 
+///cache friendly traversal for PlayStation 3 Cell SPU. 
+///It is recommended to enable useQuantizedAabbCompression for better memory usage.
 ///It takes a triangle mesh as input, for example a btTriangleMesh or btTriangleIndexVertexArray. The btBvhTriangleMeshShape class allows for triangle mesh deformations by a refit or partialRefit method.
 ///Instead of building the bounding volume hierarchy acceleration structure, it is also possible to serialize (save) and deserialize (load) the structure from disk.
 ///See Demos\ConcaveDemo\ConcavePhysicsDemo.cpp for an example.
@@ -33,7 +39,11 @@ ATTRIBUTE_ALIGNED16(class) btBvhTriangleMeshShape : public btTriangleMeshShape
 	bool m_useQuantizedAabbCompression;
 	bool m_ownsBvh;
+#ifdef __clang__
+	bool m_pad[11] __attribute__((unused));////need padding due to alignment
 	bool m_pad[11];////need padding due to alignment
diff --git a/src/bullet/BulletCollision/CollisionShapes/btCapsuleShape.h b/src/bullet/BulletCollision/CollisionShapes/btCapsuleShape.h
index ab763abf..f8c55ace 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btCapsuleShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btCapsuleShape.h
@@ -23,7 +23,7 @@ subject to the following restrictions:
 ///The btCapsuleShape represents a capsule around the Y axis, there is also the btCapsuleShapeX aligned around the X axis and btCapsuleShapeZ around the Z axis.
 ///The total height is height+2*radius, so the height is just the height between the center of each 'sphere' of the capsule caps.
 ///The btCapsuleShape is a convex hull of two spheres. The btMultiSphereShape is a more general collision shape that takes the convex hull of multiple sphere, so it can also represent a capsule when just using two spheres.
-class btCapsuleShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btCapsuleShape : public btConvexInternalShape
 	int	m_upAxis;
@@ -33,6 +33,9 @@ protected:
 	btCapsuleShape() : btConvexInternalShape() {m_shapeType = CAPSULE_SHAPE_PROXYTYPE;};
 	btCapsuleShape(btScalar radius,btScalar height);
 	///CollisionShape Interface
@@ -62,8 +65,8 @@ public:
 			halfExtents += btVector3(getMargin(),getMargin(),getMargin());
 			btMatrix3x3 abs_b = t.getBasis().absolute();  
 			btVector3 center = t.getOrigin();
-			btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));		  
+            btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 			aabbMin = center - extent;
 			aabbMax = center + extent;
@@ -101,11 +104,20 @@ public:
+	virtual btVector3	getAnisotropicRollingFrictionDirection() const
+	{
+		btVector3 aniDir(0,0,0);
+		aniDir[getUpAxis()]=1;
+		return aniDir;
+	}
 	virtual	int	calculateSerializeBufferSize() const;
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 	virtual	const char*	serialize(void* dataBuffer, btSerializer* serializer) const;
+	SIMD_FORCE_INLINE	void	deSerializeFloat(struct btCapsuleShapeData* dataBuffer);
@@ -170,4 +182,13 @@ SIMD_FORCE_INLINE	const char*	btCapsuleShape::serialize(void* dataBuffer, btSeri
 	return "btCapsuleShapeData";
+SIMD_FORCE_INLINE	void	btCapsuleShape::deSerializeFloat(btCapsuleShapeData* dataBuffer)
+	m_implicitShapeDimensions.deSerializeFloat(dataBuffer->m_convexInternalShapeData.m_implicitShapeDimensions);
+	m_collisionMargin = dataBuffer->m_convexInternalShapeData.m_collisionMargin;
+	m_localScaling.deSerializeFloat(dataBuffer->m_convexInternalShapeData.m_localScaling);
+	//it is best to already pre-allocate the matching btCapsuleShape*(X/Z) version to match m_upAxis
+	m_upAxis = dataBuffer->m_upAxis;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btCollisionShape.h b/src/bullet/BulletCollision/CollisionShapes/btCollisionShape.h
index 865c1067..5e865680 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btCollisionShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btCollisionShape.h
@@ -24,15 +24,18 @@ class btSerializer;
 ///The btCollisionShape class provides an interface for collision shapes that can be shared among btCollisionObjects.
-class btCollisionShape
+ATTRIBUTE_ALIGNED16(class) btCollisionShape
 	int m_shapeType;
 	void* m_userPointer;
+	int m_userIndex;
-	btCollisionShape() : m_shapeType (INVALID_SHAPE_PROXYTYPE), m_userPointer(0)
+	btCollisionShape() : m_shapeType (INVALID_SHAPE_PROXYTYPE), m_userPointer(0), m_userIndex(-1)
@@ -107,6 +110,13 @@ public:
 	int		getShapeType() const { return m_shapeType; }
+	///the getAnisotropicRollingFrictionDirection can be used in combination with setAnisotropicFriction
+	///See Bullet/Demos/RollingFrictionDemo for an example
+	virtual btVector3	getAnisotropicRollingFrictionDirection() const
+	{
+		return btVector3(1,1,1);
+	}
 	virtual void	setMargin(btScalar margin) = 0;
 	virtual btScalar	getMargin() const = 0;
@@ -121,6 +131,16 @@ public:
 		return m_userPointer;
+	void setUserIndex(int index)
+	{
+		m_userIndex = index;
+	}
+	int getUserIndex() const
+	{
+		return m_userIndex;
+	}
 	virtual	int	calculateSerializeBufferSize() const;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.cpp
index 4eb860c5..e8c8c336 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.cpp
@@ -18,7 +18,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btDbvt.h"
 #include "LinearMath/btSerializer.h"
-btCompoundShape::btCompoundShape(bool enableDynamicAabbTree)
+btCompoundShape::btCompoundShape(bool enableDynamicAabbTree, const int initialChildCapacity)
 : m_localAabbMin(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT)),
@@ -34,6 +34,8 @@ m_localScaling(btScalar(1.),btScalar(1.),btScalar(1.))
 		m_dynamicAabbTree = new(mem) btDbvt();
+	m_children.reserve(initialChildCapacity);
@@ -77,8 +79,8 @@ void	btCompoundShape::addChildShape(const btTransform& localTransform,btCollisio
 	if (m_dynamicAabbTree)
 		const btDbvtVolume	bounds=btDbvtVolume::FromMM(localAabbMin,localAabbMax);
-		int index = m_children.size();
-		child.m_node = m_dynamicAabbTree->insert(bounds,(void*)index);
+		size_t index = m_children.size();
+		child.m_node = m_dynamicAabbTree->insert(bounds,reinterpret_cast<void*>(index) );
@@ -182,9 +184,7 @@ void btCompoundShape::getAabb(const btTransform& trans,btVector3& aabbMin,btVect
 	btVector3 center = trans(localCenter);
-	btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-		abs_b[1].dot(localHalfExtents),
-		abs_b[2].dot(localHalfExtents));
+    btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	aabbMin = center-extent;
 	aabbMax = center+extent;
@@ -275,6 +275,8 @@ void btCompoundShape::calculatePrincipalAxisTransform(btScalar* masses, btTransf
 void btCompoundShape::setLocalScaling(const btVector3& scaling)
@@ -285,7 +287,7 @@ void btCompoundShape::setLocalScaling(const btVector3& scaling)
 //		childScale = childScale * (childTrans.getBasis() * scaling);
 		childScale = childScale * scaling / m_localScaling;
-		childTrans.setOrigin((childTrans.getOrigin())*scaling);
+		childTrans.setOrigin((childTrans.getOrigin()) * scaling / m_localScaling);
 		updateChildTransform(i, childTrans,false);
@@ -312,7 +314,8 @@ void btCompoundShape::createAabbTreeFromChildren()
             const btDbvtVolume  bounds=btDbvtVolume::FromMM(localAabbMin,localAabbMax);
-            child.m_node = m_dynamicAabbTree->insert(bounds,(void*)index);
+			size_t index2 = index;
+            child.m_node = m_dynamicAabbTree->insert(bounds, reinterpret_cast<void*>(index2) );
diff --git a/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.h b/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.h
index 141034a8..4eef8dba 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btCompoundShape.h
@@ -53,6 +53,7 @@ SIMD_FORCE_INLINE bool operator==(const btCompoundShapeChild& c1, const btCompou
 /// Currently, removal of child shapes is only supported when disabling the aabb tree (pass 'false' in the constructor of btCompoundShape)
 ATTRIBUTE_ALIGNED16(class) btCompoundShape	: public btCollisionShape
 	btAlignedObjectArray<btCompoundShapeChild> m_children;
 	btVector3						m_localAabbMin;
 	btVector3						m_localAabbMax;
@@ -64,13 +65,12 @@ ATTRIBUTE_ALIGNED16(class) btCompoundShape	: public btCollisionShape
 	btScalar	m_collisionMargin;
 	btVector3	m_localScaling;
-	btCompoundShape(bool enableDynamicAabbTree = true);
+	explicit btCompoundShape(bool enableDynamicAabbTree = true, const int initialChildCapacity = 0);
 	virtual ~btCompoundShape();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConcaveShape.h b/src/bullet/BulletCollision/CollisionShapes/btConcaveShape.h
index 2a03241c..2917cc5b 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConcaveShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConcaveShape.h
@@ -33,12 +33,14 @@ typedef enum PHY_ScalarType {
 ///The btConcaveShape class provides an interface for non-moving (static) concave shapes.
 ///It has been implemented by the btStaticPlaneShape, btBvhTriangleMeshShape and btHeightfieldTerrainShape.
-class btConcaveShape : public btCollisionShape
+ATTRIBUTE_ALIGNED16(class) btConcaveShape : public btCollisionShape
 	btScalar m_collisionMargin;
 	virtual ~btConcaveShape();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConeShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btConeShape.cpp
index 5e83087b..2d83c8bf 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConeShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btConeShape.cpp
@@ -62,6 +62,10 @@ void	btConeShape::setConeUpIndex(int upIndex)
+	m_implicitShapeDimensions[m_coneIndices[0]] = m_radius;
+	m_implicitShapeDimensions[m_coneIndices[1]] = m_height;
+	m_implicitShapeDimensions[m_coneIndices[2]] = m_radius;
 btVector3 btConeShape::coneLocalSupport(const btVector3& v) const
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConeShape.h b/src/bullet/BulletCollision/CollisionShapes/btConeShape.h
index b69b5c5b..46d78d14 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConeShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConeShape.h
@@ -20,7 +20,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h" // for the types
 ///The btConeShape implements a cone shape primitive, centered around the origin and aligned with the Y axis. The btConeShapeX is aligned around the X axis and btConeShapeZ around the Z axis.
-class btConeShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btConeShape : public btConvexInternalShape
@@ -32,6 +32,8 @@ class btConeShape : public btConvexInternalShape
 	btConeShape (btScalar radius,btScalar height);
 	virtual btVector3	localGetSupportingVertex(const btVector3& vec) const;
@@ -41,6 +43,15 @@ public:
 	btScalar getRadius() const { return m_radius;}
 	btScalar getHeight() const { return m_height;}
+	void setRadius(const btScalar radius)
+	{
+		m_radius = radius;
+	}
+	void setHeight(const btScalar height)
+	{
+		m_height = height;
+	}
 	virtual void	calculateLocalInertia(btScalar mass,btVector3& inertia) const
@@ -82,7 +93,19 @@ public:
 			return m_coneIndices[1];
+	virtual btVector3	getAnisotropicRollingFrictionDirection() const
+	{
+		return btVector3 (0,1,0);
+	}
 	virtual void	setLocalScaling(const btVector3& scaling);
+	virtual	int	calculateSerializeBufferSize() const;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	virtual	const char*	serialize(void* dataBuffer, btSerializer* serializer) const;
@@ -91,13 +114,67 @@ class btConeShapeX : public btConeShape
 		btConeShapeX(btScalar radius,btScalar height);
+	virtual btVector3	getAnisotropicRollingFrictionDirection() const
+	{
+		return btVector3 (1,0,0);
+	}
+	//debugging
+	virtual const char*	getName()const
+	{
+		return "ConeX";
+	}
 ///btConeShapeZ implements a Cone shape, around the Z axis
 class btConeShapeZ : public btConeShape
-	public:
-		btConeShapeZ(btScalar radius,btScalar height);
+	btConeShapeZ(btScalar radius,btScalar height);
+	virtual btVector3	getAnisotropicRollingFrictionDirection() const
+	{
+		return btVector3 (0,0,1);
+	}
+	//debugging
+	virtual const char*	getName()const
+	{
+		return "ConeZ";
+	}
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btConeShapeData
+	btConvexInternalShapeData	m_convexInternalShapeData;
+	int	m_upIndex;
+	char	m_padding[4];
+SIMD_FORCE_INLINE	int	btConeShape::calculateSerializeBufferSize() const
+	return sizeof(btConeShapeData);
+///fills the dataBuffer and returns the struct name (and 0 on failure)
+SIMD_FORCE_INLINE	const char*	btConeShape::serialize(void* dataBuffer, btSerializer* serializer) const
+	btConeShapeData* shapeData = (btConeShapeData*) dataBuffer;
+	btConvexInternalShape::serialize(&shapeData->m_convexInternalShapeData,serializer);
+	shapeData->m_upIndex = m_coneIndices[1];
+	return "btConeShapeData";
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvex2dShape.h b/src/bullet/BulletCollision/CollisionShapes/btConvex2dShape.h
index caf54329..bbd1caf4 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvex2dShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvex2dShape.h
@@ -21,12 +21,14 @@ subject to the following restrictions:
 ///The btConvex2dShape allows to use arbitrary convex shapes as 2d convex shapes, with the Z component assumed to be 0.
 ///For 2d boxes, the btBox2dShape is recommended.
-class btConvex2dShape : public btConvexShape
+ATTRIBUTE_ALIGNED16(class) btConvex2dShape : public btConvexShape
 	btConvexShape*	m_childConvexShape;
 	btConvex2dShape(	btConvexShape* convexChildShape);
 	virtual ~btConvex2dShape();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.cpp
index 22624597..c1aa6ca4 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.cpp
@@ -13,11 +13,17 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
+#if defined (_WIN32) || defined (__i386__)
+#define BT_USE_SSE_IN_API
 #include "btConvexHullShape.h"
 #include "BulletCollision/CollisionShapes/btCollisionMargin.h"
 #include "LinearMath/btQuaternion.h"
 #include "LinearMath/btSerializer.h"
+#include "btConvexPolyhedron.h"
+#include "LinearMath/btConvexHullComputer.h"
 btConvexHullShape ::btConvexHullShape (const btScalar* points,int numPoints,int stride) : btPolyhedralConvexAabbCachingShape ()
@@ -45,30 +51,28 @@ void btConvexHullShape::setLocalScaling(const btVector3& scaling)
-void btConvexHullShape::addPoint(const btVector3& point)
+void btConvexHullShape::addPoint(const btVector3& point, bool recalculateLocalAabb)
-	recalcLocalAabb();
+	if (recalculateLocalAabb)
+		recalcLocalAabb();
 btVector3	btConvexHullShape::localGetSupportingVertexWithoutMargin(const btVector3& vec)const
 	btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
-	btScalar newDot,maxDot = btScalar(-BT_LARGE_FLOAT);
+	btScalar maxDot = btScalar(-BT_LARGE_FLOAT);
-	for (int i=0;i<m_unscaledPoints.size();i++)
-	{
-		btVector3 vtx = m_unscaledPoints[i] * m_localScaling;
+    // Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically. 
+    if( 0 < m_unscaledPoints.size() )
+    {
+        btVector3 scaled = vec * m_localScaling;
+        int index = (int) scaled.maxDot( &m_unscaledPoints[0], m_unscaledPoints.size(), maxDot); // FIXME: may violate encapsulation of m_unscaledPoints
+        return m_unscaledPoints[index] * m_localScaling;
+    }
-		newDot = vec.dot(vtx);
-		if (newDot > maxDot)
-		{
-			maxDot = newDot;
-			supVec = vtx;
-		}
-	}
-	return supVec;
+    return supVec;
 void	btConvexHullShape::batchedUnitVectorGetSupportingVertexWithoutMargin(const btVector3* vectors,btVector3* supportVerticesOut,int numVectors) const
@@ -81,23 +85,19 @@ void	btConvexHullShape::batchedUnitVectorGetSupportingVertexWithoutMargin(const
 			supportVerticesOut[i][3] = btScalar(-BT_LARGE_FLOAT);
-	for (int i=0;i<m_unscaledPoints.size();i++)
-	{
-		btVector3 vtx = getScaledPoint(i);
-		for (int j=0;j<numVectors;j++)
-		{
-			const btVector3& vec = vectors[j];
-			newDot = vec.dot(vtx);
-			if (newDot > supportVerticesOut[j][3])
-			{
-				//WARNING: don't swap next lines, the w component would get overwritten!
-				supportVerticesOut[j] = vtx;
-				supportVerticesOut[j][3] = newDot;
-			}
-		}
-	}
+    for (int j=0;j<numVectors;j++)
+    {
+        btVector3 vec = vectors[j] * m_localScaling;        // dot(a*b,c) = dot(a,b*c)
+        if( 0 <  m_unscaledPoints.size() )
+        {
+            int i = (int) vec.maxDot( &m_unscaledPoints[0], m_unscaledPoints.size(), newDot);
+            supportVerticesOut[j] = getScaledPoint(i);
+            supportVerticesOut[j][3] = newDot;        
+        }
+        else
+            supportVerticesOut[j][3] = -BT_LARGE_FLOAT;
+    }
@@ -123,10 +123,17 @@ btVector3	btConvexHullShape::localGetSupportingVertex(const btVector3& vec)const
+void btConvexHullShape::optimizeConvexHull()
+	btConvexHullComputer conv;
+	conv.compute(&m_unscaledPoints[0].getX(), sizeof(btVector3),m_unscaledPoints.size(),0.f,0.f);
+	int numVerts = conv.vertices.size();
+	m_unscaledPoints.resize(0);
+	for (int i=0;i<numVerts;i++)
+    {
+        m_unscaledPoints.push_back(conv.vertices[i]);
+    }
@@ -208,13 +215,11 @@ const char*	btConvexHullShape::serialize(void* dataBuffer, btSerializer* seriali
 	return "btConvexHullShapeData";
-void btConvexHullShape::project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const
+void btConvexHullShape::project(const btTransform& trans, const btVector3& dir, btScalar& minProj, btScalar& maxProj, btVector3& witnesPtMin,btVector3& witnesPtMax) const
 #if 1
-	min = FLT_MAX;
-	max = -FLT_MAX;
-	btVector3 witnesPtMin;
-	btVector3 witnesPtMax;
+	minProj = FLT_MAX;
+	maxProj = -FLT_MAX;
 	int numVerts = m_unscaledPoints.size();
 	for(int i=0;i<numVerts;i++)
@@ -222,31 +227,30 @@ void btConvexHullShape::project(const btTransform& trans, const btVector3& dir,
 		btVector3 vtx = m_unscaledPoints[i] * m_localScaling;
 		btVector3 pt = trans * vtx;
 		btScalar dp = pt.dot(dir);
-		if(dp < min)	
+		if(dp < minProj)	
-			min = dp;
+			minProj = dp;
 			witnesPtMin = pt;
-		if(dp > max)	
+		if(dp > maxProj)	
-			max = dp;
+			maxProj = dp;
 	btVector3 localAxis = dir*trans.getBasis();
-	btVector3 vtx1 = trans(localGetSupportingVertex(localAxis));
-	btVector3 vtx2 = trans(localGetSupportingVertex(-localAxis));
+	witnesPtMin  = trans(localGetSupportingVertex(localAxis));
+	witnesPtMax = trans(localGetSupportingVertex(-localAxis));
-	min = vtx1.dot(dir);
-	max = vtx2.dot(dir);
+	minProj = witnesPtMin.dot(dir);
+	maxProj = witnesPtMax.dot(dir);
-	if(min>max)
+	if(minProj>maxProj)
-		btScalar tmp = min;
-		min = max;
-		max = tmp;
+		btSwap(minProj,maxProj);
+		btSwap(witnesPtMin,witnesPtMax);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.h b/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.h
index 95a2af6a..0c12aeef 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexHullShape.h
@@ -36,7 +36,7 @@ public:
 	///btConvexHullShape make an internal copy of the points.
 	btConvexHullShape(const btScalar* points=0,int numPoints=0, int stride=sizeof(btVector3));
-	void addPoint(const btVector3& point);
+	void addPoint(const btVector3& point, bool recalculateLocalAabb = true);
 	btVector3* getUnscaledPoints()
@@ -55,9 +55,8 @@ public:
 		return getUnscaledPoints();
+    void optimizeConvexHull();
 	SIMD_FORCE_INLINE	btVector3 getScaledPoint(int i) const
 		return m_unscaledPoints[i] * m_localScaling;
@@ -73,7 +72,7 @@ public:
 	virtual void	batchedUnitVectorGetSupportingVertexWithoutMargin(const btVector3* vectors,btVector3* supportVerticesOut,int numVectors) const;
-	virtual void project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const;
+	virtual void project(const btTransform& trans, const btVector3& dir, btScalar& minProj, btScalar& maxProj, btVector3& witnesPtMin,btVector3& witnesPtMax) const;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexInternalShape.h b/src/bullet/BulletCollision/CollisionShapes/btConvexInternalShape.h
index 85cd9ef9..37e04f5f 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexInternalShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexInternalShape.h
@@ -26,7 +26,7 @@ subject to the following restrictions:
 ///Note that when creating small shapes (derived from btConvexInternalShape), 
 ///you need to make sure to set a smaller collision margin, using the 'setMargin' API
 ///There is a automatic mechanism 'setSafeMargin' used by btBoxShape and btCylinderShape
-class btConvexInternalShape : public btConvexShape
+ATTRIBUTE_ALIGNED16(class) btConvexInternalShape : public btConvexShape
@@ -44,7 +44,7 @@ class btConvexInternalShape : public btConvexShape
 	virtual ~btConvexInternalShape()
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp
index c1b155ae..ad1d1bf7 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp
@@ -28,7 +28,7 @@ void btConvexPointCloudShape::setLocalScaling(const btVector3& scaling)
 btVector3	btConvexPointCloudShape::localGetSupportingVertexWithoutMargin(const btVector3& vec0)const
 	btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
-	btScalar newDot,maxDot = btScalar(-BT_LARGE_FLOAT);
+	btScalar maxDot = btScalar(-BT_LARGE_FLOAT);
 	btVector3 vec = vec0;
 	btScalar lenSqr = vec.length2();
@@ -40,51 +40,33 @@ btVector3	btConvexPointCloudShape::localGetSupportingVertexWithoutMargin(const b
 		btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
 		vec *= rlen;
+    if( m_numPoints > 0 )
+    {
+        // Here we take advantage of dot(a*b, c) = dot( a, b*c) to do less work. Note this transformation is true mathematically, not numerically.
+    //    btVector3 scaled = vec * m_localScaling;
+        int index = (int) vec.maxDot( &m_unscaledPoints[0], m_numPoints, maxDot);   //FIXME: may violate encapsulation of m_unscaledPoints
+        return getScaledPoint(index);
+    }
-	for (int i=0;i<m_numPoints;i++)
-	{
-		btVector3 vtx = getScaledPoint(i);
-		newDot = vec.dot(vtx);
-		if (newDot > maxDot)
-		{
-			maxDot = newDot;
-			supVec = vtx;
-		}
-	}
 	return supVec;
 void	btConvexPointCloudShape::batchedUnitVectorGetSupportingVertexWithoutMargin(const btVector3* vectors,btVector3* supportVerticesOut,int numVectors) const
-	btScalar newDot;
-	//use 'w' component of supportVerticesOut?
-	{
-		for (int i=0;i<numVectors;i++)
-		{
-			supportVerticesOut[i][3] = btScalar(-BT_LARGE_FLOAT);
-		}
-	}
-	for (int i=0;i<m_numPoints;i++)
-	{
-		btVector3 vtx = getScaledPoint(i);
-		for (int j=0;j<numVectors;j++)
-		{
-			const btVector3& vec = vectors[j];
-			newDot = vec.dot(vtx);
-			if (newDot > supportVerticesOut[j][3])
-			{
-				//WARNING: don't swap next lines, the w component would get overwritten!
-				supportVerticesOut[j] = vtx;
-				supportVerticesOut[j][3] = newDot;
-			}
-		}
-	}
+    for( int j = 0; j < numVectors; j++ )
+    {
+        const btVector3& vec = vectors[j] * m_localScaling;  // dot( a*c, b) = dot(a, b*c)
+        btScalar maxDot;
+        int index = (int) vec.maxDot( &m_unscaledPoints[0], m_numPoints, maxDot);
+        supportVerticesOut[j][3] = btScalar(-BT_LARGE_FLOAT);
+        if( 0 <= index )
+        {
+            //WARNING: don't swap next lines, the w component would get overwritten!
+            supportVerticesOut[j] = getScaledPoint(index);
+            supportVerticesOut[j][3] = maxDot;
+        }
+    }
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp b/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp
index 1e26be53..4f45319a 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp
@@ -21,6 +21,7 @@ subject to the following restrictions:
 #include "btConvexPolyhedron.h"
 #include "LinearMath/btHashMap.h"
@@ -33,7 +34,7 @@ btConvexPolyhedron::~btConvexPolyhedron()
 inline bool IsAlmostZero(const btVector3& v)
-	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
+	if(btFabs(v.x())>1e-6 || btFabs(v.y())>1e-6 || btFabs(v.z())>1e-6)	return false;
 	return true;
@@ -274,23 +275,29 @@ void	btConvexPolyhedron::initialize()
-void btConvexPolyhedron::project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const
+void btConvexPolyhedron::project(const btTransform& trans, const btVector3& dir, btScalar& minProj, btScalar& maxProj, btVector3& witnesPtMin,btVector3& witnesPtMax) const
-	min = FLT_MAX;
-	max = -FLT_MAX;
+	minProj = FLT_MAX;
+	maxProj = -FLT_MAX;
 	int numVerts = m_vertices.size();
 	for(int i=0;i<numVerts;i++)
 		btVector3 pt = trans * m_vertices[i];
 		btScalar dp = pt.dot(dir);
-		if(dp < min)	min = dp;
-		if(dp > max)	max = dp;
+		if(dp < minProj)
+		{
+			minProj = dp;
+			witnesPtMin = pt;
+		}
+		if(dp > maxProj)
+		{
+			maxProj = dp;
+			witnesPtMax = pt;
+		}
-	if(min>max)
+	if(minProj>maxProj)
-		btScalar tmp = min;
-		min = max;
-		max = tmp;
+		btSwap(minProj,maxProj);
+		btSwap(witnesPtMin,witnesPtMax);
\ No newline at end of file
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.h b/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.h
index 08db39a3..d3cd066a 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexPolyhedron.h
@@ -34,9 +34,12 @@ struct btFace
-class btConvexPolyhedron
+ATTRIBUTE_ALIGNED16(class) btConvexPolyhedron
 	virtual	~btConvexPolyhedron();
@@ -53,7 +56,7 @@ class btConvexPolyhedron
 	void	initialize();
 	bool testContainment() const;
-	void project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const;
+	void project(const btTransform& trans, const btVector3& dir, btScalar& minProj, btScalar& maxProj, btVector3& witnesPtMin,btVector3& witnesPtMax) const;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btConvexShape.cpp
index 8c67d8eb..b56d7291 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexShape.cpp
@@ -13,10 +13,15 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
+#if defined (_WIN32) || defined (__i386__)
+#define BT_USE_SSE_IN_API
 #include "btConvexShape.h"
 #include "btTriangleShape.h"
 #include "btSphereShape.h"
 #include "btCylinderShape.h"
+#include "btConeShape.h"
 #include "btCapsuleShape.h"
 #include "btConvexHullShape.h"
 #include "btConvexPointCloudShape.h"
@@ -43,7 +48,7 @@ btConvexShape::~btConvexShape()
-void btConvexShape::project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const
+void btConvexShape::project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max, btVector3& witnesPtMin,btVector3& witnesPtMax) const
 	btVector3 localAxis = dir*trans.getBasis();
 	btVector3 vtx1 = trans(localGetSupportingVertex(localAxis));
@@ -51,12 +56,16 @@ void btConvexShape::project(const btTransform& trans, const btVector3& dir, btSc
 	min = vtx1.dot(dir);
 	max = vtx2.dot(dir);
+	witnesPtMax = vtx2;
+	witnesPtMin = vtx1;
 		btScalar tmp = min;
 		min = max;
 		max = tmp;
+		witnesPtMax = vtx1;
+		witnesPtMin = vtx2;
@@ -109,19 +118,8 @@ static btVector3 convexHullSupport (const btVector3& localDirOrg, const btVector
 	return supVec;
-	btScalar newDot,maxDot = btScalar(-BT_LARGE_FLOAT);
-	int ptIndex = -1;
-	for (int i=0;i<numPoints;i++)
-	{
-		newDot = vec.dot(points[i]);
-		if (newDot > maxDot)
-		{
-			maxDot = newDot;
-			ptIndex = i;
-		}
-	}
+    btScalar maxDot;
+    long ptIndex = vec.maxDot( points, numPoints, maxDot);
 	btAssert(ptIndex >= 0);
 	btVector3 supVec = points[ptIndex] * localScaling;
 	return supVec;
@@ -141,16 +139,26 @@ btVector3 btConvexShape::localGetSupportVertexWithoutMarginNonVirtual (const btV
 		btBoxShape* convexShape = (btBoxShape*)this;
 		const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();
+#if defined( __APPLE__ ) && (defined( BT_USE_SSE )||defined( BT_USE_NEON ))
+    #if defined( BT_USE_SSE )
+            return btVector3( _mm_xor_ps( _mm_and_ps( localDir.mVec128, (__m128){-0.0f, -0.0f, -0.0f, -0.0f }), halfExtents.mVec128 ));
+    #elif defined( BT_USE_NEON )
+            return btVector3( (float32x4_t) (((uint32x4_t) localDir.mVec128 & (uint32x4_t){ 0x80000000, 0x80000000, 0x80000000, 0x80000000}) ^ (uint32x4_t) halfExtents.mVec128 ));
+    #else
+        #error unknown vector arch
+    #endif
 		return btVector3(btFsels(localDir.x(), halfExtents.x(), -halfExtents.x()),
 			btFsels(localDir.y(), halfExtents.y(), -halfExtents.y()),
 			btFsels(localDir.z(), halfExtents.z(), -halfExtents.z()));
 		btTriangleShape* triangleShape = (btTriangleShape*)this;
 		btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
 		btVector3* vertices = &triangleShape->m_vertices1[0];
-		btVector3 dots(dir.dot(vertices[0]), dir.dot(vertices[1]), dir.dot(vertices[2]));
+        btVector3 dots = dir.dot3(vertices[0], vertices[1], vertices[2]);
 		btVector3 sup = vertices[dots.maxAxis()];
 		return btVector3(sup.getX(),sup.getY(),sup.getZ());
@@ -333,6 +341,11 @@ btScalar btConvexShape::getMarginNonVirtual () const
 		btCylinderShape* cylShape = (btCylinderShape*)this;
 		return cylShape->getMarginNV();
+	{
+		btConeShape* conShape = (btConeShape*)this;
+		return conShape->getMarginNV();
+	}
 		btCapsuleShape* capsuleShape = (btCapsuleShape*)this;
@@ -383,8 +396,8 @@ void btConvexShape::getAabbNonVirtual (const btTransform& t, btVector3& aabbMin,
 		halfExtents += btVector3(margin,margin,margin);
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+        btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);    
 		aabbMin = center - extent;
 		aabbMax = center + extent;
@@ -417,7 +430,7 @@ void btConvexShape::getAabbNonVirtual (const btTransform& t, btVector3& aabbMin,
 		halfExtents += btVector3(capsuleShape->getMarginNonVirtual(),capsuleShape->getMarginNonVirtual(),capsuleShape->getMarginNonVirtual());
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));		  	
+        btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);    
 		aabbMin = center - extent;
 		aabbMax = center + extent;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexShape.h b/src/bullet/BulletCollision/CollisionShapes/btConvexShape.h
index 290cd9fd..875f2ac1 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexShape.h
@@ -52,7 +52,8 @@ public:
 	btScalar getMarginNonVirtual () const;
 	void getAabbNonVirtual (const btTransform& t, btVector3& aabbMin, btVector3& aabbMax) const;
-	virtual void project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const;
+	virtual void project(const btTransform& trans, const btVector3& dir, btScalar& minProj, btScalar& maxProj, btVector3& witnesPtMin,btVector3& witnesPtMax) const;
 	//notice that the vectors should be unit length
diff --git a/src/bullet/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h b/src/bullet/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h
index af5d0038..f338865c 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h
@@ -22,12 +22,14 @@ subject to the following restrictions:
 /// The btConvexTriangleMeshShape is a convex hull of a triangle mesh, but the performance is not as good as btConvexHullShape.
 /// A small benefit of this class is that it uses the btStridingMeshInterface, so you can avoid the duplication of the triangle mesh data. Nevertheless, most users should use the much better performing btConvexHullShape instead.
-class btConvexTriangleMeshShape : public btPolyhedralConvexAabbCachingShape
+ATTRIBUTE_ALIGNED16(class) btConvexTriangleMeshShape : public btPolyhedralConvexAabbCachingShape
 	class btStridingMeshInterface*	m_stridingMesh;
 	btConvexTriangleMeshShape(btStridingMeshInterface* meshInterface, bool calcAabb = true);
 	class btStridingMeshInterface*	getMeshInterface()
diff --git a/src/bullet/BulletCollision/CollisionShapes/btCylinderShape.h b/src/bullet/BulletCollision/CollisionShapes/btCylinderShape.h
index 125bfc78..6f796950 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btCylinderShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btCylinderShape.h
@@ -21,7 +21,7 @@ subject to the following restrictions:
 #include "LinearMath/btVector3.h"
 /// The btCylinderShape class implements a cylinder shape primitive, centered around the origin. Its central axis aligned with the Y axis. btCylinderShapeX is aligned with the X axis and btCylinderShapeZ around the Z axis.
-class btCylinderShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btCylinderShape : public btConvexInternalShape
@@ -31,6 +31,8 @@ protected:
 	btVector3 getHalfExtentsWithMargin() const
 		btVector3 halfExtents = getHalfExtentsWithoutMargin();
@@ -95,6 +97,13 @@ public:
 		return m_upAxis;
+	virtual btVector3	getAnisotropicRollingFrictionDirection() const
+	{
+		btVector3 aniDir(0,0,0);
+		aniDir[getUpAxis()]=1;
+		return aniDir;
+	}
 	virtual btScalar getRadius() const
 		return getHalfExtentsWithMargin().getX();
@@ -128,6 +137,8 @@ public:
 class btCylinderShapeX : public btCylinderShape
 	btCylinderShapeX (const btVector3& halfExtents);
 	virtual btVector3	localGetSupportingVertexWithoutMargin(const btVector3& vec)const;
@@ -149,6 +160,8 @@ public:
 class btCylinderShapeZ : public btCylinderShape
 	btCylinderShapeZ (const btVector3& halfExtents);
 	virtual btVector3	localGetSupportingVertexWithoutMargin(const btVector3& vec)const;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btEmptyShape.h b/src/bullet/BulletCollision/CollisionShapes/btEmptyShape.h
index 87b7b66d..069a7940 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btEmptyShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btEmptyShape.h
@@ -28,9 +28,11 @@ subject to the following restrictions:
 /// The btEmptyShape is a collision shape without actual collision detection shape, so most users should ignore this class.
 /// It can be replaced by another shape during runtime, but the inertia tensor should be recomputed.
-class btEmptyShape	: public btConcaveShape
+ATTRIBUTE_ALIGNED16(class) btEmptyShape	: public btConcaveShape
 	virtual ~btEmptyShape();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
index 95631c30..441a89c6 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
@@ -38,7 +38,7 @@ btHeightfieldTerrainShape::btHeightfieldTerrainShape(int heightStickWidth, int h
 	// legacy constructor: support only float or unsigned char,
 	// 	and min height is zero
 	PHY_ScalarType hdt = (useFloatData) ? PHY_FLOAT : PHY_UCHAR;
-	btScalar minHeight = 0.0;
+	btScalar minHeight = 0.0f;
 	// previously, height = uchar * maxHeight / 65535.
 	// So to preserve legacy behavior, heightScale = maxHeight / 65535
@@ -59,15 +59,13 @@ PHY_ScalarType hdt, bool flipQuadEdges
 	// validation
-	btAssert(heightStickWidth > 1 && "bad width");
-	btAssert(heightStickLength > 1 && "bad length");
-	btAssert(heightfieldData && "null heightfield data");
+	btAssert(heightStickWidth > 1);// && "bad width");
+	btAssert(heightStickLength > 1);// && "bad length");
+	btAssert(heightfieldData);// && "null heightfield data");
 	// btAssert(heightScale) -- do we care?  Trust caller here
-	btAssert(minHeight <= maxHeight && "bad min/max height");
-	btAssert(upAxis >= 0 && upAxis < 3 &&
-	    "bad upAxis--should be in range [0,2]");
-	btAssert(hdt != PHY_UCHAR || hdt != PHY_FLOAT || hdt != PHY_SHORT &&
-	    "Bad height data type enum");
+	btAssert(minHeight <= maxHeight);// && "bad min/max height");
+	btAssert(upAxis >= 0 && upAxis < 3);// && "bad upAxis--should be in range [0,2]");
+	btAssert(hdt != PHY_UCHAR || hdt != PHY_FLOAT || hdt != PHY_SHORT);// && "Bad height data type enum");
 	// initialize member variables
@@ -82,6 +80,7 @@ PHY_ScalarType hdt, bool flipQuadEdges
 	m_heightDataType = hdt;
 	m_flipQuadEdges = flipQuadEdges;
 	m_useDiamondSubdivision = false;
+	m_useZigzagSubdivision = false;
 	m_upAxis = upAxis;
 	m_localScaling.setValue(btScalar(1.), btScalar(1.), btScalar(1.));
@@ -109,7 +108,7 @@ PHY_ScalarType hdt, bool flipQuadEdges
 			//need to get valid m_upAxis
-			btAssert(0 && "Bad m_upAxis");
+			btAssert(0);// && "Bad m_upAxis");
@@ -135,9 +134,7 @@ void btHeightfieldTerrainShape::getAabb(const btTransform& t,btVector3& aabbMin,
 	btMatrix3x3 abs_b = t.getBasis().absolute();  
 	btVector3 center = t.getOrigin();
-	btVector3 extent = btVector3(abs_b[0].dot(halfExtents),
-		   abs_b[1].dot(halfExtents),
-		  abs_b[2].dot(halfExtents));
+    btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	extent += btVector3(getMargin(),getMargin(),getMargin());
 	aabbMin = center - extent;
@@ -362,18 +359,19 @@ void	btHeightfieldTerrainShape::processAllTriangles(btTriangleCallback* callback
 		for(int x=startX; x<endX; x++)
 			btVector3 vertices[3];
-			if (m_flipQuadEdges || (m_useDiamondSubdivision && !((j+x) & 1)))
+			if (m_flipQuadEdges || (m_useDiamondSubdivision && !((j+x) & 1))|| (m_useZigzagSubdivision  && !(j & 1)))
         //first triangle
-        getVertex(x+1,j,vertices[1]);
-        getVertex(x+1,j+1,vertices[2]);
+		getVertex(x, j + 1, vertices[1]);
+		getVertex(x + 1, j + 1, vertices[2]);
         //second triangle
-        getVertex(x,j,vertices[0]);
+      //  getVertex(x,j,vertices[0]);//already got this vertex before, thanks to Danny Chapman
-        getVertex(x,j+1,vertices[2]);
-        callback->processTriangle(vertices,x,j);				
+		getVertex(x + 1, j, vertices[2]);
+		callback->processTriangle(vertices, x, j);
 			} else
         //first triangle
@@ -383,7 +381,7 @@ void	btHeightfieldTerrainShape::processAllTriangles(btTriangleCallback* callback
         //second triangle
-        getVertex(x,j+1,vertices[1]);
+        //getVertex(x,j+1,vertices[1]);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h b/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
index 78e231e0..4a7a4a4b 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
@@ -68,7 +68,7 @@ subject to the following restrictions:
   For usage and testing see the TerrainDemo.
-class btHeightfieldTerrainShape : public btConcaveShape
+ATTRIBUTE_ALIGNED16(class) btHeightfieldTerrainShape : public btConcaveShape
 	btVector3	m_localAabbMin;
@@ -93,7 +93,8 @@ protected:
 	PHY_ScalarType	m_heightDataType;	
 	bool	m_flipQuadEdges;
-  bool  m_useDiamondSubdivision;
+  	bool  m_useDiamondSubdivision;
+	bool m_useZigzagSubdivision;
 	int	m_upAxis;
@@ -116,6 +117,9 @@ protected:
 	                PHY_ScalarType heightDataType, bool flipQuadEdges);
 	/// preferred constructor
 	  This constructor supports a range of heightfield
@@ -142,6 +146,8 @@ public:
 	void setUseDiamondSubdivision(bool useDiamondSubdivision=true) { m_useDiamondSubdivision = useDiamondSubdivision;}
+	///could help compatibility with Ogre heightfields. See https://code.google.com/p/bullet/issues/detail?id=625	
+	void setUseZigzagSubdivision(bool useZigzagSubdivision=true) { m_useZigzagSubdivision = useZigzagSubdivision;}
 	virtual void getAabb(const btTransform& t,btVector3& aabbMin,btVector3& aabbMax) const;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btMinkowskiSumShape.h b/src/bullet/BulletCollision/CollisionShapes/btMinkowskiSumShape.h
index 6c844e8c..a3f9a472 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btMinkowskiSumShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btMinkowskiSumShape.h
@@ -20,7 +20,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h" // for the types
 /// The btMinkowskiSumShape is only for advanced users. This shape represents implicit based minkowski sum of two convex implicit shapes.
-class btMinkowskiSumShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btMinkowskiSumShape : public btConvexInternalShape
 	btTransform	m_transA;
@@ -30,6 +30,8 @@ class btMinkowskiSumShape : public btConvexInternalShape
 	btMinkowskiSumShape(const btConvexShape* shapeA,const btConvexShape* shapeB);
 	virtual btVector3	localGetSupportingVertexWithoutMargin(const btVector3& vec)const;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.cpp
index c996bfcd..88f6c4dc 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.cpp
@@ -13,7 +13,9 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
+#if defined (_WIN32) || defined (__i386__)
+#define BT_USE_SSE_IN_API
 #include "btMultiSphereShape.h"
 #include "BulletCollision/CollisionShapes/btCollisionMargin.h"
@@ -39,10 +41,11 @@ btMultiSphereShape::btMultiSphereShape (const btVector3* positions,const btScala
+#ifndef MIN
+	#define MIN( _a, _b)    ((_a) < (_b) ? (_a) : (_b))
  btVector3	btMultiSphereShape::localGetSupportingVertexWithoutMargin(const btVector3& vec0)const
-	int i;
 	btVector3 supVec(0,0,0);
 	btScalar maxDot(btScalar(-BT_LARGE_FLOAT));
@@ -66,18 +69,23 @@ btMultiSphereShape::btMultiSphereShape (const btVector3* positions,const btScala
 	const btScalar* rad = &m_radiArray[0];
 	int numSpheres = m_localPositionArray.size();
-	for (i=0;i<numSpheres;i++)
+	for( int k = 0; k < numSpheres; k+= 128 )
-		vtx = (*pos) +vec*m_localScaling*(*rad) - vec * getMargin();
-		pos++;
-		rad++;
-		newDot = vec.dot(vtx);
-		if (newDot > maxDot)
+		btVector3 temp[128];
+		int inner_count = MIN( numSpheres - k, 128 );
+        for( long i = 0; i < inner_count; i++ )
+        {
+            temp[i] = (*pos)*m_localScaling +vec*m_localScaling*(*rad) - vec * getMargin();
+            pos++;
+            rad++;
+        }
+        long i = vec.maxDot( temp, inner_count, newDot);
+        if( newDot > maxDot )
 			maxDot = newDot;
-			supVec = vtx;
+			supVec = temp[i];
-	}
+    }
 	return supVec;
@@ -98,18 +106,25 @@ btMultiSphereShape::btMultiSphereShape (const btVector3* positions,const btScala
 		const btVector3* pos = &m_localPositionArray[0];
 		const btScalar* rad = &m_radiArray[0];
 		int numSpheres = m_localPositionArray.size();
-		for (int i=0;i<numSpheres;i++)
-		{
-			vtx = (*pos) +vec*m_localScaling*(*rad) - vec * getMargin();
-			pos++;
-			rad++;
-			newDot = vec.dot(vtx);
-			if (newDot > maxDot)
-			{
-				maxDot = newDot;
-				supportVerticesOut[j] = vtx;
-			}
-		}
+        for( int k = 0; k < numSpheres; k+= 128 )
+        {
+            btVector3 temp[128];
+            int inner_count = MIN( numSpheres - k, 128 );
+            for( long i = 0; i < inner_count; i++ )
+            {
+                temp[i] = (*pos)*m_localScaling +vec*m_localScaling*(*rad) - vec * getMargin();
+                pos++;
+                rad++;
+            }
+            long i = vec.maxDot( temp, inner_count, newDot);
+            if( newDot > maxDot )
+            {
+                maxDot = newDot;
+                supportVerticesOut[j] = temp[i];
+            }
+        }
diff --git a/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.h b/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.h
index 06c5d16d..5d3b4026 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btMultiSphereShape.h
@@ -25,13 +25,15 @@ subject to the following restrictions:
 ///The btMultiSphereShape represents the convex hull of a collection of spheres. You can create special capsules or other smooth volumes.
 ///It is possible to animate the spheres for deformation, but call 'recalcLocalAabb' after changing any sphere position/radius
-class btMultiSphereShape : public btConvexInternalAabbCachingShape
+ATTRIBUTE_ALIGNED16(class) btMultiSphereShape : public btConvexInternalAabbCachingShape
 	btAlignedObjectArray<btVector3> m_localPositionArray;
 	btAlignedObjectArray<btScalar>  m_radiArray;
 	btMultiSphereShape (const btVector3* positions,const btScalar* radi,int numSpheres);
 	///CollisionShape Interface
diff --git a/src/bullet/BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h b/src/bullet/BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h
index 2b92ab7d..5ebaede4 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h
@@ -25,7 +25,6 @@ subject to the following restrictions:
 ATTRIBUTE_ALIGNED16(class) btMultimaterialTriangleMeshShape : public btBvhTriangleMeshShape
     btAlignedObjectArray <btMaterial*> m_materialList;
-    int ** m_triangleMaterials;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp
index 82def79c..4854f370 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp
@@ -12,6 +12,9 @@ subject to the following restrictions:
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
+#if defined (_WIN32) || defined (__i386__)
+#define BT_USE_SSE_IN_API
 #include "BulletCollision/CollisionShapes/btPolyhedralConvexShape.h"
 #include "btConvexPolyhedron.h"
@@ -31,51 +34,58 @@ btPolyhedralConvexShape::~btPolyhedralConvexShape()
 	if (m_polyhedron)
+		m_polyhedron->~btConvexPolyhedron();
-bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
+bool	btPolyhedralConvexShape::initializePolyhedralFeatures(int shiftVerticesByMargin)
 	if (m_polyhedron)
+	{
+		m_polyhedron->~btConvexPolyhedron();
+	}
 	void* mem = btAlignedAlloc(sizeof(btConvexPolyhedron),16);
 	m_polyhedron = new (mem) btConvexPolyhedron;
-		btAlignedObjectArray<btVector3> orgVertices;
+	btAlignedObjectArray<btVector3> orgVertices;
 	for (int i=0;i<getNumVertices();i++)
 		btVector3& newVertex = orgVertices.expand();
-#if 0
-	btAlignedObjectArray<btVector3> planeEquations;
-	btGeometryUtil::getPlaneEquationsFromVertices(orgVertices,planeEquations);
-	btAlignedObjectArray<btVector3> shiftedPlaneEquations;
-	for (int p=0;p<planeEquations.size();p++)
+	btConvexHullComputer conv;
+	if (shiftVerticesByMargin)
-		   btVector3 plane = planeEquations[p];
-		   plane[3] -= getMargin();
-		   shiftedPlaneEquations.push_back(plane);
-	}
+		btAlignedObjectArray<btVector3> planeEquations;
+		btGeometryUtil::getPlaneEquationsFromVertices(orgVertices,planeEquations);
-	btAlignedObjectArray<btVector3> tmpVertices;
+		btAlignedObjectArray<btVector3> shiftedPlaneEquations;
+		for (int p=0;p<planeEquations.size();p++)
+		{
+			   btVector3 plane = planeEquations[p];
+		//	   btScalar margin = getMargin();
+			   plane[3] -= getMargin();
+			   shiftedPlaneEquations.push_back(plane);
+		}
-	btGeometryUtil::getVerticesFromPlaneEquations(shiftedPlaneEquations,tmpVertices);
-	btConvexHullComputer conv;
-	conv.compute(&tmpVertices[0].getX(), sizeof(btVector3),tmpVertices.size(),0.f,0.f);
+		btAlignedObjectArray<btVector3> tmpVertices;
-	btConvexHullComputer conv;
-	conv.compute(&orgVertices[0].getX(), sizeof(btVector3),orgVertices.size(),0.f,0.f);
+		btGeometryUtil::getVerticesFromPlaneEquations(shiftedPlaneEquations,tmpVertices);
+		conv.compute(&tmpVertices[0].getX(), sizeof(btVector3),tmpVertices.size(),0.f,0.f);
+	} else
+	{
+		conv.compute(&orgVertices[0].getX(), sizeof(btVector3),orgVertices.size(),0.f,0.f);
+	}
@@ -107,9 +117,6 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 		int numEdges = 0;
 		//compute face normals
-		btScalar maxCross2 = 0.f;
-		int chosenEdge = -1;
@@ -192,7 +199,8 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 			//do the merge: use Graham Scan 2d convex hull
-			btAlignedObjectArray<GrahamVector2> orgpoints;
+			btAlignedObjectArray<GrahamVector3> orgpoints;
+			btVector3 averageFaceNormal(0,0,0);
 			for (int i=0;i<coplanarFaceGroup.size();i++)
@@ -200,16 +208,12 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 				btFace& face = tmpFaces[coplanarFaceGroup[i]];
 				btVector3 faceNormal(face.m_plane[0],face.m_plane[1],face.m_plane[2]);
-				btVector3 xyPlaneNormal(0,0,1);
-				btQuaternion rotationArc = shortestArcQuat(faceNormal,xyPlaneNormal);
+				averageFaceNormal+=faceNormal;
 				for (int f=0;f<face.m_indices.size();f++)
 					int orgIndex = face.m_indices[f];
 					btVector3 pt = m_polyhedron->m_vertices[orgIndex];
-					btVector3 rotatedPt =  quatRotate(rotationArc,pt);
-					rotatedPt.setZ(0);
 					bool found = false;
 					for (int i=0;i<orgpoints.size();i++)
@@ -222,34 +226,45 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 					if (!found)
-						orgpoints.push_back(GrahamVector2(rotatedPt,orgIndex));
+						orgpoints.push_back(GrahamVector3(pt,orgIndex));
 			btFace combinedFace;
 			for (int i=0;i<4;i++)
 				combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
-			btAlignedObjectArray<GrahamVector2> hull;
-			GrahamScanConvexHull2D(orgpoints,hull);
+			btAlignedObjectArray<GrahamVector3> hull;
+			averageFaceNormal.normalize();
+			GrahamScanConvexHull2D(orgpoints,hull,averageFaceNormal);
 			for (int i=0;i<hull.size();i++)
-				for(int k = 0; k < orgpoints.size(); k++) {
-					if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex) {
+				for(int k = 0; k < orgpoints.size(); k++) 
+				{
+					if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex) 
+					{
 						orgpoints[k].m_orgIndex = -1; // invalidate...
-			}
+					}
 			// are there rejected vertices?
 			bool reject_merge = false;
 			for(int i = 0; i < orgpoints.size(); i++) {
 				if(orgpoints[i].m_orgIndex == -1)
 					continue; // this is in the hull...
 				// this vertex is rejected -- is anybody else using this vertex?
 				for(int j = 0; j < tmpFaces.size(); j++) {
 					btFace& face = tmpFaces[j];
 					// is this a face of the current coplanar group?
 					bool is_in_current_group = false;
@@ -275,20 +290,23 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
-			if(!reject_merge) {
+			if (!reject_merge)
+			{
 				// do this merge!
 				did_merge = true;
-			m_polyhedron->m_faces.push_back(combinedFace);
+				m_polyhedron->m_faces.push_back(combinedFace);
 			for (int i=0;i<coplanarFaceGroup.size();i++)
-				m_polyhedron->m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
+				btFace face = tmpFaces[coplanarFaceGroup[i]];
+				m_polyhedron->m_faces.push_back(face);
-		}
+		} 
@@ -299,6 +317,9 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 	return true;
+#ifndef MIN
+    #define MIN(_a, _b)     ((_a) < (_b) ? (_a) : (_b))
 btVector3	btPolyhedralConvexShape::localGetSupportingVertexWithoutMargin(const btVector3& vec0)const
@@ -323,17 +344,19 @@ btVector3	btPolyhedralConvexShape::localGetSupportingVertexWithoutMargin(const b
 	btVector3 vtx;
 	btScalar newDot;
-	for (i=0;i<getNumVertices();i++)
-	{
-		getVertex(i,vtx);
-		newDot = vec.dot(vtx);
+    for( int k = 0; k < getNumVertices(); k += 128 )
+    {
+        btVector3 temp[128];
+        int inner_count = MIN(getNumVertices() - k, 128);
+        for( i = 0; i < inner_count; i++ )
+            getVertex(i,temp[i]); 
+        i = (int) vec.maxDot( temp, inner_count, newDot);
 		if (newDot > maxDot)
 			maxDot = newDot;
-			supVec = vtx;
-		}
-	}
+			supVec = temp[i];
+		}        
+    }
 #endif //__SPU__
 	return supVec;
@@ -356,21 +379,23 @@ void	btPolyhedralConvexShape::batchedUnitVectorGetSupportingVertexWithoutMargin(
 	for (int j=0;j<numVectors;j++)
-		const btVector3& vec = vectors[j];
-		for (i=0;i<getNumVertices();i++)
-		{
-			getVertex(i,vtx);
-			newDot = vec.dot(vtx);
-			if (newDot > supportVerticesOut[j][3])
-			{
-				//WARNING: don't swap next lines, the w component would get overwritten!
-				supportVerticesOut[j] = vtx;
+        const btVector3& vec = vectors[j];
+        for( int k = 0; k < getNumVertices(); k += 128 )
+        {
+            btVector3 temp[128];
+            int inner_count = MIN(getNumVertices() - k, 128);
+            for( i = 0; i < inner_count; i++ )
+                getVertex(i,temp[i]); 
+            i = (int) vec.maxDot( temp, inner_count, newDot);
+            if (newDot > supportVerticesOut[j][3])
+            {
+				supportVerticesOut[j] = temp[i];
 				supportVerticesOut[j][3] = newDot;
-			}
-		}
-	}
+            }        
+        }
+    }
 #endif //__SPU__
diff --git a/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h b/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h
index ee2e1e28..961d001a 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h
@@ -22,7 +22,7 @@ class btConvexPolyhedron;
 ///The btPolyhedralConvexShape is an internal interface class for polyhedral convex shapes.
-class btPolyhedralConvexShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btPolyhedralConvexShape : public btConvexInternalShape
@@ -31,13 +31,17 @@ protected:
 	btConvexPolyhedron* m_polyhedron;
 	virtual ~btPolyhedralConvexShape();
 	///optional method mainly used to generate multiple contact points by clipping polyhedral features (faces/edges)
-	virtual bool	initializePolyhedralFeatures();
+	///experimental/work-in-progress
+	virtual bool	initializePolyhedralFeatures(int shiftVerticesByMargin=0);
 	const btConvexPolyhedron*	getConvexPolyhedron() const
diff --git a/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp
index 25d58d61..6a337c78 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp
@@ -98,9 +98,7 @@ void	btScaledBvhTriangleMeshShape::getAabb(const btTransform& trans,btVector3& a
 	btVector3 center = trans(localCenter);
-	btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-		   abs_b[1].dot(localHalfExtents),
-		  abs_b[2].dot(localHalfExtents));
+    btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	aabbMin = center - extent;
 	aabbMax = center + extent;
diff --git a/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h b/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h
index ff86ef31..39049eaf 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h
@@ -31,6 +31,8 @@ ATTRIBUTE_ALIGNED16(class) btScaledBvhTriangleMeshShape : public btConcaveShape
 	btScaledBvhTriangleMeshShape(btBvhTriangleMeshShape* childShape,const btVector3& localScaling);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btShapeHull.h b/src/bullet/BulletCollision/CollisionShapes/btShapeHull.h
index 642a2887..e959f198 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btShapeHull.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btShapeHull.h
@@ -25,7 +25,7 @@ subject to the following restrictions:
 ///The btShapeHull class takes a btConvexShape, builds a simplified convex hull using btConvexHull and provides triangle indices and vertices.
 ///It can be useful for to simplify a complex convex object and for visualization of a non-polyhedral convex object.
 ///It approximates the convex hull using the supporting vertex of 42 directions.
-class btShapeHull
+ATTRIBUTE_ALIGNED16(class) btShapeHull
@@ -37,6 +37,8 @@ protected:
 	static btVector3* getUnitSpherePoints();
 	btShapeHull (const btConvexShape* shape);
 	~btShapeHull ();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.cpp
index 38ef8f03..d17141e3 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.cpp
@@ -21,7 +21,7 @@ subject to the following restrictions:
 btStaticPlaneShape::btStaticPlaneShape(const btVector3& planeNormal,btScalar planeConstant)
 : btConcaveShape (), m_planeNormal(planeNormal.normalized()),
 	//	btAssert( btFuzzyZero(m_planeNormal.length() - btScalar(1.)) );
diff --git a/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.h b/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.h
index b13825e6..e6e32883 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btStaticPlaneShape.h
@@ -31,6 +31,8 @@ protected:
 	btVector3	m_localScaling;
 	btStaticPlaneShape(const btVector3& planeNormal,btScalar planeConstant);
 	virtual ~btStaticPlaneShape();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.cpp b/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.cpp
index dd22fc56..b3d44967 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.cpp
@@ -242,7 +242,7 @@ const char*	btStridingMeshInterface::serialize(void* dataBuffer, btSerializer* s
 		int gfxindex;
 	//	btVector3 triangle[3];
-		btVector3 meshScaling = getScaling();
+	//	btVector3 meshScaling = getScaling();
 		///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
 		for (part=0;part<graphicssubparts ;part++,memPtr++)
diff --git a/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.h b/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.h
index f2b27ade..9fbe1397 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btStridingMeshInterface.h
@@ -27,13 +27,15 @@ subject to the following restrictions:
 ///	The btStridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with btBvhTriangleMeshShape and some other collision shapes.
 /// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
 /// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
-class  btStridingMeshInterface
+ATTRIBUTE_ALIGNED16(class ) btStridingMeshInterface
 		btVector3 m_scaling;
 		btStridingMeshInterface() :m_scaling(btScalar(1.),btScalar(1.),btScalar(1.))
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTetrahedronShape.h b/src/bullet/BulletCollision/CollisionShapes/btTetrahedronShape.h
index 6b7128ef..b6920983 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTetrahedronShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btTetrahedronShape.h
@@ -22,7 +22,7 @@ subject to the following restrictions:
 ///The btBU_Simplex1to4 implements tetrahedron, triangle, line, vertex collision shapes. In most cases it is better to use btConvexHullShape instead.
-class btBU_Simplex1to4 : public btPolyhedralConvexAabbCachingShape
+ATTRIBUTE_ALIGNED16(class) btBU_Simplex1to4 : public btPolyhedralConvexAabbCachingShape
@@ -30,6 +30,8 @@ protected:
 	btVector3	m_vertices[4];
 	btBU_Simplex1to4(const btVector3& pt0);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTriangleInfoMap.h b/src/bullet/BulletCollision/CollisionShapes/btTriangleInfoMap.h
index 1cea7045..17deef89 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTriangleInfoMap.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btTriangleInfoMap.h
@@ -123,11 +123,11 @@ SIMD_FORCE_INLINE	int	btTriangleInfoMap::calculateSerializeBufferSize() const
 SIMD_FORCE_INLINE	const char*	btTriangleInfoMap::serialize(void* dataBuffer, btSerializer* serializer) const
 	btTriangleInfoMapData* tmapData = (btTriangleInfoMapData*) dataBuffer;
-	tmapData->m_convexEpsilon = m_convexEpsilon;
-	tmapData->m_planarEpsilon = m_planarEpsilon;
-	tmapData->m_equalVertexThreshold = m_equalVertexThreshold;
-	tmapData->m_edgeDistanceThreshold = m_edgeDistanceThreshold;
-	tmapData->m_zeroAreaThreshold = m_zeroAreaThreshold;
+	tmapData->m_convexEpsilon = (float)m_convexEpsilon;
+	tmapData->m_planarEpsilon = (float)m_planarEpsilon;
+	tmapData->m_equalVertexThreshold =(float) m_equalVertexThreshold;
+	tmapData->m_edgeDistanceThreshold = (float)m_edgeDistanceThreshold;
+	tmapData->m_zeroAreaThreshold = (float)m_zeroAreaThreshold;
 	tmapData->m_hashTableSize = m_hashTable.size();
@@ -172,9 +172,9 @@ SIMD_FORCE_INLINE	const char*	btTriangleInfoMap::serialize(void* dataBuffer, btS
 		btTriangleInfoData* memPtr = (btTriangleInfoData*)chunk->m_oldPtr;
 		for (int i=0;i<numElem;i++,memPtr++)
-			memPtr->m_edgeV0V1Angle = m_valueArray[i].m_edgeV0V1Angle;
-			memPtr->m_edgeV1V2Angle = m_valueArray[i].m_edgeV1V2Angle;
-			memPtr->m_edgeV2V0Angle = m_valueArray[i].m_edgeV2V0Angle;
+			memPtr->m_edgeV0V1Angle = (float)m_valueArray[i].m_edgeV0V1Angle;
+			memPtr->m_edgeV1V2Angle = (float)m_valueArray[i].m_edgeV1V2Angle;
+			memPtr->m_edgeV2V0Angle = (float)m_valueArray[i].m_edgeV2V0Angle;
 			memPtr->m_flags = m_valueArray[i].m_flags;
 		serializer->finalizeChunk(chunk,"btTriangleInfoData",BT_ARRAY_CODE,(void*) &m_valueArray[0]);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.cpp b/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.cpp
index b29e0f71..e4de7320 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.cpp
@@ -75,6 +75,13 @@ void	btTriangleMesh::addIndex(int index)
+void	btTriangleMesh::addTriangleIndices(int index1, int index2, int index3 )
+	m_indexedMeshes[0].m_numTriangles++;
+	addIndex( index1 );
+	addIndex( index2 );
+	addIndex( index3 );
 int	btTriangleMesh::findOrAddVertex(const btVector3& vertex, bool removeDuplicateVertices)
@@ -111,10 +118,10 @@ int	btTriangleMesh::findOrAddVertex(const btVector3& vertex, bool removeDuplicat
 					return i/3;
-	}
-		m_3componentVertices.push_back((float)vertex.getX());
-		m_3componentVertices.push_back((float)vertex.getY());
-		m_3componentVertices.push_back((float)vertex.getZ());
+		}
+		m_3componentVertices.push_back(vertex.getX());
+		m_3componentVertices.push_back(vertex.getY());
+		m_3componentVertices.push_back(vertex.getZ());
 		m_indexedMeshes[0].m_vertexBase = (unsigned char*)&m_3componentVertices[0];
 		return (m_3componentVertices.size()/3)-1;
@@ -138,3 +145,25 @@ int btTriangleMesh::getNumTriangles() const
 	return m_16bitIndices.size() / 3;
+void btTriangleMesh::preallocateVertices(int numverts)
+	if (m_use4componentVertices)
+	{
+		m_4componentVertices.reserve(numverts);
+	} else
+	{
+		m_3componentVertices.reserve(numverts);
+	}
+void btTriangleMesh::preallocateIndices(int numindices)
+	if (m_use32bitIndices)
+	{
+		m_32bitIndices.reserve(numindices);
+	} else
+	{
+		m_16bitIndices.reserve(numindices);
+	}
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.h b/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.h
index f623157f..ac4afa7f 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btTriangleMesh.h
@@ -27,7 +27,7 @@ subject to the following restrictions:
 class btTriangleMesh : public btTriangleIndexVertexArray
 	btAlignedObjectArray<btVector3>	m_4componentVertices;
-	btAlignedObjectArray<float>		m_3componentVertices;
+	btAlignedObjectArray<btScalar>	m_3componentVertices;
 	btAlignedObjectArray<unsigned int>		m_32bitIndices;
 	btAlignedObjectArray<unsigned short int>		m_16bitIndices;
@@ -52,11 +52,14 @@ class btTriangleMesh : public btTriangleIndexVertexArray
 		///By default addTriangle won't search for duplicate vertices, because the search is very slow for large triangle meshes.
 		///In general it is better to directly use btTriangleIndexVertexArray instead.
 		void	addTriangle(const btVector3& vertex0,const btVector3& vertex1,const btVector3& vertex2, bool removeDuplicateVertices=false);
+		///Add a triangle using its indices. Make sure the indices are pointing within the vertices array, so add the vertices first (and to be sure, avoid removal of duplicate vertices)	
+		void	addTriangleIndices(int index1, int index2, int index3 );
 		int getNumTriangles() const;
-		virtual void	preallocateVertices(int numverts){(void) numverts;}
-		virtual void	preallocateIndices(int numindices){(void) numindices;}
+		virtual void	preallocateVertices(int numverts);
+		virtual void	preallocateIndices(int numindices);
 		///findOrAddVertex is an internal method, use addTriangle instead
 		int		findOrAddVertex(const btVector3& vertex, bool removeDuplicateVertices);
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp b/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp
index 683684da..0e179514 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp
+++ b/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp
@@ -55,13 +55,9 @@ void btTriangleMeshShape::getAabb(const btTransform& trans,btVector3& aabbMin,bt
 	btVector3 center = trans(localCenter);
-	btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-		   abs_b[1].dot(localHalfExtents),
-		  abs_b[2].dot(localHalfExtents));
+    btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	aabbMin = center - extent;
 	aabbMax = center + extent;
 void	btTriangleMeshShape::recalcLocalAabb()
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.h b/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.h
index c8caf8fe..453e5800 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btTriangleMeshShape.h
@@ -21,7 +21,7 @@ subject to the following restrictions:
 ///The btTriangleMeshShape is an internal concave triangle mesh interface. Don't use this class directly, use btBvhTriangleMeshShape instead.
-class btTriangleMeshShape : public btConcaveShape
+ATTRIBUTE_ALIGNED16(class) btTriangleMeshShape : public btConcaveShape
 	btVector3	m_localAabbMin;
@@ -33,6 +33,7 @@ protected:
 	btTriangleMeshShape(btStridingMeshInterface* meshInterface);
 	virtual ~btTriangleMeshShape();
diff --git a/src/bullet/BulletCollision/CollisionShapes/btTriangleShape.h b/src/bullet/BulletCollision/CollisionShapes/btTriangleShape.h
index 71b05573..a8a80f82 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btTriangleShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btTriangleShape.h
@@ -25,6 +25,8 @@ ATTRIBUTE_ALIGNED16(class) btTriangleShape : public btPolyhedralConvexShape
 	btVector3	m_vertices1[3];
 	virtual int getNumVertices() const
@@ -66,7 +68,7 @@ public:
 	btVector3 localGetSupportingVertexWithoutMargin(const btVector3& dir)const 
-		btVector3 dots(dir.dot(m_vertices1[0]), dir.dot(m_vertices1[1]), dir.dot(m_vertices1[2]));
+        btVector3 dots = dir.dot3(m_vertices1[0], m_vertices1[1], m_vertices1[2]);
 	  	return m_vertices1[dots.maxAxis()];
@@ -76,7 +78,7 @@ public:
 		for (int i=0;i<numVectors;i++)
 			const btVector3& dir = vectors[i];
-			btVector3 dots(dir.dot(m_vertices1[0]), dir.dot(m_vertices1[1]), dir.dot(m_vertices1[2]));
+            btVector3 dots = dir.dot3(m_vertices1[0], m_vertices1[1], m_vertices1[2]);
   			supportVerticesOut[i] = m_vertices1[dots.maxAxis()];
diff --git a/src/bullet/BulletCollision/CollisionShapes/btUniformScalingShape.h b/src/bullet/BulletCollision/CollisionShapes/btUniformScalingShape.h
index cbf7e6fd..a10f58d2 100644
--- a/src/bullet/BulletCollision/CollisionShapes/btUniformScalingShape.h
+++ b/src/bullet/BulletCollision/CollisionShapes/btUniformScalingShape.h
@@ -21,7 +21,7 @@ subject to the following restrictions:
 ///The btUniformScalingShape allows to re-use uniform scaled instances of btConvexShape in a memory efficient way.
 ///Istead of using btUniformScalingShape, it is better to use the non-uniform setLocalScaling method on convex shapes that implement it.
-class btUniformScalingShape : public btConvexShape
+ATTRIBUTE_ALIGNED16(class) btUniformScalingShape : public btConvexShape
 	btConvexShape*	m_childConvexShape;
@@ -29,6 +29,8 @@ class btUniformScalingShape : public btConvexShape
 	btUniformScalingShape(	btConvexShape* convexChildShape, btScalar uniformScalingFactor);
 	virtual ~btUniformScalingShape();
diff --git a/src/bullet/BulletCollision/Gimpact/btBoxCollision.h b/src/bullet/BulletCollision/Gimpact/btBoxCollision.h
index d5676aaa..0a0357e5 100644
--- a/src/bullet/BulletCollision/Gimpact/btBoxCollision.h
+++ b/src/bullet/BulletCollision/Gimpact/btBoxCollision.h
@@ -218,9 +218,7 @@ public:
 	SIMD_FORCE_INLINE btVector3 transform(const btVector3 & point) const
-		return btVector3(m_R1to0[0].dot(point) + m_T1to0.x(),
-			m_R1to0[1].dot(point) + m_T1to0.y(),
-			m_R1to0[2].dot(point) + m_T1to0.z());
+        return point.dot3( m_R1to0[0], m_R1to0[1], m_R1to0[2] ) + m_T1to0;
@@ -364,9 +362,9 @@ public:
 		// Compute new center
 		center = trans(center);
-		btVector3 textends(extends.dot(trans.getBasis().getRow(0).absolute()),
- 				 extends.dot(trans.getBasis().getRow(1).absolute()),
-				 extends.dot(trans.getBasis().getRow(2).absolute()));
+        btVector3 textends = extends.dot3(trans.getBasis().getRow(0).absolute(), 
+                                          trans.getBasis().getRow(1).absolute(), 
+                                          trans.getBasis().getRow(2).absolute());
 		m_min = center - textends;
 		m_max = center + textends;
@@ -381,10 +379,10 @@ public:
 		// Compute new center
 		center = trans.transform(center);
-		btVector3 textends(extends.dot(trans.m_R1to0.getRow(0).absolute()),
- 				 extends.dot(trans.m_R1to0.getRow(1).absolute()),
-				 extends.dot(trans.m_R1to0.getRow(2).absolute()));
+        btVector3 textends = extends.dot3(trans.m_R1to0.getRow(0).absolute(), 
+                                          trans.m_R1to0.getRow(1).absolute(), 
+                                          trans.m_R1to0.getRow(2).absolute());
 		m_min = center - textends;
 		m_max = center + textends;
diff --git a/src/bullet/BulletCollision/Gimpact/btCompoundFromGimpact.h b/src/bullet/BulletCollision/Gimpact/btCompoundFromGimpact.h
new file mode 100644
index 00000000..02f8b678
--- /dev/null
+++ b/src/bullet/BulletCollision/Gimpact/btCompoundFromGimpact.h
@@ -0,0 +1,93 @@
+#include "BulletCollision/CollisionShapes/btCompoundShape.h"
+#include "btGImpactShape.h"
+#include "BulletCollision/NarrowPhaseCollision/btRaycastCallback.h"
+struct MyCallback : public btTriangleRaycastCallback
+		{
+			int	m_ignorePart;
+			int	m_ignoreTriangleIndex;
+			MyCallback(const btVector3& from, const btVector3& to, int ignorePart, int ignoreTriangleIndex)
+			:btTriangleRaycastCallback(from,to),
+			m_ignorePart(ignorePart),
+			m_ignoreTriangleIndex(ignoreTriangleIndex)
+			{
+			}
+			virtual btScalar reportHit(const btVector3& hitNormalLocal, btScalar hitFraction, int partId, int triangleIndex)
+			{
+				if (partId!=m_ignorePart || triangleIndex!=m_ignoreTriangleIndex)
+				{
+					if (hitFraction < m_hitFraction)
+						return hitFraction;
+				}
+				return m_hitFraction;
+			}
+		};
+		struct MyInternalTriangleIndexCallback :public btInternalTriangleIndexCallback
+		{
+			const btGImpactMeshShape*		m_gimpactShape;
+			btCompoundShape*			m_colShape;
+			btScalar	m_depth;
+			MyInternalTriangleIndexCallback (btCompoundShape* colShape, const btGImpactMeshShape* meshShape, btScalar depth)
+			:m_colShape(colShape),
+			m_gimpactShape(meshShape),
+			m_depth(depth)
+			{
+			}
+			virtual void internalProcessTriangleIndex(btVector3* triangle,int partId,int  triangleIndex)
+			{
+				btVector3 scale = m_gimpactShape->getLocalScaling();
+				btVector3 v0=triangle[0]*scale;
+				btVector3 v1=triangle[1]*scale;
+				btVector3 v2=triangle[2]*scale;
+				btVector3 centroid = (v0+v1+v2)/3;
+				btVector3 normal = (v1-v0).cross(v2-v0);
+				normal.normalize();
+				btVector3 rayFrom = centroid;
+				btVector3 rayTo = centroid-normal*m_depth;
+				MyCallback cb(rayFrom,rayTo,partId,triangleIndex);
+				m_gimpactShape->processAllTrianglesRay(&cb,rayFrom, rayTo);
+				if (cb.m_hitFraction<1)
+				{
+					rayTo.setInterpolate3(cb.m_from,cb.m_to,cb.m_hitFraction);
+					//rayTo = cb.m_from;
+					//rayTo = rayTo.lerp(cb.m_to,cb.m_hitFraction);
+					//gDebugDraw.drawLine(tr(centroid),tr(centroid+normal),btVector3(1,0,0));
+				}
+				btBU_Simplex1to4* tet = new btBU_Simplex1to4(v0,v1,v2,rayTo);
+				btTransform ident;
+				ident.setIdentity();
+				m_colShape->addChildShape(ident,tet);
+			}
+		};
+btCompoundShape*	btCreateCompoundFromGimpactShape(const btGImpactMeshShape* gimpactMesh, btScalar depth)
+	btCompoundShape* colShape = new btCompoundShape();
+		btTransform tr;
+		tr.setIdentity();
+		MyInternalTriangleIndexCallback cb(colShape,gimpactMesh, depth);
+		btVector3 aabbMin,aabbMax;
+		gimpactMesh->getAabb(tr,aabbMin,aabbMax);
+		gimpactMesh->getMeshInterface()->InternalProcessAllTriangles(&cb,aabbMin,aabbMax);
+	return colShape;	
\ No newline at end of file
diff --git a/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp b/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp
index 2f2c09ff..2e87475e 100644
--- a/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp
+++ b/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp
@@ -51,7 +51,7 @@ public:
-	void get_plane_equation_transformed(const btTransform & trans,btVector4 &equation)
+	void get_plane_equation_transformed(const btTransform & trans,btVector4 &equation) const 
 		equation[0] = trans.getBasis().getRow(0).dot(m_planeNormal);
 		equation[1] = trans.getBasis().getRow(1).dot(m_planeNormal);
@@ -89,7 +89,7 @@ Declared here due of insuficent space on Pool allocators
 class GIM_ShapeRetriever
-	btGImpactShapeInterface * m_gim_shape;
+	const btGImpactShapeInterface * m_gim_shape;
 	btTriangleShapeEx m_trishape;
 	btTetrahedronShapeEx m_tetrashape;
@@ -98,7 +98,7 @@ public:
 		GIM_ShapeRetriever * m_parent;
-		virtual btCollisionShape * getChildShape(int index)
+		virtual const btCollisionShape * getChildShape(int index)
 			return m_parent->m_gim_shape->getChildShape(index);
@@ -133,7 +133,7 @@ public:
 	TetraShapeRetriever  m_tetra_retriever;
 	ChildShapeRetriever * m_current_retriever;
-	GIM_ShapeRetriever(btGImpactShapeInterface * gim_shape)
+	GIM_ShapeRetriever(const btGImpactShapeInterface * gim_shape)
 		m_gim_shape = gim_shape;
 		//select retriever
@@ -153,7 +153,7 @@ public:
 		m_current_retriever->m_parent = this;
-	btCollisionShape * getChildShape(int index)
+	const btCollisionShape * getChildShape(int index)
 		return m_current_retriever->getChildShape(index);
@@ -193,8 +193,8 @@ float btGImpactCollisionAlgorithm::getAverageTriangleCollisionTime()
-btGImpactCollisionAlgorithm::btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
-: btActivatingCollisionAlgorithm(ci,body0,body1)
+btGImpactCollisionAlgorithm::btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap)
 	m_manifoldPtr = NULL;
 	m_convex_algorithm = NULL;
@@ -209,71 +209,58 @@ btGImpactCollisionAlgorithm::~btGImpactCollisionAlgorithm()
-void btGImpactCollisionAlgorithm::addContactPoint(btCollisionObject * body0,
-				btCollisionObject * body1,
+void btGImpactCollisionAlgorithm::addContactPoint(const btCollisionObjectWrapper * body0Wrap,
+				const btCollisionObjectWrapper * body1Wrap,
 				const btVector3 & point,
 				const btVector3 & normal,
 				btScalar distance)
-	checkManifold(body0,body1);
+	checkManifold(body0Wrap,body1Wrap);
 void btGImpactCollisionAlgorithm::shape_vs_shape_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1)
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btCollisionShape * shape0,
+					  const btCollisionShape * shape1)
-	btCollisionShape* tmpShape0 = body0->getCollisionShape();
-	btCollisionShape* tmpShape1 = body1->getCollisionShape();
-	body0->internalSetTemporaryCollisionShape(shape0);
-	body1->internalSetTemporaryCollisionShape(shape1);
-		btCollisionAlgorithm* algor = newAlgorithm(body0,body1);
+		btCollisionAlgorithm* algor = newAlgorithm(body0Wrap,body1Wrap);
 		// post :	checkManifold is called
-		algor->processCollision(body0,body1,*m_dispatchInfo,m_resultOut);
+		algor->processCollision(body0Wrap,body1Wrap,*m_dispatchInfo,m_resultOut);
-	body0->internalSetTemporaryCollisionShape(tmpShape0);
-	body1->internalSetTemporaryCollisionShape(tmpShape1);
 void btGImpactCollisionAlgorithm::convex_vs_convex_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1)
+					  const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btCollisionShape* shape0,
+					  const btCollisionShape* shape1)
-	btCollisionShape* tmpShape0 = body0->getCollisionShape();
-	btCollisionShape* tmpShape1 = body1->getCollisionShape();
-	body0->internalSetTemporaryCollisionShape(shape0);
-	body1->internalSetTemporaryCollisionShape(shape1);
-	checkConvexAlgorithm(body0,body1);
-	m_convex_algorithm->processCollision(body0,body1,*m_dispatchInfo,m_resultOut);
+	btCollisionObjectWrapper ob0(body0Wrap,shape0,body0Wrap->getCollisionObject(),body0Wrap->getWorldTransform(),m_part0,m_triface0);
+	btCollisionObjectWrapper ob1(body1Wrap,shape1,body1Wrap->getCollisionObject(),body1Wrap->getWorldTransform(),m_part1,m_triface1);
+	checkConvexAlgorithm(&ob0,&ob1);
+	m_convex_algorithm->processCollision(&ob0,&ob1,*m_dispatchInfo,m_resultOut);
-	body0->internalSetTemporaryCollisionShape(tmpShape0);
-	body1->internalSetTemporaryCollisionShape(tmpShape1);
@@ -283,8 +270,8 @@ void btGImpactCollisionAlgorithm::convex_vs_convex_collision(
 void btGImpactCollisionAlgorithm::gimpact_vs_gimpact_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btGImpactShapeInterface * shape1,btPairSet & pairset)
+					  const btGImpactShapeInterface * shape0,
+					  const btGImpactShapeInterface * shape1,btPairSet & pairset)
 	if(shape0->hasBoxSet() && shape1->hasBoxSet())
@@ -320,8 +307,8 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact_find_pairs(
 void btGImpactCollisionAlgorithm::gimpact_vs_shape_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btCollisionShape * shape1,
+					  const btGImpactShapeInterface * shape0,
+					  const btCollisionShape * shape1,
 					  btAlignedObjectArray<int> & collided_primitives)
@@ -359,10 +346,10 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape_find_pairs(
-void btGImpactCollisionAlgorithm::collide_gjk_triangles(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactMeshShapePart * shape0,
-				  btGImpactMeshShapePart * shape1,
+void btGImpactCollisionAlgorithm::collide_gjk_triangles(const btCollisionObjectWrapper * body0Wrap,
+				  const btCollisionObjectWrapper * body1Wrap,
+				  const btGImpactMeshShapePart * shape0,
+				  const btGImpactMeshShapePart * shape1,
 				  const int * pairs, int pair_count)
 	btTriangleShapeEx tri0;
@@ -389,7 +376,7 @@ void btGImpactCollisionAlgorithm::collide_gjk_triangles(btCollisionObject * body
 		//collide two convex shapes
-			convex_vs_convex_collision(body0,body1,&tri0,&tri1);
+			convex_vs_convex_collision(body0Wrap,body1Wrap,&tri0,&tri1);
@@ -398,14 +385,14 @@ void btGImpactCollisionAlgorithm::collide_gjk_triangles(btCollisionObject * body
-void btGImpactCollisionAlgorithm::collide_sat_triangles(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btGImpactMeshShapePart * shape1,
+void btGImpactCollisionAlgorithm::collide_sat_triangles(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btGImpactMeshShapePart * shape1,
 					  const int * pairs, int pair_count)
-	btTransform orgtrans0 = body0->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 	btPrimitiveTriangle ptri0;
 	btPrimitiveTriangle ptri1;
@@ -451,7 +438,7 @@ void btGImpactCollisionAlgorithm::collide_sat_triangles(btCollisionObject * body
-					addContactPoint(body0, body1,
+					addContactPoint(body0Wrap, body1Wrap,
@@ -472,20 +459,20 @@ void btGImpactCollisionAlgorithm::collide_sat_triangles(btCollisionObject * body
 void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
-						btCollisionObject * body0,
-					   	btCollisionObject * body1,
-					  	btGImpactShapeInterface * shape0,
-					  	btGImpactShapeInterface * shape1)
+						const btCollisionObjectWrapper* body0Wrap,
+					   	const btCollisionObjectWrapper * body1Wrap,
+					  	const btGImpactShapeInterface * shape0,
+					  	const btGImpactShapeInterface * shape1)
-		btGImpactMeshShape * meshshape0 = static_cast<btGImpactMeshShape *>(shape0);
+		const btGImpactMeshShape * meshshape0 = static_cast<const btGImpactMeshShape *>(shape0);
 		m_part0 = meshshape0->getMeshPartCount();
-			gimpact_vs_gimpact(body0,body1,meshshape0->getMeshPart(m_part0),shape1);
+			gimpact_vs_gimpact(body0Wrap,body1Wrap,meshshape0->getMeshPart(m_part0),shape1);
@@ -493,13 +480,13 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
-		btGImpactMeshShape * meshshape1 = static_cast<btGImpactMeshShape *>(shape1);
+		const btGImpactMeshShape * meshshape1 = static_cast<const btGImpactMeshShape *>(shape1);
 		m_part1 = meshshape1->getMeshPartCount();
-			gimpact_vs_gimpact(body0,body1,shape0,meshshape1->getMeshPart(m_part1));
+			gimpact_vs_gimpact(body0Wrap,body1Wrap,shape0,meshshape1->getMeshPart(m_part1));
@@ -507,8 +494,8 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
-	btTransform orgtrans0 = body0->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 	btPairSet pairset;
@@ -519,13 +506,13 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
 	if(shape0->getGImpactShapeType() == CONST_GIMPACT_TRIMESH_SHAPE_PART &&
 		shape1->getGImpactShapeType() == CONST_GIMPACT_TRIMESH_SHAPE_PART)
-		btGImpactMeshShapePart * shapepart0 = static_cast<btGImpactMeshShapePart * >(shape0);
-		btGImpactMeshShapePart * shapepart1 = static_cast<btGImpactMeshShapePart * >(shape1);
+		const btGImpactMeshShapePart * shapepart0 = static_cast<const btGImpactMeshShapePart * >(shape0);
+		const btGImpactMeshShapePart * shapepart1 = static_cast<const btGImpactMeshShapePart * >(shape1);
 		//specialized function
-		collide_gjk_triangles(body0,body1,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
+		collide_gjk_triangles(body0Wrap,body1Wrap,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
-		collide_sat_triangles(body0,body1,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
+		collide_sat_triangles(body0Wrap,body1Wrap,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
@@ -548,55 +535,49 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
 		GIM_PAIR * pair = &pairset[i];
 		m_triface0 = pair->m_index1;
 		m_triface1 = pair->m_index2;
-		btCollisionShape * colshape0 = retriever0.getChildShape(m_triface0);
-		btCollisionShape * colshape1 = retriever1.getChildShape(m_triface1);
-		if(child_has_transform0)
-		{
-			body0->setWorldTransform(orgtrans0*shape0->getChildTransform(m_triface0));
-		}
-		if(child_has_transform1)
-		{
-			body1->setWorldTransform(orgtrans1*shape1->getChildTransform(m_triface1));
-		}
-		//collide two convex shapes
-		convex_vs_convex_collision(body0,body1,colshape0,colshape1);
+		const btCollisionShape * colshape0 = retriever0.getChildShape(m_triface0);
+		const btCollisionShape * colshape1 = retriever1.getChildShape(m_triface1);
+		btTransform tr0 = body0Wrap->getWorldTransform();
+		btTransform tr1 = body1Wrap->getWorldTransform();
-			body0->setWorldTransform(orgtrans0);
+			tr0 = orgtrans0*shape0->getChildTransform(m_triface0);
-			body1->setWorldTransform(orgtrans1);
+			tr1 = orgtrans1*shape1->getChildTransform(m_triface1);
+		btCollisionObjectWrapper ob0(body0Wrap,colshape0,body0Wrap->getCollisionObject(),tr0,m_part0,m_triface0);
+		btCollisionObjectWrapper ob1(body1Wrap,colshape1,body1Wrap->getCollisionObject(),tr1,m_part1,m_triface1);
+		//collide two convex shapes
+		convex_vs_convex_collision(&ob0,&ob1,colshape0,colshape1);
-void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactShapeInterface * shape0,
-				  btCollisionShape * shape1,bool swapped)
+void btGImpactCollisionAlgorithm::gimpact_vs_shape(const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper * body1Wrap,
+				  const btGImpactShapeInterface * shape0,
+				  const btCollisionShape * shape1,bool swapped)
-		btGImpactMeshShape * meshshape0 = static_cast<btGImpactMeshShape *>(shape0);
+		const btGImpactMeshShape * meshshape0 = static_cast<const btGImpactMeshShape *>(shape0);
 		int& part = swapped ? m_part1 : m_part0;
 		part = meshshape0->getMeshPartCount();
-			gimpact_vs_shape(body0,
-				  body1,
+			gimpact_vs_shape(body0Wrap,
+				  body1Wrap,
@@ -609,9 +590,9 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
 	if(shape0->getGImpactShapeType() == CONST_GIMPACT_TRIMESH_SHAPE_PART &&
 		shape1->getShapeType() == STATIC_PLANE_PROXYTYPE)
-		btGImpactMeshShapePart * shapepart = static_cast<btGImpactMeshShapePart *>(shape0);
-		btStaticPlaneShape * planeshape = static_cast<btStaticPlaneShape * >(shape1);
-		gimpacttrimeshpart_vs_plane_collision(body0,body1,shapepart,planeshape,swapped);
+		const btGImpactMeshShapePart * shapepart = static_cast<const btGImpactMeshShapePart *>(shape0);
+		const btStaticPlaneShape * planeshape = static_cast<const btStaticPlaneShape * >(shape1);
+		gimpacttrimeshpart_vs_plane_collision(body0Wrap,body1Wrap,shapepart,planeshape,swapped);
@@ -621,21 +602,21 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
-		btCompoundShape * compoundshape = static_cast<btCompoundShape *>(shape1);
-		gimpact_vs_compoundshape(body0,body1,shape0,compoundshape,swapped);
+		const btCompoundShape * compoundshape = static_cast<const btCompoundShape *>(shape1);
+		gimpact_vs_compoundshape(body0Wrap,body1Wrap,shape0,compoundshape,swapped);
 	else if(shape1->isConcave())
-		btConcaveShape * concaveshape = static_cast<btConcaveShape *>(shape1);
-		gimpact_vs_concave(body0,body1,shape0,concaveshape,swapped);
+		const btConcaveShape * concaveshape = static_cast<const btConcaveShape *>(shape1);
+		gimpact_vs_concave(body0Wrap,body1Wrap,shape0,concaveshape,swapped);
-	btTransform orgtrans0 = body0->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 	btAlignedObjectArray<int> collided_results;
@@ -662,28 +643,38 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
             m_triface0 = child_index;
-		btCollisionShape * colshape0 = retriever0.getChildShape(child_index);
+		const btCollisionShape * colshape0 = retriever0.getChildShape(child_index);
+		btTransform tr0 = body0Wrap->getWorldTransform();
-			body0->setWorldTransform(orgtrans0*shape0->getChildTransform(child_index));
+			tr0 = orgtrans0*shape0->getChildTransform(child_index);
+		}
+		btCollisionObjectWrapper ob0(body0Wrap,colshape0,body0Wrap->getCollisionObject(),body0Wrap->getWorldTransform(),m_part0,m_triface0);
+		const btCollisionObjectWrapper* prevObj0 = m_resultOut->getBody0Wrap();
+		if (m_resultOut->getBody0Wrap()->getCollisionObject()==ob0.getCollisionObject())
+		{
+			m_resultOut->setBody0Wrap(&ob0);
+		} else
+		{
+			m_resultOut->setBody1Wrap(&ob0);
 		//collide two shapes
-			shape_vs_shape_collision(body1,body0,shape1,colshape0);
+			shape_vs_shape_collision(body1Wrap,&ob0,shape1,colshape0);
-			shape_vs_shape_collision(body0,body1,colshape0,shape1);
-		}
-		//restore transforms
-		if(child_has_transform0)
-		{
-			body0->setWorldTransform(orgtrans0);
+			shape_vs_shape_collision(&ob0,body1Wrap,colshape0,shape1);
+		m_resultOut->setBody0Wrap(prevObj0);
@@ -691,44 +682,58 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
-void btGImpactCollisionAlgorithm::gimpact_vs_compoundshape(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactShapeInterface * shape0,
-				  btCompoundShape * shape1,bool swapped)
+void btGImpactCollisionAlgorithm::gimpact_vs_compoundshape(const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper* body1Wrap,
+				  const btGImpactShapeInterface * shape0,
+				  const btCompoundShape * shape1,bool swapped)
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 	int i = shape1->getNumChildShapes();
-		btCollisionShape * colshape1 = shape1->getChildShape(i);
+		const btCollisionShape * colshape1 = shape1->getChildShape(i);
 		btTransform childtrans1 = orgtrans1*shape1->getChildTransform(i);
-		body1->setWorldTransform(childtrans1);
+		btCollisionObjectWrapper ob1(body1Wrap,colshape1,body1Wrap->getCollisionObject(),childtrans1,-1,i);
+		const btCollisionObjectWrapper* tmp = 0;
+		if (m_resultOut->getBody0Wrap()->getCollisionObject()==ob1.getCollisionObject())
+		{
+			tmp = m_resultOut->getBody0Wrap();
+			m_resultOut->setBody0Wrap(&ob1);
+		} else
+		{
+			tmp = m_resultOut->getBody1Wrap();
+			m_resultOut->setBody1Wrap(&ob1);
+		}
 		//collide child shape
-		gimpact_vs_shape(body0, body1,
+		gimpact_vs_shape(body0Wrap, &ob1,
-		//restore transforms
-		body1->setWorldTransform(orgtrans1);
+		if (m_resultOut->getBody0Wrap()->getCollisionObject()==ob1.getCollisionObject())
+		{
+			m_resultOut->setBody0Wrap(tmp);
+		} else
+		{
+			m_resultOut->setBody1Wrap(tmp);
+		}
 void btGImpactCollisionAlgorithm::gimpacttrimeshpart_vs_plane_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btStaticPlaneShape * shape1,bool swapped)
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btStaticPlaneShape * shape1,bool swapped)
-	btTransform orgtrans0 = body0->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
-	btPlaneShape * planeshape = static_cast<btPlaneShape *>(shape1);
+	const btPlaneShape * planeshape = static_cast<const btPlaneShape *>(shape1);
 	btVector4 plane;
@@ -757,14 +762,14 @@ void btGImpactCollisionAlgorithm::gimpacttrimeshpart_vs_plane_collision(
-				addContactPoint(body1, body0,
+				addContactPoint(body1Wrap, body0Wrap,
-				addContactPoint(body0, body1,
+				addContactPoint(body0Wrap, body1Wrap,
@@ -782,9 +787,9 @@ class btGImpactTriangleCallback: public btTriangleCallback
 	btGImpactCollisionAlgorithm * algorithm;
-	btCollisionObject * body0;
-	btCollisionObject * body1;
-	btGImpactShapeInterface * gimpactshape0;
+	const btCollisionObjectWrapper * body0Wrap;
+	const btCollisionObjectWrapper * body1Wrap;
+	const btGImpactShapeInterface * gimpactshape0;
 	bool swapped;
 	btScalar margin;
@@ -802,8 +807,31 @@ public:
+		btCollisionObjectWrapper ob1Wrap(body1Wrap,&tri1,body1Wrap->getCollisionObject(),body1Wrap->getWorldTransform(),partId,triangleIndex);
+		const btCollisionObjectWrapper * tmp = 0;
+		if (algorithm->internalGetResultOut()->getBody0Wrap()->getCollisionObject()==ob1Wrap.getCollisionObject())
+		{
+			tmp = algorithm->internalGetResultOut()->getBody0Wrap();
+			algorithm->internalGetResultOut()->setBody0Wrap(&ob1Wrap);
+		} else
+		{
+			tmp = algorithm->internalGetResultOut()->getBody1Wrap();
+			algorithm->internalGetResultOut()->setBody1Wrap(&ob1Wrap);
+		}
-							body0,body1,gimpactshape0,&tri1,swapped);
+							body0Wrap,&ob1Wrap,gimpactshape0,&tri1,swapped);
+		if (algorithm->internalGetResultOut()->getBody0Wrap()->getCollisionObject()==ob1Wrap.getCollisionObject())
+		{
+			algorithm->internalGetResultOut()->setBody0Wrap(tmp);
+		} else
+		{
+			algorithm->internalGetResultOut()->setBody1Wrap(tmp);
+		}
@@ -811,16 +839,16 @@ public:
 void btGImpactCollisionAlgorithm::gimpact_vs_concave(
-				  btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactShapeInterface * shape0,
-				  btConcaveShape * shape1,bool swapped)
+				  const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper * body1Wrap,
+				  const btGImpactShapeInterface * shape0,
+				  const btConcaveShape * shape1,bool swapped)
 	//create the callback
 	btGImpactTriangleCallback tricallback;
 	tricallback.algorithm = this;
-	tricallback.body0 = body0;
-	tricallback.body1 = body1;
+	tricallback.body0Wrap = body0Wrap;
+	tricallback.body1Wrap = body1Wrap;
 	tricallback.gimpactshape0 = shape0;
 	tricallback.swapped = swapped;
 	tricallback.margin = shape1->getMargin();
@@ -828,7 +856,7 @@ void btGImpactCollisionAlgorithm::gimpact_vs_concave(
 	//getting the trimesh AABB
 	btTransform gimpactInConcaveSpace;
-	gimpactInConcaveSpace = body1->getWorldTransform().inverse() * body0->getWorldTransform();
+	gimpactInConcaveSpace = body1Wrap->getWorldTransform().inverse() * body0Wrap->getWorldTransform();
 	btVector3 minAABB,maxAABB;
@@ -839,36 +867,36 @@ void btGImpactCollisionAlgorithm::gimpact_vs_concave(
-void btGImpactCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btGImpactCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
     m_resultOut = resultOut;
 	m_dispatchInfo = &dispatchInfo;
-    btGImpactShapeInterface * gimpactshape0;
-    btGImpactShapeInterface * gimpactshape1;
+    const btGImpactShapeInterface * gimpactshape0;
+    const btGImpactShapeInterface * gimpactshape1;
-	if (body0->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE)
+	if (body0Wrap->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE)
-		gimpactshape0 = static_cast<btGImpactShapeInterface *>(body0->getCollisionShape());
+		gimpactshape0 = static_cast<const btGImpactShapeInterface *>(body0Wrap->getCollisionShape());
-		if( body1->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
+		if( body1Wrap->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
-			gimpactshape1 = static_cast<btGImpactShapeInterface *>(body1->getCollisionShape());
+			gimpactshape1 = static_cast<const btGImpactShapeInterface *>(body1Wrap->getCollisionShape());
-			gimpact_vs_gimpact(body0,body1,gimpactshape0,gimpactshape1);
+			gimpact_vs_gimpact(body0Wrap,body1Wrap,gimpactshape0,gimpactshape1);
-			gimpact_vs_shape(body0,body1,gimpactshape0,body1->getCollisionShape(),false);
+			gimpact_vs_shape(body0Wrap,body1Wrap,gimpactshape0,body1Wrap->getCollisionShape(),false);
-	else if (body1->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
+	else if (body1Wrap->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
-		gimpactshape1 = static_cast<btGImpactShapeInterface *>(body1->getCollisionShape());
+		gimpactshape1 = static_cast<const btGImpactShapeInterface *>(body1Wrap->getCollisionShape());
-		gimpact_vs_shape(body1,body0,gimpactshape1,body0->getCollisionShape(),true);
+		gimpact_vs_shape(body1Wrap,body0Wrap,gimpactshape1,body0Wrap->getCollisionShape(),true);
diff --git a/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h b/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h
index 6b6e07c9..f85a94cb 100644
--- a/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h
+++ b/src/bullet/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h
@@ -40,7 +40,7 @@ class btDispatcher;
 #include "BulletCollision/CollisionShapes/btCompoundShape.h"
 #include "BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h"
 #include "LinearMath/btIDebugDraw.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 //! Collision Algorithm for GImpact Shapes
@@ -65,7 +65,7 @@ protected:
 	//! Creates a new contact point
-	SIMD_FORCE_INLINE btPersistentManifold* newContactManifold(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE btPersistentManifold* newContactManifold(const btCollisionObject* body0,const btCollisionObject* body1)
 		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
 		return m_manifoldPtr;
@@ -106,38 +106,38 @@ protected:
 	// Call before process collision
-	SIMD_FORCE_INLINE void checkManifold(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE void checkManifold(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		if(getLastManifold() == 0)
-			newContactManifold(body0,body1);
+			newContactManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 	// Call before process collision
-	SIMD_FORCE_INLINE btCollisionAlgorithm * newAlgorithm(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE btCollisionAlgorithm * newAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
-		checkManifold(body0,body1);
+		checkManifold(body0Wrap,body1Wrap);
 		btCollisionAlgorithm * convex_algorithm = m_dispatcher->findAlgorithm(
-				body0,body1,getLastManifold());
+				body0Wrap,body1Wrap,getLastManifold());
 		return convex_algorithm ;
 	// Call before process collision
-	SIMD_FORCE_INLINE void checkConvexAlgorithm(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE void checkConvexAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		if(m_convex_algorithm) return;
-		m_convex_algorithm = newAlgorithm(body0,body1);
+		m_convex_algorithm = newAlgorithm(body0Wrap,body1Wrap);
-	void addContactPoint(btCollisionObject * body0,
-					btCollisionObject * body1,
+	void addContactPoint(const btCollisionObjectWrapper * body0Wrap,
+					const btCollisionObjectWrapper * body1Wrap,
 					const btVector3 & point,
 					const btVector3 & normal,
 					btScalar distance);
@@ -145,62 +145,62 @@ protected:
 //! Collision routines
-	void collide_gjk_triangles(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactMeshShapePart * shape0,
-				  btGImpactMeshShapePart * shape1,
+	void collide_gjk_triangles(const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper* body1Wrap,
+				  const btGImpactMeshShapePart * shape0,
+				  const btGImpactMeshShapePart * shape1,
 				  const int * pairs, int pair_count);
-	void collide_sat_triangles(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btGImpactMeshShapePart * shape1,
+	void collide_sat_triangles(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btGImpactMeshShapePart * shape1,
 					  const int * pairs, int pair_count);
 	void shape_vs_shape_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1);
+					  const btCollisionObjectWrapper* body0,
+					  const btCollisionObjectWrapper* body1,
+					  const btCollisionShape * shape0,
+					  const btCollisionShape * shape1);
-	void convex_vs_convex_collision(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1);
+	void convex_vs_convex_collision(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btCollisionShape* shape0,
+					  const btCollisionShape* shape1);
 	void gimpact_vs_gimpact_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btGImpactShapeInterface * shape1,btPairSet & pairset);
+					  const btGImpactShapeInterface * shape0,
+					  const btGImpactShapeInterface * shape1,btPairSet & pairset);
 	void gimpact_vs_shape_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btCollisionShape * shape1,
+					  const btGImpactShapeInterface * shape0,
+					  const btCollisionShape * shape1,
 					  btAlignedObjectArray<int> & collided_primitives);
 	void gimpacttrimeshpart_vs_plane_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btStaticPlaneShape * shape1,bool swapped);
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btStaticPlaneShape * shape1,bool swapped);
-	btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 	virtual ~btGImpactCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -210,13 +210,17 @@ public:
+	btManifoldResult*	internalGetResultOut()
+	{
+		return m_resultOut;
+	}
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btGImpactCollisionAlgorithm));
-			return new(mem) btGImpactCollisionAlgorithm(ci,body0,body1);
+			return new(mem) btGImpactCollisionAlgorithm(ci,body0Wrap,body1Wrap);
@@ -236,26 +240,26 @@ public:
-	void gimpact_vs_gimpact(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btGImpactShapeInterface * shape1);
+	void gimpact_vs_gimpact(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btGImpactShapeInterface * shape1);
-	void gimpact_vs_shape(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btCollisionShape * shape1,bool swapped);
+	void gimpact_vs_shape(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btCollisionShape * shape1,bool swapped);
-	void gimpact_vs_compoundshape(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btCompoundShape * shape1,bool swapped);
+	void gimpact_vs_compoundshape(const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btCompoundShape * shape1,bool swapped);
 	void gimpact_vs_concave(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btConcaveShape * shape1,bool swapped);
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btConcaveShape * shape1,bool swapped);
diff --git a/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp b/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp
index cd4dfdb6..4528758c 100644
--- a/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp
+++ b/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp
@@ -384,7 +384,7 @@ bool btGImpactQuantizedBvh::rayQuery(
 SIMD_FORCE_INLINE bool _quantized_node_collision(
-	btGImpactQuantizedBvh * boxset0, btGImpactQuantizedBvh * boxset1,
+	const btGImpactQuantizedBvh * boxset0, const btGImpactQuantizedBvh * boxset1,
 	const BT_BOX_BOX_TRANSFORM_CACHE & trans_cache_1to0,
 	int node0 ,int node1, bool complete_primitive_tests)
@@ -402,7 +402,7 @@ SIMD_FORCE_INLINE bool _quantized_node_collision(
 //stackless recursive collision routine
 static void _find_quantized_collision_pairs_recursive(
-	btGImpactQuantizedBvh * boxset0, btGImpactQuantizedBvh * boxset1,
+	const btGImpactQuantizedBvh * boxset0, const btGImpactQuantizedBvh * boxset1,
 	btPairSet * collision_pairs,
 	const BT_BOX_BOX_TRANSFORM_CACHE & trans_cache_1to0,
 	int node0, int node1, bool complete_primitive_tests)
@@ -501,8 +501,8 @@ static void _find_quantized_collision_pairs_recursive(
-void btGImpactQuantizedBvh::find_collision(btGImpactQuantizedBvh * boxset0, const btTransform & trans0,
-		btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
+void btGImpactQuantizedBvh::find_collision(const btGImpactQuantizedBvh * boxset0, const btTransform & trans0,
+		const btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
 		btPairSet & collision_pairs)
diff --git a/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.h b/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.h
index 9c990774..e6e52fff 100644
--- a/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.h
+++ b/src/bullet/BulletCollision/Gimpact/btGImpactQuantizedBvh.h
@@ -363,8 +363,8 @@ public:
 	static float getAverageTreeCollisionTime();
-	static void find_collision(btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
-		btGImpactQuantizedBvh * boxset2, const btTransform & trans2,
+	static void find_collision(const btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
+		const btGImpactQuantizedBvh * boxset2, const btTransform & trans2,
 		btPairSet & collision_pairs);
diff --git a/src/bullet/BulletCollision/Gimpact/btGImpactShape.cpp b/src/bullet/BulletCollision/Gimpact/btGImpactShape.cpp
index cceace55..ac8efdf3 100644
--- a/src/bullet/BulletCollision/Gimpact/btGImpactShape.cpp
+++ b/src/bullet/BulletCollision/Gimpact/btGImpactShape.cpp
@@ -25,6 +25,7 @@ subject to the following restrictions:
 void btGImpactCompoundShape::calculateLocalInertia(btScalar mass,btVector3& inertia) const
@@ -144,6 +145,31 @@ void btGImpactMeshShape::rayTest(const btVector3& rayFrom, const btVector3& rayT
+void btGImpactMeshShapePart::processAllTrianglesRay(btTriangleCallback* callback,const btVector3& rayFrom, const btVector3& rayTo) const
+	lockChildShapes();
+	btAlignedObjectArray<int> collided;
+	btVector3 rayDir(rayTo - rayFrom);
+	rayDir.normalize();
+	m_box_set.rayQuery(rayDir, rayFrom, collided);
+	if(collided.size()==0)
+	{
+		unlockChildShapes();
+		return;
+	}
+	int part = (int)getPart();
+	btPrimitiveTriangle triangle;
+	int i = collided.size();
+	while(i--)
+	{
+		getPrimitiveTriangle(collided[i],triangle);
+		callback->processTriangle(triangle.m_vertices,part,collided[i]);
+	}
+	unlockChildShapes();
 void btGImpactMeshShapePart::processAllTriangles(btTriangleCallback* callback,const btVector3& aabbMin,const btVector3& aabbMax) const
@@ -182,6 +208,15 @@ void btGImpactMeshShape::processAllTriangles(btTriangleCallback* callback,const
+void btGImpactMeshShape::processAllTrianglesRay(btTriangleCallback* callback,const btVector3& rayFrom, const btVector3& rayTo) const
+	int i = m_mesh_parts.size();
+	while(i--)
+	{
+		m_mesh_parts[i]->processAllTrianglesRay(callback, rayFrom, rayTo);
+	}
 ///fills the dataBuffer and returns the struct name (and 0 on failure)
 const char*	btGImpactMeshShape::serialize(void* dataBuffer, btSerializer* serializer) const
diff --git a/src/bullet/BulletCollision/Gimpact/btGImpactShape.h b/src/bullet/BulletCollision/Gimpact/btGImpactShape.h
index 90015bb9..3d1f48d4 100644
--- a/src/bullet/BulletCollision/Gimpact/btGImpactShape.h
+++ b/src/bullet/BulletCollision/Gimpact/btGImpactShape.h
@@ -51,6 +51,7 @@ enum eGIMPACT_SHAPE_TYPE
 //! Helper class for tetrahedrons
 class btTetrahedronShapeEx:public btBU_Simplex1to4
@@ -192,7 +193,7 @@ public:
 	virtual eGIMPACT_SHAPE_TYPE getGImpactShapeType() const = 0 ;
 	//! gets boxset
-	SIMD_FORCE_INLINE btGImpactBoxSet * getBoxSet()
+	SIMD_FORCE_INLINE const btGImpactBoxSet * getBoxSet() const
 		return &m_box_set;
@@ -288,6 +289,15 @@ public:
         (void) callback; (void) aabbMin; (void) aabbMax;
+	//! Function for retrieve triangles.
+	/*!
+	It gives the triangles in local space
+	*/
+	virtual void processAllTrianglesRay(btTriangleCallback* /*callback*/,const btVector3& /*rayFrom*/, const btVector3& /*rayTo*/) const
+	{
+	}
@@ -635,25 +645,25 @@ public:
 			return (int )numverts;
-		SIMD_FORCE_INLINE void get_indices(int face_index,int &i0,int &i1,int &i2) const
+		SIMD_FORCE_INLINE void get_indices(int face_index,unsigned int &i0,unsigned int &i1,unsigned int &i2) const
 			if(indicestype == PHY_SHORT)
-				short * s_indices = (short *)(indexbase + face_index*indexstride);
+				unsigned short* s_indices = (unsigned short *)(indexbase + face_index * indexstride);
 				i0 = s_indices[0];
 				i1 = s_indices[1];
 				i2 = s_indices[2];
-				int * i_indices = (int *)(indexbase + face_index*indexstride);
+				unsigned int * i_indices = (unsigned int *)(indexbase + face_index*indexstride);
 				i0 = i_indices[0];
 				i1 = i_indices[1];
 				i2 = i_indices[2];
-		SIMD_FORCE_INLINE void get_vertex(int vertex_index, btVector3 & vertex) const
+		SIMD_FORCE_INLINE void get_vertex(unsigned int vertex_index, btVector3 & vertex) const
 			if(type == PHY_DOUBLE)
@@ -682,7 +692,7 @@ public:
 		virtual void get_primitive_triangle(int prim_index,btPrimitiveTriangle & triangle) const
-			int indices[3];
+			unsigned int indices[3];
@@ -692,7 +702,7 @@ public:
 		SIMD_FORCE_INLINE void get_bullet_triangle(int prim_index,btTriangleShapeEx & triangle) const
-			int indices[3];
+			unsigned int indices[3];
@@ -885,6 +895,7 @@ public:
 	virtual void	processAllTriangles(btTriangleCallback* callback,const btVector3& aabbMin,const btVector3& aabbMax) const;
+	virtual void	processAllTrianglesRay(btTriangleCallback* callback,const btVector3& rayFrom,const btVector3& rayTo) const;
@@ -1141,6 +1152,8 @@ public:
 	virtual void	processAllTriangles(btTriangleCallback* callback,const btVector3& aabbMin,const btVector3& aabbMax) const;
+	virtual void	processAllTrianglesRay (btTriangleCallback* callback,const btVector3& rayFrom,const btVector3& rayTo) const;
 	virtual	int	calculateSerializeBufferSize() const;
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
diff --git a/src/bullet/BulletCollision/Gimpact/gim_array.h b/src/bullet/BulletCollision/Gimpact/gim_array.h
index cfd5da8f..27e6f32f 100644
--- a/src/bullet/BulletCollision/Gimpact/gim_array.h
+++ b/src/bullet/BulletCollision/Gimpact/gim_array.h
@@ -285,18 +285,16 @@ public:
 	    m_data[index] = obj;
-	inline void resize(GUINT size, bool call_constructor = true)
+	inline void resize(GUINT size, bool call_constructor = true, const T& fillData=T())
-            	T obj;
-                    m_data[m_size] = obj;
+                    m_data[m_size] = fillData;
diff --git a/src/bullet/BulletCollision/Gimpact/gim_basic_geometry_operations.h b/src/bullet/BulletCollision/Gimpact/gim_basic_geometry_operations.h
index 91527740..d98051da 100644
--- a/src/bullet/BulletCollision/Gimpact/gim_basic_geometry_operations.h
+++ b/src/bullet/BulletCollision/Gimpact/gim_basic_geometry_operations.h
@@ -404,12 +404,12 @@ SIMD_FORCE_INLINE void SEGMENT_COLLISION(
 	CLASS_POINT & vPointA,
 	CLASS_POINT & vPointB)
     vec4f _M;//plane
-    VEC_CROSS(_N,_AD,_BD);
-    GREAL _tp = VEC_DOT(_N,_N);
+    VEC_CROSS(n,_AD,_BD);
+    GREAL _tp = VEC_DOT(n,n);
     	//project B over A
@@ -424,10 +424,10 @@ SIMD_FORCE_INLINE void SEGMENT_COLLISION(
     	_M[2] = VEC_DOT(vA1,_AD);
     	_M[3] = VEC_DOT(vA2,_AD);
     	//mid points
-    	_N[0] = (_M[0]+_M[1])*0.5f;
-    	_N[1] = (_M[2]+_M[3])*0.5f;
+    	n[0] = (_M[0]+_M[1])*0.5f;
+    	n[1] = (_M[2]+_M[3])*0.5f;
-    	if(_N[0]<_N[1])
+    	if(n[0]<n[1])
-    VEC_CROSS(_M,_N,_BD);
+    VEC_CROSS(_M,n,_BD);
     _M[3] = VEC_DOT(_M,vB1);
     LINE_PLANE_COLLISION(_M,_AD,vA1,vPointA,_tp,btScalar(0), btScalar(1));
diff --git a/src/bullet/BulletCollision/Gimpact/gim_box_collision.h b/src/bullet/BulletCollision/Gimpact/gim_box_collision.h
index b360dd47..9c572638 100644
--- a/src/bullet/BulletCollision/Gimpact/gim_box_collision.h
+++ b/src/bullet/BulletCollision/Gimpact/gim_box_collision.h
@@ -186,9 +186,7 @@ public:
 	SIMD_FORCE_INLINE btVector3 transform(const btVector3 & point)
-		return btVector3(m_R1to0[0].dot(point) + m_T1to0.x(),
-			m_R1to0[1].dot(point) + m_T1to0.y(),
-			m_R1to0[2].dot(point) + m_T1to0.z());
+        return point.dot3(m_R1to0[0], m_R1to0[1], m_R1to0[2]) + m_T1to0;
@@ -332,10 +330,10 @@ public:
 		// Compute new center
 		center = trans(center);
-		btVector3 textends(extends.dot(trans.getBasis().getRow(0).absolute()),
- 				 extends.dot(trans.getBasis().getRow(1).absolute()),
-				 extends.dot(trans.getBasis().getRow(2).absolute()));
+        btVector3 textends = extends.dot3(trans.getBasis().getRow(0).absolute(), 
+                                          trans.getBasis().getRow(1).absolute(), 
+                                          trans.getBasis().getRow(2).absolute());
 		m_min = center - textends;
 		m_max = center + textends;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btComputeGjkEpaPenetration.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btComputeGjkEpaPenetration.h
new file mode 100644
index 00000000..9eb880b8
--- /dev/null
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btComputeGjkEpaPenetration.h
@@ -0,0 +1,369 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2014 Erwin Coumans http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btTransform.h" // Note that btVector3 might be double precision...
+#include "btGjkEpa3.h"
+#include "btGjkCollisionDescription.h"
+#include "BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h"
+template <typename btConvexTemplate>
+bool btGjkEpaCalcPenDepth(const btConvexTemplate& a, const btConvexTemplate& b,
+                          const btGjkCollisionDescription& colDesc,
+                          btVector3& v, btVector3& wWitnessOnA, btVector3& wWitnessOnB)
+    (void)v;
+    //	const btScalar				radialmargin(btScalar(0.));
+    btVector3	guessVector(b.getWorldTransform().getOrigin()-a.getWorldTransform().getOrigin());//?? why not use the GJK input?
+    btGjkEpaSolver3::sResults	results;
+    if(btGjkEpaSolver3_Penetration(a,b,guessVector,results))
+    {
+        //	debugDraw->drawLine(results.witnesses[1],results.witnesses[1]+results.normal,btVector3(255,0,0));
+        //resultOut->addContactPoint(results.normal,results.witnesses[1],-results.depth);
+        wWitnessOnA = results.witnesses[0];
+        wWitnessOnB = results.witnesses[1];
+        v = results.normal;
+        return true;
+    } else
+    {
+        if(btGjkEpaSolver3_Distance(a,b,guessVector,results))
+        {
+            wWitnessOnA = results.witnesses[0];
+            wWitnessOnB = results.witnesses[1];
+            v = results.normal;
+            return false;
+        }
+    }
+    return false;
+template <typename btConvexTemplate, typename btGjkDistanceTemplate>
+int	btComputeGjkEpaPenetration(const btConvexTemplate& a, const btConvexTemplate& b, const btGjkCollisionDescription& colDesc, btVoronoiSimplexSolver& simplexSolver, btGjkDistanceTemplate* distInfo)
+    bool m_catchDegeneracies  = true;
+    btScalar m_cachedSeparatingDistance = 0.f;
+    btScalar distance=btScalar(0.);
+    btVector3	normalInB(btScalar(0.),btScalar(0.),btScalar(0.));
+    btVector3 pointOnA,pointOnB;
+    btTransform	localTransA = a.getWorldTransform();
+    btTransform localTransB = b.getWorldTransform();
+    btScalar marginA = a.getMargin();
+    btScalar marginB = b.getMargin();
+    int m_curIter = 0;
+    int gGjkMaxIter = colDesc.m_maxGjkIterations;//this is to catch invalid input, perhaps check for #NaN?
+    btVector3 m_cachedSeparatingAxis = colDesc.m_firstDir;
+    bool isValid = false;
+    bool checkSimplex = false;
+    bool checkPenetration = true;
+    int m_degenerateSimplex = 0;
+    int m_lastUsedMethod = -1;
+    {
+        btScalar squaredDistance = BT_LARGE_FLOAT;
+        btScalar delta = btScalar(0.);
+        btScalar margin = marginA + marginB;
+        simplexSolver.reset();
+        for ( ; ; )
+            //while (true)
+        {
+            btVector3 seperatingAxisInA = (-m_cachedSeparatingAxis)* localTransA.getBasis();
+            btVector3 seperatingAxisInB = m_cachedSeparatingAxis* localTransB.getBasis();
+            btVector3 pInA = a.getLocalSupportWithoutMargin(seperatingAxisInA);
+            btVector3 qInB = b.getLocalSupportWithoutMargin(seperatingAxisInB);
+            btVector3  pWorld = localTransA(pInA);
+            btVector3  qWorld = localTransB(qInB);
+            btVector3 w	= pWorld - qWorld;
+            delta = m_cachedSeparatingAxis.dot(w);
+            // potential exit, they don't overlap
+            if ((delta > btScalar(0.0)) && (delta * delta > squaredDistance * colDesc.m_maximumDistanceSquared))
+            {
+                m_degenerateSimplex = 10;
+                checkSimplex=true;
+                //checkPenetration = false;
+                break;
+            }
+            //exit 0: the new point is already in the simplex, or we didn't come any closer
+            if (simplexSolver.inSimplex(w))
+            {
+                m_degenerateSimplex = 1;
+                checkSimplex = true;
+                break;
+            }
+            // are we getting any closer ?
+            btScalar f0 = squaredDistance - delta;
+            btScalar f1 = squaredDistance * colDesc.m_gjkRelError2;
+            if (f0 <= f1)
+            {
+                if (f0 <= btScalar(0.))
+                {
+                    m_degenerateSimplex = 2;
+                } else
+                {
+                    m_degenerateSimplex = 11;
+                }
+                checkSimplex = true;
+                break;
+            }
+            //add current vertex to simplex
+            simplexSolver.addVertex(w, pWorld, qWorld);
+            btVector3 newCachedSeparatingAxis;
+            //calculate the closest point to the origin (update vector v)
+            if (!simplexSolver.closest(newCachedSeparatingAxis))
+            {
+                m_degenerateSimplex = 3;
+                checkSimplex = true;
+                break;
+            }
+            if(newCachedSeparatingAxis.length2()<colDesc.m_gjkRelError2)
+            {
+                m_cachedSeparatingAxis = newCachedSeparatingAxis;
+                m_degenerateSimplex = 6;
+                checkSimplex = true;
+                break;
+            }
+            btScalar previousSquaredDistance = squaredDistance;
+            squaredDistance = newCachedSeparatingAxis.length2();
+#if 0
+            ///warning: this termination condition leads to some problems in 2d test case see Bullet/Demos/Box2dDemo
+            if (squaredDistance>previousSquaredDistance)
+            {
+                m_degenerateSimplex = 7;
+                squaredDistance = previousSquaredDistance;
+                checkSimplex = false;
+                break;
+            }
+#endif //
+            //redundant m_simplexSolver->compute_points(pointOnA, pointOnB);
+            //are we getting any closer ?
+            if (previousSquaredDistance - squaredDistance <= SIMD_EPSILON * previousSquaredDistance)
+            {
+                //				m_simplexSolver->backup_closest(m_cachedSeparatingAxis);
+                checkSimplex = true;
+                m_degenerateSimplex = 12;
+                break;
+            }
+            m_cachedSeparatingAxis = newCachedSeparatingAxis;
+            //degeneracy, this is typically due to invalid/uninitialized worldtransforms for a btCollisionObject
+            if (m_curIter++ > gGjkMaxIter)
+            {
+#if defined(DEBUG) || defined (_DEBUG)
+                printf("btGjkPairDetector maxIter exceeded:%i\n",m_curIter);
+                printf("sepAxis=(%f,%f,%f), squaredDistance = %f\n",
+                       m_cachedSeparatingAxis.getX(),
+                       m_cachedSeparatingAxis.getY(),
+                       m_cachedSeparatingAxis.getZ(),
+                       squaredDistance);
+                break;
+            }
+            bool check = (!simplexSolver.fullSimplex());
+            //bool check = (!m_simplexSolver->fullSimplex() && squaredDistance > SIMD_EPSILON * m_simplexSolver->maxVertex());
+            if (!check)
+            {
+                //do we need this backup_closest here ?
+                //				m_simplexSolver->backup_closest(m_cachedSeparatingAxis);
+                m_degenerateSimplex = 13;
+                break;
+            }
+        }
+        if (checkSimplex)
+        {
+            simplexSolver.compute_points(pointOnA, pointOnB);
+            normalInB = m_cachedSeparatingAxis;
+            btScalar lenSqr =m_cachedSeparatingAxis.length2();
+            //valid normal
+            if (lenSqr < 0.0001)
+            {
+                m_degenerateSimplex = 5;
+            }
+            if (lenSqr > SIMD_EPSILON*SIMD_EPSILON)
+            {
+                btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
+                normalInB *= rlen; //normalize
+                btScalar s = btSqrt(squaredDistance);
+                btAssert(s > btScalar(0.0));
+                pointOnA -= m_cachedSeparatingAxis * (marginA / s);
+                pointOnB += m_cachedSeparatingAxis * (marginB / s);
+                distance = ((btScalar(1.)/rlen) - margin);
+                isValid = true;
+                m_lastUsedMethod = 1;
+            } else
+            {
+                m_lastUsedMethod = 2;
+            }
+        }
+        bool catchDegeneratePenetrationCase =
+        (m_catchDegeneracies &&  m_degenerateSimplex && ((distance+margin) < 0.01));
+        //if (checkPenetration && !isValid)
+        if (checkPenetration && (!isValid || catchDegeneratePenetrationCase ))
+        {
+            //penetration case
+            //if there is no way to handle penetrations, bail out
+            // Penetration depth case.
+            btVector3 tmpPointOnA,tmpPointOnB;
+            m_cachedSeparatingAxis.setZero();
+            bool isValid2 = btGjkEpaCalcPenDepth(a,b,
+                                                 colDesc,
+                                                 m_cachedSeparatingAxis, tmpPointOnA, tmpPointOnB);
+            if (isValid2)
+            {
+                btVector3 tmpNormalInB = tmpPointOnB-tmpPointOnA;
+                btScalar lenSqr = tmpNormalInB.length2();
+                if (lenSqr <= (SIMD_EPSILON*SIMD_EPSILON))
+                {
+                    tmpNormalInB = m_cachedSeparatingAxis;
+                    lenSqr = m_cachedSeparatingAxis.length2();
+                }
+                if (lenSqr > (SIMD_EPSILON*SIMD_EPSILON))
+                {
+                    tmpNormalInB /= btSqrt(lenSqr);
+                    btScalar distance2 = -(tmpPointOnA-tmpPointOnB).length();
+                    //only replace valid penetrations when the result is deeper (check)
+                    if (!isValid || (distance2 < distance))
+                    {
+                        distance = distance2;
+                        pointOnA = tmpPointOnA;
+                        pointOnB = tmpPointOnB;
+                        normalInB = tmpNormalInB;
+                        isValid = true;
+                        m_lastUsedMethod = 3;
+                    } else
+                    {
+                        m_lastUsedMethod = 8;
+                    }
+                } else
+                {
+                    m_lastUsedMethod = 9;
+                }
+            } else
+            {
+                ///this is another degenerate case, where the initial GJK calculation reports a degenerate case
+                ///EPA reports no penetration, and the second GJK (using the supporting vector without margin)
+                ///reports a valid positive distance. Use the results of the second GJK instead of failing.
+                ///thanks to Jacob.Langford for the reproduction case
+                ///http://code.google.com/p/bullet/issues/detail?id=250
+                if (m_cachedSeparatingAxis.length2() > btScalar(0.))
+                {
+                    btScalar distance2 = (tmpPointOnA-tmpPointOnB).length()-margin;
+                    //only replace valid distances when the distance is less
+                    if (!isValid || (distance2 < distance))
+                    {
+                        distance = distance2;
+                        pointOnA = tmpPointOnA;
+                        pointOnB = tmpPointOnB;
+                        pointOnA -= m_cachedSeparatingAxis * marginA ;
+                        pointOnB += m_cachedSeparatingAxis * marginB ;
+                        normalInB = m_cachedSeparatingAxis;
+                        normalInB.normalize();
+                        isValid = true;
+                        m_lastUsedMethod = 6;
+                    } else
+                    {
+                        m_lastUsedMethod = 5;
+                    }
+                }
+            }
+        }
+    }
+    if (isValid && ((distance < 0) || (distance*distance < colDesc.m_maximumDistanceSquared)))
+    {
+        m_cachedSeparatingAxis = normalInB;
+        m_cachedSeparatingDistance = distance;
+        distInfo->m_distance = distance;
+        distInfo->m_normalBtoA = normalInB;
+        distInfo->m_pointOnB = pointOnB;
+        distInfo->m_pointOnA = pointOnB+normalInB*distance;
+        return 0;
+    }
+    return -m_lastUsedMethod;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp
index 91fcea57..940282f5 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp
@@ -62,7 +62,6 @@ void btContinuousConvexCollision::computeClosestPoints( const btTransform& trans
 		const btConvexShape* convexShape = m_convexA;
 		const btStaticPlaneShape* planeShape = m_planeShape;
-		bool hasCollision = false;
 		const btVector3& planeNormal = planeShape->getPlaneNormal();
 		const btScalar& planeConstant = planeShape->getPlaneConstant();
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h
index 72eb5aec..29620abf 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h
@@ -17,7 +17,6 @@ subject to the following restrictions:
-class btStackAlloc;
 class btVector3;
 #include "btSimplexSolverInterface.h"
 class btConvexShape;
@@ -33,8 +32,7 @@ public:
 		const btConvexShape* convexA,const btConvexShape* convexB,
 					const btTransform& transA,const btTransform& transB,
 				btVector3& v, btVector3& pa, btVector3& pb,
-				class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc
-				) = 0;
+				class btIDebugDraw* debugDraw) = 0;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h
index f958cc52..46ce1ab7 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h
@@ -19,7 +19,6 @@ subject to the following restrictions:
 #include "LinearMath/btTransform.h"
 #include "LinearMath/btVector3.h"
-class btStackAlloc;
 /// This interface is made to be used by an iterative approach to do TimeOfImpact calculations
 /// This interface allows to query for closest points and penetration depth between two (convex) objects
@@ -43,15 +42,13 @@ struct btDiscreteCollisionDetectorInterface
 	struct ClosestPointInput
-			:m_maximumDistanceSquared(btScalar(BT_LARGE_FLOAT)),
-			m_stackAlloc(0)
+			:m_maximumDistanceSquared(btScalar(BT_LARGE_FLOAT))
 		btTransform m_transformA;
 		btTransform m_transformB;
 		btScalar	m_maximumDistanceSquared;
-		btStackAlloc* m_stackAlloc;
 	virtual ~btDiscreteCollisionDetectorInterface() {};
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkCollisionDescription.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkCollisionDescription.h
new file mode 100644
index 00000000..0b49b0ec
--- /dev/null
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkCollisionDescription.h
@@ -0,0 +1,41 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2014 Erwin Coumans http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btVector3.h"
+struct btGjkCollisionDescription
+    btVector3	m_firstDir;
+    int			m_maxGjkIterations;
+    btScalar	m_maximumDistanceSquared;
+    btScalar	m_gjkRelError2;
+    btGjkCollisionDescription()
+    :m_firstDir(0,1,0),
+    m_maxGjkIterations(1000),
+    m_maximumDistanceSquared(1e30f),
+    m_gjkRelError2(1.0e-6)
+    {
+    }
+    virtual ~btGjkCollisionDescription()
+    {
+    }
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp
index f74261d4..eefb974b 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp
@@ -41,21 +41,38 @@ namespace gjkepa2_impl
 	/* GJK	*/ 
-#define GJK_ACCURARY		((btScalar)0.0001)
-#define GJK_MIN_DISTANCE	((btScalar)0.0001)
-#define GJK_DUPLICATED_EPS	((btScalar)0.0001)
+	#define GJK_ACCURACY		((btScalar)1e-12)
+	#define GJK_MIN_DISTANCE	((btScalar)1e-12)
+	#define GJK_DUPLICATED_EPS	((btScalar)1e-12)
+	#define GJK_ACCURACY		((btScalar)0.0001)
+	#define GJK_MIN_DISTANCE	((btScalar)0.0001)
+	#define GJK_DUPLICATED_EPS	((btScalar)0.0001)
 #define GJK_SIMPLEX2_EPS	((btScalar)0.0)
 #define GJK_SIMPLEX3_EPS	((btScalar)0.0)
 #define GJK_SIMPLEX4_EPS	((btScalar)0.0)
 	/* EPA	*/ 
-#define EPA_MAX_VERTICES	64
+#define EPA_MAX_VERTICES	128
-#define EPA_ACCURACY		((btScalar)0.0001)
-#define EPA_PLANE_EPS		((btScalar)0.00001)
-#define EPA_INSIDE_EPS		((btScalar)0.01)
+	#define EPA_ACCURACY		((btScalar)1e-12)
+	#define EPA_PLANE_EPS		((btScalar)1e-14)
+	#define EPA_INSIDE_EPS		((btScalar)1e-9)
+	#define EPA_ACCURACY		((btScalar)0.0001)
+	#define EPA_PLANE_EPS		((btScalar)0.00001)
+	#define EPA_INSIDE_EPS		((btScalar)0.01)
+#define EPA_FALLBACK            (10*EPA_ACCURACY)
+#define EPA_MAX_FACES           (EPA_MAX_VERTICES*2)
 	// Shorthands
@@ -242,7 +259,7 @@ namespace gjkepa2_impl
 					/* Check for termination				*/ 
 					const btScalar	omega=btDot(m_ray,w)/rl;
-					if(((rl-alpha)-(GJK_ACCURARY*rl))<=0)
+					if(((rl-alpha)-(GJK_ACCURACY*rl))<=0)
 					{/* Return old simplex				*/ 
@@ -511,7 +528,6 @@ namespace gjkepa2_impl
 			btVector3	n;
 			btScalar	d;
-			btScalar	p;
 			sSV*		c[3];
 			sFace*		f[3];
 			sFace*		l[2];
@@ -657,7 +673,7 @@ namespace gjkepa2_impl
-										if(best->p>=outer.p) outer=*best;
+										outer=*best;
 									} else { m_status=eStatus::InvalidHull;break; }
 								} else { m_status=eStatus::AccuraryReached;break; }
 							} else { m_status=eStatus::OutOfVertices;break; }
@@ -696,6 +712,42 @@ namespace gjkepa2_impl
+			bool getedgedist(sFace* face, sSV* a, sSV* b, btScalar& dist)
+			{
+				const btVector3 ba = b->w - a->w;
+				const btVector3 n_ab = btCross(ba, face->n); // Outward facing edge normal direction, on triangle plane
+				const btScalar a_dot_nab = btDot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required
+				if(a_dot_nab < 0)
+				{
+					// Outside of edge a->b
+					const btScalar ba_l2 = ba.length2();
+					const btScalar a_dot_ba = btDot(a->w, ba);
+					const btScalar b_dot_ba = btDot(b->w, ba);
+					if(a_dot_ba > 0)
+					{
+						// Pick distance vertex a
+						dist = a->w.length();
+					}
+					else if(b_dot_ba < 0)
+					{
+						// Pick distance vertex b
+						dist = b->w.length();
+					}
+					else
+					{
+						// Pick distance to edge a->b
+						const btScalar a_dot_b = btDot(a->w, b->w);
+						dist = btSqrt(btMax((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (btScalar)0));
+					}
+					return true;
+				}
+				return false;
+			}
 			sFace*				newface(sSV* a,sSV* b,sSV* c,bool forced)
@@ -710,41 +762,48 @@ namespace gjkepa2_impl
 					face->n		=	btCross(b->w-a->w,c->w-a->w);
 					const btScalar	l=face->n.length();
 					const bool		v=l>EPA_ACCURACY;
-					face->p		=	btMin(btMin(
-						btDot(a->w,btCross(face->n,a->w-b->w)),
-						btDot(b->w,btCross(face->n,b->w-c->w))),
-						btDot(c->w,btCross(face->n,c->w-a->w)))	/
-						(v?l:1);
-					face->p		=	face->p>=-EPA_INSIDE_EPS?0:face->p;
-						face->d		=	btDot(a->w,face->n)/l;
-						face->n		/=	l;
-						if(forced||(face->d>=-EPA_PLANE_EPS))
+						if(!(getedgedist(face, a, b, face->d) ||
+							 getedgedist(face, b, c, face->d) ||
+							 getedgedist(face, c, a, face->d)))
-							return(face);
-						} else m_status=eStatus::NonConvex;
-					} else m_status=eStatus::Degenerated;
-					remove(m_hull,face);
-					append(m_stock,face);
-					return(0);
+							// Origin projects to the interior of the triangle
+							// Use distance to triangle plane
+							face->d = btDot(a->w, face->n) / l;
+						}
+						face->n /= l;
+						if(forced || (face->d >= -EPA_PLANE_EPS))
+						{
+							return face;
+						}
+						else
+							m_status=eStatus::NonConvex;
+					}
+					else
+						m_status=eStatus::Degenerated;
+					remove(m_hull, face);
+					append(m_stock, face);
+					return 0;
-				m_status=m_stock.root?eStatus::OutOfVertices:eStatus::OutOfFaces;
-				return(0);
+				m_status = m_stock.root ? eStatus::OutOfVertices : eStatus::OutOfFaces;
+				return 0;
 			sFace*				findbest()
 				sFace*		minf=m_hull.root;
 				btScalar	mind=minf->d*minf->d;
-				btScalar	maxp=minf->p;
 				for(sFace* f=minf->l[1];f;f=f->l[1])
 					const btScalar	sqd=f->d*f->d;
-					if((f->p>=maxp)&&(sqd<mind))
+					if(sqd<mind)
-						maxp=f->p;
@@ -973,7 +1032,7 @@ bool	btGjkEpaSolver2::SignedDistance(const btConvexShape*	shape0,
 /* Symbols cleanup		*/ 
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa3.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa3.h
new file mode 100644
index 00000000..ce1f24bc
--- /dev/null
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa3.h
@@ -0,0 +1,1035 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2014 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the
+use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software in a
+product, an acknowledgment in the product documentation would be appreciated
+but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+Initial GJK-EPA collision solver by Nathanael Presson, 2008
+Improvements and refactoring by Erwin Coumans, 2008-2014
+#ifndef BT_GJK_EPA3_H
+#define BT_GJK_EPA3_H
+#include "LinearMath/btTransform.h"
+#include "btGjkCollisionDescription.h"
+struct	btGjkEpaSolver3
+struct	sResults
+	{
+	enum eStatus
+		{
+		Separated,		/* Shapes doesnt penetrate												*/ 
+		Penetrating,	/* Shapes are penetrating												*/ 
+		GJK_Failed,		/* GJK phase fail, no big issue, shapes are probably just 'touching'	*/ 
+		EPA_Failed		/* EPA phase fail, bigger problem, need to save parameters, and debug	*/ 
+		}		status;
+	btVector3	witnesses[2];
+	btVector3	normal;
+	btScalar	distance;
+	};
+#if defined(DEBUG) || defined (_DEBUG)
+#include <stdio.h> //for debug printf
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+#endif //__SPU__
+    // Config
+    /* GJK	*/
+#define GJK_ACCURARY		((btScalar)0.0001)
+#define GJK_MIN_DISTANCE	((btScalar)0.0001)
+#define GJK_DUPLICATED_EPS	((btScalar)0.0001)
+#define GJK_SIMPLEX2_EPS	((btScalar)0.0)
+#define GJK_SIMPLEX3_EPS	((btScalar)0.0)
+#define GJK_SIMPLEX4_EPS	((btScalar)0.0)
+    /* EPA	*/
+#define EPA_MAX_VERTICES	64
+#define EPA_ACCURACY		((btScalar)0.0001)
+#define EPA_PLANE_EPS		((btScalar)0.00001)
+#define EPA_INSIDE_EPS		((btScalar)0.01)
+    // Shorthands
+    typedef unsigned int	U;
+    typedef unsigned char	U1;
+    // MinkowskiDiff
+    template <typename btConvexTemplate>
+    struct	MinkowskiDiff
+    {
+        const btConvexTemplate* m_convexAPtr;
+        const btConvexTemplate* m_convexBPtr;
+        btMatrix3x3				m_toshape1;
+        btTransform				m_toshape0;
+        bool					m_enableMargin;
+        MinkowskiDiff(const btConvexTemplate& a, const btConvexTemplate& b)
+        :m_convexAPtr(&a),
+        m_convexBPtr(&b)
+        {
+        }
+        void					EnableMargin(bool enable)
+        {
+            m_enableMargin = enable;
+        }
+        inline btVector3		Support0(const btVector3& d) const
+        {
+            return m_convexAPtr->getLocalSupportWithMargin(d);
+        }
+        inline btVector3		Support1(const btVector3& d) const
+        {
+            return m_toshape0*m_convexBPtr->getLocalSupportWithMargin(m_toshape1*d);
+        }
+        inline btVector3		Support(const btVector3& d) const
+        {
+            return(Support0(d)-Support1(-d));
+        }
+        btVector3				Support(const btVector3& d,U index) const
+        {
+            if(index)
+                return(Support1(d));
+            else
+                return(Support0(d));
+        }
+    };
+enum	eGjkStatus
+    eGjkValid,
+    eGjkInside,
+    eGjkFailed
+    // GJK
+    template <typename btConvexTemplate>
+    struct	GJK
+    {
+        /* Types		*/
+        struct	sSV
+        {
+            btVector3	d,w;
+        };
+        struct	sSimplex
+        {
+            sSV*		c[4];
+            btScalar	p[4];
+            U			rank;
+        };
+        /* Fields		*/
+        MinkowskiDiff<btConvexTemplate>			m_shape;
+        btVector3		m_ray;
+        btScalar		m_distance;
+        sSimplex		m_simplices[2];
+        sSV				m_store[4];
+        sSV*			m_free[4];
+        U				m_nfree;
+        U				m_current;
+        sSimplex*		m_simplex;
+        eGjkStatus      m_status;
+        /* Methods		*/
+        GJK(const btConvexTemplate& a, const btConvexTemplate& b)
+        :m_shape(a,b)
+        {
+            Initialize();
+        }
+        void				Initialize()
+        {
+            m_ray		=	btVector3(0,0,0);
+            m_nfree		=	0;
+            m_status	=	eGjkFailed;
+            m_current	=	0;
+            m_distance	=	0;
+        }
+        eGjkStatus			Evaluate(const MinkowskiDiff<btConvexTemplate>& shapearg,const btVector3& guess)
+        {
+            U			iterations=0;
+            btScalar	sqdist=0;
+            btScalar	alpha=0;
+            btVector3	lastw[4];
+            U			clastw=0;
+            /* Initialize solver		*/
+            m_free[0]			=	&m_store[0];
+            m_free[1]			=	&m_store[1];
+            m_free[2]			=	&m_store[2];
+            m_free[3]			=	&m_store[3];
+            m_nfree				=	4;
+            m_current			=	0;
+            m_status			=	eGjkValid;
+            m_shape				=	shapearg;
+            m_distance			=	0;
+            /* Initialize simplex		*/
+            m_simplices[0].rank	=	0;
+            m_ray				=	guess;
+            const btScalar	sqrl=	m_ray.length2();
+            appendvertice(m_simplices[0],sqrl>0?-m_ray:btVector3(1,0,0));
+            m_simplices[0].p[0]	=	1;
+            m_ray				=	m_simplices[0].c[0]->w;
+            sqdist				=	sqrl;
+            lastw[0]			=
+            lastw[1]			=
+            lastw[2]			=
+            lastw[3]			=	m_ray;
+            /* Loop						*/
+            do	{
+                const U		next=1-m_current;
+                sSimplex&	cs=m_simplices[m_current];
+                sSimplex&	ns=m_simplices[next];
+                /* Check zero							*/
+                const btScalar	rl=m_ray.length();
+                if(rl<GJK_MIN_DISTANCE)
+                {/* Touching or inside				*/
+                    m_status=eGjkInside;
+                    break;
+                }
+                /* Append new vertice in -'v' direction	*/
+                appendvertice(cs,-m_ray);
+                const btVector3&	w=cs.c[cs.rank-1]->w;
+                bool				found=false;
+                for(U i=0;i<4;++i)
+                {
+                    if((w-lastw[i]).length2()<GJK_DUPLICATED_EPS)
+                    { found=true;break; }
+                }
+                if(found)
+                {/* Return old simplex				*/
+                    removevertice(m_simplices[m_current]);
+                    break;
+                }
+                else
+                {/* Update lastw					*/
+                    lastw[clastw=(clastw+1)&3]=w;
+                }
+                /* Check for termination				*/
+                const btScalar	omega=btDot(m_ray,w)/rl;
+                alpha=btMax(omega,alpha);
+                if(((rl-alpha)-(GJK_ACCURARY*rl))<=0)
+                {/* Return old simplex				*/
+                    removevertice(m_simplices[m_current]);
+                    break;
+                }
+                /* Reduce simplex						*/
+                btScalar	weights[4];
+                U			mask=0;
+                switch(cs.rank)
+                {
+                    case	2:	sqdist=projectorigin(	cs.c[0]->w,
+                                                     cs.c[1]->w,
+                                                     weights,mask);break;
+                    case	3:	sqdist=projectorigin(	cs.c[0]->w,
+                                                     cs.c[1]->w,
+                                                     cs.c[2]->w,
+                                                     weights,mask);break;
+                    case	4:	sqdist=projectorigin(	cs.c[0]->w,
+                                                     cs.c[1]->w,
+                                                     cs.c[2]->w,
+                                                     cs.c[3]->w,
+                                                     weights,mask);break;
+                }
+                if(sqdist>=0)
+                {/* Valid	*/
+                    ns.rank		=	0;
+                    m_ray		=	btVector3(0,0,0);
+                    m_current	=	next;
+                    for(U i=0,ni=cs.rank;i<ni;++i)
+                    {
+                        if(mask&(1<<i))
+                        {
+                            ns.c[ns.rank]		=	cs.c[i];
+                            ns.p[ns.rank++]		=	weights[i];
+                            m_ray				+=	cs.c[i]->w*weights[i];
+                        }
+                        else
+                        {
+                            m_free[m_nfree++]	=	cs.c[i];
+                        }
+                    }
+                    if(mask==15) m_status=eGjkInside;
+                }
+                else
+                {/* Return old simplex				*/
+                    removevertice(m_simplices[m_current]);
+                    break;
+                }
+                m_status=((++iterations)<GJK_MAX_ITERATIONS)?m_status:eGjkFailed;
+            } while(m_status==eGjkValid);
+            m_simplex=&m_simplices[m_current];
+            switch(m_status)
+            {
+                case	eGjkValid:		m_distance=m_ray.length();break;
+                case	eGjkInside:	m_distance=0;break;
+                default:
+                {
+                }
+            }
+            return(m_status);
+        }
+        bool					EncloseOrigin()
+        {
+            switch(m_simplex->rank)
+            {
+                case	1:
+                {
+                    for(U i=0;i<3;++i)
+                    {
+                        btVector3		axis=btVector3(0,0,0);
+                        axis[i]=1;
+                        appendvertice(*m_simplex, axis);
+                        if(EncloseOrigin())	return(true);
+                        removevertice(*m_simplex);
+                        appendvertice(*m_simplex,-axis);
+                        if(EncloseOrigin())	return(true);
+                        removevertice(*m_simplex);
+                    }
+                }
+                    break;
+                case	2:
+                {
+                    const btVector3	d=m_simplex->c[1]->w-m_simplex->c[0]->w;
+                    for(U i=0;i<3;++i)
+                    {
+                        btVector3		axis=btVector3(0,0,0);
+                        axis[i]=1;
+                        const btVector3	p=btCross(d,axis);
+                        if(p.length2()>0)
+                        {
+                            appendvertice(*m_simplex, p);
+                            if(EncloseOrigin())	return(true);
+                            removevertice(*m_simplex);
+                            appendvertice(*m_simplex,-p);
+                            if(EncloseOrigin())	return(true);
+                            removevertice(*m_simplex);
+                        }
+                    }
+                }
+                    break;
+                case	3:
+                {
+                    const btVector3	n=btCross(m_simplex->c[1]->w-m_simplex->c[0]->w,
+                                              m_simplex->c[2]->w-m_simplex->c[0]->w);
+                    if(n.length2()>0)
+                    {
+                        appendvertice(*m_simplex,n);
+                        if(EncloseOrigin())	return(true);
+                        removevertice(*m_simplex);
+                        appendvertice(*m_simplex,-n);
+                        if(EncloseOrigin())	return(true);
+                        removevertice(*m_simplex);
+                    }
+                }
+                    break;
+                case	4:
+                {
+                    if(btFabs(det(	m_simplex->c[0]->w-m_simplex->c[3]->w,
+                                  m_simplex->c[1]->w-m_simplex->c[3]->w,
+                                  m_simplex->c[2]->w-m_simplex->c[3]->w))>0)
+                        return(true);
+                }
+                    break;
+            }
+            return(false);
+        }
+        /* Internals	*/
+        void				getsupport(const btVector3& d,sSV& sv) const
+        {
+            sv.d	=	d/d.length();
+            sv.w	=	m_shape.Support(sv.d);
+        }
+        void				removevertice(sSimplex& simplex)
+        {
+            m_free[m_nfree++]=simplex.c[--simplex.rank];
+        }
+        void				appendvertice(sSimplex& simplex,const btVector3& v)
+        {
+            simplex.p[simplex.rank]=0;
+            simplex.c[simplex.rank]=m_free[--m_nfree];
+            getsupport(v,*simplex.c[simplex.rank++]);
+        }
+        static btScalar		det(const btVector3& a,const btVector3& b,const btVector3& c)
+        {
+            return(	a.y()*b.z()*c.x()+a.z()*b.x()*c.y()-
+                   a.x()*b.z()*c.y()-a.y()*b.x()*c.z()+
+                   a.x()*b.y()*c.z()-a.z()*b.y()*c.x());
+        }
+        static btScalar		projectorigin(	const btVector3& a,
+                                          const btVector3& b,
+                                          btScalar* w,U& m)
+        {
+            const btVector3	d=b-a;
+            const btScalar	l=d.length2();
+            if(l>GJK_SIMPLEX2_EPS)
+            {
+                const btScalar	t(l>0?-btDot(a,d)/l:0);
+                if(t>=1)		{ w[0]=0;w[1]=1;m=2;return(b.length2()); }
+                else if(t<=0)	{ w[0]=1;w[1]=0;m=1;return(a.length2()); }
+                else			{ w[0]=1-(w[1]=t);m=3;return((a+d*t).length2()); }
+            }
+            return(-1);
+        }
+        static btScalar		projectorigin(	const btVector3& a,
+                                          const btVector3& b,
+                                          const btVector3& c,
+                                          btScalar* w,U& m)
+        {
+            static const U		imd3[]={1,2,0};
+            const btVector3*	vt[]={&a,&b,&c};
+            const btVector3		dl[]={a-b,b-c,c-a};
+            const btVector3		n=btCross(dl[0],dl[1]);
+            const btScalar		l=n.length2();
+            if(l>GJK_SIMPLEX3_EPS)
+            {
+                btScalar	mindist=-1;
+                btScalar	subw[2]={0.f,0.f};
+                U			subm(0);
+                for(U i=0;i<3;++i)
+                {
+                    if(btDot(*vt[i],btCross(dl[i],n))>0)
+                    {
+                        const U			j=imd3[i];
+                        const btScalar	subd(projectorigin(*vt[i],*vt[j],subw,subm));
+                        if((mindist<0)||(subd<mindist))
+                        {
+                            mindist		=	subd;
+                            m			=	static_cast<U>(((subm&1)?1<<i:0)+((subm&2)?1<<j:0));
+                            w[i]		=	subw[0];
+                            w[j]		=	subw[1];
+                            w[imd3[j]]	=	0;
+                        }
+                    }
+                }
+                if(mindist<0)
+                {
+                    const btScalar	d=btDot(a,n);
+                    const btScalar	s=btSqrt(l);
+                    const btVector3	p=n*(d/l);
+                    mindist	=	p.length2();
+                    m		=	7;
+                    w[0]	=	(btCross(dl[1],b-p)).length()/s;
+                    w[1]	=	(btCross(dl[2],c-p)).length()/s;
+                    w[2]	=	1-(w[0]+w[1]);
+                }
+                return(mindist);
+            }
+            return(-1);
+        }
+        static btScalar		projectorigin(	const btVector3& a,
+                                          const btVector3& b,
+                                          const btVector3& c,
+                                          const btVector3& d,
+                                          btScalar* w,U& m)
+        {
+            static const U		imd3[]={1,2,0};
+            const btVector3*	vt[]={&a,&b,&c,&d};
+            const btVector3		dl[]={a-d,b-d,c-d};
+            const btScalar		vl=det(dl[0],dl[1],dl[2]);
+            const bool			ng=(vl*btDot(a,btCross(b-c,a-b)))<=0;
+            if(ng&&(btFabs(vl)>GJK_SIMPLEX4_EPS))
+            {
+                btScalar	mindist=-1;
+                btScalar	subw[3]={0.f,0.f,0.f};
+                U			subm(0);
+                for(U i=0;i<3;++i)
+                {
+                    const U			j=imd3[i];
+                    const btScalar	s=vl*btDot(d,btCross(dl[i],dl[j]));
+                    if(s>0)
+                    {
+                        const btScalar	subd=projectorigin(*vt[i],*vt[j],d,subw,subm);
+                        if((mindist<0)||(subd<mindist))
+                        {
+                            mindist		=	subd;
+                            m			=	static_cast<U>((subm&1?1<<i:0)+
+                                                           (subm&2?1<<j:0)+
+                                                           (subm&4?8:0));
+                            w[i]		=	subw[0];
+                            w[j]		=	subw[1];
+                            w[imd3[j]]	=	0;
+                            w[3]		=	subw[2];
+                        }
+                    }
+                }
+                if(mindist<0)
+                {
+                    mindist	=	0;
+                    m		=	15;
+                    w[0]	=	det(c,b,d)/vl;
+                    w[1]	=	det(a,c,d)/vl;
+                    w[2]	=	det(b,a,d)/vl;
+                    w[3]	=	1-(w[0]+w[1]+w[2]);
+                }
+                return(mindist);
+            }
+            return(-1);
+        }
+    };
+enum	eEpaStatus
+    eEpaValid,
+    eEpaTouching,
+    eEpaDegenerated,
+    eEpaNonConvex,
+    eEpaInvalidHull,
+    eEpaOutOfFaces,
+    eEpaOutOfVertices,
+    eEpaAccuraryReached,
+    eEpaFallBack,
+    eEpaFailed
+    // EPA
+template <typename btConvexTemplate>
+    struct	EPA
+    {
+        /* Types		*/
+        struct	sFace
+        {
+            btVector3	n;
+            btScalar	d;
+            typename GJK<btConvexTemplate>::sSV*		c[3];
+            sFace*		f[3];
+            sFace*		l[2];
+            U1			e[3];
+            U1			pass;
+        };
+        struct	sList
+        {
+            sFace*		root;
+            U			count;
+            sList() : root(0),count(0)	{}
+        };
+        struct	sHorizon
+        {
+            sFace*		cf;
+            sFace*		ff;
+            U			nf;
+            sHorizon() : cf(0),ff(0),nf(0)	{}
+        };
+        /* Fields		*/
+        eEpaStatus		m_status;
+        typename GJK<btConvexTemplate>::sSimplex	m_result;
+        btVector3		m_normal;
+        btScalar		m_depth;
+        typename GJK<btConvexTemplate>::sSV				m_sv_store[EPA_MAX_VERTICES];
+        sFace			m_fc_store[EPA_MAX_FACES];
+        U				m_nextsv;
+        sList			m_hull;
+        sList			m_stock;
+        /* Methods		*/
+        EPA()
+        {
+            Initialize();
+        }
+        static inline void		bind(sFace* fa,U ea,sFace* fb,U eb)
+        {
+            fa->e[ea]=(U1)eb;fa->f[ea]=fb;
+            fb->e[eb]=(U1)ea;fb->f[eb]=fa;
+        }
+        static inline void		append(sList& list,sFace* face)
+        {
+            face->l[0]	=	0;
+            face->l[1]	=	list.root;
+            if(list.root) list.root->l[0]=face;
+            list.root	=	face;
+            ++list.count;
+        }
+        static inline void		remove(sList& list,sFace* face)
+        {
+            if(face->l[1]) face->l[1]->l[0]=face->l[0];
+            if(face->l[0]) face->l[0]->l[1]=face->l[1];
+            if(face==list.root) list.root=face->l[1];
+            --list.count;
+        }
+        void				Initialize()
+        {
+            m_status	=	eEpaFailed;
+            m_normal	=	btVector3(0,0,0);
+            m_depth		=	0;
+            m_nextsv	=	0;
+            for(U i=0;i<EPA_MAX_FACES;++i)
+            {
+                append(m_stock,&m_fc_store[EPA_MAX_FACES-i-1]);
+            }
+        }
+        eEpaStatus			Evaluate(GJK<btConvexTemplate>& gjk,const btVector3& guess)
+        {
+            typename GJK<btConvexTemplate>::sSimplex&	simplex=*gjk.m_simplex;
+            if((simplex.rank>1)&&gjk.EncloseOrigin())
+            {
+                /* Clean up				*/
+                while(m_hull.root)
+                {
+                    sFace*	f = m_hull.root;
+                    remove(m_hull,f);
+                    append(m_stock,f);
+                }
+                m_status	=	eEpaValid;
+                m_nextsv	=	0;
+                /* Orient simplex		*/
+                if(gjk.det(	simplex.c[0]->w-simplex.c[3]->w,
+                           simplex.c[1]->w-simplex.c[3]->w,
+                           simplex.c[2]->w-simplex.c[3]->w)<0)
+                {
+                    btSwap(simplex.c[0],simplex.c[1]);
+                    btSwap(simplex.p[0],simplex.p[1]);
+                }
+                /* Build initial hull	*/
+                sFace*	tetra[]={newface(simplex.c[0],simplex.c[1],simplex.c[2],true),
+                    newface(simplex.c[1],simplex.c[0],simplex.c[3],true),
+                    newface(simplex.c[2],simplex.c[1],simplex.c[3],true),
+                    newface(simplex.c[0],simplex.c[2],simplex.c[3],true)};
+                if(m_hull.count==4)
+                {
+                    sFace*		best=findbest();
+                    sFace		outer=*best;
+                    U			pass=0;
+                    U			iterations=0;
+                    bind(tetra[0],0,tetra[1],0);
+                    bind(tetra[0],1,tetra[2],0);
+                    bind(tetra[0],2,tetra[3],0);
+                    bind(tetra[1],1,tetra[3],2);
+                    bind(tetra[1],2,tetra[2],1);
+                    bind(tetra[2],2,tetra[3],1);
+                    m_status=eEpaValid;
+                    for(;iterations<EPA_MAX_ITERATIONS;++iterations)
+                    {
+                        if(m_nextsv<EPA_MAX_VERTICES)
+                        {
+                            sHorizon		horizon;
+                            typename GJK<btConvexTemplate>::sSV*			w=&m_sv_store[m_nextsv++];
+                            bool			valid=true;
+                            best->pass	=	(U1)(++pass);
+                            gjk.getsupport(best->n,*w);
+                            const btScalar	wdist=btDot(best->n,w->w)-best->d;
+                            if(wdist>EPA_ACCURACY)
+                            {
+                                for(U j=0;(j<3)&&valid;++j)
+                                {
+                                    valid&=expand(	pass,w,
+                                                  best->f[j],best->e[j],
+                                                  horizon);
+                                }
+                                if(valid&&(horizon.nf>=3))
+                                {
+                                    bind(horizon.cf,1,horizon.ff,2);
+                                    remove(m_hull,best);
+                                    append(m_stock,best);
+                                    best=findbest();
+                                    outer=*best;
+                                } else { m_status=eEpaInvalidHull;break; }
+                            } else { m_status=eEpaAccuraryReached;break; }
+                        } else { m_status=eEpaOutOfVertices;break; }
+                    }
+                    const btVector3	projection=outer.n*outer.d;
+                    m_normal	=	outer.n;
+                    m_depth		=	outer.d;
+                    m_result.rank	=	3;
+                    m_result.c[0]	=	outer.c[0];
+                    m_result.c[1]	=	outer.c[1];
+                    m_result.c[2]	=	outer.c[2];
+                    m_result.p[0]	=	btCross(	outer.c[1]->w-projection,
+                                                outer.c[2]->w-projection).length();
+                    m_result.p[1]	=	btCross(	outer.c[2]->w-projection,
+                                                outer.c[0]->w-projection).length();
+                    m_result.p[2]	=	btCross(	outer.c[0]->w-projection,
+                                                outer.c[1]->w-projection).length();
+                    const btScalar	sum=m_result.p[0]+m_result.p[1]+m_result.p[2];
+                    m_result.p[0]	/=	sum;
+                    m_result.p[1]	/=	sum;
+                    m_result.p[2]	/=	sum;
+                    return(m_status);
+                }
+            }
+            /* Fallback		*/
+            m_status	=	eEpaFallBack;
+            m_normal	=	-guess;
+            const btScalar	nl=m_normal.length();
+            if(nl>0)
+                m_normal	=	m_normal/nl;
+            else
+                m_normal	=	btVector3(1,0,0);
+            m_depth	=	0;
+            m_result.rank=1;
+            m_result.c[0]=simplex.c[0];
+            m_result.p[0]=1;
+            return(m_status);
+        }
+        bool getedgedist(sFace* face, typename GJK<btConvexTemplate>::sSV* a, typename GJK<btConvexTemplate>::sSV* b, btScalar& dist)
+        {
+            const btVector3 ba = b->w - a->w;
+            const btVector3 n_ab = btCross(ba, face->n); // Outward facing edge normal direction, on triangle plane
+            const btScalar a_dot_nab = btDot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required
+            if(a_dot_nab < 0)
+            {
+                // Outside of edge a->b
+                const btScalar ba_l2 = ba.length2();
+                const btScalar a_dot_ba = btDot(a->w, ba);
+                const btScalar b_dot_ba = btDot(b->w, ba);
+                if(a_dot_ba > 0)
+                {
+                    // Pick distance vertex a
+                    dist = a->w.length();
+                }
+                else if(b_dot_ba < 0)
+                {
+                    // Pick distance vertex b
+                    dist = b->w.length();
+                }
+                else
+                {
+                    // Pick distance to edge a->b
+                    const btScalar a_dot_b = btDot(a->w, b->w);
+                    dist = btSqrt(btMax((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (btScalar)0));
+                }
+                return true;
+            }
+            return false;
+        }
+        sFace*				newface(typename GJK<btConvexTemplate>::sSV* a,typename GJK<btConvexTemplate>::sSV* b,typename GJK<btConvexTemplate>::sSV* c,bool forced)
+        {
+            if(m_stock.root)
+            {
+                sFace*	face=m_stock.root;
+                remove(m_stock,face);
+                append(m_hull,face);
+                face->pass	=	0;
+                face->c[0]	=	a;
+                face->c[1]	=	b;
+                face->c[2]	=	c;
+                face->n		=	btCross(b->w-a->w,c->w-a->w);
+                const btScalar	l=face->n.length();
+                const bool		v=l>EPA_ACCURACY;
+                if(v)
+                {
+                    if(!(getedgedist(face, a, b, face->d) ||
+                         getedgedist(face, b, c, face->d) ||
+                         getedgedist(face, c, a, face->d)))
+                    {
+                        // Origin projects to the interior of the triangle
+                        // Use distance to triangle plane
+                        face->d = btDot(a->w, face->n) / l;
+                    }
+                    face->n /= l;
+                    if(forced || (face->d >= -EPA_PLANE_EPS))
+                    {
+                        return face;
+                    }
+                    else
+                        m_status=eEpaNonConvex;
+                }
+                else
+                    m_status=eEpaDegenerated;
+                remove(m_hull, face);
+                append(m_stock, face);
+                return 0;
+            }
+            m_status = m_stock.root ? eEpaOutOfVertices : eEpaOutOfFaces;
+            return 0;
+        }
+        sFace*				findbest()
+        {
+            sFace*		minf=m_hull.root;
+            btScalar	mind=minf->d*minf->d;
+            for(sFace* f=minf->l[1];f;f=f->l[1])
+            {
+                const btScalar	sqd=f->d*f->d;
+                if(sqd<mind)
+                {
+                    minf=f;
+                    mind=sqd;
+                }
+            }
+            return(minf);
+        }
+        bool				expand(U pass,typename GJK<btConvexTemplate>::sSV* w,sFace* f,U e,sHorizon& horizon)
+        {
+            static const U	i1m3[]={1,2,0};
+            static const U	i2m3[]={2,0,1};
+            if(f->pass!=pass)
+            {
+                const U	e1=i1m3[e];
+                if((btDot(f->n,w->w)-f->d)<-EPA_PLANE_EPS)
+                {
+                    sFace*	nf=newface(f->c[e1],f->c[e],w,false);
+                    if(nf)
+                    {
+                        bind(nf,0,f,e);
+                        if(horizon.cf) bind(horizon.cf,1,nf,2); else horizon.ff=nf;
+                        horizon.cf=nf;
+                        ++horizon.nf;
+                        return(true);
+                    }
+                }
+                else
+                {
+                    const U	e2=i2m3[e];
+                    f->pass		=	(U1)pass;
+                    if(	expand(pass,w,f->f[e1],f->e[e1],horizon)&&
+                       expand(pass,w,f->f[e2],f->e[e2],horizon))
+                    {
+                        remove(m_hull,f);
+                        append(m_stock,f);
+                        return(true);
+                    }
+                }
+            }
+            return(false);
+        }
+    };
+    template <typename btConvexTemplate>
+    static void	Initialize(	const btConvexTemplate& a, const btConvexTemplate& b,
+                           btGjkEpaSolver3::sResults& results,
+                           MinkowskiDiff<btConvexTemplate>& shape)
+    {
+        /* Results		*/ 
+        results.witnesses[0]	=
+        results.witnesses[1]	=	btVector3(0,0,0);
+        results.status			=	btGjkEpaSolver3::sResults::Separated;
+        /* Shape		*/ 
+        shape.m_toshape1		=	b.getWorldTransform().getBasis().transposeTimes(a.getWorldTransform().getBasis());
+        shape.m_toshape0		=	a.getWorldTransform().inverseTimes(b.getWorldTransform());
+    }
+// Api
+template <typename btConvexTemplate>
+bool		btGjkEpaSolver3_Distance(const btConvexTemplate& a, const btConvexTemplate& b,
+                                      const btVector3& guess,
+                                      btGjkEpaSolver3::sResults& results)
+    MinkowskiDiff<btConvexTemplate>			shape(a,b);
+    Initialize(a,b,results,shape);
+    GJK<btConvexTemplate>				gjk(a,b);
+    eGjkStatus	gjk_status=gjk.Evaluate(shape,guess);
+    if(gjk_status==eGjkValid)
+    {
+        btVector3	w0=btVector3(0,0,0);
+        btVector3	w1=btVector3(0,0,0);
+        for(U i=0;i<gjk.m_simplex->rank;++i)
+        {
+            const btScalar	p=gjk.m_simplex->p[i];
+            w0+=shape.Support( gjk.m_simplex->c[i]->d,0)*p;
+            w1+=shape.Support(-gjk.m_simplex->c[i]->d,1)*p;
+        }
+        results.witnesses[0]	=	a.getWorldTransform()*w0;
+        results.witnesses[1]	=	a.getWorldTransform()*w1;
+        results.normal			=	w0-w1;
+        results.distance		=	results.normal.length();
+        results.normal			/=	results.distance>GJK_MIN_DISTANCE?results.distance:1;
+        return(true);
+    }
+    else
+    {
+        results.status	=	gjk_status==eGjkInside?
+        btGjkEpaSolver3::sResults::Penetrating	:
+        btGjkEpaSolver3::sResults::GJK_Failed	;
+        return(false);
+    }
+template <typename btConvexTemplate>
+bool	btGjkEpaSolver3_Penetration(const btConvexTemplate& a,
+                                     const btConvexTemplate& b,
+                                     const btVector3& guess,
+                                     btGjkEpaSolver3::sResults& results)
+    MinkowskiDiff<btConvexTemplate>			shape(a,b);
+    Initialize(a,b,results,shape);
+    GJK<btConvexTemplate>				gjk(a,b);
+    eGjkStatus	gjk_status=gjk.Evaluate(shape,-guess);
+    switch(gjk_status)
+    {
+        case	eGjkInside:
+        {
+            EPA<btConvexTemplate>				epa;
+            eEpaStatus	epa_status=epa.Evaluate(gjk,-guess);
+            if(epa_status!=eEpaFailed)
+            {
+                btVector3	w0=btVector3(0,0,0);
+                for(U i=0;i<epa.m_result.rank;++i)
+                {
+                    w0+=shape.Support(epa.m_result.c[i]->d,0)*epa.m_result.p[i];
+                }
+                results.status			=	btGjkEpaSolver3::sResults::Penetrating;
+                results.witnesses[0]	=	a.getWorldTransform()*w0;
+                results.witnesses[1]	=	a.getWorldTransform()*(w0-epa.m_normal*epa.m_depth);
+                results.normal			=	-epa.m_normal;
+                results.distance		=	-epa.m_depth;
+                return(true);
+            } else results.status=btGjkEpaSolver3::sResults::EPA_Failed;
+        }
+            break;
+        case	eGjkFailed:
+            results.status=btGjkEpaSolver3::sResults::GJK_Failed;
+            break;
+        default:
+        {
+        }
+    }
+    return(false);
+#if 0
+int	btComputeGjkEpaPenetration2(const btCollisionDescription& colDesc, btDistanceInfo* distInfo)
+    btGjkEpaSolver3::sResults results;
+    btVector3 guess = colDesc.m_firstDir;
+    bool res = btGjkEpaSolver3::Penetration(colDesc.m_objA,colDesc.m_objB,
+                                            colDesc.m_transformA,colDesc.m_transformB,
+                                            colDesc.m_localSupportFuncA,colDesc.m_localSupportFuncB,
+                                            guess,
+                                            results);
+    if (res)
+    {
+        if ((results.status==btGjkEpaSolver3::sResults::Penetrating) || results.status==GJK::eStatus::Inside)
+        {
+            //normal could be 'swapped'
+            distInfo->m_distance = results.distance;
+            distInfo->m_normalBtoA = results.normal;
+            btVector3 tmpNormalInB = results.witnesses[1]-results.witnesses[0];
+            btScalar lenSqr = tmpNormalInB.length2();
+            if (lenSqr <= (SIMD_EPSILON*SIMD_EPSILON))
+            {
+                tmpNormalInB = results.normal;
+                lenSqr = results.normal.length2();
+            }
+            if (lenSqr > (SIMD_EPSILON*SIMD_EPSILON))
+            {
+                tmpNormalInB /= btSqrt(lenSqr);
+                btScalar distance2 = -(results.witnesses[0]-results.witnesses[1]).length();
+                //only replace valid penetrations when the result is deeper (check)
+                //if ((distance2 < results.distance))
+                {
+                    distInfo->m_distance = distance2;
+                    distInfo->m_pointOnA= results.witnesses[0];
+                    distInfo->m_pointOnB= results.witnesses[1];
+                    distInfo->m_normalBtoA= tmpNormalInB;
+                    return 0;
+                }
+            }
+        }
+    }
+    return -1;
+template <typename btConvexTemplate, typename btDistanceInfoTemplate>
+int	btComputeGjkDistance(const btConvexTemplate& a, const btConvexTemplate& b,
+                         const btGjkCollisionDescription& colDesc, btDistanceInfoTemplate* distInfo)
+    btGjkEpaSolver3::sResults results;
+    btVector3 guess = colDesc.m_firstDir;
+    bool isSeparated = btGjkEpaSolver3_Distance(	a,b,
+                                                 guess,
+                                                 results);
+    if (isSeparated)
+    {
+        distInfo->m_distance = results.distance;
+        distInfo->m_pointOnA= results.witnesses[0];
+        distInfo->m_pointOnB= results.witnesses[1];
+        distInfo->m_normalBtoA= results.normal;
+        return 0;
+    }
+    return -1;
+/* Symbols cleanup		*/ 
+#endif //BT_GJK_EPA3_H
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.cpp
index c6dc3f3a..572ec36f 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.cpp
@@ -25,7 +25,7 @@ bool btGjkEpaPenetrationDepthSolver::calcPenDepth( btSimplexSolverInterface& sim
 											  const btConvexShape* pConvexA, const btConvexShape* pConvexB,
 											  const btTransform& transformA, const btTransform& transformB,
 											  btVector3& v, btVector3& wWitnessOnA, btVector3& wWitnessOnB,
-											  class btIDebugDraw* debugDraw, btStackAlloc* stackAlloc )
+											  class btIDebugDraw* debugDraw)
@@ -34,7 +34,7 @@ bool btGjkEpaPenetrationDepthSolver::calcPenDepth( btSimplexSolverInterface& sim
 //	const btScalar				radialmargin(btScalar(0.));
-	btVector3	guessVector(transformA.getOrigin()-transformB.getOrigin());
+	btVector3	guessVector(transformB.getOrigin()-transformA.getOrigin());
 	btGjkEpaSolver2::sResults	results;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h
index a49689a1..1ed6340a 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h
@@ -33,7 +33,7 @@ class btGjkEpaPenetrationDepthSolver : public btConvexPenetrationDepthSolver
 									  const btConvexShape* pConvexA, const btConvexShape* pConvexB,
 									  const btTransform& transformA, const btTransform& transformB,
 									  btVector3& v, btVector3& wWitnessOnA, btVector3& wWitnessOnB,
-									  class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc );
+									  class btIDebugDraw* debugDraw);
 	private :
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.cpp
index 8af16b9c..603ad167 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.cpp
@@ -26,12 +26,15 @@ subject to the following restrictions:
 #ifdef __SPU__
 #include <spu_printf.h>
 #define printf spu_printf
 #endif //__SPU__
 //must be above the machine epsilon
-#define REL_ERROR2 btScalar(1.0e-6)
+	#define REL_ERROR2 btScalar(1.0e-12)
+	#define REL_ERROR2 btScalar(1.0e-6)
 //temp globals, to improve GJK/EPA/penetration calculations
 int gNumDeepPenetrationChecks = 0;
@@ -50,7 +53,8 @@ m_marginA(objectA->getMargin()),
 btGjkPairDetector::btGjkPairDetector(const btConvexShape* objectA,const btConvexShape* objectB,int shapeTypeA,int shapeTypeB,btScalar marginA, btScalar marginB, btSimplexSolverInterface* simplexSolver,btConvexPenetrationDepthSolver*	penetrationDepthSolver)
@@ -65,7 +69,8 @@ m_marginA(marginA),
@@ -79,17 +84,18 @@ void	btGjkPairDetector::getClosestPoints(const ClosestPointInput& input,Result&
 #ifdef __SPU__
 void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& input,Result& output,class btIDebugDraw* debugDraw)
-void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& input,Result& output,class btIDebugDraw* debugDraw)
+void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& input, Result& output, class btIDebugDraw* debugDraw)
 	m_cachedSeparatingDistance = 0.f;
 	btScalar distance=btScalar(0.);
 	btVector3	normalInB(btScalar(0.),btScalar(0.),btScalar(0.));
 	btVector3 pointOnA,pointOnB;
 	btTransform	localTransA = input.m_transformA;
 	btTransform localTransB = input.m_transformB;
-	btVector3 positionOffset = (localTransA.getOrigin() + localTransB.getOrigin()) * btScalar(0.5);
+	btVector3 positionOffset=(localTransA.getOrigin() + localTransB.getOrigin()) * btScalar(0.5);
 	localTransA.getOrigin() -= positionOffset;
 	localTransB.getOrigin() -= positionOffset;
@@ -100,17 +106,11 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
-	spu_printf("inside gjk\n");
 	//for CCD we don't use margins
 	if (m_ignoreMargin)
 		marginA = btScalar(0.);
 		marginB = btScalar(0.);
-		spu_printf("ignoring margin\n");
 	m_curIter = 0;
@@ -141,37 +141,13 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 			btVector3 seperatingAxisInA = (-m_cachedSeparatingAxis)* input.m_transformA.getBasis();
 			btVector3 seperatingAxisInB = m_cachedSeparatingAxis* input.m_transformB.getBasis();
-#if 1
 			btVector3 pInA = m_minkowskiA->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInA);
 			btVector3 qInB = m_minkowskiB->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInB);
-//			btVector3 pInA  = localGetSupportingVertexWithoutMargin(m_shapeTypeA, m_minkowskiA, seperatingAxisInA,input.m_convexVertexData[0]);//, &featureIndexA);
-//			btVector3 qInB  = localGetSupportingVertexWithoutMargin(m_shapeTypeB, m_minkowskiB, seperatingAxisInB,input.m_convexVertexData[1]);//, &featureIndexB);
-#ifdef __SPU__
-			btVector3 pInA = m_minkowskiA->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInA);
-			btVector3 qInB = m_minkowskiB->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInB);
-			btVector3 pInA = m_minkowskiA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
-			btVector3 qInB = m_minkowskiB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
-			btVector3 pInAv = m_minkowskiA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
-			btVector3 qInBv = m_minkowskiB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
-			btAssert((pInAv-pInA).length() < 0.0001);
-			btAssert((qInBv-qInB).length() < 0.0001);
-#endif //
-#endif //__SPU__
 			btVector3  pWorld = localTransA(pInA);	
 			btVector3  qWorld = localTransB(qInB);
-		spu_printf("got local supporting vertices\n");
 			if (check2d)
@@ -215,14 +191,8 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
-		spu_printf("addVertex 1\n");
 			//add current vertex to simplex
 			m_simplexSolver->addVertex(w, pWorld, qWorld);
-		spu_printf("addVertex 2\n");
 			btVector3 newCachedSeparatingAxis;
 			//calculate the closest point to the origin (update vector v)
@@ -272,7 +242,7 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 			  //degeneracy, this is typically due to invalid/uninitialized worldtransforms for a btCollisionObject   
               if (m_curIter++ > gGjkMaxIter)   
-                      #if defined(DEBUG) || defined (_DEBUG) || defined (DEBUG_SPU_COLLISION_DETECTION)
+                      #if defined(DEBUG) || defined (_DEBUG)
                               printf("btGjkPairDetector maxIter exceeded:%i\n",m_curIter);   
                               printf("sepAxis=(%f,%f,%f), squaredDistance = %f, shapeTypeA=%i,shapeTypeB=%i\n",   
@@ -305,6 +275,7 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 			m_simplexSolver->compute_points(pointOnA, pointOnB);
 			normalInB = m_cachedSeparatingAxis;
 			btScalar lenSqr =m_cachedSeparatingAxis.length2();
 			//valid normal
@@ -316,6 +287,7 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 				btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
 				normalInB *= rlen; //normalize
 				btScalar s = btSqrt(squaredDistance);
 				btAssert(s > btScalar(0.0));
@@ -353,7 +325,7 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 					m_cachedSeparatingAxis, tmpPointOnA, tmpPointOnB,
-					debugDraw,input.m_stackAlloc
+					debugDraw
@@ -371,6 +343,7 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 						tmpNormalInB /= btSqrt(lenSqr);
 						btScalar distance2 = -(tmpPointOnA-tmpPointOnB).length();
+						m_lastUsedMethod = 3;
 						//only replace valid penetrations when the result is deeper (check)
 						if (!isValid || (distance2 < distance))
@@ -378,8 +351,48 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 							pointOnA = tmpPointOnA;
 							pointOnB = tmpPointOnB;
 							normalInB = tmpNormalInB;
+							///todo: need to track down this EPA penetration solver degeneracy
+							///the penetration solver reports penetration but the contact normal
+							///connecting the contact points is pointing in the opposite direction
+							///until then, detect the issue and revert the normal
+							{
+								btScalar d1=0;
+								{
+									btVector3 seperatingAxisInA = (normalInB)* input.m_transformA.getBasis();
+									btVector3 seperatingAxisInB = -normalInB* input.m_transformB.getBasis();
+									btVector3 pInA = m_minkowskiA->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInA);
+									btVector3 qInB = m_minkowskiB->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInB);
+									btVector3  pWorld = localTransA(pInA);	
+									btVector3  qWorld = localTransB(qInB);
+									btVector3 w	= pWorld - qWorld;
+									d1 = (-normalInB).dot(w);
+								}
+								btScalar d0 = 0.f;
+								{
+									btVector3 seperatingAxisInA = (-normalInB)* input.m_transformA.getBasis();
+									btVector3 seperatingAxisInB = normalInB* input.m_transformB.getBasis();
+									btVector3 pInA = m_minkowskiA->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInA);
+									btVector3 qInB = m_minkowskiB->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInB);
+									btVector3  pWorld = localTransA(pInA);	
+									btVector3  qWorld = localTransB(qInB);
+									btVector3 w	= pWorld - qWorld;
+									d0 = normalInB.dot(w);
+								}
+								if (d1>d0)
+								{
+									m_lastUsedMethod = 10;
+									normalInB*=-1;
+								} 
+							}
 							isValid = true;
-							m_lastUsedMethod = 3;
 						} else
 							m_lastUsedMethod = 8;
@@ -411,6 +424,7 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 							pointOnB += m_cachedSeparatingAxis * marginB ;
 							normalInB = m_cachedSeparatingAxis;
 							isValid = true;
 							m_lastUsedMethod = 6;
 						} else
@@ -429,14 +443,6 @@ void btGjkPairDetector::getClosestPointsNonVirtual(const ClosestPointInput& inpu
 	if (isValid && ((distance < 0) || (distance*distance < input.m_maximumDistanceSquared)))
-#if 0
-///some debugging
-//		if (check2d)
-		{
-			printf("n = %2.3f,%2.3f,%2.3f. ",normalInB[0],normalInB[1],normalInB[2]);
-			printf("distance = %2.3f exit=%d deg=%d\n",distance,m_lastUsedMethod,m_degenerateSimplex);
-		}
 		m_cachedSeparatingAxis = normalInB;
 		m_cachedSeparatingDistance = distance;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h
index 2277a19d..feeae686 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h
@@ -52,7 +52,7 @@ public:
 	int			m_curIter;
 	int			m_degenerateSimplex;
 	int			m_catchDegeneracies;
+	int			m_fixContactNormalDirection;
 	btGjkPairDetector(const btConvexShape* objectA,const btConvexShape* objectB,btSimplexSolverInterface* simplexSolver,btConvexPenetrationDepthSolver*	penetrationDepthSolver);
 	btGjkPairDetector(const btConvexShape* objectA,const btConvexShape* objectB,int shapeTypeA,int shapeTypeB,btScalar marginA, btScalar marginB, btSimplexSolverInterface* simplexSolver,btConvexPenetrationDepthSolver*	penetrationDepthSolver);
@@ -63,12 +63,12 @@ public:
 	void	getClosestPointsNonVirtual(const ClosestPointInput& input,Result& output,class btIDebugDraw* debugDraw);
-	void setMinkowskiA(btConvexShape* minkA)
+	void setMinkowskiA(const btConvexShape* minkA)
 		m_minkowskiA = minkA;
-	void setMinkowskiB(btConvexShape* minkB)
+	void setMinkowskiB(const btConvexShape* minkB)
 		m_minkowskiB = minkB;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h
index 0ce9dd25..1bb7a7b9 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h
@@ -35,7 +35,12 @@ typedef sce::PhysicsEffects::PfxConstraintRow btConstraintRow;
 	typedef btConstraintRow PfxConstraintRow;
+enum btContactPointFlags
 /// ManifoldContactPoint collects and maintains persistent contactpoints.
 /// used to improve stability and performance of rigidbody dynamics response.
@@ -44,14 +49,15 @@ class btManifoldPoint
+				m_contactPointFlags(0),
-				m_lateralFrictionInitialized(false),
-				m_appliedImpulseLateral1(0.f),
+                m_appliedImpulseLateral1(0.f),
-				m_contactCFM1(0.f),
-				m_contactCFM2(0.f),
+				m_contactCFM(0.f),
+				m_contactERP(0.f),
+				m_frictionCFM(0.f),
@@ -64,21 +70,21 @@ class btManifoldPoint
 					m_normalWorldOnB( normal ), 
 					m_distance1( distance ),
+					m_combinedRollingFriction(btScalar(0.)),
+					m_contactPointFlags(0),
-					m_lateralFrictionInitialized(false),
-					m_appliedImpulseLateral1(0.f),
+                    m_appliedImpulseLateral1(0.f),
-					m_contactCFM1(0.f),
-					m_contactCFM2(0.f),
+					m_contactCFM(0.f),
+					m_contactERP(0.f),
+					m_frictionCFM(0.f),
-				mConstraintRow[0].m_accumImpulse = 0.f;
-				mConstraintRow[1].m_accumImpulse = 0.f;
-				mConstraintRow[2].m_accumImpulse = 0.f;
@@ -92,24 +98,28 @@ class btManifoldPoint
 			btScalar	m_distance1;
 			btScalar	m_combinedFriction;
+			btScalar	m_combinedRollingFriction;
 			btScalar	m_combinedRestitution;
-         //BP mod, store contact triangles.
-         int	   m_partId0;
-         int      m_partId1;
-         int      m_index0;
-         int      m_index1;
+			//BP mod, store contact triangles.
+			int			m_partId0;
+			int			m_partId1;
+			int			m_index0;
+			int			m_index1;
 			mutable void*	m_userPersistentData;
+			//bool			m_lateralFrictionInitialized;
+			int				m_contactPointFlags;
 			btScalar		m_appliedImpulse;
-			bool			m_lateralFrictionInitialized;
 			btScalar		m_appliedImpulseLateral1;
 			btScalar		m_appliedImpulseLateral2;
 			btScalar		m_contactMotion1;
 			btScalar		m_contactMotion2;
-			btScalar		m_contactCFM1;
-			btScalar		m_contactCFM2;
+			btScalar		m_contactCFM;
+			btScalar		m_contactERP;
+			btScalar		m_frictionCFM;
 			int				m_lifeTime;//lifetime of the contactpoint in frames
@@ -118,8 +128,6 @@ class btManifoldPoint
-			btConstraintRow mConstraintRow[3];
 			btScalar getDistance() const
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.cpp
index fe31f08d..fa45f490 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.cpp
@@ -26,11 +26,10 @@ bool btMinkowskiPenetrationDepthSolver::calcPenDepth(btSimplexSolverInterface& s
 												   const btConvexShape* convexA,const btConvexShape* convexB,
 												   const btTransform& transA,const btTransform& transB,
 												   btVector3& v, btVector3& pa, btVector3& pb,
-												   class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc
+												   class btIDebugDraw* debugDraw
-	(void)stackAlloc;
 	bool check2d= convexA->isConvex2d() && convexB->isConvex2d();
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h
index 6a8fe52f..fd533b4f 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h
@@ -32,7 +32,7 @@ public:
 	const btConvexShape* convexA,const btConvexShape* convexB,
 				const btTransform& transA,const btTransform& transB,
 			btVector3& v, btVector3& pa, btVector3& pb,
-			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc
+			class btIDebugDraw* debugDraw
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btMprPenetration.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btMprPenetration.h
new file mode 100644
index 00000000..a22a0bae
--- /dev/null
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btMprPenetration.h
@@ -0,0 +1,908 @@
+ * ---------------------------------
+ * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>
+ *
+ *  This file was ported from mpr.c file, part of libccd.
+ *  The Minkoski Portal Refinement implementation was ported 
+ *  to OpenCL by Erwin Coumans for the Bullet 3 Physics library.
+ *  The original MPR idea and implementation is by Gary Snethen
+ *  in XenoCollide, see http://github.com/erwincoumans/xenocollide
+ *
+ *  Distributed under the OSI-approved BSD License (the "License");
+ *  see <http://www.opensource.org/licenses/bsd-license.php>.
+ *  This software is distributed WITHOUT ANY WARRANTY; without even the
+ *  See the License for more information.
+ */
+///2014 Oct, Erwin Coumans, Use templates to avoid void* casts
+#define BT_DEBUG_MPR1
+#include "LinearMath/btTransform.h"
+#include "LinearMath/btAlignedObjectArray.h"
+struct btMprCollisionDescription
+    btVector3	m_firstDir;
+    int			m_maxGjkIterations;
+    btScalar	m_maximumDistanceSquared;
+    btScalar	m_gjkRelError2;
+    btMprCollisionDescription()
+    :	m_firstDir(0,1,0),
+        m_maxGjkIterations(1000),
+        m_maximumDistanceSquared(1e30f),
+        m_gjkRelError2(1.0e-6)
+    {
+    }
+    virtual ~btMprCollisionDescription()
+    {
+    }
+struct btMprDistanceInfo
+    btVector3	m_pointOnA;
+    btVector3	m_pointOnB;
+    btVector3	m_normalBtoA;
+    btScalar	m_distance;
+#ifdef __cplusplus
+#define BT_MPR_SQRT sqrtf
+#define BT_MPR_SQRT sqrt
+#define BT_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))
+#define BT_MPR_FABS fabs
+#define BT_MPR_TOLERANCE 1E-6f
+struct _btMprSupport_t 
+    btVector3 v;  //!< Support point in minkowski sum
+    btVector3 v1; //!< Support point in obj1
+    btVector3 v2; //!< Support point in obj2
+typedef struct _btMprSupport_t btMprSupport_t;
+struct _btMprSimplex_t 
+    btMprSupport_t ps[4];
+    int last; //!< index of last added point
+typedef struct _btMprSimplex_t btMprSimplex_t;
+inline btMprSupport_t* btMprSimplexPointW(btMprSimplex_t *s, int idx)
+    return &s->ps[idx];
+inline void btMprSimplexSetSize(btMprSimplex_t *s, int size)
+    s->last = size - 1;
+#ifdef DEBUG_MPR
+inline void btPrintPortalVertex(_btMprSimplex_t* portal, int index)
+    printf("portal[%d].v = %f,%f,%f, v1=%f,%f,%f, v2=%f,%f,%f\n", index, portal->ps[index].v.x(),portal->ps[index].v.y(),portal->ps[index].v.z(),
+           portal->ps[index].v1.x(),portal->ps[index].v1.y(),portal->ps[index].v1.z(),
+           portal->ps[index].v2.x(),portal->ps[index].v2.y(),portal->ps[index].v2.z());
+#endif //DEBUG_MPR
+inline int btMprSimplexSize(const btMprSimplex_t *s)
+    return s->last + 1;
+inline const btMprSupport_t* btMprSimplexPoint(const btMprSimplex_t* s, int idx)
+    // here is no check on boundaries
+    return &s->ps[idx];
+inline void btMprSupportCopy(btMprSupport_t *d, const btMprSupport_t *s)
+    *d = *s;
+inline void btMprSimplexSet(btMprSimplex_t *s, size_t pos, const btMprSupport_t *a)
+    btMprSupportCopy(s->ps + pos, a);
+inline void btMprSimplexSwap(btMprSimplex_t *s, size_t pos1, size_t pos2)
+    btMprSupport_t supp;
+    btMprSupportCopy(&supp, &s->ps[pos1]);
+    btMprSupportCopy(&s->ps[pos1], &s->ps[pos2]);
+    btMprSupportCopy(&s->ps[pos2], &supp);
+inline int btMprIsZero(float val)
+    return BT_MPR_FABS(val) < FLT_EPSILON;
+inline int btMprEq(float _a, float _b)
+    float ab;
+    float a, b;
+    ab = BT_MPR_FABS(_a - _b);
+    if (BT_MPR_FABS(ab) < FLT_EPSILON)
+        return 1;
+    a = BT_MPR_FABS(_a);
+    b = BT_MPR_FABS(_b);
+    if (b > a){
+        return ab < FLT_EPSILON * b;
+    }else{
+        return ab < FLT_EPSILON * a;
+    }
+inline int btMprVec3Eq(const btVector3* a, const btVector3 *b)
+    return btMprEq((*a).x(), (*b).x())
+            && btMprEq((*a).y(), (*b).y())
+            && btMprEq((*a).z(), (*b).z());
+template <typename btConvexTemplate>
+inline void btFindOrigin(const btConvexTemplate& a, const btConvexTemplate& b, const btMprCollisionDescription& colDesc,btMprSupport_t *center)
+	center->v1 = a.getObjectCenterInWorld();
+    center->v2 = b.getObjectCenterInWorld();
+    center->v = center->v1 - center->v2;
+inline void btMprVec3Set(btVector3 *v, float x, float y, float z)
+	v->setValue(x,y,z);
+inline void btMprVec3Add(btVector3 *v, const btVector3 *w)
+	*v += *w;
+inline void btMprVec3Copy(btVector3 *v, const btVector3 *w)
+    *v = *w;
+inline void btMprVec3Scale(btVector3 *d, float k)
+    *d *= k;
+inline float btMprVec3Dot(const btVector3 *a, const btVector3 *b)
+    float dot;
+	dot = btDot(*a,*b);
+    return dot;
+inline float btMprVec3Len2(const btVector3 *v)
+    return btMprVec3Dot(v, v);
+inline void btMprVec3Normalize(btVector3 *d)
+    float k = 1.f / BT_MPR_SQRT(btMprVec3Len2(d));
+    btMprVec3Scale(d, k);
+inline void btMprVec3Cross(btVector3 *d, const btVector3 *a, const btVector3 *b)
+	*d = btCross(*a,*b);
+inline void btMprVec3Sub2(btVector3 *d, const btVector3 *v, const btVector3 *w)
+	*d = *v - *w;
+inline void btPortalDir(const btMprSimplex_t *portal, btVector3 *dir)
+    btVector3 v2v1, v3v1;
+    btMprVec3Sub2(&v2v1, &btMprSimplexPoint(portal, 2)->v,
+                       &btMprSimplexPoint(portal, 1)->v);
+    btMprVec3Sub2(&v3v1, &btMprSimplexPoint(portal, 3)->v,
+                       &btMprSimplexPoint(portal, 1)->v);
+    btMprVec3Cross(dir, &v2v1, &v3v1);
+    btMprVec3Normalize(dir);
+inline int portalEncapsulesOrigin(const btMprSimplex_t *portal,
+                                       const btVector3 *dir)
+    float dot;
+    dot = btMprVec3Dot(dir, &btMprSimplexPoint(portal, 1)->v);
+    return btMprIsZero(dot) || dot > 0.f;
+inline int portalReachTolerance(const btMprSimplex_t *portal,
+                                     const btMprSupport_t *v4,
+                                     const btVector3 *dir)
+    float dv1, dv2, dv3, dv4;
+    float dot1, dot2, dot3;
+    // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}
+    dv1 = btMprVec3Dot(&btMprSimplexPoint(portal, 1)->v, dir);
+    dv2 = btMprVec3Dot(&btMprSimplexPoint(portal, 2)->v, dir);
+    dv3 = btMprVec3Dot(&btMprSimplexPoint(portal, 3)->v, dir);
+    dv4 = btMprVec3Dot(&v4->v, dir);
+    dot1 = dv4 - dv1;
+    dot2 = dv4 - dv2;
+    dot3 = dv4 - dv3;
+    dot1 = BT_MPR_FMIN(dot1, dot2);
+    dot1 = BT_MPR_FMIN(dot1, dot3);
+    return btMprEq(dot1, BT_MPR_TOLERANCE) || dot1 < BT_MPR_TOLERANCE;
+inline int portalCanEncapsuleOrigin(const btMprSimplex_t *portal,
+                                         const btMprSupport_t *v4,
+                                         const btVector3 *dir)
+    float dot;
+    dot = btMprVec3Dot(&v4->v, dir);
+    return btMprIsZero(dot) || dot > 0.f;
+inline void btExpandPortal(btMprSimplex_t *portal,
+                              const btMprSupport_t *v4)
+    float dot;
+    btVector3 v4v0;
+    btMprVec3Cross(&v4v0, &v4->v, &btMprSimplexPoint(portal, 0)->v);
+    dot = btMprVec3Dot(&btMprSimplexPoint(portal, 1)->v, &v4v0);
+    if (dot > 0.f){
+        dot = btMprVec3Dot(&btMprSimplexPoint(portal, 2)->v, &v4v0);
+        if (dot > 0.f){
+            btMprSimplexSet(portal, 1, v4);
+        }else{
+            btMprSimplexSet(portal, 3, v4);
+        }
+    }else{
+        dot = btMprVec3Dot(&btMprSimplexPoint(portal, 3)->v, &v4v0);
+        if (dot > 0.f){
+            btMprSimplexSet(portal, 2, v4);
+        }else{
+            btMprSimplexSet(portal, 1, v4);
+        }
+    }
+template <typename btConvexTemplate>
+inline void btMprSupport(const btConvexTemplate& a, const btConvexTemplate& b,
+                         const btMprCollisionDescription& colDesc,
+													const btVector3& dir, btMprSupport_t *supp)
+	btVector3 seperatingAxisInA = dir* a.getWorldTransform().getBasis();
+	btVector3 seperatingAxisInB = -dir* b.getWorldTransform().getBasis();
+	btVector3 pInA = a.getLocalSupportWithMargin(seperatingAxisInA);
+	btVector3 qInB = b.getLocalSupportWithMargin(seperatingAxisInB);
+	supp->v1 = a.getWorldTransform()(pInA);
+	supp->v2 = b.getWorldTransform()(qInB);
+	supp->v = supp->v1 - supp->v2;
+template <typename btConvexTemplate>
+static int btDiscoverPortal(const btConvexTemplate& a, const btConvexTemplate& b,
+                            const btMprCollisionDescription& colDesc,
+													btMprSimplex_t *portal)
+    btVector3 dir, va, vb;
+    float dot;
+    int cont;
+    // vertex 0 is center of portal
+    btFindOrigin(a,b,colDesc, btMprSimplexPointW(portal, 0));
+    // vertex 0 is center of portal
+    btMprSimplexSetSize(portal, 1);
+	btVector3 zero = btVector3(0,0,0);
+	btVector3* org = &zero;
+    if (btMprVec3Eq(&btMprSimplexPoint(portal, 0)->v, org)){
+        // Portal's center lies on origin (0,0,0) => we know that objects
+        // intersect but we would need to know penetration info.
+        // So move center little bit...
+        btMprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);
+        btMprVec3Add(&btMprSimplexPointW(portal, 0)->v, &va);
+    }
+    // vertex 1 = support in direction of origin
+    btMprVec3Copy(&dir, &btMprSimplexPoint(portal, 0)->v);
+    btMprVec3Scale(&dir, -1.f);
+    btMprVec3Normalize(&dir);
+    btMprSupport(a,b,colDesc, dir, btMprSimplexPointW(portal, 1));
+    btMprSimplexSetSize(portal, 2);
+    // test if origin isn't outside of v1
+    dot = btMprVec3Dot(&btMprSimplexPoint(portal, 1)->v, &dir);
+    if (btMprIsZero(dot) || dot < 0.f)
+        return -1;
+    // vertex 2
+    btMprVec3Cross(&dir, &btMprSimplexPoint(portal, 0)->v,
+                       &btMprSimplexPoint(portal, 1)->v);
+    if (btMprIsZero(btMprVec3Len2(&dir))){
+        if (btMprVec3Eq(&btMprSimplexPoint(portal, 1)->v, org)){
+            // origin lies on v1
+            return 1;
+        }else{
+            // origin lies on v0-v1 segment
+            return 2;
+        }
+    }
+    btMprVec3Normalize(&dir);
+    btMprSupport(a,b,colDesc, dir, btMprSimplexPointW(portal, 2));
+    dot = btMprVec3Dot(&btMprSimplexPoint(portal, 2)->v, &dir);
+    if (btMprIsZero(dot) || dot < 0.f)
+        return -1;
+    btMprSimplexSetSize(portal, 3);
+    // vertex 3 direction
+    btMprVec3Sub2(&va, &btMprSimplexPoint(portal, 1)->v,
+                     &btMprSimplexPoint(portal, 0)->v);
+    btMprVec3Sub2(&vb, &btMprSimplexPoint(portal, 2)->v,
+                     &btMprSimplexPoint(portal, 0)->v);
+    btMprVec3Cross(&dir, &va, &vb);
+    btMprVec3Normalize(&dir);
+    // it is better to form portal faces to be oriented "outside" origin
+    dot = btMprVec3Dot(&dir, &btMprSimplexPoint(portal, 0)->v);
+    if (dot > 0.f){
+        btMprSimplexSwap(portal, 1, 2);
+        btMprVec3Scale(&dir, -1.f);
+    }
+    while (btMprSimplexSize(portal) < 4){
+		 btMprSupport(a,b,colDesc, dir, btMprSimplexPointW(portal, 3));
+        dot = btMprVec3Dot(&btMprSimplexPoint(portal, 3)->v, &dir);
+        if (btMprIsZero(dot) || dot < 0.f)
+            return -1;
+        cont = 0;
+        // test if origin is outside (v1, v0, v3) - set v2 as v3 and
+        // continue
+        btMprVec3Cross(&va, &btMprSimplexPoint(portal, 1)->v,
+                          &btMprSimplexPoint(portal, 3)->v);
+        dot = btMprVec3Dot(&va, &btMprSimplexPoint(portal, 0)->v);
+        if (dot < 0.f && !btMprIsZero(dot)){
+            btMprSimplexSet(portal, 2, btMprSimplexPoint(portal, 3));
+            cont = 1;
+        }
+        if (!cont){
+            // test if origin is outside (v3, v0, v2) - set v1 as v3 and
+            // continue
+            btMprVec3Cross(&va, &btMprSimplexPoint(portal, 3)->v,
+                              &btMprSimplexPoint(portal, 2)->v);
+            dot = btMprVec3Dot(&va, &btMprSimplexPoint(portal, 0)->v);
+            if (dot < 0.f && !btMprIsZero(dot)){
+                btMprSimplexSet(portal, 1, btMprSimplexPoint(portal, 3));
+                cont = 1;
+            }
+        }
+        if (cont){
+            btMprVec3Sub2(&va, &btMprSimplexPoint(portal, 1)->v,
+                             &btMprSimplexPoint(portal, 0)->v);
+            btMprVec3Sub2(&vb, &btMprSimplexPoint(portal, 2)->v,
+                             &btMprSimplexPoint(portal, 0)->v);
+            btMprVec3Cross(&dir, &va, &vb);
+            btMprVec3Normalize(&dir);
+        }else{
+            btMprSimplexSetSize(portal, 4);
+        }
+    }
+    return 0;
+template <typename btConvexTemplate>
+static int btRefinePortal(const btConvexTemplate& a, const btConvexTemplate& b,const btMprCollisionDescription& colDesc,
+							btMprSimplex_t *portal)
+    btVector3 dir;
+    btMprSupport_t v4;
+	for (int i=0;i<BT_MPR_MAX_ITERATIONS;i++)
+    //while (1)
+	{
+        // compute direction outside the portal (from v0 throught v1,v2,v3
+        // face)
+        btPortalDir(portal, &dir);
+        // test if origin is inside the portal
+        if (portalEncapsulesOrigin(portal, &dir))
+            return 0;
+        // get next support point
+		 btMprSupport(a,b,colDesc, dir, &v4);
+        // test if v4 can expand portal to contain origin and if portal
+        // expanding doesn't reach given tolerance
+        if (!portalCanEncapsuleOrigin(portal, &v4, &dir)
+                || portalReachTolerance(portal, &v4, &dir))
+		{
+            return -1;
+        }
+        // v1-v2-v3 triangle must be rearranged to face outside Minkowski
+        // difference (direction from v0).
+        btExpandPortal(portal, &v4);
+    }
+    return -1;
+static void btFindPos(const btMprSimplex_t *portal, btVector3 *pos)
+	btVector3 zero = btVector3(0,0,0);
+	btVector3* origin = &zero;
+    btVector3 dir;
+    size_t i;
+    float b[4], sum, inv;
+    btVector3 vec, p1, p2;
+    btPortalDir(portal, &dir);
+    // use barycentric coordinates of tetrahedron to find origin
+    btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 1)->v,
+                       &btMprSimplexPoint(portal, 2)->v);
+    b[0] = btMprVec3Dot(&vec, &btMprSimplexPoint(portal, 3)->v);
+    btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 3)->v,
+                       &btMprSimplexPoint(portal, 2)->v);
+    b[1] = btMprVec3Dot(&vec, &btMprSimplexPoint(portal, 0)->v);
+    btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 0)->v,
+                       &btMprSimplexPoint(portal, 1)->v);
+    b[2] = btMprVec3Dot(&vec, &btMprSimplexPoint(portal, 3)->v);
+    btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 2)->v,
+                       &btMprSimplexPoint(portal, 1)->v);
+    b[3] = btMprVec3Dot(&vec, &btMprSimplexPoint(portal, 0)->v);
+	sum = b[0] + b[1] + b[2] + b[3];
+    if (btMprIsZero(sum) || sum < 0.f){
+		b[0] = 0.f;
+        btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 2)->v,
+                           &btMprSimplexPoint(portal, 3)->v);
+        b[1] = btMprVec3Dot(&vec, &dir);
+        btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 3)->v,
+                           &btMprSimplexPoint(portal, 1)->v);
+        b[2] = btMprVec3Dot(&vec, &dir);
+        btMprVec3Cross(&vec, &btMprSimplexPoint(portal, 1)->v,
+                           &btMprSimplexPoint(portal, 2)->v);
+        b[3] = btMprVec3Dot(&vec, &dir);
+		sum = b[1] + b[2] + b[3];
+	}
+	inv = 1.f / sum;
+    btMprVec3Copy(&p1, origin);
+    btMprVec3Copy(&p2, origin);
+    for (i = 0; i < 4; i++){
+        btMprVec3Copy(&vec, &btMprSimplexPoint(portal, i)->v1);
+        btMprVec3Scale(&vec, b[i]);
+        btMprVec3Add(&p1, &vec);
+        btMprVec3Copy(&vec, &btMprSimplexPoint(portal, i)->v2);
+        btMprVec3Scale(&vec, b[i]);
+        btMprVec3Add(&p2, &vec);
+    }
+    btMprVec3Scale(&p1, inv);
+    btMprVec3Scale(&p2, inv);
+    btMprVec3Copy(pos, &p1);
+    btMprVec3Add(pos, &p2);
+    btMprVec3Scale(pos, 0.5);
+    btMprVec3Copy(pos, &p2);
+inline float btMprVec3Dist2(const btVector3 *a, const btVector3 *b)
+    btVector3 ab;
+    btMprVec3Sub2(&ab, a, b);
+    return btMprVec3Len2(&ab);
+inline float _btMprVec3PointSegmentDist2(const btVector3 *P,
+                                                  const btVector3 *x0,
+                                                  const btVector3 *b,
+                                                  btVector3 *witness)
+    // The computation comes from solving equation of segment:
+    //      S(t) = x0 + t.d
+    //          where - x0 is initial point of segment
+    //                - d is direction of segment from x0 (|d| > 0)
+    //                - t belongs to <0, 1> interval
+    // 
+    // Than, distance from a segment to some point P can be expressed:
+    //      D(t) = |x0 + t.d - P|^2
+    //          which is distance from any point on segment. Minimization
+    //          of this function brings distance from P to segment.
+    // Minimization of D(t) leads to simple quadratic equation that's
+    // solving is straightforward.
+    //
+    // Bonus of this method is witness point for free.
+    float dist, t;
+    btVector3 d, a;
+    // direction of segment
+    btMprVec3Sub2(&d, b, x0);
+    // precompute vector from P to x0
+    btMprVec3Sub2(&a, x0, P);
+    t  = -1.f * btMprVec3Dot(&a, &d);
+    t /= btMprVec3Len2(&d);
+    if (t < 0.f || btMprIsZero(t)){
+        dist = btMprVec3Dist2(x0, P);
+        if (witness)
+            btMprVec3Copy(witness, x0);
+    }else if (t > 1.f || btMprEq(t, 1.f)){
+        dist = btMprVec3Dist2(b, P);
+        if (witness)
+            btMprVec3Copy(witness, b);
+    }else{
+        if (witness){
+            btMprVec3Copy(witness, &d);
+            btMprVec3Scale(witness, t);
+            btMprVec3Add(witness, x0);
+            dist = btMprVec3Dist2(witness, P);
+        }else{
+            // recycling variables
+            btMprVec3Scale(&d, t);
+            btMprVec3Add(&d, &a);
+            dist = btMprVec3Len2(&d);
+        }
+    }
+    return dist;
+inline float btMprVec3PointTriDist2(const btVector3 *P,
+                                const btVector3 *x0, const btVector3 *B,
+                                const btVector3 *C,
+                                btVector3 *witness)
+    // Computation comes from analytic expression for triangle (x0, B, C)
+    //      T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and
+    // Then equation for distance is:
+    //      D(s, t) = | T(s, t) - P |^2
+    // This leads to minimization of quadratic function of two variables.
+    // The solution from is taken only if s is between 0 and 1, t is
+    // between 0 and 1 and t + s < 1, otherwise distance from segment is
+    // computed.
+    btVector3 d1, d2, a;
+    float u, v, w, p, q, r;
+    float s, t, dist, dist2;
+    btVector3 witness2;
+    btMprVec3Sub2(&d1, B, x0);
+    btMprVec3Sub2(&d2, C, x0);
+    btMprVec3Sub2(&a, x0, P);
+    u = btMprVec3Dot(&a, &a);
+    v = btMprVec3Dot(&d1, &d1);
+    w = btMprVec3Dot(&d2, &d2);
+    p = btMprVec3Dot(&a, &d1);
+    q = btMprVec3Dot(&a, &d2);
+    r = btMprVec3Dot(&d1, &d2);
+	btScalar div = (w * v - r * r);
+	if (btMprIsZero(div))
+	{
+		s=-1;
+	} else
+	{
+		s = (q * r - w * p) / div;
+		t = (-s * r - q) / w;
+	}
+    if ((btMprIsZero(s) || s > 0.f)
+            && (btMprEq(s, 1.f) || s < 1.f)
+            && (btMprIsZero(t) || t > 0.f)
+            && (btMprEq(t, 1.f) || t < 1.f)
+            && (btMprEq(t + s, 1.f) || t + s < 1.f)){
+        if (witness){
+            btMprVec3Scale(&d1, s);
+            btMprVec3Scale(&d2, t);
+            btMprVec3Copy(witness, x0);
+            btMprVec3Add(witness, &d1);
+            btMprVec3Add(witness, &d2);
+            dist = btMprVec3Dist2(witness, P);
+        }else{
+            dist  = s * s * v;
+            dist += t * t * w;
+            dist += 2.f * s * t * r;
+            dist += 2.f * s * p;
+            dist += 2.f * t * q;
+            dist += u;
+        }
+    }else{
+        dist = _btMprVec3PointSegmentDist2(P, x0, B, witness);
+        dist2 = _btMprVec3PointSegmentDist2(P, x0, C, &witness2);
+        if (dist2 < dist){
+            dist = dist2;
+            if (witness)
+                btMprVec3Copy(witness, &witness2);
+        }
+        dist2 = _btMprVec3PointSegmentDist2(P, B, C, &witness2);
+        if (dist2 < dist){
+            dist = dist2;
+            if (witness)
+                btMprVec3Copy(witness, &witness2);
+        }
+    }
+    return dist;
+template <typename btConvexTemplate>
+static void btFindPenetr(const btConvexTemplate& a, const btConvexTemplate& b,
+                         const btMprCollisionDescription& colDesc,
+                         btMprSimplex_t *portal,
+                         float *depth, btVector3 *pdir, btVector3 *pos)
+    btVector3 dir;
+    btMprSupport_t v4;
+    unsigned long iterations;
+	btVector3 zero = btVector3(0,0,0);
+	btVector3* origin = &zero;
+    iterations = 1UL;
+	for (int i=0;i<BT_MPR_MAX_ITERATIONS;i++)
+    //while (1)
+	{
+        // compute portal direction and obtain next support point
+        btPortalDir(portal, &dir);
+		 btMprSupport(a,b,colDesc, dir, &v4);
+        // reached tolerance -> find penetration info
+        if (portalReachTolerance(portal, &v4, &dir)
+                || iterations ==BT_MPR_MAX_ITERATIONS)
+		{
+            *depth = btMprVec3PointTriDist2(origin,&btMprSimplexPoint(portal, 1)->v,&btMprSimplexPoint(portal, 2)->v,&btMprSimplexPoint(portal, 3)->v,pdir);
+            *depth = BT_MPR_SQRT(*depth);
+			if (btMprIsZero((*pdir).x()) && btMprIsZero((*pdir).y()) && btMprIsZero((*pdir).z()))
+			{
+				*pdir = dir;
+			} 
+			btMprVec3Normalize(pdir);
+            // barycentric coordinates:
+            btFindPos(portal, pos);
+            return;
+        }
+        btExpandPortal(portal, &v4);
+        iterations++;
+    }
+static void btFindPenetrTouch(btMprSimplex_t *portal,float *depth, btVector3 *dir, btVector3 *pos)
+    // Touching contact on portal's v1 - so depth is zero and direction
+    // is unimportant and pos can be guessed
+    *depth = 0.f;
+    btVector3 zero = btVector3(0,0,0);
+	btVector3* origin = &zero;
+	btMprVec3Copy(dir, origin);
+    btMprVec3Copy(pos, &btMprSimplexPoint(portal, 1)->v1);
+    btMprVec3Add(pos, &btMprSimplexPoint(portal, 1)->v2);
+    btMprVec3Scale(pos, 0.5);
+     btMprVec3Copy(pos, &btMprSimplexPoint(portal, 1)->v2);
+static void btFindPenetrSegment(btMprSimplex_t *portal,
+                              float *depth, btVector3 *dir, btVector3 *pos)
+    // Origin lies on v0-v1 segment.
+    // Depth is distance to v1, direction also and position must be
+    // computed
+    btMprVec3Copy(pos, &btMprSimplexPoint(portal, 1)->v1);
+    btMprVec3Add(pos, &btMprSimplexPoint(portal, 1)->v2);
+    btMprVec3Scale(pos, 0.5f);
+     btMprVec3Copy(pos, &btMprSimplexPoint(portal, 1)->v2);
+    btMprVec3Copy(dir, &btMprSimplexPoint(portal, 1)->v);
+    *depth = BT_MPR_SQRT(btMprVec3Len2(dir));
+    btMprVec3Normalize(dir);
+template <typename btConvexTemplate>
+inline int btMprPenetration( const btConvexTemplate& a, const btConvexTemplate& b,
+                            const btMprCollisionDescription& colDesc,
+					float *depthOut, btVector3* dirOut, btVector3* posOut)
+	 btMprSimplex_t portal;
+    // Phase 1: Portal discovery
+    int result = btDiscoverPortal(a,b,colDesc, &portal);
+	//sepAxis[pairIndex] = *pdir;//or -dir?
+	switch (result)
+	{
+	case 0:
+		{
+			// Phase 2: Portal refinement
+			result = btRefinePortal(a,b,colDesc, &portal);
+			if (result < 0)
+				return -1;
+			// Phase 3. Penetration info
+			btFindPenetr(a,b,colDesc, &portal, depthOut, dirOut, posOut);
+			break;
+		}
+	case 1:
+		{
+			 // Touching contact on portal's v1.
+			btFindPenetrTouch(&portal, depthOut, dirOut, posOut);
+			result=0;
+			break;
+		}
+	case 2:
+		{
+			btFindPenetrSegment( &portal, depthOut, dirOut, posOut);
+			result=0;
+			break;
+		}
+	default:
+		{
+			//if (res < 0)
+			//{
+				// Origin isn't inside portal - no collision.
+				result = -1;
+			//}
+		}
+	};
+	return result;
+template<typename btConvexTemplate, typename btMprDistanceTemplate>
+inline int	btComputeMprPenetration( const btConvexTemplate& a, const btConvexTemplate& b, const
+                                    btMprCollisionDescription& colDesc, btMprDistanceTemplate* distInfo)
+	btVector3 dir,pos;
+	float depth;
+	int res = btMprPenetration(a,b,colDesc,&depth, &dir, &pos);
+	if (res==0)
+	{
+		distInfo->m_distance = -depth;
+		distInfo->m_pointOnB = pos;
+		distInfo->m_normalBtoA = -dir;
+		distInfo->m_pointOnA = pos-distInfo->m_distance*dir;
+		return 0;
+	}
+	return -1;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp
index 954b8395..4d92e853 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp
@@ -205,10 +205,13 @@ int btPersistentManifold::getCacheEntry(const btManifoldPoint& newPoint) const
 	return nearestPoint;
-int btPersistentManifold::addManifoldPoint(const btManifoldPoint& newPoint)
+int btPersistentManifold::addManifoldPoint(const btManifoldPoint& newPoint, bool isPredictive)
-	btAssert(validContactDistance(newPoint));
+	if (!isPredictive)
+	{
+		btAssert(validContactDistance(newPoint));
+	}
 	int insertIndex = getNumContacts();
 	if (insertIndex == MANIFOLD_CACHE_SIZE)
@@ -287,7 +290,7 @@ void btPersistentManifold::refreshContactPoints(const btTransform& trA,const btT
 				//contact point processed callback
 				if (gContactProcessedCallback)
-					(*gContactProcessedCallback)(manifoldPoint,m_body0,m_body1);
+					(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
index d877f099..d220f299 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
@@ -20,6 +20,7 @@ subject to the following restrictions:
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btTransform.h"
 #include "btManifoldPoint.h"
+class btCollisionObject;
 #include "LinearMath/btAlignedAllocator.h"
 struct btCollisionResult;
@@ -57,9 +58,8 @@ ATTRIBUTE_ALIGNED128( class) btPersistentManifold : public btTypedObject
 	btManifoldPoint m_pointCache[MANIFOLD_CACHE_SIZE];
 	/// this two body pointers can point to the physics rigidbody class.
-	/// void* will allow any rigidbody class
-	void* m_body0;
-	void* m_body1;
+	const btCollisionObject* m_body0;
+	const btCollisionObject* m_body1;
 	int	m_cachedPoints;
@@ -83,7 +83,7 @@ public:
-	btPersistentManifold(void* body0,void* body1,int , btScalar contactBreakingThreshold,btScalar contactProcessingThreshold)
+	btPersistentManifold(const btCollisionObject* body0,const btCollisionObject* body1,int , btScalar contactBreakingThreshold,btScalar contactProcessingThreshold)
@@ -91,13 +91,10 @@ public:
-	SIMD_FORCE_INLINE void* getBody0() { return m_body0;}
-	SIMD_FORCE_INLINE void* getBody1() { return m_body1;}
+	SIMD_FORCE_INLINE const btCollisionObject* getBody0() const { return m_body0;}
+	SIMD_FORCE_INLINE const btCollisionObject* getBody1() const { return m_body1;}
-	SIMD_FORCE_INLINE const void* getBody0() const { return m_body0;}
-	SIMD_FORCE_INLINE const void* getBody1() const { return m_body1;}
-	void	setBodies(void* body0,void* body1)
+	void	setBodies(const btCollisionObject* body0,const btCollisionObject* body1)
 		m_body0 = body0;
 		m_body1 = body1;
@@ -110,6 +107,12 @@ public:
 #endif //
 	SIMD_FORCE_INLINE int	getNumContacts() const { return m_cachedPoints;}
+	/// the setNumContacts API is usually not used, except when you gather/fill all contacts manually
+	void setNumContacts(int cachedPoints)
+	{
+		m_cachedPoints = cachedPoints;
+	}
 	SIMD_FORCE_INLINE const btManifoldPoint& getContactPoint(int index) const
@@ -131,9 +134,22 @@ public:
 		return m_contactProcessingThreshold;
+	void setContactBreakingThreshold(btScalar contactBreakingThreshold)
+	{
+		m_contactBreakingThreshold = contactBreakingThreshold;
+	}
+	void setContactProcessingThreshold(btScalar	contactProcessingThreshold)
+	{
+		m_contactProcessingThreshold = contactProcessingThreshold;
+	}
 	int getCacheEntry(const btManifoldPoint& newPoint) const;
-	int addManifoldPoint( const btManifoldPoint& newPoint);
+	int addManifoldPoint( const btManifoldPoint& newPoint, bool isPredictive=false);
 	void removeContactPoint (int index)
@@ -146,12 +162,8 @@ public:
 			m_pointCache[index] = m_pointCache[lastUsedIndex]; 
 			//get rid of duplicated userPersistentData pointer
 			m_pointCache[lastUsedIndex].m_userPersistentData = 0;
-			m_pointCache[lastUsedIndex].mConstraintRow[0].m_accumImpulse = 0.f;
-			m_pointCache[lastUsedIndex].mConstraintRow[1].m_accumImpulse = 0.f;
-			m_pointCache[lastUsedIndex].mConstraintRow[2].m_accumImpulse = 0.f;
 			m_pointCache[lastUsedIndex].m_appliedImpulse = 0.f;
-			m_pointCache[lastUsedIndex].m_lateralFrictionInitialized = false;
+			m_pointCache[lastUsedIndex].m_contactPointFlags = 0;
 			m_pointCache[lastUsedIndex].m_appliedImpulseLateral1 = 0.f;
 			m_pointCache[lastUsedIndex].m_appliedImpulseLateral2 = 0.f;
 			m_pointCache[lastUsedIndex].m_lifeTime = 0;
@@ -167,9 +179,9 @@ public:
 		int	lifeTime = m_pointCache[insertIndex].getLifeTime();
-		btScalar	appliedImpulse = m_pointCache[insertIndex].mConstraintRow[0].m_accumImpulse;
-		btScalar	appliedLateralImpulse1 = m_pointCache[insertIndex].mConstraintRow[1].m_accumImpulse;
-		btScalar	appliedLateralImpulse2 = m_pointCache[insertIndex].mConstraintRow[2].m_accumImpulse;
+		btScalar	appliedImpulse = m_pointCache[insertIndex].m_appliedImpulse;
+		btScalar	appliedLateralImpulse1 = m_pointCache[insertIndex].m_appliedImpulseLateral1;
+		btScalar	appliedLateralImpulse2 = m_pointCache[insertIndex].m_appliedImpulseLateral2;
 //		bool isLateralFrictionInitialized = m_pointCache[insertIndex].m_lateralFrictionInitialized;
@@ -178,16 +190,11 @@ public:
 		void* cache = m_pointCache[insertIndex].m_userPersistentData;
 		m_pointCache[insertIndex] = newPoint;
 		m_pointCache[insertIndex].m_userPersistentData = cache;
 		m_pointCache[insertIndex].m_appliedImpulse = appliedImpulse;
 		m_pointCache[insertIndex].m_appliedImpulseLateral1 = appliedLateralImpulse1;
 		m_pointCache[insertIndex].m_appliedImpulseLateral2 = appliedLateralImpulse2;
-		m_pointCache[insertIndex].mConstraintRow[0].m_accumImpulse =  appliedImpulse;
-		m_pointCache[insertIndex].mConstraintRow[1].m_accumImpulse = appliedLateralImpulse1;
-		m_pointCache[insertIndex].mConstraintRow[2].m_accumImpulse = appliedLateralImpulse2;
 		m_pointCache[insertIndex].m_lifeTime = lifeTime;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp
index db190911..ea380bc5 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp
@@ -77,21 +77,36 @@ void btPolyhedralContactClipping::clipFace(const btVertexArray& pVtxIn, btVertex
-static bool TestSepAxis(const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btVector3& sep_axis, btScalar& depth)
+static bool TestSepAxis(const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btVector3& sep_axis, btScalar& depth, btVector3& witnessPointA, btVector3& witnessPointB)
 	btScalar Min0,Max0;
 	btScalar Min1,Max1;
-	hullA.project(transA,sep_axis, Min0, Max0);
-	hullB.project(transB, sep_axis, Min1, Max1);
+	btVector3 witnesPtMinA,witnesPtMaxA;
+	btVector3 witnesPtMinB,witnesPtMaxB;
+	hullA.project(transA,sep_axis, Min0, Max0,witnesPtMinA,witnesPtMaxA);
+	hullB.project(transB, sep_axis, Min1, Max1,witnesPtMinB,witnesPtMaxB);
 	if(Max0<Min1 || Max1<Min0)
 		return false;
 	btScalar d0 = Max0 - Min1;
-	assert(d0>=0.0f);
+	btAssert(d0>=0.0f);
 	btScalar d1 = Max1 - Min0;
-	assert(d1>=0.0f);
-	depth = d0<d1 ? d0:d1;
+	btAssert(d1>=0.0f);
+	if (d0<d1)
+	{
+		depth = d0;
+		witnessPointA = witnesPtMaxA;
+		witnessPointB = witnesPtMinB;
+	} else
+	{
+		depth = d1;
+		witnessPointA = witnesPtMinA;
+		witnessPointB = witnesPtMaxB;
+	}
 	return true;
@@ -101,7 +116,7 @@ static int gActualSATPairTests=0;
 inline bool IsAlmostZero(const btVector3& v)
-	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
+	if(btFabs(v.x())>1e-6 || btFabs(v.y())>1e-6 || btFabs(v.z())>1e-6)	return false;
 	return true;
@@ -163,8 +178,66 @@ void InverseTransformPoint3x3(btVector3& out, const btVector3& in, const btTrans
+ SIMD_FORCE_INLINE void btSegmentsClosestPoints(
+	btVector3& ptsVector,
+	btVector3& offsetA,
+	btVector3& offsetB,
+	btScalar& tA, btScalar& tB,
+	const btVector3& translation,
+	const btVector3& dirA, btScalar hlenA,
+	const btVector3& dirB, btScalar hlenB )
+	// compute the parameters of the closest points on each line segment
+	btScalar dirA_dot_dirB = btDot(dirA,dirB);
+	btScalar dirA_dot_trans = btDot(dirA,translation);
+	btScalar dirB_dot_trans = btDot(dirB,translation);
+	btScalar denom = 1.0f - dirA_dot_dirB * dirA_dot_dirB;
+	if ( denom == 0.0f ) {
+		tA = 0.0f;
+	} else {
+		tA = ( dirA_dot_trans - dirB_dot_trans * dirA_dot_dirB ) / denom;
+		if ( tA < -hlenA )
+			tA = -hlenA;
+		else if ( tA > hlenA )
+			tA = hlenA;
+	}
+	tB = tA * dirA_dot_dirB - dirB_dot_trans;
-bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep)
+	if ( tB < -hlenB ) {
+		tB = -hlenB;
+		tA = tB * dirA_dot_dirB + dirA_dot_trans;
+		if ( tA < -hlenA )
+			tA = -hlenA;
+		else if ( tA > hlenA )
+			tA = hlenA;
+	} else if ( tB > hlenB ) {
+		tB = hlenB;
+		tA = tB * dirA_dot_dirB + dirA_dot_trans;
+		if ( tA < -hlenA )
+			tA = -hlenA;
+		else if ( tA > hlenA )
+			tA = hlenA;
+	}
+	// compute the closest points relative to segment centers.
+	offsetA = dirA * tA;
+	offsetB = dirB * tB;
+	ptsVector = translation - offsetA + offsetB;
+bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep, btDiscreteCollisionDetectorInterface::Result& resultOut)
@@ -182,9 +255,9 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 	for(int i=0;i<numFacesA;i++)
 		const btVector3 Normal(hullA.m_faces[i].m_plane[0], hullA.m_faces[i].m_plane[1], hullA.m_faces[i].m_plane[2]);
-		const btVector3 faceANormalWS = transA.getBasis() * Normal;
+		btVector3 faceANormalWS = transA.getBasis() * Normal;
 		if (DeltaC2.dot(faceANormalWS)<0)
-			continue;
+			faceANormalWS*=-1.f;
@@ -195,7 +268,8 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 		btScalar d;
-		if(!TestSepAxis( hullA, hullB, transA,transB, faceANormalWS, d))
+		btVector3 wA,wB;
+		if(!TestSepAxis( hullA, hullB, transA,transB, faceANormalWS, d,wA,wB))
 			return false;
@@ -210,9 +284,9 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 	for(int i=0;i<numFacesB;i++)
 		const btVector3 Normal(hullB.m_faces[i].m_plane[0], hullB.m_faces[i].m_plane[1], hullB.m_faces[i].m_plane[2]);
-		const btVector3 WorldNormal = transB.getBasis() * Normal;
+		btVector3 WorldNormal = transB.getBasis() * Normal;
 		if (DeltaC2.dot(WorldNormal)<0)
-			continue;
+			WorldNormal *=-1.f;
@@ -223,7 +297,8 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 		btScalar d;
-		if(!TestSepAxis(hullA, hullB,transA,transB, WorldNormal,d))
+		btVector3 wA,wB;
+		if(!TestSepAxis(hullA, hullB,transA,transB, WorldNormal,d,wA,wB))
 			return false;
@@ -234,6 +309,12 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 	btVector3 edgeAstart,edgeAend,edgeBstart,edgeBend;
+	int edgeA=-1;
+	int edgeB=-1;
+	btVector3 worldEdgeA;
+	btVector3 worldEdgeB;
+	btVector3 witnessPointA(0,0,0),witnessPointB(0,0,0);
 	int curEdgeEdge = 0;
 	// Test edges
@@ -252,7 +333,7 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 				Cross = Cross.normalize();
 				if (DeltaC2.dot(Cross)<0)
-					continue;
+					Cross *= -1.f;
@@ -263,29 +344,76 @@ bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron&
 				btScalar dist;
-				if(!TestSepAxis( hullA, hullB, transA,transB, Cross, dist))
+				btVector3 wA,wB;
+				if(!TestSepAxis( hullA, hullB, transA,transB, Cross, dist,wA,wB))
 					return false;
 					dmin = dist;
 					sep = Cross;
+					edgeA=e0;
+					edgeB=e1;
+					worldEdgeA = WorldEdge0;
+					worldEdgeB = WorldEdge1;
+					witnessPointA=wA;
+					witnessPointB=wB;
-	const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
-	if((deltaC.dot(sep))>0.0f)
+	if (edgeA>=0&&edgeB>=0)
+	{
+//		printf("edge-edge\n");
+		//add an edge-edge contact
+		btVector3 ptsVector;
+		btVector3 offsetA;
+		btVector3 offsetB;
+		btScalar tA;
+		btScalar tB;
+		btVector3 translation = witnessPointB-witnessPointA;
+		btVector3 dirA = worldEdgeA;
+		btVector3 dirB = worldEdgeB;
+		btScalar hlenB = 1e30f;
+		btScalar hlenA = 1e30f;
+		btSegmentsClosestPoints(ptsVector,offsetA,offsetB,tA,tB,
+			translation,
+			dirA, hlenA,
+			dirB,hlenB);
+		btScalar nlSqrt = ptsVector.length2();
+		if (nlSqrt>SIMD_EPSILON)
+		{
+			btScalar nl = btSqrt(nlSqrt);
+			ptsVector *= 1.f/nl;
+			if (ptsVector.dot(DeltaC2)<0.f)
+			{
+				ptsVector*=-1.f;
+			}
+			btVector3 ptOnB = witnessPointB + offsetB;
+			btScalar distance = nl;
+			resultOut.addContactPoint(ptsVector, ptOnB,-distance);
+		}
+	}
+	if((DeltaC2.dot(sep))<0.0f)
 		sep = -sep;
 	return true;
-void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
+void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1,btVertexArray& worldVertsB2, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
-	btVertexArray worldVertsB2;
+	worldVertsB2.resize(0);
 	btVertexArray* pVtxIn = &worldVertsB1;
 	btVertexArray* pVtxOut = &worldVertsB2;
@@ -312,7 +440,6 @@ void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatin
 	const btFace& polyA = hullA.m_faces[closestFaceA];
 		// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
-	int numContacts = pVtxIn->size();
 	int numVerticesA = polyA.m_indices.size();
 	for(int e0=0;e0<numVerticesA;e0++)
@@ -361,8 +488,8 @@ void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatin
 		btScalar planeEqWS=localPlaneEq-planeNormalWS.dot(transA.getOrigin());
 		for (int i=0;i<pVtxIn->size();i++)
-			btScalar depth = planeNormalWS.dot(pVtxIn->at(i))+planeEqWS;
+			btVector3 vtx = pVtxIn->at(i);
+			btScalar depth = planeNormalWS.dot(vtx)+planeEqWS;
 			if (depth <=minDist)
 //				printf("clamped: depth=%f to minDist=%f\n",depth,minDist);
@@ -397,16 +524,19 @@ void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatin
-void	btPolyhedralContactClipping::clipHullAgainstHull(const btVector3& separatingNormal1, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
+void	btPolyhedralContactClipping::clipHullAgainstHull(const btVector3& separatingNormal1, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist,btVertexArray& worldVertsB1,btVertexArray& worldVertsB2,btDiscreteCollisionDetectorInterface::Result& resultOut)
 	btVector3 separatingNormal = separatingNormal1.normalized();
-	const btVector3 c0 = transA * hullA.m_localCenter;
-	const btVector3 c1 = transB * hullB.m_localCenter;
-	const btVector3 DeltaC2 = c0 - c1;
+//	const btVector3 c0 = transA * hullA.m_localCenter;
+//	const btVector3 c1 = transB * hullB.m_localCenter;
+	//const btVector3 DeltaC2 = c0 - c1;
-	btScalar curMaxDist=maxDist;
 	int closestFaceB=-1;
 	btScalar dmax = -FLT_MAX;
@@ -422,7 +552,7 @@ void	btPolyhedralContactClipping::clipHullAgainstHull(const btVector3& separatin
-				btVertexArray worldVertsB1;
+	worldVertsB1.resize(0);
 					const btFace& polyB = hullB.m_faces[closestFaceB];
 					const int numVertices = polyB.m_indices.size();
@@ -435,6 +565,6 @@ void	btPolyhedralContactClipping::clipHullAgainstHull(const btVector3& separatin
 	if (closestFaceB>=0)
-		clipFaceAgainstHull(separatingNormal, hullA, transA,worldVertsB1, minDist, maxDist,resultOut);
+		clipFaceAgainstHull(separatingNormal, hullA, transA,worldVertsB1, worldVertsB2,minDist, maxDist,resultOut);
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h
index 99103df2..30e3db68 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h
@@ -32,10 +32,13 @@ typedef btAlignedObjectArray<btVector3> btVertexArray;
 // Clips a face to the back of a plane
 struct btPolyhedralContactClipping
-	static void clipHullAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist, btDiscreteCollisionDetectorInterface::Result& resultOut);
-	static void	clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut);
-	static bool findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep);
+	static void	clipHullAgainstHull(const btVector3& separatingNormal1, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist,btVertexArray& worldVertsB1,btVertexArray& worldVertsB2,btDiscreteCollisionDetectorInterface::Result& resultOut);
+	static void	clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1,btVertexArray& worldVertsB2, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut);
+	static bool findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep, btDiscreteCollisionDetectorInterface::Result& resultOut);
 	///the clipFace method is used internally
 	static void clipFace(const btVertexArray& pVtxIn, btVertexArray& ppVtxOut, const btVector3& planeNormalWS,btScalar planeEqWS);
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.cpp
index fbe579ce..786efd18 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.cpp
@@ -57,12 +57,13 @@ void btTriangleRaycastCallback::processTriangle(btVector3* triangle,int partId,
 		return ; // same sign
-   //@BP Mod - Backface filtering
-   if (((m_flags & kF_FilterBackfaces) != 0) && (dist_a > btScalar(0.0)))
-   {
-      // Backface, skip check
-      return;
-   }
+	if (((m_flags & kF_FilterBackfaces) != 0) && (dist_a <= btScalar(0.0)))
+	{
+		// Backface, skip check
+		return;
+	}
 	const btScalar proj_length=dist_a-dist_b;
 	const btScalar distance = (dist_a)/(proj_length);
@@ -97,18 +98,18 @@ void btTriangleRaycastCallback::processTriangle(btVector3* triangle,int partId,
 					if ( (btScalar)(cp2.dot(triangleNormal)) >=edge_tolerance) 
-                  //@BP Mod
-                  // Triangle normal isn't normalized
+					  //@BP Mod
+					  // Triangle normal isn't normalized
-                  //@BP Mod - Allow for unflipped normal when raycasting against backfaces
-                  if (((m_flags & kF_KeepUnflippedNormal) != 0) || (dist_a <= btScalar(0.0)))
+					 //@BP Mod - Allow for unflipped normal when raycasting against backfaces
+						if (((m_flags & kF_KeepUnflippedNormal) == 0) && (dist_a <= btScalar(0.0)))
 							m_hitFraction = reportHit(-triangleNormal,distance,partId,triangleIndex);
-                     m_hitFraction = reportHit(triangleNormal,distance,partId,triangleIndex);
+							m_hitFraction = reportHit(triangleNormal,distance,partId,triangleIndex);
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.h
index f012889a..f2ed0cd3 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btRaycastCallback.h
@@ -32,10 +32,12 @@ public:
    //@BP Mod - allow backface filtering and unflipped normals
    enum EFlags
-      kF_None                 = 0,
+	  kF_None                 = 0,
       kF_FilterBackfaces      = 1 << 0,
       kF_KeepUnflippedNormal  = 1 << 1,   // Prevents returned face normal getting flipped when a ray hits a back-facing triangle
+		///SubSimplexConvexCastRaytest is the default, even if kF_None is set.
+	  kF_UseSubSimplexConvexCastRaytest = 1 << 2,   // Uses an approximate but faster ray versus convex intersection algorithm
+	  kF_UseGjkConvexCastRaytest = 1 << 3,
       kF_Terminator        = 0xFFFFFFFF
    unsigned int m_flags;
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.cpp
index 18eb662d..ec638f60 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.cpp
@@ -65,10 +65,10 @@ bool	btSubsimplexConvexCast::calcTimeOfImpact(
 	btVector3 n;
-	bool hasResult = false;
 	btVector3 c;
-	btScalar lastLambda = lambda;
 	btScalar dist2 = v.length2();
@@ -109,9 +109,9 @@ bool	btSubsimplexConvexCast::calcTimeOfImpact(
 				//check next line
 				 w = supVertexA-supVertexB;
-				lastLambda = lambda;
 				n = v;
-				hasResult = true;
 		///Just like regular GJK only add the vertex if it isn't already (close) to current vertex, it would lead to divisions by zero and NaN etc.
@@ -121,7 +121,7 @@ bool	btSubsimplexConvexCast::calcTimeOfImpact(
 		if (m_simplexSolver->closest(v))
 			dist2 = v.length2();
-			hasResult = true;
 			//todo: check this normal for validity
 			//printf("V=%f , %f, %f\n",v[0],v[1],v[2]);
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.cpp b/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.cpp
index a775198a..23b4f79c 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.cpp
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.cpp
@@ -294,7 +294,10 @@ bool btVoronoiSimplexSolver::inSimplex(const btVector3& w)
 		if (m_simplexVectorW[i] == w)
+		{
 			found = true;
+			break;
+		}
 	//check in case lastW is already removed
diff --git a/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h b/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h
index f1c7613e..2f389e27 100644
--- a/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h
+++ b/src/bullet/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h
@@ -92,13 +92,15 @@ struct	btSubSimplexClosestResult
 /// btVoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
 /// Can be used with GJK, as an alternative to Johnson distance algorithm.
-class btVoronoiSimplexSolver
+ATTRIBUTE_ALIGNED16(class) btVoronoiSimplexSolver
-class btVoronoiSimplexSolver : public btSimplexSolverInterface
+ATTRIBUTE_ALIGNED16(class) btVoronoiSimplexSolver : public btSimplexSolverInterface
 	int	m_numVertices;
 	btVector3	m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
diff --git a/src/bullet/BulletDynamics/Character/btCharacterControllerInterface.h b/src/bullet/BulletDynamics/Character/btCharacterControllerInterface.h
index c81813c9..c3a3ac6c 100644
--- a/src/bullet/BulletDynamics/Character/btCharacterControllerInterface.h
+++ b/src/bullet/BulletDynamics/Character/btCharacterControllerInterface.h
@@ -31,15 +31,16 @@ public:
 	virtual void	setWalkDirection(const btVector3& walkDirection) = 0;
 	virtual void	setVelocityForTimeInterval(const btVector3& velocity, btScalar timeInterval) = 0;
-	virtual void	reset () = 0;
+	virtual void	reset ( btCollisionWorld* collisionWorld ) = 0;
 	virtual void	warp (const btVector3& origin) = 0;
 	virtual void	preStep ( btCollisionWorld* collisionWorld) = 0;
 	virtual void	playerStep (btCollisionWorld* collisionWorld, btScalar dt) = 0;
 	virtual bool	canJump () const = 0;
-	virtual void	jump () = 0;
+	virtual void	jump(const btVector3& dir = btVector3()) = 0;
 	virtual bool	onGround () const = 0;
+	virtual void	setUpInterpolate (bool value) = 0;
diff --git a/src/bullet/BulletDynamics/Character/btKinematicCharacterController.cpp b/src/bullet/BulletDynamics/Character/btKinematicCharacterController.cpp
index f733dc0c..68fa5206 100644
--- a/src/bullet/BulletDynamics/Character/btKinematicCharacterController.cpp
+++ b/src/bullet/BulletDynamics/Character/btKinematicCharacterController.cpp
@@ -14,6 +14,7 @@ subject to the following restrictions:
+#include <stdio.h>
 #include "LinearMath/btIDebugDraw.h"
 #include "BulletCollision/CollisionDispatch/btGhostObject.h"
 #include "BulletCollision/CollisionShapes/btMultiSphereShape.h"
@@ -28,9 +29,10 @@ subject to the following restrictions:
 static btVector3
 getNormalizedVector(const btVector3& v)
-	btVector3 n = v.normalized();
-	if (n.length() < SIMD_EPSILON) {
-		n.setValue(0, 0, 0);
+	btVector3 n(0, 0, 0);
+	if (v.length() > SIMD_EPSILON) {
+		n = v.normalized();
 	return n;
@@ -77,6 +79,9 @@ public:
 		if (convexResult.m_hitCollisionObject == m_me)
 			return btScalar(1.0);
+		if (!convexResult.m_hitCollisionObject->hasContactResponse())
+			return btScalar(1.0);
 		btVector3 hitNormalWorld;
 		if (normalInWorldSpace)
@@ -127,26 +132,37 @@ btVector3 btKinematicCharacterController::perpindicularComponent (const btVector
 	return direction - parallelComponent(direction, normal);
-btKinematicCharacterController::btKinematicCharacterController (btPairCachingGhostObject* ghostObject,btConvexShape* convexShape,btScalar stepHeight, int upAxis)
+btKinematicCharacterController::btKinematicCharacterController (btPairCachingGhostObject* ghostObject,btConvexShape* convexShape,btScalar stepHeight, const btVector3& up)
-	m_upAxis = upAxis;
-	m_addedMargin = 0.02;
-	m_walkDirection.setValue(0,0,0);
-	m_useGhostObjectSweepTest = true;
 	m_ghostObject = ghostObject;
-	m_stepHeight = stepHeight;
+	m_up.setValue(0.0f, 0.0f, 1.0f);
+	m_jumpAxis.setValue(0.0f, 0.0f, 1.0f);
+	setUp(up);
+	setStepHeight(stepHeight);
+	m_addedMargin = 0.02;
+	m_walkDirection.setValue(0.0,0.0,0.0);
+	m_AngVel.setValue(0.0, 0.0, 0.0);
+	m_useGhostObjectSweepTest = true;	
 	m_turnAngle = btScalar(0.0);
 	m_useWalkDirection = true;	// use walk direction by default, legacy behavior
 	m_velocityTimeInterval = 0.0;
 	m_verticalVelocity = 0.0;
 	m_verticalOffset = 0.0;
-	m_gravity = 9.8 * 3 ; // 3G acceleration.
+	m_gravity = 9.8 * 3.0 ; // 3G acceleration.
 	m_fallSpeed = 55.0; // Terminal velocity of a sky diver in m/s.
 	m_jumpSpeed = 10.0; // ?
+	m_SetjumpSpeed = m_jumpSpeed;
 	m_wasOnGround = false;
 	m_wasJumping = false;
+	m_interpolateUp = true;
+	m_currentStepOffset = 0.0;
+	m_maxPenetrationDepth = 0.2;
+	full_drop = false;
+	bounce_fix = false;
+	m_linearDamping = btScalar(0.0);
+	m_angularDamping = btScalar(0.0);
 btKinematicCharacterController::~btKinematicCharacterController ()
@@ -160,19 +176,42 @@ btPairCachingGhostObject* btKinematicCharacterController::getGhostObject()
 bool btKinematicCharacterController::recoverFromPenetration ( btCollisionWorld* collisionWorld)
+	// Here we must refresh the overlapping paircache as the penetrating movement itself or the
+	// previous recovery iteration might have used setWorldTransform and pushed us into an object
+	// that is not in the previous cache contents from the last timestep, as will happen if we
+	// are pushed into a new AABB overlap. Unhandled this means the next convex sweep gets stuck.
+	//
+	// Do this by calling the broadphase's setAabb with the moved AABB, this will update the broadphase
+	// paircache and the ghostobject's internal paircache at the same time.    /BW
+	btVector3 minAabb, maxAabb;
+	m_convexShape->getAabb(m_ghostObject->getWorldTransform(), minAabb,maxAabb);
+	collisionWorld->getBroadphase()->setAabb(m_ghostObject->getBroadphaseHandle(), 
+						 minAabb, 
+						 maxAabb, 
+						 collisionWorld->getDispatcher());
 	bool penetration = false;
 	collisionWorld->getDispatcher()->dispatchAllCollisionPairs(m_ghostObject->getOverlappingPairCache(), collisionWorld->getDispatchInfo(), collisionWorld->getDispatcher());
 	m_currentPosition = m_ghostObject->getWorldTransform().getOrigin();
-	btScalar maxPen = btScalar(0.0);
+//	btScalar maxPen = btScalar(0.0);
 	for (int i = 0; i < m_ghostObject->getOverlappingPairCache()->getNumOverlappingPairs(); i++)
 		btBroadphasePair* collisionPair = &m_ghostObject->getOverlappingPairCache()->getOverlappingPairArray()[i];
+		btCollisionObject* obj0 = static_cast<btCollisionObject*>(collisionPair->m_pProxy0->m_clientObject);
+        btCollisionObject* obj1 = static_cast<btCollisionObject*>(collisionPair->m_pProxy1->m_clientObject);
+		if ((obj0 && !obj0->hasContactResponse()) || (obj1 && !obj1->hasContactResponse()))
+			continue;
+		if (!needsCollision(obj0, obj1))
+			continue;
 		if (collisionPair->m_algorithm)
@@ -188,14 +227,15 @@ bool btKinematicCharacterController::recoverFromPenetration ( btCollisionWorld*
 				btScalar dist = pt.getDistance();
-				if (dist < 0.0)
+				if (dist < -m_maxPenetrationDepth)
-					if (dist < maxPen)
-					{
-						maxPen = dist;
-						m_touchingNormal = pt.m_normalWorldOnB * directionSign;//??
+					// TODO: cause problems on slopes, not sure if it is needed
+					//if (dist < maxPen)
+					//{
+					//	maxPen = dist;
+					//	m_touchingNormal = pt.m_normalWorldOnB * directionSign;//??
-					}
+					//}
 					m_currentPosition += pt.m_normalWorldOnB * directionSign * dist * btScalar(0.2);
 					penetration = true;
 				} else {
@@ -215,18 +255,28 @@ bool btKinematicCharacterController::recoverFromPenetration ( btCollisionWorld*
 void btKinematicCharacterController::stepUp ( btCollisionWorld* world)
+	btScalar stepHeight = 0.0f;
+	if (m_verticalVelocity < 0.0)
+		stepHeight = m_stepHeight;
 	// phase 1: up
 	btTransform start, end;
-	m_targetPosition = m_currentPosition + getUpAxisDirections()[m_upAxis] * (m_stepHeight + (m_verticalOffset > 0.f?m_verticalOffset:0.f));
 	start.setIdentity ();
 	end.setIdentity ();
 	/* FIXME: Handle penetration properly */
-	start.setOrigin (m_currentPosition + getUpAxisDirections()[m_upAxis] * (m_convexShape->getMargin() + m_addedMargin));
+	start.setOrigin(m_currentPosition);
+	m_targetPosition = m_currentPosition + m_up * (stepHeight) + m_jumpAxis * ((m_verticalOffset > 0.f ? m_verticalOffset : 0.f));
+	m_currentPosition = m_targetPosition;
 	end.setOrigin (m_targetPosition);
-	btKinematicClosestNotMeConvexResultCallback callback (m_ghostObject, -getUpAxisDirections()[m_upAxis], btScalar(0.7071));
+	start.setRotation(m_currentOrientation);
+	end.setRotation(m_targetOrientation);
+	btKinematicClosestNotMeConvexResultCallback callback(m_ghostObject, -m_up, m_maxSlopeCosine);
 	callback.m_collisionFilterGroup = getGhostObject()->getBroadphaseHandle()->m_collisionFilterGroup;
 	callback.m_collisionFilterMask = getGhostObject()->getBroadphaseHandle()->m_collisionFilterMask;
@@ -236,26 +286,61 @@ void btKinematicCharacterController::stepUp ( btCollisionWorld* world)
-		world->convexSweepTest (m_convexShape, start, end, callback);
+		world->convexSweepTest(m_convexShape, start, end, callback, world->getDispatchInfo().m_allowedCcdPenetration);
-	if (callback.hasHit())
+	if (callback.hasHit() && m_ghostObject->hasContactResponse() && needsCollision(m_ghostObject, callback.m_hitCollisionObject))
 		// Only modify the position if the hit was a slope and not a wall or ceiling.
-		if(callback.m_hitNormalWorld.dot(getUpAxisDirections()[m_upAxis]) > 0.0)
+		if (callback.m_hitNormalWorld.dot(m_up) > 0.0)
 			// we moved up only a fraction of the step height
-			m_currentStepOffset = m_stepHeight * callback.m_closestHitFraction;
-			m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, callback.m_closestHitFraction);
+			m_currentStepOffset = stepHeight * callback.m_closestHitFraction;
+			if (m_interpolateUp == true)
+				m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, callback.m_closestHitFraction);
+			else
+				m_currentPosition = m_targetPosition;
+		}
+		btTransform& xform = m_ghostObject->getWorldTransform();
+		xform.setOrigin(m_currentPosition);
+		m_ghostObject->setWorldTransform(xform);
+		// fix penetration if we hit a ceiling for example
+		int numPenetrationLoops = 0;
+		m_touchingContact = false;
+		while (recoverFromPenetration(world))
+		{
+			numPenetrationLoops++;
+			m_touchingContact = true;
+			if (numPenetrationLoops > 4)
+			{
+				//printf("character could not recover from penetration = %d\n", numPenetrationLoops);
+				break;
+			}
+		}
+		m_targetPosition = m_ghostObject->getWorldTransform().getOrigin();
+		m_currentPosition = m_targetPosition;
+		if (m_verticalOffset > 0)
+		{
+			m_verticalOffset = 0.0;
+			m_verticalVelocity = 0.0;
+			m_currentStepOffset = m_stepHeight;
-		m_verticalVelocity = 0.0;
-		m_verticalOffset = 0.0;
 	} else {
-		m_currentStepOffset = m_stepHeight;
+		m_currentStepOffset = stepHeight;
 		m_currentPosition = m_targetPosition;
+bool btKinematicCharacterController::needsCollision(const btCollisionObject* body0, const btCollisionObject* body1)
+	bool collides = (body0->getBroadphaseHandle()->m_collisionFilterGroup & body1->getBroadphaseHandle()->m_collisionFilterMask) != 0;
+	collides = collides && (body1->getBroadphaseHandle()->m_collisionFilterGroup & body0->getBroadphaseHandle()->m_collisionFilterMask);
+	return collides;
 void btKinematicCharacterController::updateTargetPositionBasedOnCollision (const btVector3& hitNormal, btScalar tangentMag, btScalar normalMag)
 	btVector3 movementDirection = m_targetPosition - m_currentPosition;
@@ -298,6 +383,7 @@ void btKinematicCharacterController::stepForwardAndStrafe ( btCollisionWorld* co
 	// 	m_normalizedDirection[0],m_normalizedDirection[1],m_normalizedDirection[2]);
 	// phase 2: forward and strafe
 	btTransform start, end;
 	m_targetPosition = m_currentPosition + walkMove;
 	start.setIdentity ();
@@ -307,14 +393,6 @@ void btKinematicCharacterController::stepForwardAndStrafe ( btCollisionWorld* co
 	btScalar distance2 = (m_currentPosition-m_targetPosition).length2();
 //	printf("distance2=%f\n",distance2);
-	if (m_touchingContact)
-	{
-		if (m_normalizedDirection.dot(m_touchingNormal) > btScalar(0.0))
-		{
-			updateTargetPositionBasedOnCollision (m_touchingNormal);
-		}
-	}
 	int maxIter = 10;
 	while (fraction > btScalar(0.01) && maxIter-- > 0)
@@ -323,6 +401,9 @@ void btKinematicCharacterController::stepForwardAndStrafe ( btCollisionWorld* co
 		end.setOrigin (m_targetPosition);
 		btVector3 sweepDirNegative(m_currentPosition - m_targetPosition);
+		start.setRotation(m_currentOrientation);
+		end.setRotation(m_targetOrientation);
 		btKinematicClosestNotMeConvexResultCallback callback (m_ghostObject, sweepDirNegative, btScalar(0.0));
 		callback.m_collisionFilterGroup = getGhostObject()->getBroadphaseHandle()->m_collisionFilterGroup;
 		callback.m_collisionFilterMask = getGhostObject()->getBroadphaseHandle()->m_collisionFilterMask;
@@ -331,25 +412,27 @@ void btKinematicCharacterController::stepForwardAndStrafe ( btCollisionWorld* co
 		btScalar margin = m_convexShape->getMargin();
 		m_convexShape->setMargin(margin + m_addedMargin);
-		if (m_useGhostObjectSweepTest)
+		if (!(start == end))
-			m_ghostObject->convexSweepTest (m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
-		} else
-		{
-			collisionWorld->convexSweepTest (m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			if (m_useGhostObjectSweepTest)
+			{
+				m_ghostObject->convexSweepTest(m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			}
+			else
+			{
+				collisionWorld->convexSweepTest(m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			}
 		fraction -= callback.m_closestHitFraction;
-		if (callback.hasHit())
+		if (callback.hasHit() && m_ghostObject->hasContactResponse() && needsCollision(m_ghostObject, callback.m_hitCollisionObject))
 			// we moved only a fraction
-			btScalar hitDistance;
-			hitDistance = (callback.m_hitPointWorld - m_currentPosition).length();
+			//btScalar hitDistance;
+			//hitDistance = (callback.m_hitPointWorld - m_currentPosition).length();
 //			m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, callback.m_closestHitFraction);
@@ -370,66 +453,153 @@ void btKinematicCharacterController::stepForwardAndStrafe ( btCollisionWorld* co
-		} else {
-			// we moved whole way
-			m_currentPosition = m_targetPosition;
-	//	if (callback.m_closestHitFraction == 0.f)
-	//		break;
+        else
+        {
+            m_currentPosition = m_targetPosition;
+		}
 void btKinematicCharacterController::stepDown ( btCollisionWorld* collisionWorld, btScalar dt)
-	btTransform start, end;
+	btTransform start, end, end_double;
+	bool runonce = false;
 	// phase 3: down
 	/*btScalar additionalDownStep = (m_wasOnGround && !onGround()) ? m_stepHeight : 0.0;
-	btVector3 step_drop = getUpAxisDirections()[m_upAxis] * (m_currentStepOffset + additionalDownStep);
+	btVector3 step_drop = m_up * (m_currentStepOffset + additionalDownStep);
 	btScalar downVelocity = (additionalDownStep == 0.0 && m_verticalVelocity<0.0?-m_verticalVelocity:0.0) * dt;
-	btVector3 gravity_drop = getUpAxisDirections()[m_upAxis] * downVelocity; 
+	btVector3 gravity_drop = m_up * downVelocity; 
 	m_targetPosition -= (step_drop + gravity_drop);*/
+	btVector3 orig_position = m_targetPosition;
 	btScalar downVelocity = (m_verticalVelocity<0.f?-m_verticalVelocity:0.f) * dt;
-	if(downVelocity > 0.0 && downVelocity < m_stepHeight
+	if (m_verticalVelocity > 0.0)
+		return;
+	if(downVelocity > 0.0 && downVelocity > m_fallSpeed
 		&& (m_wasOnGround || !m_wasJumping))
-	{
-		downVelocity = m_stepHeight;
-	}
+		downVelocity = m_fallSpeed;
-	btVector3 step_drop = getUpAxisDirections()[m_upAxis] * (m_currentStepOffset + downVelocity);
+	btVector3 step_drop = m_up * (m_currentStepOffset + downVelocity);
 	m_targetPosition -= step_drop;
-	start.setIdentity ();
-	end.setIdentity ();
+	btKinematicClosestNotMeConvexResultCallback callback(m_ghostObject, m_up, m_maxSlopeCosine);
+        callback.m_collisionFilterGroup = getGhostObject()->getBroadphaseHandle()->m_collisionFilterGroup;
+        callback.m_collisionFilterMask = getGhostObject()->getBroadphaseHandle()->m_collisionFilterMask;
-	start.setOrigin (m_currentPosition);
-	end.setOrigin (m_targetPosition);
+	btKinematicClosestNotMeConvexResultCallback callback2(m_ghostObject, m_up, m_maxSlopeCosine);
+        callback2.m_collisionFilterGroup = getGhostObject()->getBroadphaseHandle()->m_collisionFilterGroup;
+        callback2.m_collisionFilterMask = getGhostObject()->getBroadphaseHandle()->m_collisionFilterMask;
-	btKinematicClosestNotMeConvexResultCallback callback (m_ghostObject, getUpAxisDirections()[m_upAxis], m_maxSlopeCosine);
-	callback.m_collisionFilterGroup = getGhostObject()->getBroadphaseHandle()->m_collisionFilterGroup;
-	callback.m_collisionFilterMask = getGhostObject()->getBroadphaseHandle()->m_collisionFilterMask;
-	if (m_useGhostObjectSweepTest)
+	while (1)
-		m_ghostObject->convexSweepTest (m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
-	} else
-	{
-		collisionWorld->convexSweepTest (m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+		start.setIdentity ();
+		end.setIdentity ();
+		end_double.setIdentity ();
+		start.setOrigin (m_currentPosition);
+		end.setOrigin (m_targetPosition);
+		start.setRotation(m_currentOrientation);
+		end.setRotation(m_targetOrientation);
+		//set double test for 2x the step drop, to check for a large drop vs small drop
+		end_double.setOrigin (m_targetPosition - step_drop);
+		if (m_useGhostObjectSweepTest)
+		{
+			m_ghostObject->convexSweepTest (m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			if (!callback.hasHit() && m_ghostObject->hasContactResponse())
+			{
+				//test a double fall height, to see if the character should interpolate it's fall (full) or not (partial)
+				m_ghostObject->convexSweepTest (m_convexShape, start, end_double, callback2, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			}
+		} else
+		{
+			collisionWorld->convexSweepTest (m_convexShape, start, end, callback, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			if (!callback.hasHit() && m_ghostObject->hasContactResponse())
+			{
+				//test a double fall height, to see if the character should interpolate it's fall (large) or not (small)
+				collisionWorld->convexSweepTest (m_convexShape, start, end_double, callback2, collisionWorld->getDispatchInfo().m_allowedCcdPenetration);
+			}
+		}
+		btScalar downVelocity2 = (m_verticalVelocity<0.f?-m_verticalVelocity:0.f) * dt;
+		bool has_hit;
+		if (bounce_fix == true)
+			has_hit = (callback.hasHit() || callback2.hasHit()) && m_ghostObject->hasContactResponse() && needsCollision(m_ghostObject, callback.m_hitCollisionObject);
+		else
+			has_hit = callback2.hasHit() && m_ghostObject->hasContactResponse() && needsCollision(m_ghostObject, callback2.m_hitCollisionObject);
+		btScalar stepHeight = 0.0f;
+		if (m_verticalVelocity < 0.0)
+			stepHeight = m_stepHeight;
+		if (downVelocity2 > 0.0 && downVelocity2 < stepHeight && has_hit == true && runonce == false
+					&& (m_wasOnGround || !m_wasJumping))
+		{
+			//redo the velocity calculation when falling a small amount, for fast stairs motion
+			//for larger falls, use the smoother/slower interpolated movement by not touching the target position
+			m_targetPosition = orig_position;
+			downVelocity = stepHeight;
+			step_drop = m_up * (m_currentStepOffset + downVelocity);
+			m_targetPosition -= step_drop;
+			runonce = true;
+			continue; //re-run previous tests
+		}
+		break;
-	if (callback.hasHit())
+	if (m_ghostObject->hasContactResponse() && (callback.hasHit() && needsCollision(m_ghostObject, callback.m_hitCollisionObject)) || runonce == true)
 		// we dropped a fraction of the height -> hit floor
-		m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, callback.m_closestHitFraction);
+		btScalar fraction = (m_currentPosition.getY() - callback.m_hitPointWorld.getY()) / 2;
+		//printf("hitpoint: %g - pos %g\n", callback.m_hitPointWorld.getY(), m_currentPosition.getY());
+		if (bounce_fix == true)
+		{
+			if (full_drop == true)
+				m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, callback.m_closestHitFraction);
+            else
+				//due to errors in the closestHitFraction variable when used with large polygons, calculate the hit fraction manually
+				m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, fraction);
+		}
+		else
+			m_currentPosition.setInterpolate3 (m_currentPosition, m_targetPosition, callback.m_closestHitFraction);
+		full_drop = false;
 		m_verticalVelocity = 0.0;
 		m_verticalOffset = 0.0;
 		m_wasJumping = false;
 	} else {
 		// we dropped the full height
+		full_drop = true;
+		if (bounce_fix == true)
+		{
+			downVelocity = (m_verticalVelocity<0.f?-m_verticalVelocity:0.f) * dt;
+			if (downVelocity > m_fallSpeed && (m_wasOnGround || !m_wasJumping))
+			{
+				m_targetPosition += step_drop; //undo previous target change
+				downVelocity = m_fallSpeed;
+				step_drop = m_up * (m_currentStepOffset + downVelocity);
+				m_targetPosition -= step_drop;
+			}
+		}
+		//printf("full drop - %g, %g\n", m_currentPosition.getY(), m_targetPosition.getY());
 		m_currentPosition = m_targetPosition;
@@ -462,13 +632,66 @@ btScalar timeInterval
 	m_useWalkDirection = false;
 	m_walkDirection = velocity;
 	m_normalizedDirection = getNormalizedVector(m_walkDirection);
-	m_velocityTimeInterval = timeInterval;
+	m_velocityTimeInterval += timeInterval;
+void btKinematicCharacterController::setAngularVelocity(const btVector3& velocity)
+	m_AngVel = velocity;
+const btVector3& btKinematicCharacterController::getAngularVelocity() const
+	return m_AngVel;
-void btKinematicCharacterController::reset ()
+void btKinematicCharacterController::setLinearVelocity(const btVector3& velocity)
+	m_walkDirection = velocity;
+	// HACK: if we are moving in the direction of the up, treat it as a jump :(
+	if (m_walkDirection.length2() > 0)
+	{
+		btVector3 w = velocity.normalized();
+		btScalar c = w.dot(m_up);
+		if (c != 0)
+		{
+			//there is a component in walkdirection for vertical velocity
+			btVector3 upComponent = m_up * (sinf(SIMD_HALF_PI - acosf(c)) * m_walkDirection.length());
+			m_walkDirection -= upComponent;
+			m_verticalVelocity = (c < 0.0f ? -1 : 1) * upComponent.length();
+			if (c > 0.0f)
+			{
+				m_wasJumping = true;
+				m_jumpPosition = m_ghostObject->getWorldTransform().getOrigin();
+			}
+		}
+	}
+	else
+		m_verticalVelocity = 0.0f;
+btVector3 btKinematicCharacterController::getLinearVelocity() const
+	return m_walkDirection + (m_verticalVelocity * m_up);
+void btKinematicCharacterController::reset ( btCollisionWorld* collisionWorld )
+    m_verticalVelocity = 0.0;
+    m_verticalOffset = 0.0;
+    m_wasOnGround = false;
+    m_wasJumping = false;
+    m_walkDirection.setValue(0,0,0);
+    m_velocityTimeInterval = 0.0;
+    //clear pair cache
+    btHashedOverlappingPairCache *cache = m_ghostObject->getOverlappingPairCache();
+    while (cache->getOverlappingPairArray().size() > 0)
+    {
+            cache->removeOverlappingPair(cache->getOverlappingPairArray()[0].m_pProxy0, cache->getOverlappingPairArray()[0].m_pProxy1, collisionWorld->getDispatcher());
+    }
 void btKinematicCharacterController::warp (const btVector3& origin)
@@ -482,62 +705,99 @@ void btKinematicCharacterController::warp (const btVector3& origin)
 void btKinematicCharacterController::preStep (  btCollisionWorld* collisionWorld)
-	int numPenetrationLoops = 0;
-	m_touchingContact = false;
-	while (recoverFromPenetration (collisionWorld))
-	{
-		numPenetrationLoops++;
-		m_touchingContact = true;
-		if (numPenetrationLoops > 4)
-		{
-			//printf("character could not recover from penetration = %d\n", numPenetrationLoops);
-			break;
-		}
-	}
 	m_currentPosition = m_ghostObject->getWorldTransform().getOrigin();
 	m_targetPosition = m_currentPosition;
-//	printf("m_targetPosition=%f,%f,%f\n",m_targetPosition[0],m_targetPosition[1],m_targetPosition[2]);
+	m_currentOrientation = m_ghostObject->getWorldTransform().getRotation();
+	m_targetOrientation = m_currentOrientation;
+//	printf("m_targetPosition=%f,%f,%f\n",m_targetPosition[0],m_targetPosition[1],m_targetPosition[2]);
-#include <stdio.h>
 void btKinematicCharacterController::playerStep (  btCollisionWorld* collisionWorld, btScalar dt)
 //	printf("playerStep(): ");
 //	printf("  dt = %f", dt);
+	if (m_AngVel.length2() > 0.0f)
+	{
+		m_AngVel *= btPow(btScalar(1) - m_angularDamping, dt);
+	}
+	// integrate for angular velocity
+	if (m_AngVel.length2() > 0.0f)
+	{
+		btTransform xform;
+		xform = m_ghostObject->getWorldTransform();
+		btQuaternion rot(m_AngVel.normalized(), m_AngVel.length() * dt);
+		btQuaternion orn = rot * xform.getRotation();
+		xform.setRotation(orn);
+		m_ghostObject->setWorldTransform(xform);
+		m_currentPosition = m_ghostObject->getWorldTransform().getOrigin();
+		m_targetPosition = m_currentPosition;
+		m_currentOrientation = m_ghostObject->getWorldTransform().getRotation();
+		m_targetOrientation = m_currentOrientation;
+	}
 	// quick check...
-	if (!m_useWalkDirection && m_velocityTimeInterval <= 0.0) {
+	if (!m_useWalkDirection && (m_velocityTimeInterval <= 0.0)) {
 //		printf("\n");
 		return;		// no motion
 	m_wasOnGround = onGround();
+	//btVector3 lvel = m_walkDirection;
+	btScalar c = 0.0f;
+	if (m_walkDirection.length2() > 0)
+	{
+		// apply damping
+		m_walkDirection *= btPow(btScalar(1) - m_linearDamping, dt);
+	}
+	m_verticalVelocity *= btPow(btScalar(1) - m_linearDamping, dt);
 	// Update fall velocity.
 	m_verticalVelocity -= m_gravity * dt;
-	if(m_verticalVelocity > 0.0 && m_verticalVelocity > m_jumpSpeed)
+	if (m_verticalVelocity > 0.0 && m_verticalVelocity > m_jumpSpeed)
 		m_verticalVelocity = m_jumpSpeed;
-	if(m_verticalVelocity < 0.0 && btFabs(m_verticalVelocity) > btFabs(m_fallSpeed))
+	if (m_verticalVelocity < 0.0 && btFabs(m_verticalVelocity) > btFabs(m_fallSpeed))
 		m_verticalVelocity = -btFabs(m_fallSpeed);
 	m_verticalOffset = m_verticalVelocity * dt;
 	btTransform xform;
-	xform = m_ghostObject->getWorldTransform ();
+	xform = m_ghostObject->getWorldTransform();
 //	printf("walkDirection(%f,%f,%f)\n",walkDirection[0],walkDirection[1],walkDirection[2]);
 //	printf("walkSpeed=%f\n",walkSpeed);
-	stepUp (collisionWorld);
+	stepUp(collisionWorld);
+	//todo: Experimenting with behavior of controller when it hits a ceiling..
+	//bool hitUp = stepUp (collisionWorld);	
+	//if (hitUp)
+	//{
+	//	m_verticalVelocity -= m_gravity * dt;
+	//	if (m_verticalVelocity > 0.0 && m_verticalVelocity > m_jumpSpeed)
+	//	{
+	//		m_verticalVelocity = m_jumpSpeed;
+	//	}
+	//	if (m_verticalVelocity < 0.0 && btFabs(m_verticalVelocity) > btFabs(m_fallSpeed))
+	//	{
+	//		m_verticalVelocity = -btFabs(m_fallSpeed);
+	//	}
+	//	m_verticalOffset = m_verticalVelocity * dt;
+	//	xform = m_ghostObject->getWorldTransform();
+	//}
 	if (m_useWalkDirection) {
 		stepForwardAndStrafe (collisionWorld, m_walkDirection);
 	} else {
@@ -557,10 +817,38 @@ void btKinematicCharacterController::playerStep (  btCollisionWorld* collisionWo
 	stepDown (collisionWorld, dt);
+	//todo: Experimenting with max jump height
+	//if (m_wasJumping)
+	//{
+	//	btScalar ds = m_currentPosition[m_upAxis] - m_jumpPosition[m_upAxis];
+	//	if (ds > m_maxJumpHeight)
+	//	{
+	//		// substract the overshoot
+	//		m_currentPosition[m_upAxis] -= ds - m_maxJumpHeight;
+	//		// max height was reached, so potential energy is at max 
+	//		// and kinematic energy is 0, thus velocity is 0.
+	//		if (m_verticalVelocity > 0.0)
+	//			m_verticalVelocity = 0.0;
+	//	}
+	//}
 	// printf("\n");
 	xform.setOrigin (m_currentPosition);
 	m_ghostObject->setWorldTransform (xform);
+	int numPenetrationLoops = 0;
+	m_touchingContact = false;
+	while (recoverFromPenetration(collisionWorld))
+	{
+		numPenetrationLoops++;
+		m_touchingContact = true;
+		if (numPenetrationLoops > 4)
+		{
+			//printf("character could not recover from penetration = %d\n", numPenetrationLoops);
+			break;
+		}
+	}
 void btKinematicCharacterController::setFallSpeed (btScalar fallSpeed)
@@ -571,6 +859,7 @@ void btKinematicCharacterController::setFallSpeed (btScalar fallSpeed)
 void btKinematicCharacterController::setJumpSpeed (btScalar jumpSpeed)
 	m_jumpSpeed = jumpSpeed;
+	m_SetjumpSpeed = m_jumpSpeed;
 void btKinematicCharacterController::setMaxJumpHeight (btScalar maxJumpHeight)
@@ -583,14 +872,16 @@ bool btKinematicCharacterController::canJump () const
 	return onGround();
-void btKinematicCharacterController::jump ()
+void btKinematicCharacterController::jump(const btVector3& v)
-	if (!canJump())
-		return;
+	m_jumpSpeed = v.length2() == 0 ? m_SetjumpSpeed : v.length();
 	m_verticalVelocity = m_jumpSpeed;
 	m_wasJumping = true;
+	m_jumpAxis = v.length2() == 0 ? m_up : v.normalized();
+	m_jumpPosition = m_ghostObject->getWorldTransform().getOrigin();
 #if 0
 	currently no jumping.
 	btTransform xform;
@@ -602,14 +893,16 @@ void btKinematicCharacterController::jump ()
-void btKinematicCharacterController::setGravity(btScalar gravity)
+void btKinematicCharacterController::setGravity(const btVector3& gravity)
-	m_gravity = gravity;
+	if (gravity.length2() > 0) setUpVector(-gravity);
+	m_gravity = gravity.length();
-btScalar btKinematicCharacterController::getGravity() const
+btVector3 btKinematicCharacterController::getGravity() const
-	return m_gravity;
+	return -m_gravity * m_up;
 void btKinematicCharacterController::setMaxSlope(btScalar slopeRadians)
@@ -623,11 +916,25 @@ btScalar btKinematicCharacterController::getMaxSlope() const
 	return m_maxSlopeRadians;
+void btKinematicCharacterController::setMaxPenetrationDepth(btScalar d)
+	m_maxPenetrationDepth = d;
+btScalar btKinematicCharacterController::getMaxPenetrationDepth() const
+	return m_maxPenetrationDepth;
 bool btKinematicCharacterController::onGround () const
-	return m_verticalVelocity == 0.0 && m_verticalOffset == 0.0;
+	return (fabs(m_verticalVelocity) < SIMD_EPSILON) && (fabs(m_verticalOffset) < SIMD_EPSILON);
+void btKinematicCharacterController::setStepHeight(btScalar h) 
+	m_stepHeight = h;
 btVector3* btKinematicCharacterController::getUpAxisDirections()
@@ -639,3 +946,54 @@ btVector3* btKinematicCharacterController::getUpAxisDirections()
 void btKinematicCharacterController::debugDraw(btIDebugDraw* debugDrawer)
+void btKinematicCharacterController::setUpInterpolate(bool value)
+	m_interpolateUp = value;
+void btKinematicCharacterController::setUp(const btVector3& up)
+	if (up.length2() > 0 && m_gravity > 0.0f)
+	{
+		setGravity(-m_gravity * up.normalized());
+		return;
+	}
+	setUpVector(up);
+void btKinematicCharacterController::setUpVector(const btVector3& up)
+	if (m_up == up)
+		return;
+	btVector3 u = m_up;
+	if (up.length2() > 0)
+		m_up = up.normalized();
+	else
+		m_up = btVector3(0.0, 0.0, 0.0);
+	if (!m_ghostObject) return;
+	btQuaternion rot = getRotation(m_up, u);
+	//set orientation with new up
+	btTransform xform;
+	xform = m_ghostObject->getWorldTransform();
+	btQuaternion orn = rot.inverse() * xform.getRotation();
+	xform.setRotation(orn);
+	m_ghostObject->setWorldTransform(xform);
+btQuaternion btKinematicCharacterController::getRotation(btVector3& v0, btVector3& v1) const
+	if (v0.length2() == 0.0f || v1.length2() == 0.0f)
+	{
+		btQuaternion q;
+		return q;
+	}
+	return shortestArcQuatNormalize2(v0, v1);
diff --git a/src/bullet/BulletDynamics/Character/btKinematicCharacterController.h b/src/bullet/BulletDynamics/Character/btKinematicCharacterController.h
index ef01f8a3..3d677e64 100644
--- a/src/bullet/BulletDynamics/Character/btKinematicCharacterController.h
+++ b/src/bullet/BulletDynamics/Character/btKinematicCharacterController.h
@@ -34,7 +34,7 @@ class btPairCachingGhostObject;
 ///btKinematicCharacterController is an object that supports a sliding motion in a world.
 ///It uses a ghost object and convex sweep test to test for upcoming collisions. This is combined with discrete collision detection to recover from penetrations.
 ///Interaction between btKinematicCharacterController and dynamic rigid bodies needs to be explicity implemented by the user.
-class btKinematicCharacterController : public btCharacterControllerInterface
+ATTRIBUTE_ALIGNED16(class) btKinematicCharacterController : public btCharacterControllerInterface
@@ -43,10 +43,12 @@ protected:
 	btPairCachingGhostObject* m_ghostObject;
 	btConvexShape*	m_convexShape;//is also in m_ghostObject, but it needs to be convex, so we store it here to avoid upcast
+	btScalar m_maxPenetrationDepth;
 	btScalar m_verticalVelocity;
 	btScalar m_verticalOffset;
 	btScalar m_fallSpeed;
 	btScalar m_jumpSpeed;
+	btScalar m_SetjumpSpeed;
 	btScalar m_maxJumpHeight;
 	btScalar m_maxSlopeRadians; // Slope angle that is set (used for returning the exact value)
 	btScalar m_maxSlopeCosine;  // Cosine equivalent of m_maxSlopeRadians (calculated once when set, for optimization)
@@ -61,26 +63,39 @@ protected:
 	///this is the desired walk direction, set by the user
 	btVector3	m_walkDirection;
 	btVector3	m_normalizedDirection;
+	btVector3	m_AngVel;
+	btVector3	m_jumpPosition;
 	//some internal variables
 	btVector3 m_currentPosition;
 	btScalar  m_currentStepOffset;
 	btVector3 m_targetPosition;
+	btQuaternion m_currentOrientation;
+	btQuaternion m_targetOrientation;
 	///keep track of the contact manifolds
 	btManifoldArray	m_manifoldArray;
 	bool m_touchingContact;
 	btVector3 m_touchingNormal;
+	btScalar m_linearDamping;
+	btScalar m_angularDamping;
 	bool  m_wasOnGround;
 	bool  m_wasJumping;
 	bool	m_useGhostObjectSweepTest;
 	bool	m_useWalkDirection;
 	btScalar	m_velocityTimeInterval;
-	int m_upAxis;
+	btVector3 m_up;
+	btVector3 m_jumpAxis;
 	static btVector3* getUpAxisDirections();
+	bool  m_interpolateUp;
+	bool  full_drop;
+	bool  bounce_fix;
 	btVector3 computeReflectionDirection (const btVector3& direction, const btVector3& normal);
 	btVector3 parallelComponent (const btVector3& direction, const btVector3& normal);
@@ -91,8 +106,18 @@ protected:
 	void updateTargetPositionBasedOnCollision (const btVector3& hit_normal, btScalar tangentMag = btScalar(0.0), btScalar normalMag = btScalar(1.0));
 	void stepForwardAndStrafe (btCollisionWorld* collisionWorld, const btVector3& walkMove);
 	void stepDown (btCollisionWorld* collisionWorld, btScalar dt);
+	virtual bool needsCollision(const btCollisionObject* body0, const btCollisionObject* body1);
+	void setUpVector(const btVector3& up);
+	btQuaternion getRotation(btVector3& v0, btVector3& v1) const;
-	btKinematicCharacterController (btPairCachingGhostObject* ghostObject,btConvexShape* convexShape,btScalar stepHeight, int upAxis = 1);
+	btKinematicCharacterController (btPairCachingGhostObject* ghostObject,btConvexShape* convexShape,btScalar stepHeight, const btVector3& up = btVector3(1.0,0.0,0.0));
 	~btKinematicCharacterController ();
@@ -106,14 +131,9 @@ public:
 	///btActionInterface interface
 	void	debugDraw(btIDebugDraw* debugDrawer);
-	void setUpAxis (int axis)
-	{
-		if (axis < 0)
-			axis = 0;
-		if (axis > 2)
-			axis = 2;
-		m_upAxis = axis;
-	}
+	void setUp(const btVector3& up);
+	const btVector3& getUp() { return m_up; }
 	/// This should probably be called setPositionIncrementPerSimulatorStep.
 	/// This is neither a direction nor a velocity, but the amount to
@@ -130,27 +150,47 @@ public:
 	virtual void setVelocityForTimeInterval(const btVector3& velocity,
 				btScalar timeInterval);
-	void reset ();
+	virtual void setAngularVelocity(const btVector3& velocity);
+	virtual const btVector3& getAngularVelocity() const;
+	virtual void setLinearVelocity(const btVector3& velocity);
+	virtual btVector3 getLinearVelocity() const;
+	void setLinearDamping(btScalar d) { m_linearDamping = btClamped(d, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0)); }
+	btScalar getLinearDamping() const { return  m_linearDamping; }
+	void setAngularDamping(btScalar d) { m_angularDamping = btClamped(d, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0)); }
+	btScalar getAngularDamping() const { return  m_angularDamping; }
+	void reset ( btCollisionWorld* collisionWorld );
 	void warp (const btVector3& origin);
 	void preStep (  btCollisionWorld* collisionWorld);
 	void playerStep ( btCollisionWorld* collisionWorld, btScalar dt);
+	void setStepHeight(btScalar h);
+	btScalar getStepHeight() const { return m_stepHeight; }
 	void setFallSpeed (btScalar fallSpeed);
+	btScalar getFallSpeed() const { return m_fallSpeed; }
 	void setJumpSpeed (btScalar jumpSpeed);
+	btScalar getJumpSpeed() const { return m_jumpSpeed; }
 	void setMaxJumpHeight (btScalar maxJumpHeight);
 	bool canJump () const;
-	void jump ();
+	void jump(const btVector3& v = btVector3());
-	void setGravity(btScalar gravity);
-	btScalar getGravity() const;
+	void applyImpulse(const btVector3& v) { jump(v); }
+	void setGravity(const btVector3& gravity);
+	btVector3 getGravity() const;
 	/// The max slope determines the maximum angle that the controller can walk up.
 	/// The slope angle is measured in radians.
 	void setMaxSlope(btScalar slopeRadians);
 	btScalar getMaxSlope() const;
+	void setMaxPenetrationDepth(btScalar d);
+	btScalar getMaxPenetrationDepth() const;
 	btPairCachingGhostObject* getGhostObject();
 	void	setUseGhostSweepTest(bool useGhostObjectSweepTest)
@@ -158,6 +198,7 @@ public:
 	bool onGround () const;
+	void setUpInterpolate (bool value);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp
index 755544f0..09b7388b 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp
@@ -53,6 +53,7 @@ btConeTwistConstraint::btConeTwistConstraint(btRigidBody& rbA,const btTransform&
 	m_rbBFrame = m_rbAFrame;
+	m_rbBFrame.setOrigin(btVector3(0., 0., 0.));
@@ -136,6 +137,9 @@ void btConeTwistConstraint::getInfo2NonVirtual (btConstraintInfo2* info,const bt
 		btVector3 a1neg = -a1;
+    info->m_J2linearAxis[0] = -1;
+    info->m_J2linearAxis[info->rowskip+1] = -1;
+    info->m_J2linearAxis[2*info->rowskip+2] = -1;
 	btVector3 a2 = transB.getBasis() * m_rbBFrame.getOrigin();
 		btVector3* angular0 = (btVector3*)(info->m_J2angularAxis);
@@ -210,7 +214,7 @@ void btConeTwistConstraint::getInfo2NonVirtual (btConstraintInfo2* info,const bt
 			// m_swingCorrection is always positive or 0
 			info->m_lowerLimit[srow] = 0;
-			info->m_upperLimit[srow] = SIMD_INFINITY;
+			info->m_upperLimit[srow] = (m_bMotorEnabled && m_maxMotorImpulse >= 0.0f) ? m_maxMotorImpulse : SIMD_INFINITY;
 			srow += info->rowskip;
@@ -304,7 +308,7 @@ void	btConeTwistConstraint::buildJacobian()
-void	btConeTwistConstraint::solveConstraintObsolete(btRigidBody& bodyA,btRigidBody& bodyB,btScalar	timeStep)
+void	btConeTwistConstraint::solveConstraintObsolete(btSolverBody& bodyA,btSolverBody& bodyB,btScalar	timeStep)
 	#ifndef __SPU__
 	if (m_useSolveConstraintObsolete)
@@ -506,7 +510,7 @@ void	btConeTwistConstraint::solveConstraintObsolete(btRigidBody& bodyA,btRigidBo
 				m_accTwistLimitImpulse = btMax(m_accTwistLimitImpulse + impulseMag, btScalar(0.0) );
 				impulseMag = m_accTwistLimitImpulse - temp;
-				btVector3 impulse = m_twistAxis * impulseMag;
+		//		btVector3 impulse = m_twistAxis * impulseMag;
 				bodyA.internalApplyImpulse(btVector3(0,0,0), m_rbA.getInvInertiaTensorWorld()*m_twistAxis,impulseMag);
 				bodyB.internalApplyImpulse(btVector3(0,0,0), m_rbB.getInvInertiaTensorWorld()*m_twistAxis,-impulseMag);
@@ -536,8 +540,8 @@ void btConeTwistConstraint::calcAngleInfo()
 	m_solveTwistLimit = false;
 	m_solveSwingLimit = false;
-	btVector3 b1Axis1,b1Axis2,b1Axis3;
-	btVector3 b2Axis1,b2Axis2;
+	btVector3 b1Axis1(0,0,0),b1Axis2(0,0,0),b1Axis3(0,0,0);
+	btVector3 b2Axis1(0,0,0),b2Axis2(0,0,0);
 	b1Axis1 = getRigidBodyA().getCenterOfMassTransform().getBasis() * this->m_rbAFrame.getBasis().getColumn(0);
 	b2Axis1 = getRigidBodyB().getCenterOfMassTransform().getBasis() * this->m_rbBFrame.getBasis().getColumn(0);
@@ -725,7 +729,8 @@ void btConeTwistConstraint::calcAngleInfo2(const btTransform& transA, const btTr
 				if(m_swingSpan1 < m_fixThresh)
 				{ // hinge around Y axis
-					if(!(btFuzzyZero(y)))
+//					if(!(btFuzzyZero(y)))
+					if((!(btFuzzyZero(x))) || (!(btFuzzyZero(z))))
 						m_solveSwingLimit = true;
 						if(m_swingSpan2 >= m_fixThresh)
@@ -747,7 +752,8 @@ void btConeTwistConstraint::calcAngleInfo2(const btTransform& transA, const btTr
 				{ // hinge around Z axis
-					if(!btFuzzyZero(z))
+//					if(!btFuzzyZero(z))
+					if((!(btFuzzyZero(x))) || (!(btFuzzyZero(y))))
 						m_solveSwingLimit = true;
 						if(m_swingSpan1 >= m_fixThresh)
@@ -772,8 +778,10 @@ void btConeTwistConstraint::calcAngleInfo2(const btTransform& transA, const btTr
 				target[2] = x * ivA[2] + y * jvA[2] + z * kvA[2];
 				m_swingAxis = -ivB.cross(target);
-				m_swingCorrection = m_swingAxis.length();
-				m_swingAxis.normalize();
+                                m_swingCorrection = m_swingAxis.length();
+                                if (!btFuzzyZero(m_swingCorrection))
+                                    m_swingAxis.normalize();
@@ -828,12 +836,11 @@ void btConeTwistConstraint::computeConeLimitInfo(const btQuaternion& qCone,
 		vSwingAxis = btVector3(qCone.x(), qCone.y(), qCone.z());
-		if (fabs(vSwingAxis.x()) > SIMD_EPSILON)
-		{
-			// non-zero twist?! this should never happen.
-			int wtf = 0; wtf = wtf;
-		}
+#if 0
+        // non-zero twist?! this should never happen.
+       btAssert(fabs(vSwingAxis.x()) <= SIMD_EPSILON));
 		// Compute limit for given swing. tricky:
 		// Given a swing axis, we're looking for the intersection with the bounding cone ellipse.
 		// (Since we're dealing with angles, this ellipse is embedded on the surface of a sphere.)
@@ -877,8 +884,10 @@ void btConeTwistConstraint::computeConeLimitInfo(const btQuaternion& qCone,
 	else if (swingAngle < 0)
 		// this should never happen!
-		int wtf = 0; wtf = wtf;
-	}
+#if 0
+        btAssert(0);
+ 	}
 btVector3 btConeTwistConstraint::GetPointForAngle(btScalar fAngleInRadians, btScalar fLength) const
@@ -929,7 +938,9 @@ void btConeTwistConstraint::computeTwistLimitInfo(const btQuaternion& qTwist,
 	if (twistAngle < 0)
 		// this should never happen
-		int wtf = 0; wtf = wtf;			
+#if 0
+        btAssert(0);
 	vTwistAxis = btVector3(qMinTwist.x(), qMinTwist.y(), qMinTwist.z());
@@ -974,12 +985,12 @@ void btConeTwistConstraint::adjustSwingAxisToUseEllipseNormal(btVector3& vSwingA
 void btConeTwistConstraint::setMotorTarget(const btQuaternion &q)
-	btTransform trACur = m_rbA.getCenterOfMassTransform();
-	btTransform trBCur = m_rbB.getCenterOfMassTransform();
-	btTransform trABCur = trBCur.inverse() * trACur;
-	btQuaternion qABCur = trABCur.getRotation();
-	btTransform trConstraintCur = (trBCur * m_rbBFrame).inverse() * (trACur * m_rbAFrame);
-	btQuaternion qConstraintCur = trConstraintCur.getRotation();
+	//btTransform trACur = m_rbA.getCenterOfMassTransform();
+	//btTransform trBCur = m_rbB.getCenterOfMassTransform();
+//	btTransform trABCur = trBCur.inverse() * trACur;
+//	btQuaternion qABCur = trABCur.getRotation();
+//	btTransform trConstraintCur = (trBCur * m_rbBFrame).inverse() * (trACur * m_rbAFrame);
+	//btQuaternion qConstraintCur = trConstraintCur.getRotation();
 	btQuaternion qConstraint = m_rbBFrame.getRotation().inverse() * q * m_rbAFrame.getRotation();
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h
index 868e62f0..b7636180 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h
@@ -40,6 +40,15 @@ and swing 1 and 2 are along the z and y axes respectively.
 #include "btJacobianEntry.h"
 #include "btTypedConstraint.h"
+#define btConeTwistConstraintData2	btConeTwistConstraintDoubleData
+#define btConeTwistConstraintDataName	"btConeTwistConstraintDoubleData"
+#define btConeTwistConstraintData2	btConeTwistConstraintData 
+#define btConeTwistConstraintDataName	"btConeTwistConstraintData" 
 class btRigidBody;
 enum btConeTwistFlags
@@ -50,7 +59,7 @@ enum btConeTwistFlags
 ///btConeTwistConstraint can be used to simulate ragdoll joints (upper arm, leg etc)
-class btConeTwistConstraint : public btTypedConstraint
+ATTRIBUTE_ALIGNED16(class) btConeTwistConstraint : public btTypedConstraint
@@ -126,6 +135,8 @@ protected:
 	btConeTwistConstraint(btRigidBody& rbA,btRigidBody& rbB,const btTransform& rbAFrame, const btTransform& rbBFrame);
 	btConeTwistConstraint(btRigidBody& rbA,const btTransform& rbAFrame);
@@ -140,8 +151,9 @@ public:
 	void	getInfo2NonVirtual(btConstraintInfo2* info,const btTransform& transA,const btTransform& transB,const btMatrix3x3& invInertiaWorldA,const btMatrix3x3& invInertiaWorldB);
-	virtual	void	solveConstraintObsolete(btRigidBody& bodyA,btRigidBody& bodyB,btScalar	timeStep);
+	virtual	void	solveConstraintObsolete(btSolverBody& bodyA,btSolverBody& bodyB,btScalar	timeStep);
 	void	updateRHS(btScalar	timeStep);
@@ -158,6 +170,11 @@ public:
 		m_angularOnly = angularOnly;
+	bool    getAngularOnly() const
+	{
+	    return m_angularOnly;
+	}
 	void	setLimit(int limitIndex,btScalar limitValue)
@@ -184,6 +201,33 @@ public:
+    btScalar getLimit(int limitIndex) const
+	{
+		switch (limitIndex)
+		{
+		case 3:
+			{
+				return m_twistSpan;
+				break;
+			}
+		case 4:
+			{
+				return m_swingSpan2;
+				break;
+			}
+		case 5:
+			{
+				return m_swingSpan1;
+				break;
+			}
+		default:
+			{
+			    btAssert(0 && "Invalid limitIndex specified for btConeTwistConstraint");
+			    return 0.0;
+			}
+		};
+	}
 	// setLimit(), a few notes:
 	// _softness:
 	//		0->1, recommend ~0.8->1.
@@ -206,8 +250,8 @@ public:
 		m_relaxationFactor = _relaxationFactor;
-	const btTransform& getAFrame() { return m_rbAFrame; };	
-	const btTransform& getBFrame() { return m_rbBFrame; };
+	const btTransform& getAFrame() const { return m_rbAFrame; };	
+	const btTransform& getBFrame() const { return m_rbBFrame; };
 	inline int getSolveTwistLimit()
@@ -227,27 +271,43 @@ public:
 	void calcAngleInfo();
 	void calcAngleInfo2(const btTransform& transA, const btTransform& transB,const btMatrix3x3& invInertiaWorldA,const btMatrix3x3& invInertiaWorldB);
-	inline btScalar getSwingSpan1()
+	inline btScalar getSwingSpan1() const
 		return m_swingSpan1;
-	inline btScalar getSwingSpan2()
+	inline btScalar getSwingSpan2() const
 		return m_swingSpan2;
-	inline btScalar getTwistSpan()
+	inline btScalar getTwistSpan() const
 		return m_twistSpan;
-	inline btScalar getTwistAngle()
+	inline btScalar getLimitSoftness() const
+	{
+		return m_limitSoftness;
+	}
+	inline btScalar getBiasFactor() const
+	{
+		return m_biasFactor;
+	}
+	inline btScalar getRelaxationFactor() const
+	{
+		return m_relaxationFactor;
+	}
+	inline btScalar getTwistAngle() const
 		return m_twistAngle;
 	bool isPastSwingLimit() { return m_solveSwingLimit; }
+	btScalar getDamping() const { return m_damping; }
 	void setDamping(btScalar damping) { m_damping = damping; }
 	void enableMotor(bool b) { m_bMotorEnabled = b; }
+	bool isMotorEnabled() const { return m_bMotorEnabled; }
+	btScalar getMaxMotorImpulse() const { return m_maxMotorImpulse; }
+	bool isMaxMotorImpulseNormalized() const { return m_bNormalizedMotorStrength; }
 	void setMaxMotorImpulse(btScalar maxMotorImpulse) { m_maxMotorImpulse = maxMotorImpulse; m_bNormalizedMotorStrength = false; }
 	void setMaxMotorImpulseNormalized(btScalar maxMotorImpulse) { m_maxMotorImpulse = maxMotorImpulse; m_bNormalizedMotorStrength = true; }
@@ -259,6 +319,7 @@ public:
 	// note: if q violates the joint limits, the internal target is clamped to avoid conflicting impulses (very bad for stability)
 	// note: don't forget to enableMotor()
 	void setMotorTarget(const btQuaternion &q);
+	const btQuaternion& getMotorTarget() const { return m_qTarget; }
 	// same as above, but q is the desired rotation of frameA wrt frameB in constraint space
 	void setMotorTargetInConstraintSpace(const btQuaternion &q);
@@ -285,6 +346,11 @@ public:
 	///return the local value of parameter
 	virtual	btScalar getParam(int num, int axis = -1) const;
+	int getFlags() const
+	{
+		return m_flags;
+	}
 	virtual	int	calculateSerializeBufferSize() const;
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
@@ -292,7 +358,30 @@ public:
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btConeTwistConstraintDoubleData
+	btTypedConstraintDoubleData	m_typeConstraintData;
+	btTransformDoubleData m_rbAFrame;
+	btTransformDoubleData m_rbBFrame;
+	//limits
+	double	m_swingSpan1;
+	double	m_swingSpan2;
+	double	m_twistSpan;
+	double	m_limitSoftness;
+	double	m_biasFactor;
+	double	m_relaxationFactor;
+	double	m_damping;
+///this structure is not used, except for loading pre-2.82 .bullet files
 struct	btConeTwistConstraintData
 	btTypedConstraintData	m_typeConstraintData;
@@ -312,12 +401,12 @@ struct	btConeTwistConstraintData
 	char m_pad[4];
 SIMD_FORCE_INLINE int	btConeTwistConstraint::calculateSerializeBufferSize() const
-	return sizeof(btConeTwistConstraintData);
+	return sizeof(btConeTwistConstraintData2);
@@ -325,21 +414,21 @@ SIMD_FORCE_INLINE int	btConeTwistConstraint::calculateSerializeBufferSize() cons
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 SIMD_FORCE_INLINE const char*	btConeTwistConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
-	btConeTwistConstraintData* cone = (btConeTwistConstraintData*) dataBuffer;
+	btConeTwistConstraintData2* cone = (btConeTwistConstraintData2*) dataBuffer;
-	m_rbAFrame.serializeFloat(cone->m_rbAFrame);
-	m_rbBFrame.serializeFloat(cone->m_rbBFrame);
+	m_rbAFrame.serialize(cone->m_rbAFrame);
+	m_rbBFrame.serialize(cone->m_rbBFrame);
-	cone->m_swingSpan1 = float(m_swingSpan1);
-	cone->m_swingSpan2 = float(m_swingSpan2);
-	cone->m_twistSpan = float(m_twistSpan);
-	cone->m_limitSoftness = float(m_limitSoftness);
-	cone->m_biasFactor = float(m_biasFactor);
-	cone->m_relaxationFactor = float(m_relaxationFactor);
-	cone->m_damping = float(m_damping);
-	return "btConeTwistConstraintData";
+	cone->m_swingSpan1 = m_swingSpan1;
+	cone->m_swingSpan2 = m_swingSpan2;
+	cone->m_twistSpan = m_twistSpan;
+	cone->m_limitSoftness = m_limitSoftness;
+	cone->m_biasFactor = m_biasFactor;
+	cone->m_relaxationFactor = m_relaxationFactor;
+	cone->m_damping = m_damping;
+	return btConeTwistConstraintDataName;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btConstraintSolver.h b/src/bullet/BulletDynamics/ConstraintSolver/btConstraintSolver.h
index 6f673102..890afe6d 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btConstraintSolver.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btConstraintSolver.h
@@ -28,6 +28,15 @@ class btIDebugDraw;
 class btStackAlloc;
 class	btDispatcher;
 /// btConstraintSolver provides solver interface
+enum btConstraintSolverType
 class btConstraintSolver
@@ -38,12 +47,16 @@ public:
 	virtual void prepareSolve (int /* numBodies */, int /* numManifolds */) {;}
 	///solve a group of constraints
-	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints, const btContactSolverInfo& info,class btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher) = 0;
+	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints, const btContactSolverInfo& info,class btIDebugDraw* debugDrawer,btDispatcher* dispatcher) = 0;
-	virtual void allSolved (const btContactSolverInfo& /* info */,class btIDebugDraw* /* debugDrawer */, btStackAlloc* /* stackAlloc */) {;}
+	virtual void allSolved (const btContactSolverInfo& /* info */,class btIDebugDraw* /* debugDrawer */) {;}
 	///clear internal cached data and reset random seed
 	virtual	void	reset() = 0;
+	virtual btConstraintSolverType	getSolverType() const=0;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btContactConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btContactConstraint.cpp
index 88859182..1098d0c9 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btContactConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btContactConstraint.cpp
@@ -70,7 +70,7 @@ void	btContactConstraint::buildJacobian()
-//response  between two dynamic objects without friction, assuming 0 penetration depth
+//response  between two dynamic objects without friction and no restitution, assuming 0 penetration depth
 btScalar resolveSingleCollision(
         btRigidBody* body1,
         btCollisionObject* colObj2,
@@ -93,7 +93,7 @@ btScalar resolveSingleCollision(
     btScalar rel_vel;
     rel_vel = normal.dot(vel);
-    btScalar combinedRestitution = body1->getRestitution() * colObj2->getRestitution();
+    btScalar combinedRestitution = 0.f;
     btScalar restitution = combinedRestitution* -rel_vel;
     btScalar positionalError = solverInfo.m_erp *-distance /solverInfo.m_timeStep ;
@@ -155,8 +155,7 @@ void resolveSingleBilateral(btRigidBody& body1, const btVector3& pos1,
 		body1.getCenterOfMassTransform().getBasis().transpose() * body1.getAngularVelocity(),
 		body2.getCenterOfMassTransform().getBasis().transpose() * body2.getAngularVelocity()); 
-	btScalar a;
-	a=jacDiagABInv;
 	rel_vel = normal.dot(vel);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h b/src/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h
index 6204cb3d..a3a0fa67 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h
@@ -16,18 +16,20 @@ subject to the following restrictions:
+#include "LinearMath/btScalar.h"
 enum	btSolverMode
-	SOLVER_SIMD = 256,	//enabled for Windows, the solver innerloop is branchless SIMD, 40% faster than FPU/scalar version
-	SOLVER_CUDA = 512	//will be open sourced during Game Developers Conference 2009. Much faster.
+	SOLVER_SIMD = 256,
 struct btContactSolverInfoData
@@ -47,12 +49,15 @@ struct btContactSolverInfoData
 	btScalar	m_globalCfm;//constraint force mixing
 	int			m_splitImpulse;
 	btScalar	m_splitImpulsePenetrationThreshold;
+	btScalar	m_splitImpulseTurnErp;
 	btScalar	m_linearSlop;
 	btScalar	m_warmstartingFactor;
 	int			m_solverMode;
 	int	m_restingContactRestitutionThreshold;
 	int			m_minimumSolverBatchSize;
+	btScalar	m_maxGyroscopicForce;
+	btScalar	m_singleAxisRollingFrictionThreshold;
@@ -67,21 +72,88 @@ struct btContactSolverInfo : public btContactSolverInfoData
 		m_tau = btScalar(0.6);
 		m_damping = btScalar(1.0);
 		m_friction = btScalar(0.3);
+		m_timeStep = btScalar(1.f/60.f);
 		m_restitution = btScalar(0.);
 		m_maxErrorReduction = btScalar(20.);
 		m_numIterations = 10;
 		m_erp = btScalar(0.2);
-		m_erp2 = btScalar(0.1);
+		m_erp2 = btScalar(0.8);
 		m_globalCfm = btScalar(0.);
 		m_sor = btScalar(1.);
-		m_splitImpulse = false;
-		m_splitImpulsePenetrationThreshold = -0.02f;
+		m_splitImpulse = true;
+		m_splitImpulsePenetrationThreshold = -.04f;
+		m_splitImpulseTurnErp = 0.1f;
 		m_linearSlop = btScalar(0.0);
-		m_restingContactRestitutionThreshold = 2;//resting contact lifetime threshold to disable restitution
+		m_restingContactRestitutionThreshold = 2;//unused as of 2.81
 		m_minimumSolverBatchSize = 128; //try to combine islands until the amount of constraints reaches this limit
+		m_maxGyroscopicForce = 100.f; ///it is only used for 'explicit' version of gyroscopic force
+		m_singleAxisRollingFrictionThreshold = 1e30f;///if the velocity is above this threshold, it will use a single constraint row (axis), otherwise 3 rows.
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct btContactSolverInfoDoubleData
+	double		m_tau;
+	double		m_damping;//global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	double		m_friction;
+	double		m_timeStep;
+	double		m_restitution;
+	double		m_maxErrorReduction;
+	double		m_sor;
+	double		m_erp;//used as Baumgarte factor
+	double		m_erp2;//used in Split Impulse
+	double		m_globalCfm;//constraint force mixing
+	double		m_splitImpulsePenetrationThreshold;
+	double		m_splitImpulseTurnErp;
+	double		m_linearSlop;
+	double		m_warmstartingFactor;
+	double		m_maxGyroscopicForce;///it is only used for 'explicit' version of gyroscopic force
+	double		m_singleAxisRollingFrictionThreshold;
+	int			m_numIterations;
+	int			m_solverMode;
+	int			m_restingContactRestitutionThreshold;
+	int			m_minimumSolverBatchSize;
+	int			m_splitImpulse;
+	char		m_padding[4];
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct btContactSolverInfoFloatData
+	float		m_tau;
+	float		m_damping;//global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	float		m_friction;
+	float		m_timeStep;
+	float		m_restitution;
+	float		m_maxErrorReduction;
+	float		m_sor;
+	float		m_erp;//used as Baumgarte factor
+	float		m_erp2;//used in Split Impulse
+	float		m_globalCfm;//constraint force mixing
+	float		m_splitImpulsePenetrationThreshold;
+	float		m_splitImpulseTurnErp;
+	float		m_linearSlop;
+	float		m_warmstartingFactor;
+	float		m_maxGyroscopicForce;
+	float		m_singleAxisRollingFrictionThreshold;
+	int			m_numIterations;
+	int			m_solverMode;
+	int			m_restingContactRestitutionThreshold;
+	int			m_minimumSolverBatchSize;
+	int			m_splitImpulse;
+	char		m_padding[4];
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btFixedConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btFixedConstraint.cpp
new file mode 100644
index 00000000..75d81cc0
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btFixedConstraint.cpp
@@ -0,0 +1,37 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btFixedConstraint.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "LinearMath/btTransformUtil.h"
+#include <new>
+btFixedConstraint::btFixedConstraint(btRigidBody& rbA,btRigidBody& rbB, const btTransform& frameInA,const btTransform& frameInB)
+	setAngularLowerLimit(btVector3(0,0,0));
+	setAngularUpperLimit(btVector3(0,0,0));
+	setLinearLowerLimit(btVector3(0,0,0));
+	setLinearUpperLimit(btVector3(0,0,0));
+btFixedConstraint::~btFixedConstraint ()
diff --git a/src/bullet/BulletMultiThreaded/PpuAddressSpace.h b/src/bullet/BulletDynamics/ConstraintSolver/btFixedConstraint.h
similarity index 60%
rename from src/bullet/BulletMultiThreaded/PpuAddressSpace.h
rename to src/bullet/BulletDynamics/ConstraintSolver/btFixedConstraint.h
index 6f228274..bff2008b 100644
--- a/src/bullet/BulletMultiThreaded/PpuAddressSpace.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btFixedConstraint.h
@@ -1,6 +1,6 @@
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2010 Erwin Coumans  http://bulletphysics.org
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -13,25 +13,21 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
+#include "btGeneric6DofSpring2Constraint.h"
-#ifdef _WIN32
-//stop those casting warnings until we have a better solution for ppu_address_t / void* / uint64 conversions
-#pragma warning (disable: 4311)
-#pragma warning (disable: 4312)
-#endif //_WIN32
+ATTRIBUTE_ALIGNED16(class) btFixedConstraint : public btGeneric6DofSpring2Constraint
+	btFixedConstraint(btRigidBody& rbA,btRigidBody& rbB, const btTransform& frameInA,const btTransform& frameInB);
-#if defined(_WIN64)
-	typedef unsigned __int64 ppu_address_t;
-#elif defined(__LP64__) || defined(__x86_64__)
-	typedef uint64_t ppu_address_t;
-	typedef uint32_t ppu_address_t;
-#endif //defined(_WIN64)
+	virtual ~btFixedConstraint();
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGearConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btGearConstraint.cpp
new file mode 100644
index 00000000..bcd457b6
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGearConstraint.cpp
@@ -0,0 +1,54 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2012 Advanced Micro Devices, Inc.  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+/// Implemented by Erwin Coumans. The idea for the constraint comes from Dimitris Papavasiliou.
+#include "btGearConstraint.h"
+btGearConstraint::btGearConstraint(btRigidBody& rbA, btRigidBody& rbB, const btVector3& axisInA,const btVector3& axisInB, btScalar ratio)
+btGearConstraint::~btGearConstraint ()
+void btGearConstraint::getInfo1 (btConstraintInfo1* info)
+	info->m_numConstraintRows = 1;
+	info->nub = 1;
+void btGearConstraint::getInfo2 (btConstraintInfo2* info)
+	btVector3 globalAxisA, globalAxisB;
+	globalAxisA = m_rbA.getWorldTransform().getBasis()*this->m_axisInA;
+	globalAxisB = m_rbB.getWorldTransform().getBasis()*this->m_axisInB;
+	info->m_J1angularAxis[0] = globalAxisA[0];
+	info->m_J1angularAxis[1] = globalAxisA[1];
+	info->m_J1angularAxis[2] = globalAxisA[2];
+	info->m_J2angularAxis[0] = m_ratio*globalAxisB[0];
+	info->m_J2angularAxis[1] = m_ratio*globalAxisB[1];
+	info->m_J2angularAxis[2] = m_ratio*globalAxisB[2];
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGearConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btGearConstraint.h
new file mode 100644
index 00000000..f9afcb91
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGearConstraint.h
@@ -0,0 +1,152 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2012 Advanced Micro Devices, Inc.  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#define btGearConstraintData	btGearConstraintDoubleData
+#define btGearConstraintDataName	"btGearConstraintDoubleData"
+#define btGearConstraintData	btGearConstraintFloatData
+#define btGearConstraintDataName	"btGearConstraintFloatData"
+///The btGeatConstraint will couple the angular velocity for two bodies around given local axis and ratio.
+///See Bullet/Demos/ConstraintDemo for an example use.
+class btGearConstraint : public btTypedConstraint
+	btVector3	m_axisInA;
+	btVector3	m_axisInB;
+	bool		m_useFrameA;
+	btScalar	m_ratio;
+	btGearConstraint(btRigidBody& rbA, btRigidBody& rbB, const btVector3& axisInA,const btVector3& axisInB, btScalar ratio=1.f);
+	virtual ~btGearConstraint ();
+	///internal method used by the constraint solver, don't use them directly
+	virtual void getInfo1 (btConstraintInfo1* info);
+	///internal method used by the constraint solver, don't use them directly
+	virtual void getInfo2 (btConstraintInfo2* info);
+	void setAxisA(btVector3& axisA) 
+	{
+		m_axisInA = axisA;
+	}
+	void setAxisB(btVector3& axisB)
+	{
+		m_axisInB = axisB;
+	}
+	void setRatio(btScalar ratio)
+	{
+		m_ratio = ratio;
+	}
+	const btVector3& getAxisA() const
+	{
+		return m_axisInA;
+	}
+	const btVector3& getAxisB() const
+	{
+		return m_axisInB;
+	}
+	btScalar getRatio() const
+	{
+		return m_ratio;
+	}
+	virtual	void	setParam(int num, btScalar value, int axis = -1) 
+	{
+		(void) num;
+		(void) value;
+		(void) axis;
+		btAssert(0);
+	}
+	///return the local value of parameter
+	virtual	btScalar getParam(int num, int axis = -1) const 
+	{ 
+		(void) num;
+		(void) axis;
+		btAssert(0);
+		return 0.f;
+	}
+	virtual	int	calculateSerializeBufferSize() const;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	virtual	const char*	serialize(void* dataBuffer, btSerializer* serializer) const;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct btGearConstraintFloatData
+	btTypedConstraintFloatData	m_typeConstraintData;
+	btVector3FloatData			m_axisInA;
+	btVector3FloatData			m_axisInB;
+	float							m_ratio;
+	char							m_padding[4];
+struct btGearConstraintDoubleData
+	btTypedConstraintDoubleData	m_typeConstraintData;
+	btVector3DoubleData			m_axisInA;
+	btVector3DoubleData			m_axisInB;
+	double						m_ratio;
+SIMD_FORCE_INLINE	int	btGearConstraint::calculateSerializeBufferSize() const
+	return sizeof(btGearConstraintData);
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+SIMD_FORCE_INLINE	const char*	btGearConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
+	btGearConstraintData* gear = (btGearConstraintData*)dataBuffer;
+	btTypedConstraint::serialize(&gear->m_typeConstraintData,serializer);
+	m_axisInA.serialize( gear->m_axisInA );
+	m_axisInB.serialize( gear->m_axisInB );
+	gear->m_ratio = m_ratio;
+	return btGearConstraintDataName;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp
index 8ff9940b..bc2b5a85 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp
@@ -174,10 +174,8 @@ btScalar btRotationalLimitMotor::solveAngularLimits(
 	// current velocity difference
-	btVector3 angVelA;
-	body0->internalGetAngularVelocity(angVelA);
-	btVector3 angVelB;
-	body1->internalGetAngularVelocity(angVelB);
+	btVector3 angVelA = body0->getAngularVelocity();
+	btVector3 angVelB = body1->getAngularVelocity();
 	btVector3 vel_diff;
 	vel_diff = angVelA-angVelB;
@@ -225,12 +223,8 @@ btScalar btRotationalLimitMotor::solveAngularLimits(
 	btVector3 motorImp = clippedMotorImpulse * axis;
-	//body0->applyTorqueImpulse(motorImp);
-	//body1->applyTorqueImpulse(-motorImp);
-	body0->internalApplyImpulse(btVector3(0,0,0), body0->getInvInertiaTensorWorld()*axis,clippedMotorImpulse);
-	body1->internalApplyImpulse(btVector3(0,0,0), body1->getInvInertiaTensorWorld()*axis,-clippedMotorImpulse);
+	body0->applyTorqueImpulse(motorImp);
+	body1->applyTorqueImpulse(-motorImp);
 	return clippedMotorImpulse;
@@ -292,10 +286,8 @@ btScalar btTranslationalLimitMotor::solveLinearAxis(
 	btVector3 rel_pos1 = anchorPos - body1.getCenterOfMassPosition();
 	btVector3 rel_pos2 = anchorPos - body2.getCenterOfMassPosition();
-	btVector3 vel1;
-	body1.internalGetVelocityInLocalPointObsolete(rel_pos1,vel1);
-	btVector3 vel2;
-	body2.internalGetVelocityInLocalPointObsolete(rel_pos2,vel2);
+	btVector3 vel1 = body1.getVelocityInLocalPoint(rel_pos1);
+	btVector3 vel2 = body2.getVelocityInLocalPoint(rel_pos2);
 	btVector3 vel = vel1 - vel2;
 	btScalar rel_vel = axis_normal_on_a.dot(vel);
@@ -348,16 +340,10 @@ btScalar btTranslationalLimitMotor::solveLinearAxis(
 	normalImpulse = m_accumulatedImpulse[limit_index] - oldNormalImpulse;
 	btVector3 impulse_vector = axis_normal_on_a * normalImpulse;
-	//body1.applyImpulse( impulse_vector, rel_pos1);
-	//body2.applyImpulse(-impulse_vector, rel_pos2);
-	btVector3 ftorqueAxis1 = rel_pos1.cross(axis_normal_on_a);
-	btVector3 ftorqueAxis2 = rel_pos2.cross(axis_normal_on_a);
-	body1.internalApplyImpulse(axis_normal_on_a*body1.getInvMass(), body1.getInvInertiaTensorWorld()*ftorqueAxis1,normalImpulse);
-	body2.internalApplyImpulse(axis_normal_on_a*body2.getInvMass(), body2.getInvInertiaTensorWorld()*ftorqueAxis2,-normalImpulse);
+	body1.applyImpulse( impulse_vector, rel_pos1);
+	body2.applyImpulse(-impulse_vector, rel_pos2);
 	return normalImpulse;
@@ -795,17 +781,16 @@ int btGeneric6DofConstraint::get_limit_motor_info2(
     if (powered || limit)
     {   // if the joint is powered, or has joint limits, add in the extra row
         btScalar *J1 = rotational ? info->m_J1angularAxis : info->m_J1linearAxis;
-        btScalar *J2 = rotational ? info->m_J2angularAxis : 0;
+        btScalar *J2 = rotational ? info->m_J2angularAxis : info->m_J2linearAxis;
         J1[srow+0] = ax1[0];
         J1[srow+1] = ax1[1];
         J1[srow+2] = ax1[2];
-        if(rotational)
-        {
-            J2[srow+0] = -ax1[0];
-            J2[srow+1] = -ax1[1];
-            J2[srow+2] = -ax1[2];
-        }
-        if((!rotational))
+        J2[srow+0] = -ax1[0];
+        J2[srow+1] = -ax1[1];
+        J2[srow+2] = -ax1[2];
+		if((!rotational))
 			if (m_useOffsetForConstraintFrame)
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h
index b4410811..bea8629c 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h
@@ -35,6 +35,14 @@ class btRigidBody;
+#define btGeneric6DofConstraintData2		btGeneric6DofConstraintDoubleData2
+#define btGeneric6DofConstraintDataName	"btGeneric6DofConstraintDoubleData2"
+#define btGeneric6DofConstraintData2		btGeneric6DofConstraintData
+#define btGeneric6DofConstraintDataName	"btGeneric6DofConstraintData"
 //! Rotation Limit structure for generic joints
 class btRotationalLimitMotor
@@ -103,14 +111,14 @@ public:
 	//! Is limited
-    bool isLimited()
+    bool isLimited() const
     	if(m_loLimit > m_hiLimit) return false;
     	return true;
 	//! Need apply correction
-    bool needApplyTorques()
+    bool needApplyTorques() const
     	if(m_currentLimit == 0 && m_enableMotor == false) return false;
     	return true;
@@ -199,11 +207,11 @@ public:
     - limited means upper > lower
     - limitIndex: first 3 are linear, next 3 are angular
-    inline bool	isLimited(int limitIndex)
+    inline bool	isLimited(int limitIndex) const
        return (m_upperLimit[limitIndex] >= m_lowerLimit[limitIndex]);
-    inline bool needApplyForce(int limitIndex)
+    inline bool needApplyForce(int limitIndex) const
     	if(m_currentLimit[limitIndex] == 0 && m_enableMotor[limitIndex] == false) return false;
     	return true;
@@ -268,7 +276,7 @@ This brings support for limit parameters and motors. </li>
-class btGeneric6DofConstraint : public btTypedConstraint
+ATTRIBUTE_ALIGNED16(class) btGeneric6DofConstraint : public btTypedConstraint
@@ -346,6 +354,8 @@ protected:
 	///for backwards compatibility during the transition to 'getInfo/getInfo2'
 	bool		m_useSolveConstraintObsolete;
@@ -447,7 +457,7 @@ public:
     	m_linearLimits.m_lowerLimit = linearLower;
-	void	getLinearLowerLimit(btVector3& linearLower)
+	void	getLinearLowerLimit(btVector3& linearLower) const
 		linearLower = m_linearLimits.m_lowerLimit;
@@ -457,7 +467,7 @@ public:
 		m_linearLimits.m_upperLimit = linearUpper;
-	void	getLinearUpperLimit(btVector3& linearUpper)
+	void	getLinearUpperLimit(btVector3& linearUpper) const
 		linearUpper = m_linearLimits.m_upperLimit;
@@ -468,7 +478,7 @@ public:
 			m_angularLimits[i].m_loLimit = btNormalizeAngle(angularLower[i]);
-	void	getAngularLowerLimit(btVector3& angularLower)
+	void	getAngularLowerLimit(btVector3& angularLower) const
 		for(int i = 0; i < 3; i++) 
 			angularLower[i] = m_angularLimits[i].m_loLimit;
@@ -480,7 +490,7 @@ public:
 			m_angularLimits[i].m_hiLimit = btNormalizeAngle(angularUpper[i]);
-	void	getAngularUpperLimit(btVector3& angularUpper)
+	void	getAngularUpperLimit(btVector3& angularUpper) const
 		for(int i = 0; i < 3; i++)
 			angularUpper[i] = m_angularLimits[i].m_hiLimit;
@@ -522,7 +532,7 @@ public:
     - limited means upper > lower
     - limitIndex: first 3 are linear, next 3 are angular
-    bool	isLimited(int limitIndex)
+    bool	isLimited(int limitIndex) const
@@ -539,8 +549,11 @@ public:
 								btConstraintInfo2 *info, int row, btVector3& ax1, int rotational, int rotAllowed = false);
 	// access for UseFrameOffset
-	bool getUseFrameOffset() { return m_useOffsetForConstraintFrame; }
+	bool getUseFrameOffset() const { return m_useOffsetForConstraintFrame; }
 	void setUseFrameOffset(bool frameOffsetOnOff) { m_useOffsetForConstraintFrame = frameOffsetOnOff; }
+	bool getUseLinearReferenceFrameA() const { return m_useLinearReferenceFrameA; }
+	void setUseLinearReferenceFrameA(bool linearReferenceFrameA) { m_useLinearReferenceFrameA = linearReferenceFrameA; }
 	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
 	///If no axis is provided, it uses the default axis for this constraint.
@@ -550,6 +563,10 @@ public:
 	void setAxis( const btVector3& axis1, const btVector3& axis2);
+    	virtual	int getFlags() const
+    	{
+        	return m_flags;
+	}
 	virtual	int	calculateSerializeBufferSize() const;
@@ -559,7 +576,7 @@ public:
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
 struct btGeneric6DofConstraintData
 	btTypedConstraintData	m_typeConstraintData;
@@ -576,35 +593,51 @@ struct btGeneric6DofConstraintData
 	int m_useOffsetForConstraintFrame;
+struct btGeneric6DofConstraintDoubleData2
+	btTypedConstraintDoubleData	m_typeConstraintData;
+	btTransformDoubleData m_rbAFrame; // constraint axii. Assumes z is hinge axis.
+	btTransformDoubleData m_rbBFrame;
+	btVector3DoubleData	m_linearUpperLimit;
+	btVector3DoubleData	m_linearLowerLimit;
+	btVector3DoubleData	m_angularUpperLimit;
+	btVector3DoubleData	m_angularLowerLimit;
+	int	m_useLinearReferenceFrameA;
+	int m_useOffsetForConstraintFrame;
 SIMD_FORCE_INLINE	int	btGeneric6DofConstraint::calculateSerializeBufferSize() const
-	return sizeof(btGeneric6DofConstraintData);
+	return sizeof(btGeneric6DofConstraintData2);
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 SIMD_FORCE_INLINE	const char*	btGeneric6DofConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
-	btGeneric6DofConstraintData* dof = (btGeneric6DofConstraintData*)dataBuffer;
+	btGeneric6DofConstraintData2* dof = (btGeneric6DofConstraintData2*)dataBuffer;
-	m_frameInA.serializeFloat(dof->m_rbAFrame);
-	m_frameInB.serializeFloat(dof->m_rbBFrame);
+	m_frameInA.serialize(dof->m_rbAFrame);
+	m_frameInB.serialize(dof->m_rbBFrame);
 	int i;
 	for (i=0;i<3;i++)
-		dof->m_angularLowerLimit.m_floats[i] =  float(m_angularLimits[i].m_loLimit);
-		dof->m_angularUpperLimit.m_floats[i] =  float(m_angularLimits[i].m_hiLimit);
-		dof->m_linearLowerLimit.m_floats[i] = float(m_linearLimits.m_lowerLimit[i]);
-		dof->m_linearUpperLimit.m_floats[i] = float(m_linearLimits.m_upperLimit[i]);
+		dof->m_angularLowerLimit.m_floats[i] =  m_angularLimits[i].m_loLimit;
+		dof->m_angularUpperLimit.m_floats[i] =  m_angularLimits[i].m_hiLimit;
+		dof->m_linearLowerLimit.m_floats[i] = m_linearLimits.m_lowerLimit[i];
+		dof->m_linearUpperLimit.m_floats[i] = m_linearLimits.m_upperLimit[i];
 	dof->m_useLinearReferenceFrameA = m_useLinearReferenceFrameA? 1 : 0;
 	dof->m_useOffsetForConstraintFrame = m_useOffsetForConstraintFrame ? 1 : 0;
-	return "btGeneric6DofConstraintData";
+	return btGeneric6DofConstraintDataName;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp
new file mode 100644
index 00000000..49ff78c2
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp
@@ -0,0 +1,1121 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+2014 May: btGeneric6DofSpring2Constraint is created from the original (2.82.2712) btGeneric6DofConstraint by Gabor Puhr and Tamas Umenhoffer
+- Much more accurate and stable in a lot of situation. (Especially when a sleeping chain of RBs connected with 6dof2 is pulled)
+- Stable and accurate spring with minimal energy loss that works with all of the solvers. (latter is not true for the original 6dof spring)
+- Servo motor functionality
+- Much more accurate bouncing. 0 really means zero bouncing (not true for the original 6odf) and there is only a minimal energy loss when the value is 1 (because of the solvers' precision)
+- Rotation order for the Euler system can be set. (One axis' freedom is still limited to pi/2)
+- It is slower than the original 6dof. There is no exact ratio, but half speed is a good estimation. (with PGS)
+- At bouncing the correct velocity is calculated, but not the correct position. (it is because of the solver can correct position or velocity, but not both.)
+/// 2009 March: btGeneric6DofConstraint refactored by Roman Ponomarev
+/// Added support for generic constraint solver through getInfo1/getInfo2 methods
+btGeneric6DofConstraint Refactored by Francisco Le?n
+email: projectileman@yahoo.com
+#include "btGeneric6DofSpring2Constraint.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "LinearMath/btTransformUtil.h"
+#include <new>
+btGeneric6DofSpring2Constraint::btGeneric6DofSpring2Constraint(btRigidBody& rbA, btRigidBody& rbB, const btTransform& frameInA, const btTransform& frameInB, RotateOrder rotOrder)
+	: btTypedConstraint(D6_SPRING_2_CONSTRAINT_TYPE, rbA, rbB)
+	, m_frameInA(frameInA)
+	, m_frameInB(frameInB)
+	, m_rotateOrder(rotOrder)	
+	, m_flags(0)
+	calculateTransforms();
+btGeneric6DofSpring2Constraint::btGeneric6DofSpring2Constraint(btRigidBody& rbB, const btTransform& frameInB, RotateOrder rotOrder)
+	: btTypedConstraint(D6_SPRING_2_CONSTRAINT_TYPE, getFixedBody(), rbB)
+	, m_frameInB(frameInB)
+	, m_rotateOrder(rotOrder)
+	, m_flags(0)
+	///not providing rigidbody A means implicitly using worldspace for body A
+	m_frameInA = rbB.getCenterOfMassTransform() * m_frameInB;
+	calculateTransforms();
+btScalar btGeneric6DofSpring2Constraint::btGetMatrixElem(const btMatrix3x3& mat, int index)
+	int i = index%3;
+	int j = index/3;
+	return mat[i][j];
+// MatrixToEulerXYZ from http://www.geometrictools.com/LibFoundation/Mathematics/Wm4Matrix3.inl.html
+bool btGeneric6DofSpring2Constraint::matrixToEulerXYZ(const btMatrix3x3& mat,btVector3& xyz)
+	// rot =  cy*cz          -cy*sz           sy
+	//        cz*sx*sy+cx*sz  cx*cz-sx*sy*sz -cy*sx
+	//       -cx*cz*sy+sx*sz  cz*sx+cx*sy*sz  cx*cy
+	btScalar fi = btGetMatrixElem(mat,2);
+	if (fi < btScalar(1.0f))
+	{
+		if (fi > btScalar(-1.0f))
+		{
+			xyz[0] = btAtan2(-btGetMatrixElem(mat,5),btGetMatrixElem(mat,8));
+			xyz[1] = btAsin(btGetMatrixElem(mat,2));
+			xyz[2] = btAtan2(-btGetMatrixElem(mat,1),btGetMatrixElem(mat,0));
+			return true;
+		}
+		else
+		{
+			// WARNING.  Not unique.  XA - ZA = -atan2(r10,r11)
+			xyz[0] = -btAtan2(btGetMatrixElem(mat,3),btGetMatrixElem(mat,4));
+			xyz[1] = -SIMD_HALF_PI;
+			xyz[2] = btScalar(0.0);
+			return false;
+		}
+	}
+	else
+	{
+		// WARNING.  Not unique.  XAngle + ZAngle = atan2(r10,r11)
+		xyz[0] = btAtan2(btGetMatrixElem(mat,3),btGetMatrixElem(mat,4));
+		xyz[1] = SIMD_HALF_PI;
+		xyz[2] = 0.0;
+	}
+	return false;
+bool btGeneric6DofSpring2Constraint::matrixToEulerXZY(const btMatrix3x3& mat,btVector3& xyz)
+	// rot =  cy*cz          -sz           sy*cz
+	//        cy*cx*sz+sx*sy  cx*cz        sy*cx*sz-cy*sx
+	//        cy*sx*sz-cx*sy  sx*cz        sy*sx*sz+cx*cy
+	btScalar fi = btGetMatrixElem(mat,1);
+	if (fi < btScalar(1.0f))
+	{
+		if (fi > btScalar(-1.0f))
+		{
+			xyz[0] = btAtan2(btGetMatrixElem(mat,7),btGetMatrixElem(mat,4));
+			xyz[1] = btAtan2(btGetMatrixElem(mat,2),btGetMatrixElem(mat,0));
+			xyz[2] = btAsin(-btGetMatrixElem(mat,1));
+			return true;
+		}
+		else
+		{
+			xyz[0] = -btAtan2(-btGetMatrixElem(mat,6),btGetMatrixElem(mat,8));
+			xyz[1] = btScalar(0.0);
+			xyz[2] = SIMD_HALF_PI;
+			return false;
+		}
+	}
+	else
+	{
+		xyz[0] = btAtan2(-btGetMatrixElem(mat,6),btGetMatrixElem(mat,8));
+		xyz[1] = 0.0;
+		xyz[2] = -SIMD_HALF_PI;
+	}
+	return false;
+bool btGeneric6DofSpring2Constraint::matrixToEulerYXZ(const btMatrix3x3& mat,btVector3& xyz)
+	// rot =  cy*cz+sy*sx*sz  cz*sy*sx-cy*sz  cx*sy
+	//        cx*sz           cx*cz           -sx
+	//        cy*sx*sz-cz*sy  sy*sz+cy*cz*sx  cy*cx
+	btScalar fi = btGetMatrixElem(mat,5);
+	if (fi < btScalar(1.0f))
+	{
+		if (fi > btScalar(-1.0f))
+		{
+			xyz[0] = btAsin(-btGetMatrixElem(mat,5));
+			xyz[1] = btAtan2(btGetMatrixElem(mat,2),btGetMatrixElem(mat,8));
+			xyz[2] = btAtan2(btGetMatrixElem(mat,3),btGetMatrixElem(mat,4));
+			return true;
+		}
+		else
+		{
+			xyz[0] = SIMD_HALF_PI;
+			xyz[1] = -btAtan2(-btGetMatrixElem(mat,1),btGetMatrixElem(mat,0));
+			xyz[2] = btScalar(0.0);
+			return false;
+		}
+	}
+	else
+	{
+		xyz[0] = -SIMD_HALF_PI;
+		xyz[1] = btAtan2(-btGetMatrixElem(mat,1),btGetMatrixElem(mat,0));
+		xyz[2] = 0.0;
+	}
+	return false;
+bool btGeneric6DofSpring2Constraint::matrixToEulerYZX(const btMatrix3x3& mat,btVector3& xyz)
+	// rot =  cy*cz   sy*sx-cy*cx*sz   cx*sy+cy*sz*sx
+	//        sz           cz*cx           -cz*sx
+	//        -cz*sy  cy*sx+cx*sy*sz   cy*cx-sy*sz*sx
+	btScalar fi = btGetMatrixElem(mat,3);
+	if (fi < btScalar(1.0f))
+	{
+		if (fi > btScalar(-1.0f))
+		{
+			xyz[0] = btAtan2(-btGetMatrixElem(mat,5),btGetMatrixElem(mat,4));
+			xyz[1] = btAtan2(-btGetMatrixElem(mat,6),btGetMatrixElem(mat,0));
+			xyz[2] = btAsin(btGetMatrixElem(mat,3));
+			return true;
+		}
+		else
+		{
+			xyz[0] = btScalar(0.0);
+			xyz[1] = -btAtan2(btGetMatrixElem(mat,7),btGetMatrixElem(mat,8));
+			xyz[2] = -SIMD_HALF_PI;
+			return false;
+		}
+	}
+	else
+	{
+		xyz[0] = btScalar(0.0);
+		xyz[1] = btAtan2(btGetMatrixElem(mat,7),btGetMatrixElem(mat,8));
+		xyz[2] = SIMD_HALF_PI;
+	}
+	return false;
+bool btGeneric6DofSpring2Constraint::matrixToEulerZXY(const btMatrix3x3& mat,btVector3& xyz)
+	// rot =  cz*cy-sz*sx*sy    -cx*sz   cz*sy+cy*sz*sx
+	//        cy*sz+cz*sx*sy     cz*cx   sz*sy-cz*xy*sx
+	//        -cx*sy              sx     cx*cy
+	btScalar fi = btGetMatrixElem(mat,7);
+	if (fi < btScalar(1.0f))
+	{
+		if (fi > btScalar(-1.0f))
+		{
+			xyz[0] = btAsin(btGetMatrixElem(mat,7));
+			xyz[1] = btAtan2(-btGetMatrixElem(mat,6),btGetMatrixElem(mat,8));
+			xyz[2] = btAtan2(-btGetMatrixElem(mat,1),btGetMatrixElem(mat,4));
+			return true;
+		}
+		else
+		{
+			xyz[0] = -SIMD_HALF_PI;
+			xyz[1] = btScalar(0.0);
+			xyz[2] = -btAtan2(btGetMatrixElem(mat,2),btGetMatrixElem(mat,0));
+			return false;
+		}
+	}
+	else
+	{
+		xyz[0] = SIMD_HALF_PI;
+		xyz[1] = btScalar(0.0);
+		xyz[2] = btAtan2(btGetMatrixElem(mat,2),btGetMatrixElem(mat,0));
+	}
+	return false;
+bool btGeneric6DofSpring2Constraint::matrixToEulerZYX(const btMatrix3x3& mat,btVector3& xyz)
+	// rot =  cz*cy   cz*sy*sx-cx*sz   sz*sx+cz*cx*sy
+	//        cy*sz   cz*cx+sz*sy*sx   cx*sz*sy-cz*sx
+	//        -sy          cy*sx         cy*cx
+	btScalar fi = btGetMatrixElem(mat,6);
+	if (fi < btScalar(1.0f))
+	{
+		if (fi > btScalar(-1.0f))
+		{
+			xyz[0] = btAtan2(btGetMatrixElem(mat,7), btGetMatrixElem(mat,8));
+			xyz[1] = btAsin(-btGetMatrixElem(mat,6));
+			xyz[2] = btAtan2(btGetMatrixElem(mat,3),btGetMatrixElem(mat,0));
+			return true;
+		}
+		else
+		{
+			xyz[0] = btScalar(0.0);
+			xyz[1] = SIMD_HALF_PI;
+			xyz[2] = -btAtan2(btGetMatrixElem(mat,1),btGetMatrixElem(mat,2));
+			return false;
+		}
+	}
+	else
+	{
+		xyz[0] = btScalar(0.0);
+		xyz[1] = -SIMD_HALF_PI;
+		xyz[2] = btAtan2(-btGetMatrixElem(mat,1),-btGetMatrixElem(mat,2));
+	}
+	return false;
+void btGeneric6DofSpring2Constraint::calculateAngleInfo()
+	btMatrix3x3 relative_frame = m_calculatedTransformA.getBasis().inverse()*m_calculatedTransformB.getBasis();
+	switch (m_rotateOrder)
+	{
+		case RO_XYZ : matrixToEulerXYZ(relative_frame,m_calculatedAxisAngleDiff); break;
+		case RO_XZY : matrixToEulerXZY(relative_frame,m_calculatedAxisAngleDiff); break;
+		case RO_YXZ : matrixToEulerYXZ(relative_frame,m_calculatedAxisAngleDiff); break;
+		case RO_YZX : matrixToEulerYZX(relative_frame,m_calculatedAxisAngleDiff); break;
+		case RO_ZXY : matrixToEulerZXY(relative_frame,m_calculatedAxisAngleDiff); break;
+		case RO_ZYX : matrixToEulerZYX(relative_frame,m_calculatedAxisAngleDiff); break;
+		default : btAssert(false);
+	}
+	// in euler angle mode we do not actually constrain the angular velocity
+	// along the axes axis[0] and axis[2] (although we do use axis[1]) :
+	//
+	//    to get			constrain w2-w1 along		...not
+	//    ------			---------------------		------
+	//    d(angle[0])/dt = 0	ax[1] x ax[2]			ax[0]
+	//    d(angle[1])/dt = 0	ax[1]
+	//    d(angle[2])/dt = 0	ax[0] x ax[1]			ax[2]
+	//
+	// constraining w2-w1 along an axis 'a' means that a'*(w2-w1)=0.
+	// to prove the result for angle[0], write the expression for angle[0] from
+	// GetInfo1 then take the derivative. to prove this for angle[2] it is
+	// easier to take the euler rate expression for d(angle[2])/dt with respect
+	// to the components of w and set that to 0.
+	switch (m_rotateOrder)
+	{
+	case RO_XYZ :
+		{
+			//Is this the "line of nodes" calculation choosing planes YZ (B coordinate system) and xy (A coordinate system)? (http://en.wikipedia.org/wiki/Euler_angles)
+			//The two planes are non-homologous, so this is a Tait�Bryan angle formalism and not a proper Euler
+			//Extrinsic rotations are equal to the reversed order intrinsic rotations so the above xyz extrinsic rotations (axes are fixed) are the same as the zy'x" intrinsic rotations (axes are refreshed after each rotation)
+			//that is why xy and YZ planes are chosen (this will describe a zy'x" intrinsic rotation) (see the figure on the left at http://en.wikipedia.org/wiki/Euler_angles under Tait�Bryan angles)
+			// x' = Nperp = N.cross(axis2)
+			// y' = N = axis2.cross(axis0)	
+			// z' = z
+			//
+			// x" = X
+			// y" = y'
+			// z" = ??
+			//in other words:
+			//first rotate around z
+			//second rotate around y'= z.cross(X)
+			//third rotate around x" = X
+			//Original XYZ extrinsic rotation order. 
+			//Planes: xy and YZ normals: z, X.  Plane intersection (N) is z.cross(X)
+			btVector3 axis0 = m_calculatedTransformB.getBasis().getColumn(0);
+			btVector3 axis2 = m_calculatedTransformA.getBasis().getColumn(2);
+			m_calculatedAxis[1] = axis2.cross(axis0);
+			m_calculatedAxis[0] = m_calculatedAxis[1].cross(axis2);
+			m_calculatedAxis[2] = axis0.cross(m_calculatedAxis[1]);
+			break;
+		}
+	case RO_XZY :
+		{
+			//planes: xz,ZY normals: y, X
+			//first rotate around y
+			//second rotate around z'= y.cross(X)
+			//third rotate around x" = X
+			btVector3 axis0 = m_calculatedTransformB.getBasis().getColumn(0);
+			btVector3 axis1 = m_calculatedTransformA.getBasis().getColumn(1);
+			m_calculatedAxis[2] = axis0.cross(axis1);
+			m_calculatedAxis[0] = axis1.cross(m_calculatedAxis[2]);
+			m_calculatedAxis[1] = m_calculatedAxis[2].cross(axis0);
+			break;
+		}
+	case RO_YXZ :
+		{
+			//planes: yx,XZ normals: z, Y
+			//first rotate around z
+			//second rotate around x'= z.cross(Y)
+			//third rotate around y" = Y
+			btVector3 axis1 = m_calculatedTransformB.getBasis().getColumn(1);
+			btVector3 axis2 = m_calculatedTransformA.getBasis().getColumn(2);
+			m_calculatedAxis[0] = axis1.cross(axis2);
+			m_calculatedAxis[1] = axis2.cross(m_calculatedAxis[0]);
+			m_calculatedAxis[2] = m_calculatedAxis[0].cross(axis1);
+			break;
+		}
+	case RO_YZX :
+		{
+			//planes: yz,ZX normals: x, Y
+			//first rotate around x
+			//second rotate around z'= x.cross(Y)
+			//third rotate around y" = Y
+			btVector3 axis0 = m_calculatedTransformA.getBasis().getColumn(0);
+			btVector3 axis1 = m_calculatedTransformB.getBasis().getColumn(1);
+			m_calculatedAxis[2] = axis0.cross(axis1);
+			m_calculatedAxis[0] = axis1.cross(m_calculatedAxis[2]);
+			m_calculatedAxis[1] = m_calculatedAxis[2].cross(axis0);
+			break;
+		}
+	case RO_ZXY :
+		{
+			//planes: zx,XY normals: y, Z
+			//first rotate around y
+			//second rotate around x'= y.cross(Z)
+			//third rotate around z" = Z
+			btVector3 axis1 = m_calculatedTransformA.getBasis().getColumn(1);
+			btVector3 axis2 = m_calculatedTransformB.getBasis().getColumn(2);
+			m_calculatedAxis[0] = axis1.cross(axis2);
+			m_calculatedAxis[1] = axis2.cross(m_calculatedAxis[0]);
+			m_calculatedAxis[2] = m_calculatedAxis[0].cross(axis1);
+			break;
+		}
+	case RO_ZYX :
+		{
+			//planes: zy,YX normals: x, Z
+			//first rotate around x
+			//second rotate around y' = x.cross(Z)
+			//third rotate around z" = Z
+			btVector3 axis0 = m_calculatedTransformA.getBasis().getColumn(0);
+			btVector3 axis2 = m_calculatedTransformB.getBasis().getColumn(2);
+			m_calculatedAxis[1] = axis2.cross(axis0);
+			m_calculatedAxis[0] = m_calculatedAxis[1].cross(axis2);
+			m_calculatedAxis[2] = axis0.cross(m_calculatedAxis[1]);
+			break;
+		}
+	default:
+		btAssert(false);
+	}
+	m_calculatedAxis[0].normalize();
+	m_calculatedAxis[1].normalize();
+	m_calculatedAxis[2].normalize();
+void btGeneric6DofSpring2Constraint::calculateTransforms()
+	calculateTransforms(m_rbA.getCenterOfMassTransform(),m_rbB.getCenterOfMassTransform());
+void btGeneric6DofSpring2Constraint::calculateTransforms(const btTransform& transA,const btTransform& transB)
+	m_calculatedTransformA = transA * m_frameInA;
+	m_calculatedTransformB = transB * m_frameInB;
+	calculateLinearInfo();
+	calculateAngleInfo();
+	btScalar miA = getRigidBodyA().getInvMass();
+	btScalar miB = getRigidBodyB().getInvMass();
+	m_hasStaticBody = (miA < SIMD_EPSILON) || (miB < SIMD_EPSILON);
+	btScalar miS = miA + miB;
+	if(miS > btScalar(0.f))
+	{
+		m_factA = miB / miS;
+	}
+	else 
+	{
+		m_factA = btScalar(0.5f);
+	}
+	m_factB = btScalar(1.0f) - m_factA;
+void btGeneric6DofSpring2Constraint::testAngularLimitMotor(int axis_index)
+	btScalar angle = m_calculatedAxisAngleDiff[axis_index];
+	angle = btAdjustAngleToLimits(angle, m_angularLimits[axis_index].m_loLimit, m_angularLimits[axis_index].m_hiLimit);
+	m_angularLimits[axis_index].m_currentPosition = angle;
+	m_angularLimits[axis_index].testLimitValue(angle);
+void btGeneric6DofSpring2Constraint::getInfo1 (btConstraintInfo1* info)
+	//prepare constraint
+	calculateTransforms(m_rbA.getCenterOfMassTransform(),m_rbB.getCenterOfMassTransform());
+	info->m_numConstraintRows = 0;
+	info->nub = 0;
+	int i;
+	//test linear limits
+	for(i = 0; i < 3; i++)
+	{
+		     if (m_linearLimits.m_currentLimit[i]==4) info->m_numConstraintRows += 2;
+		else if (m_linearLimits.m_currentLimit[i]!=0) info->m_numConstraintRows += 1;
+		if (m_linearLimits.m_enableMotor[i] ) info->m_numConstraintRows += 1;
+		if (m_linearLimits.m_enableSpring[i]) info->m_numConstraintRows += 1;
+	}
+	//test angular limits
+	for (i=0;i<3 ;i++ )
+	{
+		testAngularLimitMotor(i);
+		     if (m_angularLimits[i].m_currentLimit==4) info->m_numConstraintRows += 2;
+		else if (m_angularLimits[i].m_currentLimit!=0) info->m_numConstraintRows += 1;
+		if (m_angularLimits[i].m_enableMotor ) info->m_numConstraintRows += 1;
+		if (m_angularLimits[i].m_enableSpring) info->m_numConstraintRows += 1;
+	}
+void btGeneric6DofSpring2Constraint::getInfo2 (btConstraintInfo2* info)
+	const btTransform& transA = m_rbA.getCenterOfMassTransform();
+	const btTransform& transB = m_rbB.getCenterOfMassTransform();
+	const btVector3& linVelA = m_rbA.getLinearVelocity();
+	const btVector3& linVelB = m_rbB.getLinearVelocity();
+	const btVector3& angVelA = m_rbA.getAngularVelocity();
+	const btVector3& angVelB = m_rbB.getAngularVelocity();
+	// for stability better to solve angular limits first
+	int row = setAngularLimits(info, 0,transA,transB,linVelA,linVelB,angVelA,angVelB);
+	setLinearLimits(info, row, transA,transB,linVelA,linVelB,angVelA,angVelB);
+int btGeneric6DofSpring2Constraint::setLinearLimits(btConstraintInfo2* info, int row, const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB)
+	//solve linear limits
+	btRotationalLimitMotor2 limot;
+	for (int i=0;i<3 ;i++ )
+	{
+		if(m_linearLimits.m_currentLimit[i] || m_linearLimits.m_enableMotor[i] || m_linearLimits.m_enableSpring[i])
+		{ // re-use rotational motor code
+			limot.m_bounce                 = m_linearLimits.m_bounce[i];
+			limot.m_currentLimit           = m_linearLimits.m_currentLimit[i];
+			limot.m_currentPosition        = m_linearLimits.m_currentLinearDiff[i];
+			limot.m_currentLimitError      = m_linearLimits.m_currentLimitError[i];
+			limot.m_currentLimitErrorHi    = m_linearLimits.m_currentLimitErrorHi[i];
+			limot.m_enableMotor            = m_linearLimits.m_enableMotor[i];
+			limot.m_servoMotor             = m_linearLimits.m_servoMotor[i];
+			limot.m_servoTarget            = m_linearLimits.m_servoTarget[i];
+			limot.m_enableSpring           = m_linearLimits.m_enableSpring[i];
+			limot.m_springStiffness        = m_linearLimits.m_springStiffness[i];
+			limot.m_springStiffnessLimited = m_linearLimits.m_springStiffnessLimited[i];
+			limot.m_springDamping          = m_linearLimits.m_springDamping[i];
+			limot.m_springDampingLimited   = m_linearLimits.m_springDampingLimited[i];
+			limot.m_equilibriumPoint       = m_linearLimits.m_equilibriumPoint[i];
+			limot.m_hiLimit                = m_linearLimits.m_upperLimit[i];
+			limot.m_loLimit                = m_linearLimits.m_lowerLimit[i];
+			limot.m_maxMotorForce          = m_linearLimits.m_maxMotorForce[i];
+			limot.m_targetVelocity         = m_linearLimits.m_targetVelocity[i];
+			btVector3 axis = m_calculatedTransformA.getBasis().getColumn(i);
+			int flags = m_flags >> (i * BT_6DOF_FLAGS_AXIS_SHIFT2);
+			limot.m_stopCFM  = (flags & BT_6DOF_FLAGS_CFM_STOP2) ? m_linearLimits.m_stopCFM[i] : info->cfm[0];
+			limot.m_stopERP  = (flags & BT_6DOF_FLAGS_ERP_STOP2) ? m_linearLimits.m_stopERP[i] : info->erp;
+			limot.m_motorCFM = (flags & BT_6DOF_FLAGS_CFM_MOTO2) ? m_linearLimits.m_motorCFM[i] : info->cfm[0];
+			limot.m_motorERP = (flags & BT_6DOF_FLAGS_ERP_MOTO2) ? m_linearLimits.m_motorERP[i] : info->erp;
+			//rotAllowed is a bit of a magic from the original 6dof. The calculation of it here is something that imitates the original behavior as much as possible.
+			int indx1 = (i + 1) % 3;
+			int indx2 = (i + 2) % 3;
+			int rotAllowed = 1; // rotations around orthos to current axis (it is used only when one of the body is static)
+			bool indx1Violated = m_angularLimits[indx1].m_currentLimit == 1 ||
+				m_angularLimits[indx1].m_currentLimit == 2 ||
+				( m_angularLimits[indx1].m_currentLimit == 3 && ( m_angularLimits[indx1].m_currentLimitError < -D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION || m_angularLimits[indx1].m_currentLimitError > D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION ) ) ||
+				( m_angularLimits[indx1].m_currentLimit == 4 && ( m_angularLimits[indx1].m_currentLimitError < -D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION || m_angularLimits[indx1].m_currentLimitErrorHi > D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION ) );
+			bool indx2Violated = m_angularLimits[indx2].m_currentLimit == 1 ||
+				m_angularLimits[indx2].m_currentLimit == 2 ||
+				( m_angularLimits[indx2].m_currentLimit == 3 && ( m_angularLimits[indx2].m_currentLimitError < -D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION || m_angularLimits[indx2].m_currentLimitError > D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION ) ) ||
+				( m_angularLimits[indx2].m_currentLimit == 4 && ( m_angularLimits[indx2].m_currentLimitError < -D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION || m_angularLimits[indx2].m_currentLimitErrorHi > D6_LIMIT_ERROR_THRESHOLD_FOR_ROTATION ) );
+			if( indx1Violated && indx2Violated )
+			{
+				rotAllowed = 0;
+			}
+			row += get_limit_motor_info2(&limot, transA,transB,linVelA,linVelB,angVelA,angVelB, info, row, axis, 0, rotAllowed);
+		}
+	}
+	return row;
+int btGeneric6DofSpring2Constraint::setAngularLimits(btConstraintInfo2 *info, int row_offset, const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB)
+	int row = row_offset;
+	//order of rotational constraint rows
+	int cIdx[] = {0, 1, 2};
+	switch(m_rotateOrder)
+	{
+		case RO_XYZ : cIdx[0] = 0; cIdx[1] = 1; cIdx[2] = 2; break;
+		case RO_XZY : cIdx[0] = 0; cIdx[1] = 2; cIdx[2] = 1; break;
+		case RO_YXZ : cIdx[0] = 1; cIdx[1] = 0; cIdx[2] = 2; break;
+		case RO_YZX : cIdx[0] = 1; cIdx[1] = 2; cIdx[2] = 0; break;
+		case RO_ZXY : cIdx[0] = 2; cIdx[1] = 0; cIdx[2] = 1; break;
+		case RO_ZYX : cIdx[0] = 2; cIdx[1] = 1; cIdx[2] = 0; break;
+		default : btAssert(false);
+	}
+	for (int ii = 0; ii < 3 ; ii++ )
+	{
+		int i = cIdx[ii];
+		if(m_angularLimits[i].m_currentLimit || m_angularLimits[i].m_enableMotor || m_angularLimits[i].m_enableSpring)
+		{
+			btVector3 axis = getAxis(i);
+			int flags = m_flags >> ((i + 3) * BT_6DOF_FLAGS_AXIS_SHIFT2);
+			if(!(flags & BT_6DOF_FLAGS_CFM_STOP2))
+			{
+				m_angularLimits[i].m_stopCFM = info->cfm[0];
+			}
+			if(!(flags & BT_6DOF_FLAGS_ERP_STOP2))
+			{
+				m_angularLimits[i].m_stopERP = info->erp;
+			}
+			if(!(flags & BT_6DOF_FLAGS_CFM_MOTO2))
+			{
+				m_angularLimits[i].m_motorCFM = info->cfm[0];
+			}
+			if(!(flags & BT_6DOF_FLAGS_ERP_MOTO2))
+			{
+				m_angularLimits[i].m_motorERP = info->erp;
+			}
+			row += get_limit_motor_info2(&m_angularLimits[i],transA,transB,linVelA,linVelB,angVelA,angVelB, info,row,axis,1);
+		}
+	}
+	return row;
+void btGeneric6DofSpring2Constraint::setFrames(const btTransform& frameA, const btTransform& frameB)
+	m_frameInA = frameA;
+	m_frameInB = frameB;
+	buildJacobian();
+	calculateTransforms();
+void btGeneric6DofSpring2Constraint::calculateLinearInfo()
+	m_calculatedLinearDiff = m_calculatedTransformB.getOrigin() - m_calculatedTransformA.getOrigin();
+	m_calculatedLinearDiff = m_calculatedTransformA.getBasis().inverse() * m_calculatedLinearDiff;
+	for(int i = 0; i < 3; i++)
+	{
+		m_linearLimits.m_currentLinearDiff[i] = m_calculatedLinearDiff[i];
+		m_linearLimits.testLimitValue(i, m_calculatedLinearDiff[i]);
+	}
+void btGeneric6DofSpring2Constraint::calculateJacobi(btRotationalLimitMotor2 * limot, const btTransform& transA,const btTransform& transB, btConstraintInfo2 *info, int srow, btVector3& ax1, int rotational, int rotAllowed)
+	btScalar *J1 = rotational ? info->m_J1angularAxis : info->m_J1linearAxis;
+	btScalar *J2 = rotational ? info->m_J2angularAxis : info->m_J2linearAxis;
+	J1[srow+0] = ax1[0];
+	J1[srow+1] = ax1[1];
+	J1[srow+2] = ax1[2];
+	J2[srow+0] = -ax1[0];
+	J2[srow+1] = -ax1[1];
+	J2[srow+2] = -ax1[2];
+	if(!rotational)
+	{
+		btVector3 tmpA, tmpB, relA, relB;
+		// get vector from bodyB to frameB in WCS
+		relB = m_calculatedTransformB.getOrigin() - transB.getOrigin();
+		// same for bodyA
+		relA = m_calculatedTransformA.getOrigin() - transA.getOrigin();
+		tmpA = relA.cross(ax1);
+		tmpB = relB.cross(ax1);
+		if(m_hasStaticBody && (!rotAllowed))
+		{
+			tmpA *= m_factA;
+			tmpB *= m_factB;
+		}
+		int i;
+		for (i=0; i<3; i++) info->m_J1angularAxis[srow+i] = tmpA[i];
+		for (i=0; i<3; i++) info->m_J2angularAxis[srow+i] = -tmpB[i];
+	}
+int btGeneric6DofSpring2Constraint::get_limit_motor_info2(
+	btRotationalLimitMotor2 * limot,
+	const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB,
+	btConstraintInfo2 *info, int row, btVector3& ax1, int rotational,int rotAllowed)
+	int count = 0;
+	int srow = row * info->rowskip;
+	if (limot->m_currentLimit==4) 
+	{
+		btScalar vel = rotational ? angVelA.dot(ax1) - angVelB.dot(ax1) : linVelA.dot(ax1) - linVelB.dot(ax1);
+		calculateJacobi(limot,transA,transB,info,srow,ax1,rotational,rotAllowed);
+		info->m_constraintError[srow] = info->fps * limot->m_stopERP * limot->m_currentLimitError * (rotational ? -1 : 1);
+		if (rotational) {
+			if (info->m_constraintError[srow]-vel*limot->m_stopERP > 0) {
+				btScalar bounceerror = -limot->m_bounce* vel;
+				if (bounceerror > info->m_constraintError[srow]) info->m_constraintError[srow] = bounceerror;
+			}
+		} else {
+			if (info->m_constraintError[srow]-vel*limot->m_stopERP < 0) {
+				btScalar bounceerror = -limot->m_bounce* vel;
+				if (bounceerror < info->m_constraintError[srow]) info->m_constraintError[srow] = bounceerror;
+			}
+		}
+		info->m_lowerLimit[srow] = rotational ? 0 : -SIMD_INFINITY;
+		info->m_upperLimit[srow] = rotational ? SIMD_INFINITY : 0;
+		info->cfm[srow] = limot->m_stopCFM;
+		srow += info->rowskip;
+		++count;
+		calculateJacobi(limot,transA,transB,info,srow,ax1,rotational,rotAllowed);
+		info->m_constraintError[srow] = info->fps * limot->m_stopERP * limot->m_currentLimitErrorHi * (rotational ? -1 : 1);
+		if (rotational) {
+			if (info->m_constraintError[srow]-vel*limot->m_stopERP < 0) {
+				btScalar bounceerror = -limot->m_bounce* vel;
+				if (bounceerror < info->m_constraintError[srow]) info->m_constraintError[srow] = bounceerror;
+			}
+		} else {
+			if (info->m_constraintError[srow]-vel*limot->m_stopERP > 0) {
+				btScalar bounceerror = -limot->m_bounce* vel;
+				if (bounceerror > info->m_constraintError[srow]) info->m_constraintError[srow] = bounceerror;
+			}
+		}
+		info->m_lowerLimit[srow] = rotational ? -SIMD_INFINITY : 0;
+		info->m_upperLimit[srow] = rotational ? 0 : SIMD_INFINITY;
+		info->cfm[srow] = limot->m_stopCFM;
+		srow += info->rowskip;
+		++count;
+	} else
+	if (limot->m_currentLimit==3) 
+	{
+		calculateJacobi(limot,transA,transB,info,srow,ax1,rotational,rotAllowed);
+		info->m_constraintError[srow] = info->fps * limot->m_stopERP * limot->m_currentLimitError * (rotational ? -1 : 1);
+		info->m_lowerLimit[srow] = -SIMD_INFINITY;
+		info->m_upperLimit[srow] = SIMD_INFINITY;
+		info->cfm[srow] = limot->m_stopCFM;
+		srow += info->rowskip;
+		++count;
+	}
+	if (limot->m_enableMotor && !limot->m_servoMotor)
+	{
+		calculateJacobi(limot,transA,transB,info,srow,ax1,rotational,rotAllowed);
+		btScalar tag_vel = rotational ? limot->m_targetVelocity : -limot->m_targetVelocity;
+		btScalar mot_fact = getMotorFactor(limot->m_currentPosition, 
+			limot->m_loLimit,
+			limot->m_hiLimit,
+			tag_vel,
+			info->fps * limot->m_motorERP);
+		info->m_constraintError[srow] = mot_fact * limot->m_targetVelocity;
+		info->m_lowerLimit[srow] = -limot->m_maxMotorForce;
+		info->m_upperLimit[srow] = limot->m_maxMotorForce;
+		info->cfm[srow] = limot->m_motorCFM;
+		srow += info->rowskip;
+		++count;
+	}
+	if (limot->m_enableMotor && limot->m_servoMotor)
+	{
+		btScalar error = limot->m_currentPosition - limot->m_servoTarget;
+		calculateJacobi(limot,transA,transB,info,srow,ax1,rotational,rotAllowed);
+		btScalar targetvelocity = error<0 ? -limot->m_targetVelocity : limot->m_targetVelocity;
+		btScalar tag_vel = -targetvelocity;
+		btScalar mot_fact;
+		if(error != 0)
+		{
+			btScalar lowLimit;
+			btScalar hiLimit;
+			if(limot->m_loLimit > limot->m_hiLimit)
+			{
+				lowLimit = error > 0 ? limot->m_servoTarget : -SIMD_INFINITY;
+				hiLimit  = error < 0 ? limot->m_servoTarget :  SIMD_INFINITY;
+			}
+			else
+			{
+				lowLimit = error > 0 && limot->m_servoTarget>limot->m_loLimit ? limot->m_servoTarget : limot->m_loLimit;
+				hiLimit  = error < 0 && limot->m_servoTarget<limot->m_hiLimit ? limot->m_servoTarget : limot->m_hiLimit;
+			}
+			mot_fact = getMotorFactor(limot->m_currentPosition, lowLimit, hiLimit, tag_vel, info->fps * limot->m_motorERP);
+		} 
+		else 
+		{
+			mot_fact = 0;
+		}
+		info->m_constraintError[srow] = mot_fact * targetvelocity * (rotational ? -1 : 1);
+		info->m_lowerLimit[srow] = -limot->m_maxMotorForce;
+		info->m_upperLimit[srow] = limot->m_maxMotorForce;
+		info->cfm[srow] = limot->m_motorCFM;
+		srow += info->rowskip;
+		++count;
+	}
+	if (limot->m_enableSpring)
+	{
+		btScalar error = limot->m_currentPosition - limot->m_equilibriumPoint;
+		calculateJacobi(limot,transA,transB,info,srow,ax1,rotational,rotAllowed);
+		//btScalar cfm = 1.0 / ((1.0/info->fps)*limot->m_springStiffness+ limot->m_springDamping);
+		//if(cfm > 0.99999)
+		//	cfm = 0.99999;
+		//btScalar erp = (1.0/info->fps)*limot->m_springStiffness / ((1.0/info->fps)*limot->m_springStiffness + limot->m_springDamping);
+		//info->m_constraintError[srow] = info->fps * erp * error * (rotational ? -1.0 : 1.0);
+		//info->m_lowerLimit[srow] = -SIMD_INFINITY;
+		//info->m_upperLimit[srow] = SIMD_INFINITY;
+		btScalar dt = BT_ONE / info->fps;
+		btScalar kd = limot->m_springDamping;
+		btScalar ks = limot->m_springStiffness;
+		btScalar vel = rotational ? angVelA.dot(ax1) - angVelB.dot(ax1) : linVelA.dot(ax1) - linVelB.dot(ax1);
+//		btScalar erp = 0.1;
+		btScalar cfm = BT_ZERO;
+		btScalar mA = BT_ONE / m_rbA.getInvMass();
+		btScalar mB = BT_ONE / m_rbB.getInvMass();
+		btScalar m = mA > mB ? mB : mA;
+		btScalar angularfreq = sqrt(ks / m);
+		//limit stiffness (the spring should not be sampled faster that the quarter of its angular frequency)
+		if(limot->m_springStiffnessLimited && 0.25 < angularfreq * dt)
+		{
+			ks = BT_ONE / dt / dt / btScalar(16.0) * m;
+		}
+		//avoid damping that would blow up the spring
+		if(limot->m_springDampingLimited && kd * dt > m)
+		{
+			kd = m / dt;
+		}
+		btScalar fs = ks * error * dt;
+		btScalar fd = -kd * (vel) * (rotational ? -1 : 1) * dt;
+		btScalar f = (fs+fd);
+		info->m_constraintError[srow] = (vel + f * (rotational ? -1 : 1)) ;
+		btScalar minf = f < fd ? f : fd;
+		btScalar maxf = f < fd ? fd : f;
+		if(!rotational)
+		{
+			info->m_lowerLimit[srow] = minf > 0 ? 0 : minf;
+			info->m_upperLimit[srow] = maxf < 0 ? 0 : maxf;
+		}
+		else
+		{
+			info->m_lowerLimit[srow] = -maxf > 0 ? 0 : -maxf;
+			info->m_upperLimit[srow] = -minf < 0 ? 0 : -minf;
+		}
+		info->cfm[srow] = cfm;
+		srow += info->rowskip;
+		++count;
+	}
+	return count;
+//override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+//If no axis is provided, it uses the default axis for this constraint.
+void btGeneric6DofSpring2Constraint::setParam(int num, btScalar value, int axis)
+	if((axis >= 0) && (axis < 3))
+	{
+		switch(num)
+		{
+				m_linearLimits.m_stopERP[axis] = value;
+				m_flags |= BT_6DOF_FLAGS_ERP_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+				m_linearLimits.m_stopCFM[axis] = value;
+				m_flags |= BT_6DOF_FLAGS_CFM_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+				m_linearLimits.m_motorERP[axis] = value;
+				m_flags |= BT_6DOF_FLAGS_ERP_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+				m_linearLimits.m_motorCFM[axis] = value;
+				m_flags |= BT_6DOF_FLAGS_CFM_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+			default : 
+				btAssertConstrParams(0);
+		}
+	}
+	else if((axis >=3) && (axis < 6))
+	{
+		switch(num)
+		{
+				m_angularLimits[axis - 3].m_stopERP = value;
+				m_flags |= BT_6DOF_FLAGS_ERP_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+				m_angularLimits[axis - 3].m_stopCFM = value;
+				m_flags |= BT_6DOF_FLAGS_CFM_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+				m_angularLimits[axis - 3].m_motorERP = value;
+				m_flags |= BT_6DOF_FLAGS_ERP_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+				m_angularLimits[axis - 3].m_motorCFM = value;
+				m_flags |= BT_6DOF_FLAGS_CFM_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2);
+				break;
+			default : 
+				btAssertConstrParams(0);
+		}
+	}
+	else
+	{
+		btAssertConstrParams(0);
+	}
+//return the local value of parameter
+btScalar btGeneric6DofSpring2Constraint::getParam(int num, int axis) const 
+	btScalar retVal = 0;
+	if((axis >= 0) && (axis < 3))
+	{
+		switch(num)
+		{
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_ERP_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_linearLimits.m_stopERP[axis];
+				break;
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_CFM_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_linearLimits.m_stopCFM[axis];
+				break;
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_ERP_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_linearLimits.m_motorERP[axis];
+				break;
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_CFM_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_linearLimits.m_motorCFM[axis];
+				break;
+			default : 
+				btAssertConstrParams(0);
+		}
+	}
+	else if((axis >=3) && (axis < 6))
+	{
+		switch(num)
+		{
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_ERP_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_angularLimits[axis - 3].m_stopERP;
+				break;
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_CFM_STOP2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_angularLimits[axis - 3].m_stopCFM;
+				break;
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_ERP_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_angularLimits[axis - 3].m_motorERP;
+				break;
+				btAssertConstrParams(m_flags & (BT_6DOF_FLAGS_CFM_MOTO2 << (axis * BT_6DOF_FLAGS_AXIS_SHIFT2)));
+				retVal = m_angularLimits[axis - 3].m_motorCFM;
+				break;
+			default : 
+				btAssertConstrParams(0);
+		}
+	}
+	else
+	{
+		btAssertConstrParams(0);
+	}
+	return retVal;
+void btGeneric6DofSpring2Constraint::setAxis(const btVector3& axis1,const btVector3& axis2)
+	btVector3 zAxis = axis1.normalized();
+	btVector3 yAxis = axis2.normalized();
+	btVector3 xAxis = yAxis.cross(zAxis); // we want right coordinate system
+	btTransform frameInW;
+	frameInW.setIdentity();
+	frameInW.getBasis().setValue( xAxis[0], yAxis[0], zAxis[0],
+	                              xAxis[1], yAxis[1], zAxis[1],
+	                              xAxis[2], yAxis[2], zAxis[2]);
+	// now get constraint frame in local coordinate systems
+	m_frameInA = m_rbA.getCenterOfMassTransform().inverse() * frameInW;
+	m_frameInB = m_rbB.getCenterOfMassTransform().inverse() * frameInW;
+	calculateTransforms();
+void btGeneric6DofSpring2Constraint::setBounce(int index, btScalar bounce)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_bounce[index] = bounce;
+	else
+		m_angularLimits[index - 3].m_bounce = bounce;
+void btGeneric6DofSpring2Constraint::enableMotor(int index, bool onOff)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_enableMotor[index] = onOff;
+	else
+		m_angularLimits[index - 3].m_enableMotor = onOff;
+void btGeneric6DofSpring2Constraint::setServo(int index, bool onOff)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_servoMotor[index] = onOff;
+	else
+		m_angularLimits[index - 3].m_servoMotor = onOff;
+void btGeneric6DofSpring2Constraint::setTargetVelocity(int index, btScalar velocity)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_targetVelocity[index] = velocity;
+	else
+		m_angularLimits[index - 3].m_targetVelocity = velocity;
+void btGeneric6DofSpring2Constraint::setServoTarget(int index, btScalar target)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_servoTarget[index] = target;
+	else
+		m_angularLimits[index - 3].m_servoTarget = target;
+void btGeneric6DofSpring2Constraint::setMaxMotorForce(int index, btScalar force)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_maxMotorForce[index] = force;
+	else
+		m_angularLimits[index - 3].m_maxMotorForce = force;
+void btGeneric6DofSpring2Constraint::enableSpring(int index, bool onOff)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_enableSpring[index] = onOff;
+	else
+		m_angularLimits[index - 3] .m_enableSpring = onOff;
+void btGeneric6DofSpring2Constraint::setStiffness(int index, btScalar stiffness, bool limitIfNeeded)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3) {
+		m_linearLimits.m_springStiffness[index] = stiffness;
+		m_linearLimits.m_springStiffnessLimited[index] = limitIfNeeded;
+	} else {
+		m_angularLimits[index - 3].m_springStiffness = stiffness;
+		m_angularLimits[index - 3].m_springStiffnessLimited = limitIfNeeded;
+	}
+void btGeneric6DofSpring2Constraint::setDamping(int index, btScalar damping, bool limitIfNeeded)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3) {
+		m_linearLimits.m_springDamping[index] = damping;
+		m_linearLimits.m_springDampingLimited[index] = limitIfNeeded;
+	} else {
+		m_angularLimits[index - 3].m_springDamping = damping;
+		m_angularLimits[index - 3].m_springDampingLimited = limitIfNeeded;
+	}
+void btGeneric6DofSpring2Constraint::setEquilibriumPoint()
+	calculateTransforms();
+	int i;
+	for( i = 0; i < 3; i++)
+		m_linearLimits.m_equilibriumPoint[i] = m_calculatedLinearDiff[i];
+	for(i = 0; i < 3; i++)
+		m_angularLimits[i].m_equilibriumPoint = m_calculatedAxisAngleDiff[i];
+void btGeneric6DofSpring2Constraint::setEquilibriumPoint(int index)
+	btAssert((index >= 0) && (index < 6));
+	calculateTransforms();
+	if (index<3)
+		m_linearLimits.m_equilibriumPoint[index] = m_calculatedLinearDiff[index];
+	else
+		m_angularLimits[index - 3] .m_equilibriumPoint = m_calculatedAxisAngleDiff[index - 3];
+void btGeneric6DofSpring2Constraint::setEquilibriumPoint(int index, btScalar val)
+	btAssert((index >= 0) && (index < 6));
+	if (index<3)
+		m_linearLimits.m_equilibriumPoint[index] = val;
+	else
+		m_angularLimits[index - 3] .m_equilibriumPoint = val;
+//////////////////////////// btRotationalLimitMotor2 ////////////////////////////////////
+void btRotationalLimitMotor2::testLimitValue(btScalar test_value)
+	//we can't normalize the angles here because we would lost the sign that we use later, but it doesn't seem to be a problem
+	if(m_loLimit > m_hiLimit) {
+		m_currentLimit = 0;
+		m_currentLimitError = btScalar(0.f);
+	}
+	else if(m_loLimit == m_hiLimit) {
+		m_currentLimitError = test_value - m_loLimit;
+		m_currentLimit = 3;
+	} else {
+		m_currentLimitError = test_value - m_loLimit;
+		m_currentLimitErrorHi = test_value - m_hiLimit;
+		m_currentLimit = 4;
+	}
+//////////////////////////// btTranslationalLimitMotor2 ////////////////////////////////////
+void btTranslationalLimitMotor2::testLimitValue(int limitIndex, btScalar test_value)
+	btScalar loLimit = m_lowerLimit[limitIndex];
+	btScalar hiLimit = m_upperLimit[limitIndex];
+	if(loLimit > hiLimit) {
+		m_currentLimitError[limitIndex] = 0;
+		m_currentLimit[limitIndex] = 0;
+	}
+	else if(loLimit == hiLimit) {
+		m_currentLimitError[limitIndex] = test_value - loLimit;
+		m_currentLimit[limitIndex] = 3;
+	} else {
+		m_currentLimitError[limitIndex] = test_value - loLimit;
+		m_currentLimitErrorHi[limitIndex] = test_value - hiLimit;
+		m_currentLimit[limitIndex] = 4;
+	}
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h
new file mode 100644
index 00000000..193e51e3
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h
@@ -0,0 +1,674 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+2014 May: btGeneric6DofSpring2Constraint is created from the original (2.82.2712) btGeneric6DofConstraint by Gabor Puhr and Tamas Umenhoffer
+- Much more accurate and stable in a lot of situation. (Especially when a sleeping chain of RBs connected with 6dof2 is pulled)
+- Stable and accurate spring with minimal energy loss that works with all of the solvers. (latter is not true for the original 6dof spring)
+- Servo motor functionality
+- Much more accurate bouncing. 0 really means zero bouncing (not true for the original 6odf) and there is only a minimal energy loss when the value is 1 (because of the solvers' precision)
+- Rotation order for the Euler system can be set. (One axis' freedom is still limited to pi/2)
+- It is slower than the original 6dof. There is no exact ratio, but half speed is a good estimation.
+- At bouncing the correct velocity is calculated, but not the correct position. (it is because of the solver can correct position or velocity, but not both.)
+/// 2009 March: btGeneric6DofConstraint refactored by Roman Ponomarev
+/// Added support for generic constraint solver through getInfo1/getInfo2 methods
+btGeneric6DofConstraint Refactored by Francisco Le?n
+email: projectileman@yahoo.com
+#include "LinearMath/btVector3.h"
+#include "btJacobianEntry.h"
+#include "btTypedConstraint.h"
+class btRigidBody;
+#define btGeneric6DofSpring2ConstraintData2		btGeneric6DofSpring2ConstraintDoubleData2
+#define btGeneric6DofSpring2ConstraintDataName	"btGeneric6DofSpring2ConstraintDoubleData2"
+#define btGeneric6DofSpring2ConstraintData2		btGeneric6DofSpring2ConstraintData
+#define btGeneric6DofSpring2ConstraintDataName	"btGeneric6DofSpring2ConstraintData"
+enum RotateOrder
+	RO_XYZ=0,
+class btRotationalLimitMotor2
+// upper < lower means free
+// upper == lower means locked
+// upper > lower means limited
+	btScalar m_loLimit;
+	btScalar m_hiLimit;
+	btScalar m_bounce;
+	btScalar m_stopERP;
+	btScalar m_stopCFM;
+	btScalar m_motorERP;
+	btScalar m_motorCFM;
+	bool     m_enableMotor;
+	btScalar m_targetVelocity;
+	btScalar m_maxMotorForce;
+	bool     m_servoMotor;
+	btScalar m_servoTarget;
+	bool     m_enableSpring;
+	btScalar m_springStiffness;
+	bool     m_springStiffnessLimited;
+	btScalar m_springDamping;
+	bool     m_springDampingLimited;
+	btScalar m_equilibriumPoint;
+	btScalar m_currentLimitError;
+	btScalar m_currentLimitErrorHi;
+	btScalar m_currentPosition;
+	int      m_currentLimit;
+	btRotationalLimitMotor2()
+	{
+		m_loLimit                = 1.0f;
+		m_hiLimit                = -1.0f;
+		m_bounce                 = 0.0f;
+		m_stopERP                = 0.2f;
+		m_stopCFM                = 0.f;
+		m_motorERP               = 0.9f;
+		m_motorCFM               = 0.f;
+		m_enableMotor            = false;
+		m_targetVelocity         = 0;
+		m_maxMotorForce          = 0.1f;
+		m_servoMotor             = false;
+		m_servoTarget            = 0;
+		m_enableSpring           = false;
+		m_springStiffness        = 0;
+		m_springStiffnessLimited = false;
+		m_springDamping          = 0;
+		m_springDampingLimited   = false;
+		m_equilibriumPoint       = 0;
+		m_currentLimitError   = 0;
+		m_currentLimitErrorHi = 0;
+		m_currentPosition     = 0;
+		m_currentLimit        = 0;
+	}
+	btRotationalLimitMotor2(const btRotationalLimitMotor2 & limot)
+	{
+		m_loLimit                = limot.m_loLimit;
+		m_hiLimit                = limot.m_hiLimit;
+		m_bounce                 = limot.m_bounce;
+		m_stopERP                = limot.m_stopERP;
+		m_stopCFM                = limot.m_stopCFM;
+		m_motorERP               = limot.m_motorERP;
+		m_motorCFM               = limot.m_motorCFM;
+		m_enableMotor            = limot.m_enableMotor;
+		m_targetVelocity         = limot.m_targetVelocity;
+		m_maxMotorForce          = limot.m_maxMotorForce;
+		m_servoMotor             = limot.m_servoMotor;
+		m_servoTarget            = limot.m_servoTarget;
+		m_enableSpring           = limot.m_enableSpring;
+		m_springStiffness        = limot.m_springStiffness;
+		m_springStiffnessLimited = limot.m_springStiffnessLimited;
+		m_springDamping          = limot.m_springDamping;
+		m_springDampingLimited   = limot.m_springDampingLimited;
+		m_equilibriumPoint       = limot.m_equilibriumPoint;
+		m_currentLimitError   = limot.m_currentLimitError;
+		m_currentLimitErrorHi = limot.m_currentLimitErrorHi;
+		m_currentPosition     = limot.m_currentPosition;
+		m_currentLimit        = limot.m_currentLimit;
+	}
+	bool isLimited()
+	{
+		if(m_loLimit > m_hiLimit) return false;
+		return true;
+	}
+	void testLimitValue(btScalar test_value);
+class btTranslationalLimitMotor2
+// upper < lower means free
+// upper == lower means locked
+// upper > lower means limited
+	btVector3 m_lowerLimit;
+	btVector3 m_upperLimit;
+	btVector3 m_bounce;
+	btVector3 m_stopERP;
+	btVector3 m_stopCFM;
+	btVector3 m_motorERP;
+	btVector3 m_motorCFM;
+	bool      m_enableMotor[3];
+	bool      m_servoMotor[3];
+	bool      m_enableSpring[3];
+	btVector3 m_servoTarget;
+	btVector3 m_springStiffness;
+	bool      m_springStiffnessLimited[3];
+	btVector3 m_springDamping;
+	bool      m_springDampingLimited[3];
+	btVector3 m_equilibriumPoint;
+	btVector3 m_targetVelocity;
+	btVector3 m_maxMotorForce;
+	btVector3 m_currentLimitError;
+	btVector3 m_currentLimitErrorHi;
+	btVector3 m_currentLinearDiff;
+	int       m_currentLimit[3];
+	btTranslationalLimitMotor2()
+	{
+		m_lowerLimit         .setValue(0.f , 0.f , 0.f );
+		m_upperLimit         .setValue(0.f , 0.f , 0.f );
+		m_bounce             .setValue(0.f , 0.f , 0.f );
+		m_stopERP            .setValue(0.2f, 0.2f, 0.2f);
+		m_stopCFM            .setValue(0.f , 0.f , 0.f );
+		m_motorERP           .setValue(0.9f, 0.9f, 0.9f);
+		m_motorCFM           .setValue(0.f , 0.f , 0.f );
+		m_currentLimitError  .setValue(0.f , 0.f , 0.f );
+		m_currentLimitErrorHi.setValue(0.f , 0.f , 0.f );
+		m_currentLinearDiff  .setValue(0.f , 0.f , 0.f );
+		for(int i=0; i < 3; i++) 
+		{
+			m_enableMotor[i]            = false;
+			m_servoMotor[i]             = false;
+			m_enableSpring[i]           = false;
+			m_servoTarget[i]            = btScalar(0.f);
+			m_springStiffness[i]        = btScalar(0.f);
+			m_springStiffnessLimited[i] = false;
+			m_springDamping[i]          = btScalar(0.f);
+			m_springDampingLimited[i]   = false;
+			m_equilibriumPoint[i]       = btScalar(0.f);
+			m_targetVelocity[i]         = btScalar(0.f);
+			m_maxMotorForce[i]          = btScalar(0.f);
+			m_currentLimit[i]     = 0;
+		}
+	}
+	btTranslationalLimitMotor2(const btTranslationalLimitMotor2 & other )
+	{
+		m_lowerLimit          = other.m_lowerLimit;
+		m_upperLimit          = other.m_upperLimit;
+		m_bounce              = other.m_bounce;
+		m_stopERP             = other.m_stopERP;
+		m_stopCFM             = other.m_stopCFM;
+		m_motorERP            = other.m_motorERP;
+		m_motorCFM            = other.m_motorCFM;
+		m_currentLimitError   = other.m_currentLimitError;
+		m_currentLimitErrorHi = other.m_currentLimitErrorHi;
+		m_currentLinearDiff   = other.m_currentLinearDiff;
+		for(int i=0; i < 3; i++) 
+		{
+			m_enableMotor[i]            = other.m_enableMotor[i];
+			m_servoMotor[i]             = other.m_servoMotor[i];
+			m_enableSpring[i]           = other.m_enableSpring[i];
+			m_servoTarget[i]            = other.m_servoTarget[i];
+			m_springStiffness[i]        = other.m_springStiffness[i];
+			m_springStiffnessLimited[i] = other.m_springStiffnessLimited[i];
+			m_springDamping[i]          = other.m_springDamping[i];
+			m_springDampingLimited[i]   = other.m_springDampingLimited[i];
+			m_equilibriumPoint[i]       = other.m_equilibriumPoint[i];
+			m_targetVelocity[i]         = other.m_targetVelocity[i];
+			m_maxMotorForce[i]          = other.m_maxMotorForce[i];
+			m_currentLimit[i]     = other.m_currentLimit[i];
+		}
+	}
+	inline bool isLimited(int limitIndex)
+	{
+		return (m_upperLimit[limitIndex] >= m_lowerLimit[limitIndex]);
+	}
+	void testLimitValue(int limitIndex, btScalar test_value);
+enum bt6DofFlags2
+#define BT_6DOF_FLAGS_AXIS_SHIFT2 4 // bits per axis
+ATTRIBUTE_ALIGNED16(class) btGeneric6DofSpring2Constraint : public btTypedConstraint
+	btTransform m_frameInA;
+	btTransform m_frameInB;
+	btJacobianEntry m_jacLinear[3];
+	btJacobianEntry m_jacAng[3];
+	btTranslationalLimitMotor2 m_linearLimits;
+	btRotationalLimitMotor2 m_angularLimits[3];
+	RotateOrder m_rotateOrder;
+	btTransform  m_calculatedTransformA;
+	btTransform  m_calculatedTransformB;
+	btVector3    m_calculatedAxisAngleDiff;
+	btVector3    m_calculatedAxis[3];
+	btVector3    m_calculatedLinearDiff;
+	btScalar     m_factA;
+	btScalar     m_factB;
+	bool         m_hasStaticBody;
+	int          m_flags;
+	btGeneric6DofSpring2Constraint&	operator=(btGeneric6DofSpring2Constraint&)
+	{
+		btAssert(0);
+		return *this;
+	}
+	int setAngularLimits(btConstraintInfo2 *info, int row_offset,const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB);
+	int setLinearLimits(btConstraintInfo2 *info, int row, const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB);
+	void calculateLinearInfo();
+	void calculateAngleInfo();
+	void testAngularLimitMotor(int axis_index);
+	void calculateJacobi(btRotationalLimitMotor2* limot, const btTransform& transA,const btTransform& transB, btConstraintInfo2* info, int srow, btVector3& ax1, int rotational, int rotAllowed);
+	int get_limit_motor_info2(btRotationalLimitMotor2* limot,
+		const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB,
+		btConstraintInfo2* info, int row, btVector3& ax1, int rotational, int rotAllowed = false);
+    btGeneric6DofSpring2Constraint(btRigidBody& rbA, btRigidBody& rbB, const btTransform& frameInA, const btTransform& frameInB, RotateOrder rotOrder = RO_XYZ);
+    btGeneric6DofSpring2Constraint(btRigidBody& rbB, const btTransform& frameInB, RotateOrder rotOrder = RO_XYZ);
+	virtual void buildJacobian() {}
+	virtual void getInfo1 (btConstraintInfo1* info);
+	virtual void getInfo2 (btConstraintInfo2* info);
+	virtual int calculateSerializeBufferSize() const;
+	virtual const char* serialize(void* dataBuffer, btSerializer* serializer) const;
+	btRotationalLimitMotor2* getRotationalLimitMotor(int index) { return &m_angularLimits[index]; }
+	btTranslationalLimitMotor2* getTranslationalLimitMotor() { return &m_linearLimits; }
+	// Calculates the global transform for the joint offset for body A an B, and also calculates the angle differences between the bodies.
+	void calculateTransforms(const btTransform& transA,const btTransform& transB);
+	void calculateTransforms();
+	// Gets the global transform of the offset for body A
+	const btTransform & getCalculatedTransformA() const { return m_calculatedTransformA; }
+	// Gets the global transform of the offset for body B
+	const btTransform & getCalculatedTransformB() const { return m_calculatedTransformB; }
+	const btTransform & getFrameOffsetA() const { return m_frameInA; }
+	const btTransform & getFrameOffsetB() const { return m_frameInB; }
+	btTransform & getFrameOffsetA() { return m_frameInA; }
+	btTransform & getFrameOffsetB() { return m_frameInB; }
+	// Get the rotation axis in global coordinates ( btGeneric6DofSpring2Constraint::calculateTransforms() must be called previously )
+	btVector3 getAxis(int axis_index) const { return m_calculatedAxis[axis_index]; }
+	// Get the relative Euler angle ( btGeneric6DofSpring2Constraint::calculateTransforms() must be called previously )
+	btScalar getAngle(int axis_index) const { return m_calculatedAxisAngleDiff[axis_index]; }
+	// Get the relative position of the constraint pivot ( btGeneric6DofSpring2Constraint::calculateTransforms() must be called previously )
+	btScalar getRelativePivotPosition(int axis_index) const { return m_calculatedLinearDiff[axis_index]; }
+	void setFrames(const btTransform & frameA, const btTransform & frameB);
+	void setLinearLowerLimit(const btVector3& linearLower) { m_linearLimits.m_lowerLimit = linearLower; }
+	void getLinearLowerLimit(btVector3& linearLower) { linearLower = m_linearLimits.m_lowerLimit; }
+	void setLinearUpperLimit(const btVector3& linearUpper) { m_linearLimits.m_upperLimit = linearUpper; }
+	void getLinearUpperLimit(btVector3& linearUpper) { linearUpper = m_linearLimits.m_upperLimit; }
+	void setAngularLowerLimit(const btVector3& angularLower)
+	{
+		for(int i = 0; i < 3; i++) 
+			m_angularLimits[i].m_loLimit = btNormalizeAngle(angularLower[i]);
+	}
+	void setAngularLowerLimitReversed(const btVector3& angularLower)
+	{
+		for(int i = 0; i < 3; i++) 
+			m_angularLimits[i].m_hiLimit = btNormalizeAngle(-angularLower[i]);
+	}
+	void getAngularLowerLimit(btVector3& angularLower)
+	{
+		for(int i = 0; i < 3; i++) 
+			angularLower[i] = m_angularLimits[i].m_loLimit;
+	}
+	void getAngularLowerLimitReversed(btVector3& angularLower)
+	{
+		for(int i = 0; i < 3; i++)
+			angularLower[i] = -m_angularLimits[i].m_hiLimit;
+	}
+	void setAngularUpperLimit(const btVector3& angularUpper)
+	{
+		for(int i = 0; i < 3; i++)
+			m_angularLimits[i].m_hiLimit = btNormalizeAngle(angularUpper[i]);
+	}
+	void setAngularUpperLimitReversed(const btVector3& angularUpper)
+	{
+		for(int i = 0; i < 3; i++)
+			m_angularLimits[i].m_loLimit = btNormalizeAngle(-angularUpper[i]);
+	}
+	void getAngularUpperLimit(btVector3& angularUpper)
+	{
+		for(int i = 0; i < 3; i++)
+			angularUpper[i] = m_angularLimits[i].m_hiLimit;
+	}
+	void getAngularUpperLimitReversed(btVector3& angularUpper)
+	{
+		for(int i = 0; i < 3; i++)
+			angularUpper[i] = -m_angularLimits[i].m_loLimit;
+	}
+	//first 3 are linear, next 3 are angular
+	void setLimit(int axis, btScalar lo, btScalar hi)
+	{
+		if(axis<3)
+		{
+			m_linearLimits.m_lowerLimit[axis] = lo;
+			m_linearLimits.m_upperLimit[axis] = hi;
+		}
+		else
+		{
+			lo = btNormalizeAngle(lo);
+			hi = btNormalizeAngle(hi);
+			m_angularLimits[axis-3].m_loLimit = lo;
+			m_angularLimits[axis-3].m_hiLimit = hi;
+		}
+	}
+	void setLimitReversed(int axis, btScalar lo, btScalar hi)
+	{
+		if(axis<3)
+		{
+			m_linearLimits.m_lowerLimit[axis] = lo;
+			m_linearLimits.m_upperLimit[axis] = hi;
+		}
+		else
+		{
+			lo = btNormalizeAngle(lo);
+			hi = btNormalizeAngle(hi);
+			m_angularLimits[axis-3].m_hiLimit = -lo;
+			m_angularLimits[axis-3].m_loLimit = -hi;
+		}
+	}
+	bool isLimited(int limitIndex)
+	{
+		if(limitIndex<3)
+		{
+			return m_linearLimits.isLimited(limitIndex);
+		}
+		return m_angularLimits[limitIndex-3].isLimited();
+	}
+	void setRotationOrder(RotateOrder order) { m_rotateOrder = order; }
+	RotateOrder getRotationOrder() { return m_rotateOrder; }
+	void setAxis( const btVector3& axis1, const btVector3& axis2);
+	void setBounce(int index, btScalar bounce);
+	void enableMotor(int index, bool onOff);
+	void setServo(int index, bool onOff); // set the type of the motor (servo or not) (the motor has to be turned on for servo also)
+	void setTargetVelocity(int index, btScalar velocity);
+	void setServoTarget(int index, btScalar target);
+	void setMaxMotorForce(int index, btScalar force);
+	void enableSpring(int index, bool onOff);
+	void setStiffness(int index, btScalar stiffness, bool limitIfNeeded = true); // if limitIfNeeded is true the system will automatically limit the stiffness in necessary situations where otherwise the spring would move unrealistically too widely
+	void setDamping(int index, btScalar damping, bool limitIfNeeded = true); // if limitIfNeeded is true the system will automatically limit the damping in necessary situations where otherwise the spring would blow up
+	void setEquilibriumPoint(); // set the current constraint position/orientation as an equilibrium point for all DOF
+	void setEquilibriumPoint(int index);  // set the current constraint position/orientation as an equilibrium point for given DOF
+	void setEquilibriumPoint(int index, btScalar val);
+	//override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
+	//If no axis is provided, it uses the default axis for this constraint.
+	virtual void setParam(int num, btScalar value, int axis = -1);
+	virtual btScalar getParam(int num, int axis = -1) const;
+    static btScalar btGetMatrixElem(const btMatrix3x3& mat, int index);
+    static bool matrixToEulerXYZ(const btMatrix3x3& mat,btVector3& xyz);
+    static bool matrixToEulerXZY(const btMatrix3x3& mat,btVector3& xyz);
+    static bool matrixToEulerYXZ(const btMatrix3x3& mat,btVector3& xyz);
+    static bool matrixToEulerYZX(const btMatrix3x3& mat,btVector3& xyz);
+    static bool matrixToEulerZXY(const btMatrix3x3& mat,btVector3& xyz);
+    static bool matrixToEulerZYX(const btMatrix3x3& mat,btVector3& xyz);
+struct btGeneric6DofSpring2ConstraintData
+	btTypedConstraintData m_typeConstraintData;
+	btTransformFloatData m_rbAFrame;
+	btTransformFloatData m_rbBFrame;
+	btVector3FloatData m_linearUpperLimit;
+	btVector3FloatData m_linearLowerLimit;
+	btVector3FloatData m_linearBounce;
+	btVector3FloatData m_linearStopERP;
+	btVector3FloatData m_linearStopCFM;
+	btVector3FloatData m_linearMotorERP;
+	btVector3FloatData m_linearMotorCFM;
+	btVector3FloatData m_linearTargetVelocity;
+	btVector3FloatData m_linearMaxMotorForce;
+	btVector3FloatData m_linearServoTarget;
+	btVector3FloatData m_linearSpringStiffness;
+	btVector3FloatData m_linearSpringDamping;
+	btVector3FloatData m_linearEquilibriumPoint;
+	char               m_linearEnableMotor[4];
+	char               m_linearServoMotor[4];
+	char               m_linearEnableSpring[4];
+	char               m_linearSpringStiffnessLimited[4];
+	char               m_linearSpringDampingLimited[4];
+	char               m_padding1[4];
+	btVector3FloatData m_angularUpperLimit;
+	btVector3FloatData m_angularLowerLimit;
+	btVector3FloatData m_angularBounce;
+	btVector3FloatData m_angularStopERP;
+	btVector3FloatData m_angularStopCFM;
+	btVector3FloatData m_angularMotorERP;
+	btVector3FloatData m_angularMotorCFM;
+	btVector3FloatData m_angularTargetVelocity;
+	btVector3FloatData m_angularMaxMotorForce;
+	btVector3FloatData m_angularServoTarget;
+	btVector3FloatData m_angularSpringStiffness;
+	btVector3FloatData m_angularSpringDamping;
+	btVector3FloatData m_angularEquilibriumPoint;
+	char               m_angularEnableMotor[4];
+	char               m_angularServoMotor[4];
+	char               m_angularEnableSpring[4];
+	char               m_angularSpringStiffnessLimited[4];
+	char               m_angularSpringDampingLimited[4];
+	int                m_rotateOrder;
+struct btGeneric6DofSpring2ConstraintDoubleData2
+	btTypedConstraintDoubleData m_typeConstraintData;
+	btTransformDoubleData m_rbAFrame;
+	btTransformDoubleData m_rbBFrame;
+	btVector3DoubleData m_linearUpperLimit;
+	btVector3DoubleData m_linearLowerLimit;
+	btVector3DoubleData m_linearBounce;
+	btVector3DoubleData m_linearStopERP;
+	btVector3DoubleData m_linearStopCFM;
+	btVector3DoubleData m_linearMotorERP;
+	btVector3DoubleData m_linearMotorCFM;
+	btVector3DoubleData m_linearTargetVelocity;
+	btVector3DoubleData m_linearMaxMotorForce;
+	btVector3DoubleData m_linearServoTarget;
+	btVector3DoubleData m_linearSpringStiffness;
+	btVector3DoubleData m_linearSpringDamping;
+	btVector3DoubleData m_linearEquilibriumPoint;
+	char                m_linearEnableMotor[4];
+	char                m_linearServoMotor[4];
+	char                m_linearEnableSpring[4];
+	char                m_linearSpringStiffnessLimited[4];
+	char                m_linearSpringDampingLimited[4];
+	char                m_padding1[4];
+	btVector3DoubleData m_angularUpperLimit;
+	btVector3DoubleData m_angularLowerLimit;
+	btVector3DoubleData m_angularBounce;
+	btVector3DoubleData m_angularStopERP;
+	btVector3DoubleData m_angularStopCFM;
+	btVector3DoubleData m_angularMotorERP;
+	btVector3DoubleData m_angularMotorCFM;
+	btVector3DoubleData m_angularTargetVelocity;
+	btVector3DoubleData m_angularMaxMotorForce;
+	btVector3DoubleData m_angularServoTarget;
+	btVector3DoubleData m_angularSpringStiffness;
+	btVector3DoubleData m_angularSpringDamping;
+	btVector3DoubleData m_angularEquilibriumPoint;
+	char                m_angularEnableMotor[4];
+	char                m_angularServoMotor[4];
+	char                m_angularEnableSpring[4];
+	char                m_angularSpringStiffnessLimited[4];
+	char                m_angularSpringDampingLimited[4];
+	int                 m_rotateOrder;
+SIMD_FORCE_INLINE int btGeneric6DofSpring2Constraint::calculateSerializeBufferSize() const
+	return sizeof(btGeneric6DofSpring2ConstraintData2);
+SIMD_FORCE_INLINE const char* btGeneric6DofSpring2Constraint::serialize(void* dataBuffer, btSerializer* serializer) const
+	btGeneric6DofSpring2ConstraintData2* dof = (btGeneric6DofSpring2ConstraintData2*)dataBuffer;
+	btTypedConstraint::serialize(&dof->m_typeConstraintData,serializer);
+	m_frameInA.serialize(dof->m_rbAFrame);
+	m_frameInB.serialize(dof->m_rbBFrame);
+	int i;
+	for (i=0;i<3;i++)
+	{
+		dof->m_angularLowerLimit.m_floats[i]       = m_angularLimits[i].m_loLimit;
+		dof->m_angularUpperLimit.m_floats[i]       = m_angularLimits[i].m_hiLimit;
+		dof->m_angularBounce.m_floats[i]           = m_angularLimits[i].m_bounce;
+		dof->m_angularStopERP.m_floats[i]          = m_angularLimits[i].m_stopERP;
+		dof->m_angularStopCFM.m_floats[i]          = m_angularLimits[i].m_stopCFM;
+		dof->m_angularMotorERP.m_floats[i]         = m_angularLimits[i].m_motorERP;
+		dof->m_angularMotorCFM.m_floats[i]         = m_angularLimits[i].m_motorCFM;
+		dof->m_angularTargetVelocity.m_floats[i]   = m_angularLimits[i].m_targetVelocity;
+		dof->m_angularMaxMotorForce.m_floats[i]    = m_angularLimits[i].m_maxMotorForce;
+		dof->m_angularServoTarget.m_floats[i]      = m_angularLimits[i].m_servoTarget;
+		dof->m_angularSpringStiffness.m_floats[i]  = m_angularLimits[i].m_springStiffness;
+		dof->m_angularSpringDamping.m_floats[i]    = m_angularLimits[i].m_springDamping;
+		dof->m_angularEquilibriumPoint.m_floats[i] = m_angularLimits[i].m_equilibriumPoint;
+	}
+	dof->m_angularLowerLimit.m_floats[3]       = 0;
+	dof->m_angularUpperLimit.m_floats[3]       = 0;
+	dof->m_angularBounce.m_floats[3]           = 0;
+	dof->m_angularStopERP.m_floats[3]          = 0;
+	dof->m_angularStopCFM.m_floats[3]          = 0;
+	dof->m_angularMotorERP.m_floats[3]         = 0;
+	dof->m_angularMotorCFM.m_floats[3]         = 0;
+	dof->m_angularTargetVelocity.m_floats[3]   = 0;
+	dof->m_angularMaxMotorForce.m_floats[3]    = 0;
+	dof->m_angularServoTarget.m_floats[3]      = 0;
+	dof->m_angularSpringStiffness.m_floats[3]  = 0;
+	dof->m_angularSpringDamping.m_floats[3]    = 0;
+	dof->m_angularEquilibriumPoint.m_floats[3] = 0;
+	for (i=0;i<4;i++)
+	{
+		dof->m_angularEnableMotor[i]            = i < 3 ? ( m_angularLimits[i].m_enableMotor ? 1 : 0 ) : 0;
+		dof->m_angularServoMotor[i]             = i < 3 ? ( m_angularLimits[i].m_servoMotor ? 1 : 0 ) : 0;
+		dof->m_angularEnableSpring[i]           = i < 3 ? ( m_angularLimits[i].m_enableSpring ? 1 : 0 ) : 0;
+		dof->m_angularSpringStiffnessLimited[i] = i < 3 ? ( m_angularLimits[i].m_springStiffnessLimited ? 1 : 0 ) : 0;
+		dof->m_angularSpringDampingLimited[i]   = i < 3 ? ( m_angularLimits[i].m_springDampingLimited ? 1 : 0 ) : 0;
+	}
+	m_linearLimits.m_lowerLimit.serialize( dof->m_linearLowerLimit );
+	m_linearLimits.m_upperLimit.serialize( dof->m_linearUpperLimit );
+	m_linearLimits.m_bounce.serialize( dof->m_linearBounce );
+	m_linearLimits.m_stopERP.serialize( dof->m_linearStopERP );
+	m_linearLimits.m_stopCFM.serialize( dof->m_linearStopCFM );
+	m_linearLimits.m_motorERP.serialize( dof->m_linearMotorERP );
+	m_linearLimits.m_motorCFM.serialize( dof->m_linearMotorCFM );
+	m_linearLimits.m_targetVelocity.serialize( dof->m_linearTargetVelocity );
+	m_linearLimits.m_maxMotorForce.serialize( dof->m_linearMaxMotorForce );
+	m_linearLimits.m_servoTarget.serialize( dof->m_linearServoTarget );
+	m_linearLimits.m_springStiffness.serialize( dof->m_linearSpringStiffness );
+	m_linearLimits.m_springDamping.serialize( dof->m_linearSpringDamping );
+	m_linearLimits.m_equilibriumPoint.serialize( dof->m_linearEquilibriumPoint );
+	for (i=0;i<4;i++)
+	{
+		dof->m_linearEnableMotor[i]            = i < 3 ? ( m_linearLimits.m_enableMotor[i] ? 1 : 0 ) : 0;
+		dof->m_linearServoMotor[i]             = i < 3 ? ( m_linearLimits.m_servoMotor[i] ? 1 : 0 ) : 0;
+		dof->m_linearEnableSpring[i]           = i < 3 ? ( m_linearLimits.m_enableSpring[i] ? 1 : 0 ) : 0;
+		dof->m_linearSpringStiffnessLimited[i] = i < 3 ? ( m_linearLimits.m_springStiffnessLimited[i] ? 1 : 0 ) : 0;
+		dof->m_linearSpringDampingLimited[i]   = i < 3 ? ( m_linearLimits.m_springDampingLimited[i] ? 1 : 0 ) : 0;
+	}
+	dof->m_rotateOrder = m_rotateOrder;
+	return btGeneric6DofSpring2ConstraintDataName;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp
index 2b387149..6f765884 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp
@@ -118,7 +118,7 @@ void btGeneric6DofSpringConstraint::internalUpdateSprings(btConstraintInfo2* inf
 	// it is assumed that calculateTransforms() have been called before this call
 	int i;
-	btVector3 relVel = m_rbB.getLinearVelocity() - m_rbA.getLinearVelocity();
+	//btVector3 relVel = m_rbB.getLinearVelocity() - m_rbA.getLinearVelocity();
 	for(i = 0; i < 3; i++)
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h
index 31e0cd53..dac59c68 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h
@@ -21,6 +21,15 @@ subject to the following restrictions:
 #include "btTypedConstraint.h"
 #include "btGeneric6DofConstraint.h"
+#define btGeneric6DofSpringConstraintData2		btGeneric6DofSpringConstraintDoubleData2
+#define btGeneric6DofSpringConstraintDataName	"btGeneric6DofSpringConstraintDoubleData2"
+#define btGeneric6DofSpringConstraintData2		btGeneric6DofSpringConstraintData
+#define btGeneric6DofSpringConstraintDataName	"btGeneric6DofSpringConstraintData"
 /// Generic 6 DOF constraint that allows to set spring motors to any translational and rotational DOF
@@ -32,7 +41,7 @@ subject to the following restrictions:
 /// 4 : rotation Y (2nd Euler rotational around new position of Y axis, range [-PI/2+epsilon, PI/2-epsilon] )
 /// 5 : rotation Z (1st Euler rotational around Z axis, range [-PI+epsilon, PI-epsilon] )
-class btGeneric6DofSpringConstraint : public btGeneric6DofConstraint
+ATTRIBUTE_ALIGNED16(class) btGeneric6DofSpringConstraint : public btGeneric6DofConstraint
 	bool		m_springEnabled[6];
@@ -42,6 +51,9 @@ protected:
 	void init();
 	void internalUpdateSprings(btConstraintInfo2* info);
     btGeneric6DofSpringConstraint(btRigidBody& rbA, btRigidBody& rbB, const btTransform& frameInA, const btTransform& frameInB ,bool useLinearReferenceFrameA);
     btGeneric6DofSpringConstraint(btRigidBody& rbB, const btTransform& frameInB, bool useLinearReferenceFrameB);
 	void enableSpring(int index, bool onOff);
@@ -51,6 +63,26 @@ public:
 	void setEquilibriumPoint(int index);  // set the current constraint position/orientation as an equilibrium point for given DOF
 	void setEquilibriumPoint(int index, btScalar val);
+	bool isSpringEnabled(int index) const
+	{
+	    return m_springEnabled[index];
+	}
+	btScalar getStiffness(int index) const
+	{
+	    return m_springStiffness[index];
+	}
+	btScalar getDamping(int index) const
+	{
+	    return m_springDamping[index];
+	}
+	btScalar getEquilibriumPoint(int index) const
+	{
+	    return m_equilibriumPoint[index];
+	}
 	virtual void setAxis( const btVector3& axis1, const btVector3& axis2);
 	virtual void getInfo2 (btConstraintInfo2* info);
@@ -62,7 +94,6 @@ public:
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
 struct btGeneric6DofSpringConstraintData
 	btGeneric6DofConstraintData	m_6dofData;
@@ -73,15 +104,26 @@ struct btGeneric6DofSpringConstraintData
 	float		m_springDamping[6];
+struct btGeneric6DofSpringConstraintDoubleData2
+	btGeneric6DofConstraintDoubleData2	m_6dofData;
+	int			m_springEnabled[6];
+	double		m_equilibriumPoint[6];
+	double		m_springStiffness[6];
+	double		m_springDamping[6];
 SIMD_FORCE_INLINE	int	btGeneric6DofSpringConstraint::calculateSerializeBufferSize() const
-	return sizeof(btGeneric6DofSpringConstraintData);
+	return sizeof(btGeneric6DofSpringConstraintData2);
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 SIMD_FORCE_INLINE	const char*	btGeneric6DofSpringConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
-	btGeneric6DofSpringConstraintData* dof = (btGeneric6DofSpringConstraintData*)dataBuffer;
+	btGeneric6DofSpringConstraintData2* dof = (btGeneric6DofSpringConstraintData2*)dataBuffer;
 	int i;
@@ -92,7 +134,7 @@ SIMD_FORCE_INLINE	const char*	btGeneric6DofSpringConstraint::serialize(void* dat
 		dof->m_springEnabled[i] = m_springEnabled[i]? 1 : 0;
 		dof->m_springStiffness[i] = m_springStiffness[i];
-	return "btGeneric6DofConstraintData";
+	return btGeneric6DofSpringConstraintDataName;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.cpp
index 29123d52..4be2aabe 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.cpp
@@ -25,7 +25,7 @@ subject to the following restrictions:
 // anchor, axis1 and axis2 are in world coordinate system
 // axis1 must be orthogonal to axis2
 btHinge2Constraint::btHinge2Constraint(btRigidBody& rbA, btRigidBody& rbB, btVector3& anchor, btVector3& axis1, btVector3& axis2)
-: btGeneric6DofSpringConstraint(rbA, rbB, btTransform::getIdentity(), btTransform::getIdentity(), true),
+: btGeneric6DofSpring2Constraint(rbA, rbB, btTransform::getIdentity(), btTransform::getIdentity(),RO_XYZ),
@@ -59,7 +59,7 @@ btHinge2Constraint::btHinge2Constraint(btRigidBody& rbA, btRigidBody& rbB, btVec
 	setAngularUpperLimit(btVector3(-1.f, 0.f,  SIMD_HALF_PI * 0.5f));
 	// enable suspension
 	enableSpring(2, true);
-	setStiffness(2, SIMD_PI * SIMD_PI * 4.f); // period 1 sec for 1 kilogramm weel :-)
+	setStiffness(2, SIMD_PI * SIMD_PI * 4.f);
 	setDamping(2, 0.01f);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.h
index a76452dd..06a8e3ec 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btHinge2Constraint.h
@@ -20,7 +20,7 @@ subject to the following restrictions:
 #include "LinearMath/btVector3.h"
 #include "btTypedConstraint.h"
-#include "btGeneric6DofSpringConstraint.h"
+#include "btGeneric6DofSpring2Constraint.h"
@@ -29,13 +29,15 @@ subject to the following restrictions:
 // 2 rotational degrees of freedom, similar to Euler rotations around Z (axis 1) and X (axis 2)
 // 1 translational (along axis Z) with suspension spring
-class btHinge2Constraint : public btGeneric6DofSpringConstraint
+ATTRIBUTE_ALIGNED16(class) btHinge2Constraint : public btGeneric6DofSpring2Constraint
 	btVector3	m_anchor;
 	btVector3	m_axis1;
 	btVector3	m_axis2;
 	// constructor
 	// anchor, axis1 and axis2 are in world coordinate system
 	// axis1 must be orthogonal to axis2
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.cpp
index 9e3a2bae..76a15094 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.cpp
@@ -45,7 +45,11 @@ btHingeConstraint::btHingeConstraint(btRigidBody& rbA,btRigidBody& rbB, const bt
-									 m_flags(0)
+									 m_flags(0),
+									 m_normalCFM(0),
+									 m_normalERP(0),
+									 m_stopCFM(0),
+									 m_stopERP(0)
 	m_rbAFrame.getOrigin() = pivotInA;
@@ -101,7 +105,11 @@ m_angularOnly(false), m_enableAngularMotor(false),
 	// since no frame is given, assume this to be zero angle and just pick rb transform axis
@@ -151,7 +159,11 @@ m_enableAngularMotor(false),
 	//start with free
@@ -177,7 +189,11 @@ m_enableAngularMotor(false),
 	///not providing rigidbody B means implicitly using worldspace for body B
@@ -285,8 +301,60 @@ void	btHingeConstraint::buildJacobian()
 #endif //__SPU__
+static inline btScalar btNormalizeAnglePositive(btScalar angle)
+  return btFmod(btFmod(angle, btScalar(2.0*SIMD_PI)) + btScalar(2.0*SIMD_PI), btScalar(2.0*SIMD_PI));
+static btScalar btShortestAngularDistance(btScalar accAngle, btScalar curAngle)
+	btScalar result = btNormalizeAngle(btNormalizeAnglePositive(btNormalizeAnglePositive(curAngle) -
+	btNormalizeAnglePositive(accAngle)));
+	return result;
+static btScalar btShortestAngleUpdate(btScalar accAngle, btScalar curAngle)
+	btScalar tol(0.3);
+	btScalar result = btShortestAngularDistance(accAngle, curAngle);
+	  if (btFabs(result) > tol)
+		return curAngle;
+	  else
+		return accAngle + result;
+	return curAngle;
+btScalar btHingeAccumulatedAngleConstraint::getAccumulatedHingeAngle()
+	btScalar hingeAngle = getHingeAngle();
+	m_accumulatedAngle = btShortestAngleUpdate(m_accumulatedAngle,hingeAngle);
+	return m_accumulatedAngle;
+void	btHingeAccumulatedAngleConstraint::setAccumulatedHingeAngle(btScalar accAngle)
+	m_accumulatedAngle  = accAngle;
+void btHingeAccumulatedAngleConstraint::getInfo1(btConstraintInfo1* info)
+	//update m_accumulatedAngle
+	btScalar curHingeAngle = getHingeAngle();
+	m_accumulatedAngle = btShortestAngleUpdate(m_accumulatedAngle,curHingeAngle);
+	btHingeConstraint::getInfo1(info);
 void btHingeConstraint::getInfo1(btConstraintInfo1* info)
 	if (m_useSolveConstraintObsolete)
 		info->m_numConstraintRows = 0;
@@ -369,6 +437,10 @@ void btHingeConstraint::getInfo2Internal(btConstraintInfo2* info, const btTransf
+			info->m_J2linearAxis[i*skip]=0;
+			info->m_J2linearAxis[i*skip+1]=0;
+			info->m_J2linearAxis[i*skip+2]=0;
@@ -384,6 +456,10 @@ void btHingeConstraint::getInfo2Internal(btConstraintInfo2* info, const btTransf
 		info->m_J1linearAxis[0] = 1;
 		info->m_J1linearAxis[skip + 1] = 1;
 		info->m_J1linearAxis[2 * skip + 2] = 1;
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[skip + 1] = -1;
+		info->m_J2linearAxis[2 * skip + 2] = -1;
@@ -405,7 +481,9 @@ void btHingeConstraint::getInfo2Internal(btConstraintInfo2* info, const btTransf
 	// linear RHS
-    btScalar k = info->fps * info->erp;
+	btScalar normalErp = (m_flags & BT_HINGE_FLAGS_ERP_NORM) ? m_normalERP : info->erp;
+    btScalar k = info->fps * normalErp;
 	if (!m_angularOnly)
 		for(i = 0; i < 3; i++)
@@ -502,7 +580,7 @@ void btHingeConstraint::getInfo2Internal(btConstraintInfo2* info, const btTransf
 			powered = 0;
 		info->m_constraintError[srow] = btScalar(0.0f);
-		btScalar currERP = (m_flags & BT_HINGE_FLAGS_ERP_STOP) ? m_stopERP : info->erp;
+		btScalar currERP = (m_flags & BT_HINGE_FLAGS_ERP_STOP) ? m_stopERP : normalErp;
 			if(m_flags & BT_HINGE_FLAGS_CFM_NORM)
@@ -598,6 +676,8 @@ void	btHingeConstraint::updateRHS(btScalar	timeStep)
 btScalar btHingeConstraint::getHingeAngle()
 	return getHingeAngle(m_rbA.getCenterOfMassTransform(),m_rbB.getCenterOfMassTransform());
@@ -702,8 +782,8 @@ void btHingeConstraint::getInfo2InternalUsingFrameOffset(btConstraintInfo2* info
 	btTransform trA = transA*m_rbAFrame;
 	btTransform trB = transB*m_rbBFrame;
 	// pivot point
-	btVector3 pivotAInW = trA.getOrigin();
-	btVector3 pivotBInW = trB.getOrigin();
+//	btVector3 pivotAInW = trA.getOrigin();
+//	btVector3 pivotBInW = trB.getOrigin();
 #if 1
 	// difference between frames in WCS
 	btVector3 ofs = trB.getOrigin() - trA.getOrigin();
@@ -790,14 +870,19 @@ void btHingeConstraint::getInfo2InternalUsingFrameOffset(btConstraintInfo2* info
 	for (i=0; i<3; i++) info->m_J1angularAxis[s2+i] = tmpA[i];
     for (i=0; i<3; i++) info->m_J2angularAxis[s2+i] = -tmpB[i];
-	btScalar k = info->fps * info->erp;
+	btScalar normalErp = (m_flags & BT_HINGE_FLAGS_ERP_NORM)? m_normalERP : info->erp;
+	btScalar k = info->fps * normalErp;
 	if (!m_angularOnly)
 		for (i=0; i<3; i++) info->m_J1linearAxis[s0+i] = p[i];
 		for (i=0; i<3; i++) info->m_J1linearAxis[s1+i] = q[i];
 		for (i=0; i<3; i++) info->m_J1linearAxis[s2+i] = ax1[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s0+i] = -p[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s1+i] = -q[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s2+i] = -ax1[i];
 	// compute three elements of right hand side
 		btScalar rhs = k * p.dot(ofs);
@@ -844,7 +929,8 @@ void btHingeConstraint::getInfo2InternalUsingFrameOffset(btConstraintInfo2* info
 	//    angular_velocity  = (erp*fps) * (ax1 x ax2)
 	// ax1 x ax2 is in the plane space of ax1, so we project the angular
 	// velocity to p and q to find the right hand side.
-	k = info->fps * info->erp;
+	k = info->fps * normalErp;//??
 	btVector3 u = ax1A.cross(ax1B);
 	info->m_constraintError[s3] = k * u.dot(p);
 	info->m_constraintError[s4] = k * u.dot(q);
@@ -889,7 +975,7 @@ void btHingeConstraint::getInfo2InternalUsingFrameOffset(btConstraintInfo2* info
 			powered = 0;
 		info->m_constraintError[srow] = btScalar(0.0f);
-		btScalar currERP = (m_flags & BT_HINGE_FLAGS_ERP_STOP) ? m_stopERP : info->erp;
+		btScalar currERP = (m_flags & BT_HINGE_FLAGS_ERP_STOP) ? m_stopERP : normalErp;
 			if(m_flags & BT_HINGE_FLAGS_CFM_NORM)
@@ -990,6 +1076,10 @@ void btHingeConstraint::setParam(int num, btScalar value, int axis)
 				m_normalCFM = value;
 				m_flags |= BT_HINGE_FLAGS_CFM_NORM;
+				m_normalERP = value;
+				m_flags |= BT_HINGE_FLAGS_ERP_NORM;
+				break;
 			default : 
@@ -1020,6 +1110,10 @@ btScalar btHingeConstraint::getParam(int num, int axis) const
 				btAssertConstrParams(m_flags & BT_HINGE_FLAGS_CFM_NORM);
 				retVal = m_normalCFM;
+				btAssertConstrParams(m_flags & BT_HINGE_FLAGS_ERP_NORM);
+				retVal = m_normalERP;
+				break;
 			default : 
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.h
index cb2973e1..f26e7210 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btHingeConstraint.h
@@ -28,8 +28,8 @@ subject to the following restrictions:
 class btRigidBody;
-#define btHingeConstraintData	btHingeConstraintDoubleData
-#define btHingeConstraintDataName	"btHingeConstraintDoubleData"
+#define btHingeConstraintData	btHingeConstraintDoubleData2 //rename to 2 for backwards compatibility, so we can still load the 'btHingeConstraintDoubleData' version
+#define btHingeConstraintDataName	"btHingeConstraintDoubleData2" 
 #define btHingeConstraintData	btHingeConstraintFloatData
 #define btHingeConstraintDataName	"btHingeConstraintFloatData"
@@ -41,7 +41,8 @@ enum btHingeFlags
@@ -94,12 +95,15 @@ public:
 	int			m_flags;
 	btScalar	m_normalCFM;
+	btScalar	m_normalERP;
 	btScalar	m_stopCFM;
 	btScalar	m_stopERP;
 	btHingeConstraint(btRigidBody& rbA,btRigidBody& rbB, const btVector3& pivotInA,const btVector3& pivotInB, const btVector3& axisInA,const btVector3& axisInB, bool useReferenceFrameA = false);
 	btHingeConstraint(btRigidBody& rbA,const btVector3& pivotInA,const btVector3& axisInA, bool useReferenceFrameA = false);
@@ -173,6 +177,7 @@ public:
 	//       maintain a given angular target.
 	void enableMotor(bool enableMotor) 	{ m_enableAngularMotor = enableMotor; }
 	void setMaxMotorImpulse(btScalar maxMotorImpulse) { m_maxMotorImpulse = maxMotorImpulse; }
+	void setMotorTargetVelocity(btScalar motorTargetVelocity) { m_motorTargetVelocity = motorTargetVelocity; }
 	void setMotorTarget(const btQuaternion& qAinB, btScalar dt); // qAinB is rotation of body A wrt body B.
 	void setMotorTarget(btScalar targetAngle, btScalar dt);
@@ -189,6 +194,33 @@ public:
 		m_relaxationFactor = _relaxationFactor;
+	btScalar getLimitSoftness() const
+	{
+		return m_limit.getSoftness();
+		return m_limitSoftness;
+	}
+	btScalar getLimitBiasFactor() const
+	{
+		return m_limit.getBiasFactor();
+		return m_biasFactor;
+	}
+	btScalar getLimitRelaxationFactor() const
+	{
+		return m_limit.getRelaxationFactor();
+		return m_relaxationFactor;
+	}
 	void	setAxis(btVector3& axisInA)
@@ -215,6 +247,14 @@ public:
+    bool hasLimit() const {
+        return m_limit.getHalfRange() > 0;
+        return m_lowerLimit <= m_upperLimit;
+    }
 	btScalar	getLowerLimit() const
@@ -234,6 +274,7 @@ public:
+	///The getHingeAngle gives the hinge angle in range [-PI,PI]
 	btScalar getHingeAngle();
 	btScalar getHingeAngle(const btTransform& transA,const btTransform& transB);
@@ -284,13 +325,20 @@ public:
 	// access for UseFrameOffset
 	bool getUseFrameOffset() { return m_useOffsetForConstraintFrame; }
 	void setUseFrameOffset(bool frameOffsetOnOff) { m_useOffsetForConstraintFrame = frameOffsetOnOff; }
+	// access for UseReferenceFrameA
+	bool getUseReferenceFrameA() const { return m_useReferenceFrameA; }
+	void setUseReferenceFrameA(bool useReferenceFrameA) { m_useReferenceFrameA = useReferenceFrameA; }
 	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5). 
 	///If no axis is provided, it uses the default axis for this constraint.
 	virtual	void	setParam(int num, btScalar value, int axis = -1);
 	///return the local value of parameter
 	virtual	btScalar getParam(int num, int axis = -1) const;
+	virtual	int getFlags() const
+	{
+  	    return m_flags;
+	}
 	virtual	int	calculateSerializeBufferSize() const;
@@ -300,7 +348,10 @@ public:
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+//only for backward compatibility
+///this structure is not used, except for loading pre-2.82 .bullet files
 struct	btHingeConstraintDoubleData
 	btTypedConstraintData	m_typeConstraintData;
@@ -319,7 +370,46 @@ struct	btHingeConstraintDoubleData
 	float	m_relaxationFactor;
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+///The getAccumulatedHingeAngle returns the accumulated hinge angle, taking rotation across the -PI/PI boundary into account
+ATTRIBUTE_ALIGNED16(class) btHingeAccumulatedAngleConstraint : public btHingeConstraint
+	btScalar	m_accumulatedAngle;
+	btHingeAccumulatedAngleConstraint(btRigidBody& rbA,btRigidBody& rbB, const btVector3& pivotInA,const btVector3& pivotInB, const btVector3& axisInA,const btVector3& axisInB, bool useReferenceFrameA = false)
+	:btHingeConstraint(rbA,rbB,pivotInA,pivotInB, axisInA,axisInB, useReferenceFrameA )
+	{
+		m_accumulatedAngle=getHingeAngle();
+	}
+	btHingeAccumulatedAngleConstraint(btRigidBody& rbA,const btVector3& pivotInA,const btVector3& axisInA, bool useReferenceFrameA = false)
+	:btHingeConstraint(rbA,pivotInA,axisInA, useReferenceFrameA)
+	{
+		m_accumulatedAngle=getHingeAngle();
+	}
+	btHingeAccumulatedAngleConstraint(btRigidBody& rbA,btRigidBody& rbB, const btTransform& rbAFrame, const btTransform& rbBFrame, bool useReferenceFrameA = false)
+	:btHingeConstraint(rbA,rbB, rbAFrame, rbBFrame, useReferenceFrameA )
+	{
+		m_accumulatedAngle=getHingeAngle();
+	}
+	btHingeAccumulatedAngleConstraint(btRigidBody& rbA,const btTransform& rbAFrame, bool useReferenceFrameA = false)
+	:btHingeConstraint(rbA,rbAFrame, useReferenceFrameA )
+	{
+		m_accumulatedAngle=getHingeAngle();
+	}
+	btScalar getAccumulatedHingeAngle();
+	void	setAccumulatedHingeAngle(btScalar accAngle);
+	virtual void getInfo1 (btConstraintInfo1* info);
 struct	btHingeConstraintFloatData
 	btTypedConstraintData	m_typeConstraintData;
@@ -342,6 +432,30 @@ struct	btHingeConstraintFloatData
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btHingeConstraintDoubleData2
+	btTypedConstraintDoubleData	m_typeConstraintData;
+	btTransformDoubleData m_rbAFrame; // constraint axii. Assumes z is hinge axis.
+	btTransformDoubleData m_rbBFrame;
+	int			m_useReferenceFrameA;
+	int			m_angularOnly;
+	int			m_enableAngularMotor;
+	double		m_motorTargetVelocity;
+	double		m_maxMotorImpulse;
+	double		m_lowerLimit;
+	double		m_upperLimit;
+	double		m_limitSoftness;
+	double		m_biasFactor;
+	double		m_relaxationFactor;
+	char	m_padding1[4];
 SIMD_FORCE_INLINE	int	btHingeConstraint::calculateSerializeBufferSize() const
 	return sizeof(btHingeConstraintData);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btJacobianEntry.h b/src/bullet/BulletDynamics/ConstraintSolver/btJacobianEntry.h
index f1994a2d..125580d1 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btJacobianEntry.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btJacobianEntry.h
@@ -16,8 +16,7 @@ subject to the following restrictions:
-#include "LinearMath/btVector3.h"
-#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "LinearMath/btMatrix3x3.h"
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.cpp
new file mode 100644
index 00000000..f110cd48
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.cpp
@@ -0,0 +1,463 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btNNCGConstraintSolver.h"
+btScalar btNNCGConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+	btScalar val = btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup( bodies,numBodies,manifoldPtr, numManifolds, constraints,numConstraints,infoGlobal,debugDrawer);
+	m_pNC.resizeNoInitialize(m_tmpSolverNonContactConstraintPool.size());
+	m_pC.resizeNoInitialize(m_tmpSolverContactConstraintPool.size());
+	m_pCF.resizeNoInitialize(m_tmpSolverContactFrictionConstraintPool.size());
+	m_pCRF.resizeNoInitialize(m_tmpSolverContactRollingFrictionConstraintPool.size());
+	m_deltafNC.resizeNoInitialize(m_tmpSolverNonContactConstraintPool.size());
+	m_deltafC.resizeNoInitialize(m_tmpSolverContactConstraintPool.size());
+	m_deltafCF.resizeNoInitialize(m_tmpSolverContactFrictionConstraintPool.size());
+	m_deltafCRF.resizeNoInitialize(m_tmpSolverContactRollingFrictionConstraintPool.size());
+	return val;
+btScalar btNNCGConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/)
+	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
+	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
+	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
+	if (infoGlobal.m_solverMode & SOLVER_RANDMIZE_ORDER)
+	{
+		if (1)			// uncomment this for a bit less random ((iteration & 7) == 0)
+		{
+			for (int j=0; j<numNonContactPool; ++j) {
+				int tmp = m_orderNonContactConstraintPool[j];
+				int swapi = btRandInt2(j+1);
+				m_orderNonContactConstraintPool[j] = m_orderNonContactConstraintPool[swapi];
+				m_orderNonContactConstraintPool[swapi] = tmp;
+			}
+			//contact/friction constraints are not solved more than 
+			if (iteration< infoGlobal.m_numIterations)
+			{
+				for (int j=0; j<numConstraintPool; ++j) {
+					int tmp = m_orderTmpConstraintPool[j];
+					int swapi = btRandInt2(j+1);
+					m_orderTmpConstraintPool[j] = m_orderTmpConstraintPool[swapi];
+					m_orderTmpConstraintPool[swapi] = tmp;
+				}
+				for (int j=0; j<numFrictionPool; ++j) {
+					int tmp = m_orderFrictionConstraintPool[j];
+					int swapi = btRandInt2(j+1);
+					m_orderFrictionConstraintPool[j] = m_orderFrictionConstraintPool[swapi];
+					m_orderFrictionConstraintPool[swapi] = tmp;
+				}
+			}
+		}
+	}
+	btScalar deltaflengthsqr = 0;
+	if (infoGlobal.m_solverMode & SOLVER_SIMD)
+	{
+		for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+		{
+			btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+			if (iteration < constraint.m_overrideNumSolverIterations) 
+			{
+				btScalar deltaf = resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+				m_deltafNC[j] = deltaf;
+				deltaflengthsqr += deltaf * deltaf;
+			}
+		}
+	} else 
+	{
+		for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+		{
+			btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+			if (iteration < constraint.m_overrideNumSolverIterations) 
+			{
+				btScalar deltaf = resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+				m_deltafNC[j] = deltaf;
+				deltaflengthsqr += deltaf * deltaf;
+			}
+		}
+	}
+	if (m_onlyForNoneContact) 
+	{
+		if (iteration==0) 
+		{
+			for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++) m_pNC[j] = m_deltafNC[j];
+		} else {
+			// deltaflengthsqrprev can be 0 only if the solver solved the problem exactly in the previous iteration. In this case we should have quit, but mainly for debug reason with this 'hack' it is now allowed to continue the calculation
+			btScalar beta = m_deltafLengthSqrPrev>0 ? deltaflengthsqr / m_deltafLengthSqrPrev : 2;
+			if (beta>1) 
+			{
+				for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++) m_pNC[j] = 0;
+			} else 
+			{
+				for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+				{
+					btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+					if (iteration < constraint.m_overrideNumSolverIterations) 
+					{
+						btScalar additionaldeltaimpulse = beta * m_pNC[j];
+						constraint.m_appliedImpulse = btScalar(constraint.m_appliedImpulse) + additionaldeltaimpulse;
+						m_pNC[j] = beta * m_pNC[j] + m_deltafNC[j];
+						btSolverBody& body1 = m_tmpSolverBodyPool[constraint.m_solverBodyIdA];
+						btSolverBody& body2 = m_tmpSolverBodyPool[constraint.m_solverBodyIdB];
+						const btSolverConstraint& c = constraint;
+						body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(),c.m_angularComponentA,additionaldeltaimpulse);
+						body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(),c.m_angularComponentB,additionaldeltaimpulse);
+					}
+				}
+			}
+		}
+		m_deltafLengthSqrPrev = deltaflengthsqr;
+	}
+	if (infoGlobal.m_solverMode & SOLVER_SIMD)
+	{
+		if (iteration< infoGlobal.m_numIterations)
+		{
+			for (int j=0;j<numConstraints;j++)
+			{
+				if (constraints[j]->isEnabled())
+				{
+					int bodyAid = getOrInitSolverBody(constraints[j]->getRigidBodyA(),infoGlobal.m_timeStep);
+					int bodyBid = getOrInitSolverBody(constraints[j]->getRigidBodyB(),infoGlobal.m_timeStep);
+					btSolverBody& bodyA = m_tmpSolverBodyPool[bodyAid];
+					btSolverBody& bodyB = m_tmpSolverBodyPool[bodyBid];
+					constraints[j]->solveConstraintObsolete(bodyA,bodyB,infoGlobal.m_timeStep);
+				}
+			}
+			///solve all contact constraints using SIMD, if available
+			{
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int multiplier = (infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS)? 2 : 1;
+				for (int c=0;c<numPoolConstraints;c++)
+				{
+					btScalar totalImpulse =0;
+					{
+						const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[c]];
+						btScalar deltaf = resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);						
+						m_deltafC[c] = deltaf;
+						deltaflengthsqr += deltaf*deltaf;
+						totalImpulse = solveManifold.m_appliedImpulse;
+					}
+					bool applyFriction = true;
+					if (applyFriction)
+					{
+						{
+							btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c*multiplier]];
+							if (totalImpulse>btScalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+								btScalar deltaf = resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+								m_deltafCF[c*multiplier] = deltaf;
+								deltaflengthsqr += deltaf*deltaf;
+							} else {
+								m_deltafCF[c*multiplier] = 0;
+							}
+						}
+						if (infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS)
+						{
+							btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c*multiplier+1]];
+							if (totalImpulse>btScalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+								btScalar deltaf = resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+								m_deltafCF[c*multiplier+1] = deltaf;
+								deltaflengthsqr += deltaf*deltaf;
+							} else {
+								m_deltafCF[c*multiplier+1] = 0;
+							}
+						}
+					}
+				}
+			}
+			{
+				//solve the friction constraints after all contact constraints, don't interleave them
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int j;
+				for (j=0;j<numPoolConstraints;j++)
+				{
+					const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+					//resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					btScalar deltaf = resolveSingleConstraintRowLowerLimit(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					m_deltafC[j] = deltaf;
+					deltaflengthsqr += deltaf*deltaf;
+				}
+				///solve all friction constraints, using SIMD, if available
+				int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+				for (j=0;j<numFrictionPoolConstraints;j++)
+				{
+					btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+					btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse>btScalar(0))
+					{
+						solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+						solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+						//resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+						btScalar deltaf = resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+						m_deltafCF[j] = deltaf;
+						deltaflengthsqr += deltaf*deltaf;
+					} else {
+						m_deltafCF[j] = 0;
+					}
+				}
+				int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+				for (j=0;j<numRollingFrictionPoolConstraints;j++)
+				{
+					btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+					btScalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse>btScalar(0))
+					{
+						btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+						if (rollingFrictionMagnitude>rollingFrictionConstraint.m_friction)
+							rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+						rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+						rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+						btScalar deltaf = resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA],m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB],rollingFrictionConstraint);
+						m_deltafCRF[j] = deltaf;
+						deltaflengthsqr += deltaf*deltaf;
+					} else {
+						m_deltafCRF[j] = 0;
+					}
+				}
+			}
+		}
+	} else
+	{
+		if (iteration< infoGlobal.m_numIterations)
+		{
+			for (int j=0;j<numConstraints;j++)
+			{
+				if (constraints[j]->isEnabled())
+				{
+					int bodyAid = getOrInitSolverBody(constraints[j]->getRigidBodyA(),infoGlobal.m_timeStep);
+					int bodyBid = getOrInitSolverBody(constraints[j]->getRigidBodyB(),infoGlobal.m_timeStep);
+					btSolverBody& bodyA = m_tmpSolverBodyPool[bodyAid];
+					btSolverBody& bodyB = m_tmpSolverBodyPool[bodyBid];
+					constraints[j]->solveConstraintObsolete(bodyA,bodyB,infoGlobal.m_timeStep);
+				}
+			}
+			///solve all contact constraints
+			int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+			for (int j=0;j<numPoolConstraints;j++)
+			{
+				const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+				btScalar deltaf = resolveSingleConstraintRowLowerLimit(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+				m_deltafC[j] = deltaf;
+				deltaflengthsqr += deltaf*deltaf;
+			}
+			///solve all friction constraints
+			int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+			for (int j=0;j<numFrictionPoolConstraints;j++)
+			{
+				btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+				btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+				if (totalImpulse>btScalar(0))
+				{
+					solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+					solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+					btScalar deltaf = resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					m_deltafCF[j] = deltaf;
+					deltaflengthsqr += deltaf*deltaf;
+				} else {
+					m_deltafCF[j] = 0;
+				}
+			}
+			int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+			for (int j=0;j<numRollingFrictionPoolConstraints;j++)
+			{
+				btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+				btScalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+				if (totalImpulse>btScalar(0))
+				{
+					btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+					if (rollingFrictionMagnitude>rollingFrictionConstraint.m_friction)
+						rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+					rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+					rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+					btScalar deltaf = resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA],m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB],rollingFrictionConstraint);
+					m_deltafCRF[j] = deltaf;
+					deltaflengthsqr += deltaf*deltaf;
+				} else {
+					m_deltafCRF[j] = 0;
+				}
+			}
+		}
+	}
+	if (!m_onlyForNoneContact) 
+	{
+		if (iteration==0) 
+		{
+			for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++) m_pNC[j] = m_deltafNC[j];
+			for (int j=0;j<m_tmpSolverContactConstraintPool.size();j++) m_pC[j] = m_deltafC[j];
+			for (int j=0;j<m_tmpSolverContactFrictionConstraintPool.size();j++) m_pCF[j] = m_deltafCF[j];
+			if ( (infoGlobal.m_solverMode & SOLVER_SIMD) ==0 || (infoGlobal.m_solverMode & SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) == 0 ) 
+			{
+				for (int j=0;j<m_tmpSolverContactRollingFrictionConstraintPool.size();j++) m_pCRF[j] = m_deltafCRF[j];
+			}
+		} else 
+		{
+			// deltaflengthsqrprev can be 0 only if the solver solved the problem exactly in the previous iteration. In this case we should have quit, but mainly for debug reason with this 'hack' it is now allowed to continue the calculation
+			btScalar beta = m_deltafLengthSqrPrev>0 ? deltaflengthsqr / m_deltafLengthSqrPrev : 2;
+			if (beta>1) {
+				for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++) m_pNC[j] = 0;
+				for (int j=0;j<m_tmpSolverContactConstraintPool.size();j++) m_pC[j] = 0;
+				for (int j=0;j<m_tmpSolverContactFrictionConstraintPool.size();j++) m_pCF[j] = 0;
+				if ( (infoGlobal.m_solverMode & SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) == 0 ) {
+					for (int j=0;j<m_tmpSolverContactRollingFrictionConstraintPool.size();j++) m_pCRF[j] = 0;
+				}
+			} else {
+				for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+				{
+					btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+					if (iteration < constraint.m_overrideNumSolverIterations) {
+						btScalar additionaldeltaimpulse = beta * m_pNC[j];
+						constraint.m_appliedImpulse = btScalar(constraint.m_appliedImpulse) + additionaldeltaimpulse;
+						m_pNC[j] = beta * m_pNC[j] + m_deltafNC[j];
+						btSolverBody& body1 = m_tmpSolverBodyPool[constraint.m_solverBodyIdA];
+						btSolverBody& body2 = m_tmpSolverBodyPool[constraint.m_solverBodyIdB];
+						const btSolverConstraint& c = constraint;
+						body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(),c.m_angularComponentA,additionaldeltaimpulse);
+						body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(),c.m_angularComponentB,additionaldeltaimpulse);
+					}
+				}
+				for (int j=0;j<m_tmpSolverContactConstraintPool.size();j++)
+				{
+					btSolverConstraint& constraint = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+					if (iteration< infoGlobal.m_numIterations) {
+						btScalar additionaldeltaimpulse = beta * m_pC[j];
+						constraint.m_appliedImpulse = btScalar(constraint.m_appliedImpulse) + additionaldeltaimpulse;
+						m_pC[j] = beta * m_pC[j] + m_deltafC[j];
+						btSolverBody& body1 = m_tmpSolverBodyPool[constraint.m_solverBodyIdA];
+						btSolverBody& body2 = m_tmpSolverBodyPool[constraint.m_solverBodyIdB];
+						const btSolverConstraint& c = constraint;
+						body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(),c.m_angularComponentA,additionaldeltaimpulse);
+						body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(),c.m_angularComponentB,additionaldeltaimpulse);
+					}
+				}
+				for (int j=0;j<m_tmpSolverContactFrictionConstraintPool.size();j++)
+				{
+					btSolverConstraint& constraint = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+					if (iteration< infoGlobal.m_numIterations) {
+						btScalar additionaldeltaimpulse = beta * m_pCF[j];
+						constraint.m_appliedImpulse = btScalar(constraint.m_appliedImpulse) + additionaldeltaimpulse;
+						m_pCF[j] = beta * m_pCF[j] + m_deltafCF[j];
+						btSolverBody& body1 = m_tmpSolverBodyPool[constraint.m_solverBodyIdA];
+						btSolverBody& body2 = m_tmpSolverBodyPool[constraint.m_solverBodyIdB];
+						const btSolverConstraint& c = constraint;
+						body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(),c.m_angularComponentA,additionaldeltaimpulse);
+						body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(),c.m_angularComponentB,additionaldeltaimpulse);
+					}
+				}
+				if ( (infoGlobal.m_solverMode & SOLVER_SIMD) ==0 || (infoGlobal.m_solverMode & SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) == 0 ) {
+					for (int j=0;j<m_tmpSolverContactRollingFrictionConstraintPool.size();j++)
+					{
+						btSolverConstraint& constraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+						if (iteration< infoGlobal.m_numIterations) {
+							btScalar additionaldeltaimpulse = beta * m_pCRF[j];
+							constraint.m_appliedImpulse = btScalar(constraint.m_appliedImpulse) + additionaldeltaimpulse;
+							m_pCRF[j] = beta * m_pCRF[j] + m_deltafCRF[j];
+							btSolverBody& body1 = m_tmpSolverBodyPool[constraint.m_solverBodyIdA];
+							btSolverBody& body2 = m_tmpSolverBodyPool[constraint.m_solverBodyIdB];
+							const btSolverConstraint& c = constraint;
+							body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(),c.m_angularComponentA,additionaldeltaimpulse);
+							body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(),c.m_angularComponentB,additionaldeltaimpulse);
+						}
+					}
+				}
+			}
+		}
+		m_deltafLengthSqrPrev = deltaflengthsqr;
+	}
+	return deltaflengthsqr;
+btScalar btNNCGConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+	m_pNC.resizeNoInitialize(0);
+	m_pC.resizeNoInitialize(0);
+	m_pCF.resizeNoInitialize(0);
+	m_pCRF.resizeNoInitialize(0);
+	m_deltafNC.resizeNoInitialize(0);
+	m_deltafC.resizeNoInitialize(0);
+	m_deltafCF.resizeNoInitialize(0);
+	m_deltafCRF.resizeNoInitialize(0);
+	return btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h b/src/bullet/BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h
new file mode 100644
index 00000000..a300929c
--- /dev/null
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h
@@ -0,0 +1,64 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btSequentialImpulseConstraintSolver.h"
+ATTRIBUTE_ALIGNED16(class) btNNCGConstraintSolver : public btSequentialImpulseConstraintSolver
+	btScalar m_deltafLengthSqrPrev;
+	btAlignedObjectArray<btScalar> m_pNC;  // p for None Contact constraints
+	btAlignedObjectArray<btScalar> m_pC;   // p for Contact constraints
+	btAlignedObjectArray<btScalar> m_pCF;  // p for ContactFriction constraints
+	btAlignedObjectArray<btScalar> m_pCRF; // p for ContactRollingFriction constraints
+	//These are recalculated in every iterations. We just keep these to prevent reallocation in each iteration.
+	btAlignedObjectArray<btScalar> m_deltafNC;  // deltaf for NoneContact constraints
+	btAlignedObjectArray<btScalar> m_deltafC;   // deltaf for Contact constraints
+	btAlignedObjectArray<btScalar> m_deltafCF;  // deltaf for ContactFriction constraints
+	btAlignedObjectArray<btScalar> m_deltafCRF; // deltaf for ContactRollingFriction constraints
+	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal);
+	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	btNNCGConstraintSolver() : btSequentialImpulseConstraintSolver(), m_onlyForNoneContact(false) {}
+	virtual btConstraintSolverType getSolverType() const
+	{
+		return BT_NNCG_SOLVER;
+	}
+	bool m_onlyForNoneContact;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp
index 7e0d93b9..3c0430b9 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp
@@ -116,15 +116,14 @@ void btPoint2PointConstraint::getInfo2NonVirtual (btConstraintInfo2* info, const
-	/*info->m_J2linearAxis[0] = -1;
-    info->m_J2linearAxis[s+1] = -1;
-    info->m_J2linearAxis[2*s+2] = -1;
-	*/
+	info->m_J2linearAxis[0] = -1;
+    info->m_J2linearAxis[info->rowskip+1] = -1;
+    info->m_J2linearAxis[2*info->rowskip+2] = -1;
 	btVector3 a2 = body1_trans.getBasis()*getPivotInB();
-		btVector3 a2n = -a2;
+	//	btVector3 a2n = -a2;
 		btVector3* angular0 = (btVector3*)(info->m_J2angularAxis);
 		btVector3* angular1 = (btVector3*)(info->m_J2angularAxis+info->rowskip);
 		btVector3* angular2 = (btVector3*)(info->m_J2angularAxis+2*info->rowskip);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h
index b3bda03e..8fa03d71 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h
@@ -24,10 +24,10 @@ class btRigidBody;
-#define btPoint2PointConstraintData	btPoint2PointConstraintDoubleData
-#define btPoint2PointConstraintDataName	"btPoint2PointConstraintDoubleData"
+#define btPoint2PointConstraintData2	btPoint2PointConstraintDoubleData2
+#define btPoint2PointConstraintDataName	"btPoint2PointConstraintDoubleData2"
-#define btPoint2PointConstraintData	btPoint2PointConstraintFloatData
+#define btPoint2PointConstraintData2	btPoint2PointConstraintFloatData
 #define btPoint2PointConstraintDataName	"btPoint2PointConstraintFloatData"
@@ -67,6 +67,8 @@ public:
 	///for backwards compatibility during the transition to 'getInfo/getInfo2'
 	bool		m_useSolveConstraintObsolete;
@@ -114,6 +116,11 @@ public:
 	virtual	void	setParam(int num, btScalar value, int axis = -1);
 	///return the local value of parameter
 	virtual	btScalar getParam(int num, int axis = -1) const;
+	virtual	int getFlags() const
+	{
+        	return m_flags;
+    	}
 	virtual	int	calculateSerializeBufferSize() const;
@@ -131,6 +138,17 @@ struct	btPoint2PointConstraintFloatData
 	btVector3FloatData	m_pivotInB;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btPoint2PointConstraintDoubleData2
+	btTypedConstraintDoubleData	m_typeConstraintData;
+	btVector3DoubleData	m_pivotInA;
+	btVector3DoubleData	m_pivotInB;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+///this structure is not used, except for loading pre-2.82 .bullet files
 ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
 struct	btPoint2PointConstraintDoubleData
@@ -138,18 +156,19 @@ struct	btPoint2PointConstraintDoubleData
 	btVector3DoubleData	m_pivotInA;
 	btVector3DoubleData	m_pivotInB;
 SIMD_FORCE_INLINE	int	btPoint2PointConstraint::calculateSerializeBufferSize() const
-	return sizeof(btPoint2PointConstraintData);
+	return sizeof(btPoint2PointConstraintData2);
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 SIMD_FORCE_INLINE	const char*	btPoint2PointConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
-	btPoint2PointConstraintData* p2pData = (btPoint2PointConstraintData*)dataBuffer;
+	btPoint2PointConstraintData2* p2pData = (btPoint2PointConstraintData2*)dataBuffer;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
index ab074224..fe45af42 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
@@ -4,8 +4,8 @@ Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
@@ -14,177 +14,301 @@ subject to the following restrictions:
 //It is not necessary (redundant) to refresh contact manifolds, this refresh has been moved to the collision algorithms.
 #include "btSequentialImpulseConstraintSolver.h"
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
-#include "BulletDynamics/Dynamics/btRigidBody.h"
-#include "btContactConstraint.h"
-#include "btSolve2LinearConstraint.h"
-#include "btContactSolverInfo.h"
 #include "LinearMath/btIDebugDraw.h"
-#include "btJacobianEntry.h"
+#include "LinearMath/btCpuFeatureUtility.h"
+//#include "btJacobianEntry.h"
 #include "LinearMath/btMinMax.h"
 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
 #include <new>
 #include "LinearMath/btStackAlloc.h"
 #include "LinearMath/btQuickprof.h"
-#include "btSolverBody.h"
-#include "btSolverConstraint.h"
+//#include "btSolverBody.h"
+//#include "btSolverConstraint.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include <string.h> //for memset
 int		gNumSplitImpulseRecoveries = 0;
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+///This is the scalar reference implementation of solving a single constraint row, the innerloop of the Projected Gauss Seidel/Sequential Impulse constraint solver
+///Below are optional SSE2 and SSE4/FMA3 versions. We assume most hardware has SSE2. For SSE4/FMA3 we perform a CPU feature check.
+static btSimdScalar gResolveSingleConstraintRowGeneric_scalar_reference(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
+	btScalar deltaImpulse = c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm;
+	const btScalar deltaVel1Dotn = c.m_contactNormal1.dot(body1.internalGetDeltaLinearVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
+	const btScalar deltaVel2Dotn = c.m_contactNormal2.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+	//	const btScalar delta_rel_vel	=	deltaVel1Dotn-deltaVel2Dotn;
+	deltaImpulse -= deltaVel1Dotn*c.m_jacDiagABInv;
+	deltaImpulse -= deltaVel2Dotn*c.m_jacDiagABInv;
+	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit - c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else if (sum > c.m_upperLimit)
+	{
+		deltaImpulse = c.m_upperLimit - c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_upperLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(), c.m_angularComponentA, deltaImpulse);
+	body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(), c.m_angularComponentB, deltaImpulse);
+	return deltaImpulse;
+static btSimdScalar gResolveSingleConstraintRowLowerLimit_scalar_reference(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
+	btScalar deltaImpulse = c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm;
+	const btScalar deltaVel1Dotn = c.m_contactNormal1.dot(body1.internalGetDeltaLinearVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
+	const btScalar deltaVel2Dotn = c.m_contactNormal2.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+	deltaImpulse -= deltaVel1Dotn*c.m_jacDiagABInv;
+	deltaImpulse -= deltaVel2Dotn*c.m_jacDiagABInv;
+	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit - c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	body1.internalApplyImpulse(c.m_contactNormal1*body1.internalGetInvMass(), c.m_angularComponentA, deltaImpulse);
+	body2.internalApplyImpulse(c.m_contactNormal2*body2.internalGetInvMass(), c.m_angularComponentB, deltaImpulse);
+	return deltaImpulse;
 #ifdef USE_SIMD
 #include <emmintrin.h>
 #define btVecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
 static inline __m128 btSimdDot3( __m128 vec0, __m128 vec1 )
 	__m128 result = _mm_mul_ps( vec0, vec1);
 	return _mm_add_ps( btVecSplat( result, 0 ), _mm_add_ps( btVecSplat( result, 1 ), btVecSplat( result, 2 ) ) );
+#if defined (BT_ALLOW_SSE4)
+#include <intrin.h>
+#define USE_FMA					1
+#define USE_FMA3_INSTEAD_FMA4	1
+#define USE_SSE4_DOT			1
+#define SSE4_DP(a, b)			_mm_dp_ps(a, b, 0x7f)
+#define SSE4_DP_FP(a, b)		_mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f))
+#define DOT_PRODUCT(a, b)		SSE4_DP(a, b)
+#define DOT_PRODUCT(a, b)		btSimdDot3(a, b)
+#if USE_FMA
+// a*b + c
+#define FMADD(a, b, c)		_mm_fmadd_ps(a, b, c)
+// -(a*b) + c
+#define FMNADD(a, b, c)		_mm_fnmadd_ps(a, b, c)
+#else // USE_FMA3
+// a*b + c
+#define FMADD(a, b, c)		_mm_macc_ps(a, b, c)
+// -(a*b) + c
+#define FMNADD(a, b, c)		_mm_nmacc_ps(a, b, c)
+#else // USE_FMA
+// c + a*b
+#define FMADD(a, b, c)		_mm_add_ps(c, _mm_mul_ps(a, b))
+// c - a*b
+#define FMNADD(a, b, c)		_mm_sub_ps(c, _mm_mul_ps(a, b))
 // Project Gauss Seidel or the equivalent Sequential Impulse
-void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
+static btSimdScalar gResolveSingleConstraintRowGeneric_sse2(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
-#ifdef USE_SIMD
 	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
 	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
 	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
-	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
-	__m128 deltaVel1Dotn	=	_mm_add_ps(btSimdDot3(c.m_contactNormal.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128));
-	__m128 deltaVel2Dotn	=	_mm_sub_ps(btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128),btSimdDot3((c.m_contactNormal).mVec128,body2.internalGetDeltaLinearVelocity().mVec128));
-	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
-	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
-	btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
-	btSimdScalar resultLowerLess,resultUpperLess;
-	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
-	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
-	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
-	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
-	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
-	__m128 upperMinApplied = _mm_sub_ps(upperLimit1,cpAppliedImp);
-	deltaImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied) );
-	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1) );
-	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
-	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
+	btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
+	__m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	btSimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
+	btSimdScalar resultLowerLess, resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
+	c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
+	__m128 upperMinApplied = _mm_sub_ps(upperLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied));
+	c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1));
+	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128);
+	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal2).mVec128, body2.internalGetInvMass().mVec128);
 	__m128 impulseMagnitude = deltaImpulse;
-	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
-	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
-	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
-	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
-	resolveSingleConstraintRowGeneric(body1,body2,c);
+	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
+	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
+	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
+	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
+	return deltaImpulse;
-// Project Gauss Seidel or the equivalent Sequential Impulse
- void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGeneric(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
+// Enhanced version of gResolveSingleConstraintRowGeneric_sse2 with SSE4.1 and FMA3
+static btSimdScalar gResolveSingleConstraintRowGeneric_sse4_1_fma3(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
-	btScalar deltaImpulse = c.m_rhs-btScalar(c.m_appliedImpulse)*c.m_cfm;
-	const btScalar deltaVel1Dotn	=	c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
-	const btScalar deltaVel2Dotn	=	-c.m_contactNormal.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+#if defined (BT_ALLOW_SSE4)
+	__m128 tmp					= _mm_set_ps1(c.m_jacDiagABInv);
+	__m128 deltaImpulse			= _mm_set_ps1(c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm);
+	const __m128 lowerLimit		= _mm_set_ps1(c.m_lowerLimit);
+	const __m128 upperLimit		= _mm_set_ps1(c.m_upperLimit);
+	const __m128 deltaVel1Dotn	= _mm_add_ps(DOT_PRODUCT(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
+	const __m128 deltaVel2Dotn	= _mm_add_ps(DOT_PRODUCT(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
+	deltaImpulse				= FMNADD(deltaVel1Dotn, tmp, deltaImpulse);
+	deltaImpulse				= FMNADD(deltaVel2Dotn, tmp, deltaImpulse);
+	tmp							= _mm_add_ps(c.m_appliedImpulse, deltaImpulse); // sum
+	const __m128 maskLower		= _mm_cmpgt_ps(tmp, lowerLimit);
+	const __m128 maskUpper		= _mm_cmpgt_ps(upperLimit, tmp);
+	deltaImpulse				= _mm_blendv_ps(_mm_sub_ps(lowerLimit, c.m_appliedImpulse), _mm_blendv_ps(_mm_sub_ps(upperLimit, c.m_appliedImpulse), deltaImpulse, maskUpper), maskLower);
+	c.m_appliedImpulse			= _mm_blendv_ps(lowerLimit, _mm_blendv_ps(upperLimit, tmp, maskUpper), maskLower);
+	body1.internalGetDeltaLinearVelocity().mVec128	= FMADD(_mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128), deltaImpulse, body1.internalGetDeltaLinearVelocity().mVec128);
+	body1.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentA.mVec128, deltaImpulse, body1.internalGetDeltaAngularVelocity().mVec128);
+	body2.internalGetDeltaLinearVelocity().mVec128	= FMADD(_mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128), deltaImpulse, body2.internalGetDeltaLinearVelocity().mVec128);
+	body2.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentB.mVec128, deltaImpulse, body2.internalGetDeltaAngularVelocity().mVec128);
+	return deltaImpulse;
+	return gResolveSingleConstraintRowGeneric_sse2(body1,body2,c);
-//	const btScalar delta_rel_vel	=	deltaVel1Dotn-deltaVel2Dotn;
-	deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
-	deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
-	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
-	if (sum < c.m_lowerLimit)
-	{
-		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
-		c.m_appliedImpulse = c.m_lowerLimit;
-	}
-	else if (sum > c.m_upperLimit) 
-	{
-		deltaImpulse = c.m_upperLimit-c.m_appliedImpulse;
-		c.m_appliedImpulse = c.m_upperLimit;
-	}
-	else
-	{
-		c.m_appliedImpulse = sum;
-	}
-		body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
-		body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
- void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
+static btSimdScalar gResolveSingleConstraintRowLowerLimit_sse2(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
-#ifdef USE_SIMD
 	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
 	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
 	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
-	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
-	__m128 deltaVel1Dotn	=	_mm_add_ps(btSimdDot3(c.m_contactNormal.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128));
-	__m128 deltaVel2Dotn	=	_mm_sub_ps(btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128),btSimdDot3((c.m_contactNormal).mVec128,body2.internalGetDeltaLinearVelocity().mVec128));
-	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
-	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
-	btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
-	btSimdScalar resultLowerLess,resultUpperLess;
-	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
-	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
-	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
-	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
-	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
-	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
-	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
+	btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
+	__m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	btSimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
+	btSimdScalar resultLowerLess, resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
+	c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
+	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128);
+	__m128	linearComponentB = _mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128);
 	__m128 impulseMagnitude = deltaImpulse;
-	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
-	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
-	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
-	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
+	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
+	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
+	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
+	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
+	return deltaImpulse;
+// Enhanced version of gResolveSingleConstraintRowGeneric_sse2 with SSE4.1 and FMA3
+static btSimdScalar gResolveSingleConstraintRowLowerLimit_sse4_1_fma3(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
+#ifdef BT_ALLOW_SSE4
+	__m128 tmp					= _mm_set_ps1(c.m_jacDiagABInv);
+	__m128 deltaImpulse			= _mm_set_ps1(c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm);
+	const __m128 lowerLimit		= _mm_set_ps1(c.m_lowerLimit);
+	const __m128 deltaVel1Dotn	= _mm_add_ps(DOT_PRODUCT(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
+	const __m128 deltaVel2Dotn	= _mm_add_ps(DOT_PRODUCT(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
+	deltaImpulse				= FMNADD(deltaVel1Dotn, tmp, deltaImpulse);
+	deltaImpulse				= FMNADD(deltaVel2Dotn, tmp, deltaImpulse);
+	tmp							= _mm_add_ps(c.m_appliedImpulse, deltaImpulse);
+	const __m128 mask			= _mm_cmpgt_ps(tmp, lowerLimit);
+	deltaImpulse				= _mm_blendv_ps(_mm_sub_ps(lowerLimit, c.m_appliedImpulse), deltaImpulse, mask);
+	c.m_appliedImpulse			= _mm_blendv_ps(lowerLimit, tmp, mask);
+	body1.internalGetDeltaLinearVelocity().mVec128	= FMADD(_mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128), deltaImpulse, body1.internalGetDeltaLinearVelocity().mVec128);
+	body1.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentA.mVec128, deltaImpulse, body1.internalGetDeltaAngularVelocity().mVec128);
+	body2.internalGetDeltaLinearVelocity().mVec128	= FMADD(_mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128), deltaImpulse, body2.internalGetDeltaLinearVelocity().mVec128);
+	body2.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentB.mVec128, deltaImpulse, body2.internalGetDeltaAngularVelocity().mVec128);
+	return deltaImpulse;
+	return gResolveSingleConstraintRowLowerLimit_sse2(body1,body2,c);
+#endif //BT_ALLOW_SSE4
+#endif //USE_SIMD
+btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
+#ifdef USE_SIMD
+	return m_resolveSingleConstraintRowGeneric(body1, body2, c);
-	resolveSingleConstraintRowLowerLimit(body1,body2,c);
+	return resolveSingleConstraintRowGeneric(body1,body2,c);
 // Project Gauss Seidel or the equivalent Sequential Impulse
- void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimit(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
+btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGeneric(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
-	btScalar deltaImpulse = c.m_rhs-btScalar(c.m_appliedImpulse)*c.m_cfm;
-	const btScalar deltaVel1Dotn	=	c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
-	const btScalar deltaVel2Dotn	=	-c.m_contactNormal.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+	return gResolveSingleConstraintRowGeneric_scalar_reference(body1, body2, c);
-	deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
-	deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
-	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
-	if (sum < c.m_lowerLimit)
-	{
-		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
-		c.m_appliedImpulse = c.m_lowerLimit;
-	}
-	else
-	{
-		c.m_appliedImpulse = sum;
-	}
-	body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
-	body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
+#ifdef USE_SIMD
+	return m_resolveSingleConstraintRowLowerLimit(body1, body2, c);
+	return resolveSingleConstraintRowLowerLimit(body1,body2,c);
+btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimit(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
+	return gResolveSingleConstraintRowLowerLimit_scalar_reference(body1,body2,c);
 void	btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFriendly(
-        btRigidBody& body1,
-        btRigidBody& body2,
+        btSolverBody& body1,
+        btSolverBody& body2,
         const btSolverConstraint& c)
 		if (c.m_rhsPenetration)
 			btScalar deltaImpulse = c.m_rhsPenetration-btScalar(c.m_appliedPushImpulse)*c.m_cfm;
-			const btScalar deltaVel1Dotn	=	c.m_contactNormal.dot(body1.internalGetPushVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetTurnVelocity());
-			const btScalar deltaVel2Dotn	=	-c.m_contactNormal.dot(body2.internalGetPushVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetTurnVelocity());
+			const btScalar deltaVel1Dotn	=	c.m_contactNormal1.dot(body1.internalGetPushVelocity()) 	+ c.m_relpos1CrossNormal.dot(body1.internalGetTurnVelocity());
+			const btScalar deltaVel2Dotn	=	c.m_contactNormal2.dot(body2.internalGetPushVelocity())		+ c.m_relpos2CrossNormal.dot(body2.internalGetTurnVelocity());
 			deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
 			deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
@@ -198,12 +322,12 @@ void	btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFri
 				c.m_appliedPushImpulse = sum;
-			body1.internalApplyPushImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
-			body2.internalApplyPushImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+			body1.internalApplyPushImpulse(c.m_contactNormal1*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+			body2.internalApplyPushImpulse(c.m_contactNormal2*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
- void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
+ void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
 #ifdef USE_SIMD
 	if (!c.m_rhsPenetration)
@@ -215,8 +339,8 @@ void	btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFri
 	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
 	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
 	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhsPenetration), _mm_mul_ps(_mm_set1_ps(c.m_appliedPushImpulse),_mm_set1_ps(c.m_cfm)));
-	__m128 deltaVel1Dotn	=	_mm_add_ps(btSimdDot3(c.m_contactNormal.mVec128,body1.internalGetPushVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetTurnVelocity().mVec128));
-	__m128 deltaVel2Dotn	=	_mm_sub_ps(btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetTurnVelocity().mVec128),btSimdDot3((c.m_contactNormal).mVec128,body2.internalGetPushVelocity().mVec128));
+	__m128 deltaVel1Dotn	=	_mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128,body1.internalGetPushVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetTurnVelocity().mVec128));
+	__m128 deltaVel2Dotn	=	_mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128,body2.internalGetPushVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetTurnVelocity().mVec128));
 	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
 	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
 	btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
@@ -226,12 +350,12 @@ void	btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFri
 	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
 	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
 	c.m_appliedPushImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
-	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
-	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
+	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128,body1.internalGetInvMass().mVec128);
+	__m128	linearComponentB = _mm_mul_ps(c.m_contactNormal2.mVec128,body2.internalGetInvMass().mVec128);
 	__m128 impulseMagnitude = deltaImpulse;
 	body1.internalGetPushVelocity().mVec128 = _mm_add_ps(body1.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
 	body1.internalGetTurnVelocity().mVec128 = _mm_add_ps(body1.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
-	body2.internalGetPushVelocity().mVec128 = _mm_sub_ps(body2.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
+	body2.internalGetPushVelocity().mVec128 = _mm_add_ps(body2.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
 	body2.internalGetTurnVelocity().mVec128 = _mm_add_ps(body2.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
@@ -239,6 +363,63 @@ void	btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFri
+ btSequentialImpulseConstraintSolver::btSequentialImpulseConstraintSolver()
+	 : m_resolveSingleConstraintRowGeneric(gResolveSingleConstraintRowGeneric_scalar_reference),
+	 m_resolveSingleConstraintRowLowerLimit(gResolveSingleConstraintRowLowerLimit_scalar_reference),
+	 m_btSeed2(0)
+ {
+#ifdef USE_SIMD
+	 m_resolveSingleConstraintRowGeneric = gResolveSingleConstraintRowGeneric_sse2;
+	 m_resolveSingleConstraintRowLowerLimit=gResolveSingleConstraintRowLowerLimit_sse2;
+#endif //USE_SIMD
+#ifdef BT_ALLOW_SSE4
+	 int cpuFeatures = btCpuFeatureUtility::getCpuFeatures();
+	 if ((cpuFeatures & btCpuFeatureUtility::CPU_FEATURE_FMA3) && (cpuFeatures & btCpuFeatureUtility::CPU_FEATURE_SSE4_1))
+	 {
+		m_resolveSingleConstraintRowGeneric = gResolveSingleConstraintRowGeneric_sse4_1_fma3;
+		m_resolveSingleConstraintRowLowerLimit = gResolveSingleConstraintRowLowerLimit_sse4_1_fma3;
+	 }
+ }
+ btSequentialImpulseConstraintSolver::~btSequentialImpulseConstraintSolver()
+ {
+ }
+ btSingleConstraintRowSolver	btSequentialImpulseConstraintSolver::getScalarConstraintRowSolverGeneric()
+ {
+	 return gResolveSingleConstraintRowGeneric_scalar_reference;
+ }
+ btSingleConstraintRowSolver	btSequentialImpulseConstraintSolver::getScalarConstraintRowSolverLowerLimit()
+ {
+	 return gResolveSingleConstraintRowLowerLimit_scalar_reference;
+ }
+#ifdef USE_SIMD
+ btSingleConstraintRowSolver	btSequentialImpulseConstraintSolver::getSSE2ConstraintRowSolverGeneric()
+ {
+	 return gResolveSingleConstraintRowGeneric_sse2;
+ }
+ btSingleConstraintRowSolver	btSequentialImpulseConstraintSolver::getSSE2ConstraintRowSolverLowerLimit()
+ {
+	 return gResolveSingleConstraintRowLowerLimit_sse2;
+ }
+#ifdef BT_ALLOW_SSE4
+ btSingleConstraintRowSolver	btSequentialImpulseConstraintSolver::getSSE4_1ConstraintRowSolverGeneric()
+ {
+	 return gResolveSingleConstraintRowGeneric_sse4_1_fma3;
+ }
+ btSingleConstraintRowSolver	btSequentialImpulseConstraintSolver::getSSE4_1ConstraintRowSolverLowerLimit()
+ {
+	 return gResolveSingleConstraintRowLowerLimit_sse4_1_fma3;
+ }
+#endif //BT_ALLOW_SSE4
+#endif //USE_SIMD
 unsigned long btSequentialImpulseConstraintSolver::btRand2()
@@ -277,9 +458,10 @@ int btSequentialImpulseConstraintSolver::btRandInt2 (int n)
-#if 0
-void	btSequentialImpulseConstraintSolver::initSolverBody(btSolverBody* solverBody, btCollisionObject* collisionObject)
+void	btSequentialImpulseConstraintSolver::initSolverBody(btSolverBody* solverBody, btCollisionObject* collisionObject, btScalar timeStep)
 	btRigidBody* rb = collisionObject? btRigidBody::upcast(collisionObject) : 0;
@@ -289,17 +471,32 @@ void	btSequentialImpulseConstraintSolver::initSolverBody(btSolverBody* solverBod
 	if (rb)
-		solverBody->internalGetInvMass() = btVector3(rb->getInvMass(),rb->getInvMass(),rb->getInvMass())*rb->getLinearFactor();
+		solverBody->m_worldTransform = rb->getWorldTransform();
+		solverBody->internalSetInvMass(btVector3(rb->getInvMass(),rb->getInvMass(),rb->getInvMass())*rb->getLinearFactor());
 		solverBody->m_originalBody = rb;
 		solverBody->m_angularFactor = rb->getAngularFactor();
+		solverBody->m_linearFactor = rb->getLinearFactor();
+		solverBody->m_linearVelocity = rb->getLinearVelocity();
+		solverBody->m_angularVelocity = rb->getAngularVelocity();
+		solverBody->m_externalForceImpulse = rb->getTotalForce()*rb->getInvMass()*timeStep;
+		solverBody->m_externalTorqueImpulse = rb->getTotalTorque()*rb->getInvInertiaTensorWorld()*timeStep ;
 	} else
-		solverBody->internalGetInvMass().setValue(0,0,0);
+		solverBody->m_worldTransform.setIdentity();
+		solverBody->internalSetInvMass(btVector3(0,0,0));
 		solverBody->m_originalBody = 0;
+		solverBody->m_linearFactor.setValue(1,1,1);
+		solverBody->m_linearVelocity.setValue(0,0,0);
+		solverBody->m_angularVelocity.setValue(0,0,0);
+		solverBody->m_externalForceImpulse.setValue(0,0,0);
+		solverBody->m_externalTorqueImpulse.setValue(0,0,0);
@@ -313,10 +510,11 @@ btScalar btSequentialImpulseConstraintSolver::restitutionCurve(btScalar rel_vel,
-void	applyAnisotropicFriction(btCollisionObject* colObj,btVector3& frictionDirection);
-void	applyAnisotropicFriction(btCollisionObject* colObj,btVector3& frictionDirection)
+void	btSequentialImpulseConstraintSolver::applyAnisotropicFriction(btCollisionObject* colObj,btVector3& frictionDirection, int frictionMode)
-	if (colObj && colObj->hasAnisotropicFriction())
+	if (colObj && colObj->hasAnisotropicFriction(frictionMode))
 		// transform to local coordinates
 		btVector3 loc_lateral = frictionDirection * colObj->getWorldTransform().getBasis();
@@ -326,20 +524,24 @@ void	applyAnisotropicFriction(btCollisionObject* colObj,btVector3& frictionDirec
 		// ... and transform it back to global coordinates
 		frictionDirection = colObj->getWorldTransform().getBasis() * loc_lateral;
-void btSequentialImpulseConstraintSolver::setupFrictionConstraint(btSolverConstraint& solverConstraint, const btVector3& normalAxis,btRigidBody* solverBodyA,btRigidBody* solverBodyB,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity, btScalar cfmSlip)
+void btSequentialImpulseConstraintSolver::setupFrictionConstraint(btSolverConstraint& solverConstraint, const btVector3& normalAxis,int  solverBodyIdA,int solverBodyIdB,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity, btScalar cfmSlip)
-	btRigidBody* body0=btRigidBody::upcast(colObj0);
-	btRigidBody* body1=btRigidBody::upcast(colObj1);
+	btSolverBody& solverBodyA = m_tmpSolverBodyPool[solverBodyIdA];
+	btSolverBody& solverBodyB = m_tmpSolverBodyPool[solverBodyIdB];
-	solverConstraint.m_contactNormal = normalAxis;
+	btRigidBody* body0 = m_tmpSolverBodyPool[solverBodyIdA].m_originalBody;
+	btRigidBody* body1 = m_tmpSolverBodyPool[solverBodyIdB].m_originalBody;
-	solverConstraint.m_solverBodyA = body0 ? body0 : &getFixedBody();
-	solverConstraint.m_solverBodyB = body1 ? body1 : &getFixedBody();
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
 	solverConstraint.m_friction = cp.m_combinedFriction;
 	solverConstraint.m_originalContactPoint = 0;
@@ -347,56 +549,139 @@ void btSequentialImpulseConstraintSolver::setupFrictionConstraint(btSolverConstr
 	solverConstraint.m_appliedImpulse = 0.f;
 	solverConstraint.m_appliedPushImpulse = 0.f;
+	if (body0)
-		btVector3 ftorqueAxis1 = rel_pos1.cross(solverConstraint.m_contactNormal);
+		solverConstraint.m_contactNormal1 = normalAxis;
+		btVector3 ftorqueAxis1 = rel_pos1.cross(solverConstraint.m_contactNormal1);
 		solverConstraint.m_relpos1CrossNormal = ftorqueAxis1;
-		solverConstraint.m_angularComponentA = body0 ? body0->getInvInertiaTensorWorld()*ftorqueAxis1*body0->getAngularFactor() : btVector3(0,0,0);
+		solverConstraint.m_angularComponentA = body0->getInvInertiaTensorWorld()*ftorqueAxis1*body0->getAngularFactor();
+	}else
+	{
+		solverConstraint.m_contactNormal1.setZero();
+		solverConstraint.m_relpos1CrossNormal.setZero();
+		solverConstraint.m_angularComponentA .setZero();
+	if (body1)
-		btVector3 ftorqueAxis1 = rel_pos2.cross(-solverConstraint.m_contactNormal);
+		solverConstraint.m_contactNormal2 = -normalAxis;
+		btVector3 ftorqueAxis1 = rel_pos2.cross(solverConstraint.m_contactNormal2);
 		solverConstraint.m_relpos2CrossNormal = ftorqueAxis1;
-		solverConstraint.m_angularComponentB = body1 ? body1->getInvInertiaTensorWorld()*ftorqueAxis1*body1->getAngularFactor() : btVector3(0,0,0);
+		solverConstraint.m_angularComponentB = body1->getInvInertiaTensorWorld()*ftorqueAxis1*body1->getAngularFactor();
+	} else
+	{
+		solverConstraint.m_contactNormal2.setZero();
+		solverConstraint.m_relpos2CrossNormal.setZero();
+		solverConstraint.m_angularComponentB.setZero();
-	btScalar denom0 = rb0->computeImpulseDenominator(pos1,solverConstraint.m_contactNormal);
-	btScalar denom1 = rb1->computeImpulseDenominator(pos2,solverConstraint.m_contactNormal);
-	btVector3 vec;
-	btScalar denom0 = 0.f;
-	btScalar denom1 = 0.f;
-	if (body0)
-		vec = ( solverConstraint.m_angularComponentA).cross(rel_pos1);
-		denom0 = body0->getInvMass() + normalAxis.dot(vec);
+		btVector3 vec;
+		btScalar denom0 = 0.f;
+		btScalar denom1 = 0.f;
+		if (body0)
+		{
+			vec = ( solverConstraint.m_angularComponentA).cross(rel_pos1);
+			denom0 = body0->getInvMass() + normalAxis.dot(vec);
+		}
+		if (body1)
+		{
+			vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
+			denom1 = body1->getInvMass() + normalAxis.dot(vec);
+		}
+		btScalar denom = relaxation/(denom0+denom1);
+		solverConstraint.m_jacDiagABInv = denom;
-	if (body1)
-		vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
-		denom1 = body1->getInvMass() + normalAxis.dot(vec);
+		btScalar rel_vel;
+		btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(body0?solverBodyA.m_linearVelocity+solverBodyA.m_externalForceImpulse:btVector3(0,0,0))
+			+ solverConstraint.m_relpos1CrossNormal.dot(body0?solverBodyA.m_angularVelocity:btVector3(0,0,0));
+		btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(body1?solverBodyB.m_linearVelocity+solverBodyB.m_externalForceImpulse:btVector3(0,0,0))
+			+ solverConstraint.m_relpos2CrossNormal.dot(body1?solverBodyB.m_angularVelocity:btVector3(0,0,0));
+		rel_vel = vel1Dotn+vel2Dotn;
+//		btScalar positionalError = 0.f;
+		btScalar velocityError =  desiredVelocity - rel_vel;
+		btScalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv;
+		solverConstraint.m_rhs = velocityImpulse;
+		solverConstraint.m_rhsPenetration = 0.f;
+		solverConstraint.m_cfm = cfmSlip;
+		solverConstraint.m_lowerLimit = -solverConstraint.m_friction;
+		solverConstraint.m_upperLimit = solverConstraint.m_friction;
+btSolverConstraint&	btSequentialImpulseConstraintSolver::addFrictionConstraint(const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity, btScalar cfmSlip)
+	btSolverConstraint& solverConstraint = m_tmpSolverContactFrictionConstraintPool.expandNonInitializing();
+	solverConstraint.m_frictionIndex = frictionIndex;
+	setupFrictionConstraint(solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2,
+							colObj0, colObj1, relaxation, desiredVelocity, cfmSlip);
+	return solverConstraint;
-	btScalar denom = relaxation/(denom0+denom1);
-	solverConstraint.m_jacDiagABInv = denom;
-	solverConstraint.m_jac =  btJacobianEntry (
-		rel_pos1,rel_pos2,solverConstraint.m_contactNormal,
-		body0->getInvInertiaDiagLocal(),
-		body0->getInvMass(),
-		body1->getInvInertiaDiagLocal(),
-		body1->getInvMass());
-#endif //_USE_JACOBIAN
+void btSequentialImpulseConstraintSolver::setupRollingFrictionConstraint(	btSolverConstraint& solverConstraint, const btVector3& normalAxis1,int solverBodyIdA,int  solverBodyIdB,
+									btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,
+									btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation,
+									btScalar desiredVelocity, btScalar cfmSlip)
+	btVector3 normalAxis(0,0,0);
+	solverConstraint.m_contactNormal1 = normalAxis;
+	solverConstraint.m_contactNormal2 = -normalAxis;
+	btSolverBody& solverBodyA = m_tmpSolverBodyPool[solverBodyIdA];
+	btSolverBody& solverBodyB = m_tmpSolverBodyPool[solverBodyIdB];
+	btRigidBody* body0 = m_tmpSolverBodyPool[solverBodyIdA].m_originalBody;
+	btRigidBody* body1 = m_tmpSolverBodyPool[solverBodyIdB].m_originalBody;
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
+	solverConstraint.m_friction = cp.m_combinedRollingFriction;
+	solverConstraint.m_originalContactPoint = 0;
+	solverConstraint.m_appliedImpulse = 0.f;
+	solverConstraint.m_appliedPushImpulse = 0.f;
+	{
+		btVector3 ftorqueAxis1 = -normalAxis1;
+		solverConstraint.m_relpos1CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentA = body0 ? body0->getInvInertiaTensorWorld()*ftorqueAxis1*body0->getAngularFactor() : btVector3(0,0,0);
+	}
+	{
+		btVector3 ftorqueAxis1 = normalAxis1;
+		solverConstraint.m_relpos2CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentB = body1 ? body1->getInvInertiaTensorWorld()*ftorqueAxis1*body1->getAngularFactor() : btVector3(0,0,0);
+	}
+	{
+		btVector3 iMJaA = body0?body0->getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal:btVector3(0,0,0);
+		btVector3 iMJaB = body1?body1->getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal:btVector3(0,0,0);
+		btScalar sum = 0;
+		sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+		sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+		solverConstraint.m_jacDiagABInv = btScalar(1.)/sum;
+	}
 		btScalar rel_vel;
-		btScalar vel1Dotn = solverConstraint.m_contactNormal.dot(body0?body0->getLinearVelocity():btVector3(0,0,0)) 
-			+ solverConstraint.m_relpos1CrossNormal.dot(body0?body0->getAngularVelocity():btVector3(0,0,0));
-		btScalar vel2Dotn = -solverConstraint.m_contactNormal.dot(body1?body1->getLinearVelocity():btVector3(0,0,0)) 
-			+ solverConstraint.m_relpos2CrossNormal.dot(body1?body1->getAngularVelocity():btVector3(0,0,0));
+		btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(body0?solverBodyA.m_linearVelocity+solverBodyA.m_externalForceImpulse:btVector3(0,0,0))
+			+ solverConstraint.m_relpos1CrossNormal.dot(body0?solverBodyA.m_angularVelocity:btVector3(0,0,0));
+		btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(body1?solverBodyB.m_linearVelocity+solverBodyB.m_externalForceImpulse:btVector3(0,0,0))
+			+ solverConstraint.m_relpos2CrossNormal.dot(body1?solverBodyB.m_angularVelocity:btVector3(0,0,0));
 		rel_vel = vel1Dotn+vel2Dotn;
@@ -406,81 +691,107 @@ void btSequentialImpulseConstraintSolver::setupFrictionConstraint(btSolverConstr
 		btSimdScalar	velocityImpulse = velocityError * btSimdScalar(solverConstraint.m_jacDiagABInv);
 		solverConstraint.m_rhs = velocityImpulse;
 		solverConstraint.m_cfm = cfmSlip;
-		solverConstraint.m_lowerLimit = 0;
-		solverConstraint.m_upperLimit = 1e10f;
+		solverConstraint.m_lowerLimit = -solverConstraint.m_friction;
+		solverConstraint.m_upperLimit = solverConstraint.m_friction;
-btSolverConstraint&	btSequentialImpulseConstraintSolver::addFrictionConstraint(const btVector3& normalAxis,btRigidBody* solverBodyA,btRigidBody* solverBodyB,int frictionIndex,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity, btScalar cfmSlip)
+btSolverConstraint&	btSequentialImpulseConstraintSolver::addRollingFrictionConstraint(const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity, btScalar cfmSlip)
-	btSolverConstraint& solverConstraint = m_tmpSolverContactFrictionConstraintPool.expandNonInitializing();
+	btSolverConstraint& solverConstraint = m_tmpSolverContactRollingFrictionConstraintPool.expandNonInitializing();
 	solverConstraint.m_frictionIndex = frictionIndex;
-	setupFrictionConstraint(solverConstraint, normalAxis, solverBodyA, solverBodyB, cp, rel_pos1, rel_pos2, 
+	setupRollingFrictionConstraint(solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2,
 							colObj0, colObj1, relaxation, desiredVelocity, cfmSlip);
 	return solverConstraint;
-int	btSequentialImpulseConstraintSolver::getOrInitSolverBody(btCollisionObject& body)
+int	btSequentialImpulseConstraintSolver::getOrInitSolverBody(btCollisionObject& body,btScalar timeStep)
-#if 0
 	int solverBodyIdA = -1;
 	if (body.getCompanionId() >= 0)
 		//body has already been converted
 		solverBodyIdA = body.getCompanionId();
+        btAssert(solverBodyIdA < m_tmpSolverBodyPool.size());
 	} else
 		btRigidBody* rb = btRigidBody::upcast(&body);
-		if (rb && rb->getInvMass())
+		//convert both active and kinematic objects (for their velocity)
+		if (rb && (rb->getInvMass() || rb->isKinematicObject()))
 			solverBodyIdA = m_tmpSolverBodyPool.size();
 			btSolverBody& solverBody = m_tmpSolverBodyPool.expand();
-			initSolverBody(&solverBody,&body);
+			initSolverBody(&solverBody,&body,timeStep);
 		} else
-			return 0;//assume first one is a fixed solver body
+			if (m_fixedBodyId<0)
+			{
+				m_fixedBodyId = m_tmpSolverBodyPool.size();
+				btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
+				initSolverBody(&fixedBody,0,timeStep);
+			}
+			return m_fixedBodyId;
+//			return 0;//assume first one is a fixed solver body
 	return solverBodyIdA;
-	return 0;
 #include <stdio.h>
-void btSequentialImpulseConstraintSolver::setupContactConstraint(btSolverConstraint& solverConstraint, 
-																 btCollisionObject* colObj0, btCollisionObject* colObj1,
+void btSequentialImpulseConstraintSolver::setupContactConstraint(btSolverConstraint& solverConstraint,
+																 int solverBodyIdA, int solverBodyIdB,
 																 btManifoldPoint& cp, const btContactSolverInfo& infoGlobal,
-																 btVector3& vel, btScalar& rel_vel, btScalar& relaxation,
-																 btVector3& rel_pos1, btVector3& rel_pos2)
+																 btScalar& relaxation,
+																 const btVector3& rel_pos1, const btVector3& rel_pos2)
-			btRigidBody* rb0 = btRigidBody::upcast(colObj0);
-			btRigidBody* rb1 = btRigidBody::upcast(colObj1);
-			const btVector3& pos1 = cp.getPositionWorldOnA();
-			const btVector3& pos2 = cp.getPositionWorldOnB();
+		//	const btVector3& pos1 = cp.getPositionWorldOnA();
+		//	const btVector3& pos2 = cp.getPositionWorldOnB();
+			btSolverBody* bodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+			btSolverBody* bodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+			btRigidBody* rb0 = bodyA->m_originalBody;
+			btRigidBody* rb1 = bodyB->m_originalBody;
-//			btVector3 rel_pos1 = pos1 - colObj0->getWorldTransform().getOrigin(); 
+//			btVector3 rel_pos1 = pos1 - colObj0->getWorldTransform().getOrigin();
 //			btVector3 rel_pos2 = pos2 - colObj1->getWorldTransform().getOrigin();
-			rel_pos1 = pos1 - colObj0->getWorldTransform().getOrigin(); 
-			rel_pos2 = pos2 - colObj1->getWorldTransform().getOrigin();
+			//rel_pos1 = pos1 - bodyA->getWorldTransform().getOrigin();
+			//rel_pos2 = pos2 - bodyB->getWorldTransform().getOrigin();
+			relaxation = infoGlobal.m_sor;
+			btScalar invTimeStep = btScalar(1)/infoGlobal.m_timeStep;
+			btScalar cfm = (cp.m_contactPointFlags&BT_CONTACT_FLAG_HAS_CONTACT_CFM)?cp.m_contactCFM:infoGlobal.m_globalCfm;
+			cfm *= invTimeStep;
-			relaxation = 1.f;
+			btScalar erp = (cp.m_contactPointFlags&BT_CONTACT_FLAG_HAS_CONTACT_ERP)?cp.m_contactERP:infoGlobal.m_erp2;
 			btVector3 torqueAxis0 = rel_pos1.cross(cp.m_normalWorldOnB);
 			solverConstraint.m_angularComponentA = rb0 ? rb0->getInvInertiaTensorWorld()*torqueAxis0*rb0->getAngularFactor() : btVector3(0,0,0);
-			btVector3 torqueAxis1 = rel_pos2.cross(cp.m_normalWorldOnB);		
+			btVector3 torqueAxis1 = rel_pos2.cross(cp.m_normalWorldOnB);
 			solverConstraint.m_angularComponentB = rb1 ? rb1->getInvInertiaTensorWorld()*-torqueAxis1*rb1->getAngularFactor() : btVector3(0,0,0);
 					btScalar denom0 = rb0->computeImpulseDenominator(pos1,cp.m_normalWorldOnB);
 					btScalar denom1 = rb1->computeImpulseDenominator(pos2,cp.m_normalWorldOnB);
 					btVector3 vec;
 					btScalar denom0 = 0.f;
 					btScalar denom1 = 0.f;
@@ -494,36 +805,49 @@ void btSequentialImpulseConstraintSolver::setupContactConstraint(btSolverConstra
 						vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
 						denom1 = rb1->getInvMass() + cp.m_normalWorldOnB.dot(vec);
-					btScalar denom = relaxation/(denom0+denom1);
+					btScalar denom = relaxation/(denom0+denom1+cfm);
 					solverConstraint.m_jacDiagABInv = denom;
-				solverConstraint.m_contactNormal = cp.m_normalWorldOnB;
-				solverConstraint.m_relpos1CrossNormal = rel_pos1.cross(cp.m_normalWorldOnB);
-				solverConstraint.m_relpos2CrossNormal = rel_pos2.cross(-cp.m_normalWorldOnB);
+				if (rb0)
+				{
+					solverConstraint.m_contactNormal1 = cp.m_normalWorldOnB;
+					solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+				} else
+				{
+					solverConstraint.m_contactNormal1.setZero();
+					solverConstraint.m_relpos1CrossNormal.setZero();
+				}
+				if (rb1)
+				{
+					solverConstraint.m_contactNormal2 = -cp.m_normalWorldOnB;
+					solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+				}else
+				{
+					solverConstraint.m_contactNormal2.setZero();
+					solverConstraint.m_relpos2CrossNormal.setZero();
+				}
+				btScalar restitution = 0.f;
+				btScalar penetration = cp.getDistance()+infoGlobal.m_linearSlop;
+				{
+					btVector3 vel1,vel2;
+					vel1 = rb0? rb0->getVelocityInLocalPoint(rel_pos1) : btVector3(0,0,0);
+					vel2 = rb1? rb1->getVelocityInLocalPoint(rel_pos2) : btVector3(0,0,0);
-			btVector3 vel1 = rb0 ? rb0->getVelocityInLocalPoint(rel_pos1) : btVector3(0,0,0);
-			btVector3 vel2 = rb1 ? rb1->getVelocityInLocalPoint(rel_pos2) : btVector3(0,0,0);
-			vel  = vel1 - vel2;
-			rel_vel = cp.m_normalWorldOnB.dot(vel);
+	//			btVector3 vel2 = rb1 ? rb1->getVelocityInLocalPoint(rel_pos2) : btVector3(0,0,0);
+					btVector3 vel  = vel1 - vel2;
+					btScalar rel_vel = cp.m_normalWorldOnB.dot(vel);
-				btScalar penetration = cp.getDistance()+infoGlobal.m_linearSlop;
-				solverConstraint.m_friction = cp.m_combinedFriction;
+					solverConstraint.m_friction = cp.m_combinedFriction;
-				btScalar restitution = 0.f;
-				if (cp.m_lifeTime>infoGlobal.m_restingContactRestitutionThreshold)
-				{
-					restitution = 0.f;
-				} else
-				{
 					restitution =  restitutionCurve(rel_vel, cp.m_combinedRestitution);
 					if (restitution <= btScalar(0.))
@@ -537,9 +861,9 @@ void btSequentialImpulseConstraintSolver::setupContactConstraint(btSolverConstra
 					solverConstraint.m_appliedImpulse = cp.m_appliedImpulse * infoGlobal.m_warmstartingFactor;
 					if (rb0)
-						rb0->internalApplyImpulse(solverConstraint.m_contactNormal*rb0->getInvMass()*rb0->getLinearFactor(),solverConstraint.m_angularComponentA,solverConstraint.m_appliedImpulse);
+						bodyA->internalApplyImpulse(solverConstraint.m_contactNormal1*bodyA->internalGetInvMass()*rb0->getLinearFactor(),solverConstraint.m_angularComponentA,solverConstraint.m_appliedImpulse);
 					if (rb1)
-						rb1->internalApplyImpulse(solverConstraint.m_contactNormal*rb1->getInvMass()*rb1->getLinearFactor(),-solverConstraint.m_angularComponentB,-(btScalar)solverConstraint.m_appliedImpulse);
+						bodyB->internalApplyImpulse(-solverConstraint.m_contactNormal2*bodyB->internalGetInvMass()*rb1->getLinearFactor(),-solverConstraint.m_angularComponentB,-(btScalar)solverConstraint.m_appliedImpulse);
 				} else
 					solverConstraint.m_appliedImpulse = 0.f;
@@ -548,40 +872,51 @@ void btSequentialImpulseConstraintSolver::setupContactConstraint(btSolverConstra
 				solverConstraint.m_appliedPushImpulse = 0.f;
-					btScalar rel_vel;
-					btScalar vel1Dotn = solverConstraint.m_contactNormal.dot(rb0?rb0->getLinearVelocity():btVector3(0,0,0)) 
-						+ solverConstraint.m_relpos1CrossNormal.dot(rb0?rb0->getAngularVelocity():btVector3(0,0,0));
-					btScalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rb1?rb1->getLinearVelocity():btVector3(0,0,0)) 
-						+ solverConstraint.m_relpos2CrossNormal.dot(rb1?rb1->getAngularVelocity():btVector3(0,0,0));
-					rel_vel = vel1Dotn+vel2Dotn;
+					btVector3 externalForceImpulseA = bodyA->m_originalBody ? bodyA->m_externalForceImpulse: btVector3(0,0,0);
+					btVector3 externalTorqueImpulseA = bodyA->m_originalBody ? bodyA->m_externalTorqueImpulse: btVector3(0,0,0);
+					btVector3 externalForceImpulseB = bodyB->m_originalBody ? bodyB->m_externalForceImpulse: btVector3(0,0,0);
+					btVector3 externalTorqueImpulseB = bodyB->m_originalBody ?bodyB->m_externalTorqueImpulse : btVector3(0,0,0);
+					btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(bodyA->m_linearVelocity+externalForceImpulseA)
+						+ solverConstraint.m_relpos1CrossNormal.dot(bodyA->m_angularVelocity+externalTorqueImpulseA);
+					btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(bodyB->m_linearVelocity+externalForceImpulseB)
+						+ solverConstraint.m_relpos2CrossNormal.dot(bodyB->m_angularVelocity+externalTorqueImpulseB);
+					btScalar rel_vel = vel1Dotn+vel2Dotn;
 					btScalar positionalError = 0.f;
 					btScalar	velocityError = restitution - rel_vel;// * damping;
 					if (penetration>0)
 						positionalError = 0;
-						velocityError -= penetration / infoGlobal.m_timeStep;
+						velocityError -= penetration *invTimeStep;
 					} else
-						positionalError = -penetration * infoGlobal.m_erp/infoGlobal.m_timeStep;
+						positionalError = -penetration * erp*invTimeStep;
 					btScalar  penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
 					btScalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
 					if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
 						//combine position and velocity into rhs
-						solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+						solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;//-solverConstraint.m_contactNormal1.dot(bodyA->m_externalForce*bodyA->m_invMass-bodyB->m_externalForce/bodyB->m_invMass)*solverConstraint.m_jacDiagABInv;
 						solverConstraint.m_rhsPenetration = 0.f;
 					} else
 						//split position and velocity into rhs and m_rhsPenetration
 						solverConstraint.m_rhs = velocityImpulse;
 						solverConstraint.m_rhsPenetration = penetrationImpulse;
-					solverConstraint.m_cfm = 0.f;
+					solverConstraint.m_cfm = cfm*solverConstraint.m_jacDiagABInv;
 					solverConstraint.m_lowerLimit = 0;
 					solverConstraint.m_upperLimit = 1e10f;
@@ -593,52 +928,47 @@ void btSequentialImpulseConstraintSolver::setupContactConstraint(btSolverConstra
-void btSequentialImpulseConstraintSolver::setFrictionConstraintImpulse( btSolverConstraint& solverConstraint, 
-																		btRigidBody* rb0, btRigidBody* rb1, 
+void btSequentialImpulseConstraintSolver::setFrictionConstraintImpulse( btSolverConstraint& solverConstraint,
+																		int solverBodyIdA, int solverBodyIdB,
 																 btManifoldPoint& cp, const btContactSolverInfo& infoGlobal)
-					if (infoGlobal.m_solverMode & SOLVER_USE_FRICTION_WARMSTARTING)
-					{
-						{
-							btSolverConstraint& frictionConstraint1 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex];
-							if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
-							{
-								frictionConstraint1.m_appliedImpulse = cp.m_appliedImpulseLateral1 * infoGlobal.m_warmstartingFactor;
-								if (rb0)
-									rb0->internalApplyImpulse(frictionConstraint1.m_contactNormal*rb0->getInvMass()*rb0->getLinearFactor(),frictionConstraint1.m_angularComponentA,frictionConstraint1.m_appliedImpulse);
-								if (rb1)
-									rb1->internalApplyImpulse(frictionConstraint1.m_contactNormal*rb1->getInvMass()*rb1->getLinearFactor(),-frictionConstraint1.m_angularComponentB,-(btScalar)frictionConstraint1.m_appliedImpulse);
-							} else
-							{
-								frictionConstraint1.m_appliedImpulse = 0.f;
-							}
-						}
-						if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
-						{
-							btSolverConstraint& frictionConstraint2 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex+1];
-							if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
-							{
-								frictionConstraint2.m_appliedImpulse = cp.m_appliedImpulseLateral2 * infoGlobal.m_warmstartingFactor;
-								if (rb0)
-									rb0->internalApplyImpulse(frictionConstraint2.m_contactNormal*rb0->getInvMass(),frictionConstraint2.m_angularComponentA,frictionConstraint2.m_appliedImpulse);
-								if (rb1)
-									rb1->internalApplyImpulse(frictionConstraint2.m_contactNormal*rb1->getInvMass(),-frictionConstraint2.m_angularComponentB,-(btScalar)frictionConstraint2.m_appliedImpulse);
-							} else
-							{
-								frictionConstraint2.m_appliedImpulse = 0.f;
-							}
-						}
-					} else
-					{
-						btSolverConstraint& frictionConstraint1 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex];
-						frictionConstraint1.m_appliedImpulse = 0.f;
-						if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
-						{
-							btSolverConstraint& frictionConstraint2 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex+1];
-							frictionConstraint2.m_appliedImpulse = 0.f;
-						}
-					}
+	btSolverBody* bodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	btSolverBody* bodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+	btRigidBody* rb0 = bodyA->m_originalBody;
+	btRigidBody* rb1 = bodyB->m_originalBody;
+	{
+		btSolverConstraint& frictionConstraint1 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex];
+		if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+		{
+			frictionConstraint1.m_appliedImpulse = cp.m_appliedImpulseLateral1 * infoGlobal.m_warmstartingFactor;
+			if (rb0)
+				bodyA->internalApplyImpulse(frictionConstraint1.m_contactNormal1*rb0->getInvMass()*rb0->getLinearFactor(),frictionConstraint1.m_angularComponentA,frictionConstraint1.m_appliedImpulse);
+			if (rb1)
+				bodyB->internalApplyImpulse(-frictionConstraint1.m_contactNormal2*rb1->getInvMass()*rb1->getLinearFactor(),-frictionConstraint1.m_angularComponentB,-(btScalar)frictionConstraint1.m_appliedImpulse);
+		} else
+		{
+			frictionConstraint1.m_appliedImpulse = 0.f;
+		}
+	}
+	if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+	{
+		btSolverConstraint& frictionConstraint2 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex+1];
+		if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+		{
+			frictionConstraint2.m_appliedImpulse = cp.m_appliedImpulseLateral2  * infoGlobal.m_warmstartingFactor;
+			if (rb0)
+				bodyA->internalApplyImpulse(frictionConstraint2.m_contactNormal1*rb0->getInvMass(),frictionConstraint2.m_angularComponentA,frictionConstraint2.m_appliedImpulse);
+			if (rb1)
+				bodyB->internalApplyImpulse(-frictionConstraint2.m_contactNormal2*rb1->getInvMass(),-frictionConstraint2.m_angularComponentB,-(btScalar)frictionConstraint2.m_appliedImpulse);
+		} else
+		{
+			frictionConstraint2.m_appliedImpulse = 0.f;
+		}
+	}
@@ -651,14 +981,22 @@ void	btSequentialImpulseConstraintSolver::convertContact(btPersistentManifold* m
 	colObj0 = (btCollisionObject*)manifold->getBody0();
 	colObj1 = (btCollisionObject*)manifold->getBody1();
+	int solverBodyIdA = getOrInitSolverBody(*colObj0,infoGlobal.m_timeStep);
+	int solverBodyIdB = getOrInitSolverBody(*colObj1,infoGlobal.m_timeStep);
+//	btRigidBody* bodyA = btRigidBody::upcast(colObj0);
+//	btRigidBody* bodyB = btRigidBody::upcast(colObj1);
+	btSolverBody* solverBodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	btSolverBody* solverBodyB = &m_tmpSolverBodyPool[solverBodyIdB];
-	btRigidBody* solverBodyA = btRigidBody::upcast(colObj0);
-	btRigidBody* solverBodyB = btRigidBody::upcast(colObj1);
 	///avoid collision response between two static objects
-	if ((!solverBodyA || !solverBodyA->getInvMass()) && (!solverBodyB || !solverBodyB->getInvMass()))
+	if (!solverBodyA || (solverBodyA->m_invMass.fuzzyZero() && (!solverBodyB || solverBodyB->m_invMass.fuzzyZero())))
+	int rollingFriction=1;
 	for (int j=0;j<manifold->getNumContacts();j++)
@@ -669,18 +1007,35 @@ void	btSequentialImpulseConstraintSolver::convertContact(btPersistentManifold* m
 			btVector3 rel_pos1;
 			btVector3 rel_pos2;
 			btScalar relaxation;
-			btScalar rel_vel;
-			btVector3 vel;
 			int frictionIndex = m_tmpSolverContactConstraintPool.size();
 			btSolverConstraint& solverConstraint = m_tmpSolverContactConstraintPool.expandNonInitializing();
 			btRigidBody* rb0 = btRigidBody::upcast(colObj0);
 			btRigidBody* rb1 = btRigidBody::upcast(colObj1);
-			solverConstraint.m_solverBodyA = rb0? rb0 : &getFixedBody();
-			solverConstraint.m_solverBodyB = rb1? rb1 : &getFixedBody();
+			solverConstraint.m_solverBodyIdA = solverBodyIdA;
+			solverConstraint.m_solverBodyIdB = solverBodyIdB;
 			solverConstraint.m_originalContactPoint = &cp;
-			setupContactConstraint(solverConstraint, colObj0, colObj1, cp, infoGlobal, vel, rel_vel, relaxation, rel_pos1, rel_pos2);
+			const btVector3& pos1 = cp.getPositionWorldOnA();
+			const btVector3& pos2 = cp.getPositionWorldOnB();
+			rel_pos1 = pos1 - colObj0->getWorldTransform().getOrigin();
+			rel_pos2 = pos2 - colObj1->getWorldTransform().getOrigin();
+			btVector3 vel1;// = rb0 ? rb0->getVelocityInLocalPoint(rel_pos1) : btVector3(0,0,0);
+			btVector3 vel2;// = rb1 ? rb1->getVelocityInLocalPoint(rel_pos2) : btVector3(0,0,0);
+			solverBodyA->getVelocityInLocalPointNoDelta(rel_pos1,vel1);
+			solverBodyB->getVelocityInLocalPointNoDelta(rel_pos2,vel2 );
+			btVector3 vel  = vel1 - vel2;
+			btScalar rel_vel = cp.m_normalWorldOnB.dot(vel);
+			setupContactConstraint(solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal, relaxation, rel_pos1, rel_pos2);
 //			const btVector3& pos1 = cp.getPositionWorldOnA();
 //			const btVector3& pos2 = cp.getPositionWorldOnB();
@@ -689,95 +1044,250 @@ void	btSequentialImpulseConstraintSolver::convertContact(btPersistentManifold* m
 			solverConstraint.m_frictionIndex = m_tmpSolverContactFrictionConstraintPool.size();
-			if (!(infoGlobal.m_solverMode & SOLVER_ENABLE_FRICTION_DIRECTION_CACHING) || !cp.m_lateralFrictionInitialized)
+			btVector3 angVelA(0,0,0),angVelB(0,0,0);
+			if (rb0)
+				angVelA = rb0->getAngularVelocity();
+			if (rb1)
+				angVelB = rb1->getAngularVelocity();
+			btVector3 relAngVel = angVelB-angVelA;
+			if ((cp.m_combinedRollingFriction>0.f) && (rollingFriction>0))
+			{
+				//only a single rollingFriction per manifold
+				rollingFriction--;
+				if (relAngVel.length()>infoGlobal.m_singleAxisRollingFrictionThreshold)
+				{
+					relAngVel.normalize();
+					applyAnisotropicFriction(colObj0,relAngVel,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj1,relAngVel,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					if (relAngVel.length()>0.001)
+						addRollingFrictionConstraint(relAngVel,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				} else
+				{
+					addRollingFrictionConstraint(cp.m_normalWorldOnB,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					btVector3 axis0,axis1;
+					btPlaneSpace1(cp.m_normalWorldOnB,axis0,axis1);
+					applyAnisotropicFriction(colObj0,axis0,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj1,axis0,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj0,axis1,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj1,axis1,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					if (axis0.length()>0.001)
+						addRollingFrictionConstraint(axis0,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					if (axis1.length()>0.001)
+						addRollingFrictionConstraint(axis1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				}
+			}
+			///Bullet has several options to set the friction directions
+			///By default, each contact has only a single friction direction that is recomputed automatically very frame
+			///based on the relative linear velocity.
+			///If the relative velocity it zero, it will automatically compute a friction direction.
+			///You can also enable two friction directions, using the SOLVER_USE_2_FRICTION_DIRECTIONS.
+			///In that case, the second friction direction will be orthogonal to both contact normal and first friction direction.
+			///
+			///If you choose SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION, then the friction will be independent from the relative projected velocity.
+			///
+			///The user can manually override the friction directions for certain contacts using a contact callback,
+			///and set the cp.m_lateralFrictionInitialized to true
+			///In that case, you can set the target relative motion in each friction direction (cp.m_contactMotion1 and cp.m_contactMotion2)
+			///this will give a conveyor belt effect
+			///
 				cp.m_lateralFrictionDir1 = vel - cp.m_normalWorldOnB * rel_vel;
 				btScalar lat_rel_vel = cp.m_lateralFrictionDir1.length2();
 				if (!(infoGlobal.m_solverMode & SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION) && lat_rel_vel > SIMD_EPSILON)
-					cp.m_lateralFrictionDir1 /= btSqrt(lat_rel_vel);
+					cp.m_lateralFrictionDir1 *= 1.f/btSqrt(lat_rel_vel);
+					applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					addFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
 					if((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
 						cp.m_lateralFrictionDir2 = cp.m_lateralFrictionDir1.cross(cp.m_normalWorldOnB);
-						applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir2);
-						applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir2);
-						addFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyA,solverBodyB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+						applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						addFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
-					applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir1);
-					applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir1);
-					addFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyA,solverBodyB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
-					cp.m_lateralFrictionInitialized = true;
 				} else
-					//re-calculate friction direction every frame, todo: check if this is really needed
+					applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					addFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
 					if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
-						applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir2);
-						applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir2);
-						addFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyA,solverBodyB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+						applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						addFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
-					applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir1);
-					applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir1);
-					addFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyA,solverBodyB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
-					cp.m_lateralFrictionInitialized = true;
+					{
+					}
 			} else
-				addFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyA,solverBodyB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation,cp.m_contactMotion1, cp.m_contactCFM1);
+				addFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation,cp.m_contactMotion1, cp.m_frictionCFM);
 				if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
-					addFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyA,solverBodyB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation, cp.m_contactMotion2, cp.m_contactCFM2);
+					addFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation, cp.m_contactMotion2, cp.m_frictionCFM);
-			setFrictionConstraintImpulse( solverConstraint, rb0, rb1, cp, infoGlobal);
+			setFrictionConstraintImpulse( solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal);
-btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc)
+void btSequentialImpulseConstraintSolver::convertContacts(btPersistentManifold** manifoldPtr,int numManifolds, const btContactSolverInfo& infoGlobal)
-	BT_PROFILE("solveGroupCacheFriendlySetup");
-	(void)stackAlloc;
-	(void)debugDrawer;
+	int i;
+	btPersistentManifold* manifold = 0;
+//			btCollisionObject* colObj0=0,*colObj1=0;
-	m_maxOverrideNumSolverIterations = 0;
-	if (!(numConstraints + numManifolds))
+	for (i=0;i<numManifolds;i++)
-		//		printf("empty\n");
-		return 0.f;
+		manifold = manifoldPtr[i];
+		convertContact(manifold,infoGlobal);
-	if (infoGlobal.m_splitImpulse)
+btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+	m_fixedBodyId = -1;
+	BT_PROFILE("solveGroupCacheFriendlySetup");
+	(void)debugDrawer;
+	m_maxOverrideNumSolverIterations = 0;
+	 //make sure that dynamic bodies exist for all (enabled) constraints
+	for (int i=0;i<numConstraints;i++)
-		for (int i = 0; i < numBodies; i++)
+		btTypedConstraint* constraint = constraints[i];
+		if (constraint->isEnabled())
-			btRigidBody* body = btRigidBody::upcast(bodies[i]);
-			if (body)
-			{	
-				body->internalGetDeltaLinearVelocity().setZero();
-				body->internalGetDeltaAngularVelocity().setZero();
-				body->internalGetPushVelocity().setZero();
-				body->internalGetTurnVelocity().setZero();
+			if (!constraint->getRigidBodyA().isStaticOrKinematicObject())
+			{
+				bool found=false;
+				for (int b=0;b<numBodies;b++)
+				{
+					if (&constraint->getRigidBodyA()==bodies[b])
+					{
+						found = true;
+						break;
+					}
+				}
+				btAssert(found);
+			}
+			if (!constraint->getRigidBodyB().isStaticOrKinematicObject())
+			{
+				bool found=false;
+				for (int b=0;b<numBodies;b++)
+				{
+					if (&constraint->getRigidBodyB()==bodies[b])
+					{
+						found = true;
+						break;
+					}
+				}
+				btAssert(found);
-	else
+    //make sure that dynamic bodies exist for all contact manifolds
+    for (int i=0;i<numManifolds;i++)
+    {
+        if (!manifoldPtr[i]->getBody0()->isStaticOrKinematicObject())
+        {
+            bool found=false;
+            for (int b=0;b<numBodies;b++)
+            {
+                if (manifoldPtr[i]->getBody0()==bodies[b])
+                {
+                    found = true;
+                    break;
+                }
+            }
+            btAssert(found);
+        }
+        if (!manifoldPtr[i]->getBody1()->isStaticOrKinematicObject())
+        {
+            bool found=false;
+            for (int b=0;b<numBodies;b++)
+            {
+                if (manifoldPtr[i]->getBody1()==bodies[b])
+                {
+                    found = true;
+                    break;
+                }
+            }
+            btAssert(found);
+        }
+    }
+	for (int i = 0; i < numBodies; i++)
+	{
+		bodies[i]->setCompanionId(-1);
+	}
+	m_tmpSolverBodyPool.reserve(numBodies+1);
+	m_tmpSolverBodyPool.resize(0);
+	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
+    //initSolverBody(&fixedBody,0);
+	//convert all bodies
+	for (int i=0;i<numBodies;i++)
-		for (int i = 0; i < numBodies; i++)
+		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
+		btRigidBody* body = btRigidBody::upcast(bodies[i]);
+		if (body && body->getInvMass())
-			btRigidBody* body = btRigidBody::upcast(bodies[i]);
-			if (body)
-			{	
-				body->internalGetDeltaLinearVelocity().setZero();
-				body->internalGetDeltaAngularVelocity().setZero();
+			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
+			btVector3 gyroForce (0,0,0);
+			{
+				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
+				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
+			}
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+			}
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
@@ -791,6 +1301,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 	//btRigidBody* rb0=0,*rb1=0;
 	//if (1)
@@ -799,12 +1310,24 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 			int totalNumRows = 0;
 			int i;
-			m_tmpConstraintSizesPool.resize(numConstraints);
+			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
 			//calculate the total number of contraint rows
 			for (i=0;i<numConstraints;i++)
 				btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+				btJointFeedback* fb = constraints[i]->getJointFeedback();
+				if (fb)
+				{
+					fb->m_appliedForceBodyA.setZero();
+					fb->m_appliedTorqueBodyA.setZero();
+					fb->m_appliedForceBodyB.setZero();
+					fb->m_appliedTorqueBodyB.setZero();
+				}
+				if (constraints[i]->isEnabled())
+				{
+				}
 				if (constraints[i]->isEnabled())
@@ -815,16 +1338,16 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 				totalNumRows += info1.m_numConstraintRows;
-			m_tmpSolverNonContactConstraintPool.resize(totalNumRows);
+			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
 			///setup the btSolverConstraints
 			int currentRow = 0;
 			for (i=0;i<numConstraints;i++)
 				const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
 				if (info1.m_numConstraintRows)
@@ -834,6 +1357,14 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 					btRigidBody& rbA = constraint->getRigidBodyA();
 					btRigidBody& rbB = constraint->getRigidBodyB();
+					int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
+                    int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
+                    btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
+                    btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
 					int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
 					if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
@@ -848,28 +1379,31 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 						currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
 						currentConstraintRow[j].m_appliedImpulse = 0.f;
 						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
-						currentConstraintRow[j].m_solverBodyA = &rbA;
-						currentConstraintRow[j].m_solverBodyB = &rbB;
+						currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+						currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
 						currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
-					rbA.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					rbA.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					rbB.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					rbB.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+					bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+					bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
 					btTypedConstraint::btConstraintInfo2 info2;
 					info2.fps = 1.f/infoGlobal.m_timeStep;
 					info2.erp = infoGlobal.m_erp;
-					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
+					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
 					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
-					info2.m_J2linearAxis = 0;
+					info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
 					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
 					info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
 					///the size of btSolverConstraint needs be a multiple of btScalar
-					btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
+		            btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
 					info2.m_constraintError = &currentConstraintRow->m_rhs;
 					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
 					info2.m_damping = infoGlobal.m_damping;
@@ -906,29 +1440,37 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
-							btVector3 iMJlA = solverConstraint.m_contactNormal*rbA.getInvMass();
+							btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
 							btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
-							btVector3 iMJlB = solverConstraint.m_contactNormal*rbB.getInvMass();//sign of normal?
+							btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
 							btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
-							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
+							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
 							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
-							sum += iMJlB.dot(solverConstraint.m_contactNormal);
+							sum += iMJlB.dot(solverConstraint.m_contactNormal2);
 							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
-							solverConstraint.m_jacDiagABInv = btScalar(1.)/sum;
+							btScalar fsum = btFabs(sum);
+							btAssert(fsum > SIMD_EPSILON);
+							solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?btScalar(1.)/sum : 0.f;
-						///fix rhs
-						///todo: add force/torque accelerators
 							btScalar rel_vel;
-							btScalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.getLinearVelocity()) + solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity());
-							btScalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.getLinearVelocity()) + solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity());
+							btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
+							btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
-							rel_vel = vel1Dotn+vel2Dotn;
+							btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
+							btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
+							btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
+												+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
+							btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
+																+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
+							rel_vel = vel1Dotn+vel2Dotn;
 							btScalar restitution = 0.f;
 							btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
 							btScalar	velocityError = restitution - rel_vel * info2.m_damping;
@@ -937,6 +1479,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
 							solverConstraint.m_appliedImpulse = 0.f;
@@ -944,21 +1487,11 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
-		{
-			int i;
-			btPersistentManifold* manifold = 0;
-//			btCollisionObject* colObj0=0,*colObj1=0;
+		convertContacts(manifoldPtr,numManifolds,infoGlobal);
-			for (i=0;i<numManifolds;i++)
-			{
-				manifold = manifoldPtr[i];
-				convertContact(manifold,infoGlobal);
-			}
-		}
-	btContactSolverInfo info = infoGlobal;
+//	btContactSolverInfo info = infoGlobal;
 	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
@@ -966,9 +1499,13 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
 	///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints
-	m_orderNonContactConstraintPool.resize(numNonContactPool);
-	m_orderTmpConstraintPool.resize(numConstraintPool);
-	m_orderFrictionConstraintPool.resize(numFrictionPool);
+	m_orderNonContactConstraintPool.resizeNoInitialize(numNonContactPool);
+	if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+		m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool*2);
+	else
+		m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool);
+	m_orderFrictionConstraintPool.resizeNoInitialize(numFrictionPool);
 		int i;
 		for (i=0;i<numNonContactPool;i++)
@@ -989,36 +1526,37 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
-btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/,btStackAlloc* /*stackAlloc*/)
+btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/)
 	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
 	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
 	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
-	int j;
 	if (infoGlobal.m_solverMode & SOLVER_RANDMIZE_ORDER)
-		if ((iteration & 7) == 0) {
-			for (j=0; j<numNonContactPool; ++j) {
+		if (1)			// uncomment this for a bit less random ((iteration & 7) == 0)
+		{
+			for (int j=0; j<numNonContactPool; ++j) {
 				int tmp = m_orderNonContactConstraintPool[j];
 				int swapi = btRandInt2(j+1);
 				m_orderNonContactConstraintPool[j] = m_orderNonContactConstraintPool[swapi];
 				m_orderNonContactConstraintPool[swapi] = tmp;
-			//contact/friction constraints are not solved more than 
+			//contact/friction constraints are not solved more than
 			if (iteration< infoGlobal.m_numIterations)
-				for (j=0; j<numConstraintPool; ++j) {
+				for (int j=0; j<numConstraintPool; ++j) {
 					int tmp = m_orderTmpConstraintPool[j];
 					int swapi = btRandInt2(j+1);
 					m_orderTmpConstraintPool[j] = m_orderTmpConstraintPool[swapi];
 					m_orderTmpConstraintPool[swapi] = tmp;
-				for (j=0; j<numFrictionPool; ++j) {
+				for (int j=0; j<numFrictionPool; ++j) {
 					int tmp = m_orderFrictionConstraintPool[j];
 					int swapi = btRandInt2(j+1);
 					m_orderFrictionConstraintPool[j] = m_orderFrictionConstraintPool[swapi];
@@ -1031,72 +1569,164 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration
 	if (infoGlobal.m_solverMode & SOLVER_SIMD)
 		///solve all joint constraints, using SIMD, if available
-		for (j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+		for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
 			btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
 			if (iteration < constraint.m_overrideNumSolverIterations)
-				resolveSingleConstraintRowGenericSIMD(*constraint.m_solverBodyA,*constraint.m_solverBodyB,constraint);
+				resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
 		if (iteration< infoGlobal.m_numIterations)
-			for (j=0;j<numConstraints;j++)
+			for (int j=0;j<numConstraints;j++)
-				constraints[j]->solveConstraintObsolete(constraints[j]->getRigidBodyA(),constraints[j]->getRigidBodyB(),infoGlobal.m_timeStep);
+				if (constraints[j]->isEnabled())
+				{
+					int bodyAid = getOrInitSolverBody(constraints[j]->getRigidBodyA(),infoGlobal.m_timeStep);
+					int bodyBid = getOrInitSolverBody(constraints[j]->getRigidBodyB(),infoGlobal.m_timeStep);
+					btSolverBody& bodyA = m_tmpSolverBodyPool[bodyAid];
+					btSolverBody& bodyB = m_tmpSolverBodyPool[bodyBid];
+					constraints[j]->solveConstraintObsolete(bodyA,bodyB,infoGlobal.m_timeStep);
+				}
 			///solve all contact constraints using SIMD, if available
-			int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
-			for (j=0;j<numPoolConstraints;j++)
-				const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
-				resolveSingleConstraintRowLowerLimitSIMD(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int multiplier = (infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS)? 2 : 1;
+				for (int c=0;c<numPoolConstraints;c++)
+				{
+					btScalar totalImpulse =0;
+					{
+						const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[c]];
+						resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+						totalImpulse = solveManifold.m_appliedImpulse;
+					}
+					bool applyFriction = true;
+					if (applyFriction)
+					{
+						{
+							btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c*multiplier]];
+							if (totalImpulse>btScalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+							}
+						}
+						if (infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS)
+						{
+							btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c*multiplier+1]];
+							if (totalImpulse>btScalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+							}
+						}
+					}
+				}
-			///solve all friction constraints, using SIMD, if available
-			int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
-			for (j=0;j<numFrictionPoolConstraints;j++)
-				btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
-				btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+				//solve the friction constraints after all contact constraints, don't interleave them
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int j;
-				if (totalImpulse>btScalar(0))
+				for (j=0;j<numPoolConstraints;j++)
-					solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
-					solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+					const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+					resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+				}
+				///solve all friction constraints, using SIMD, if available
+				int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+				for (j=0;j<numFrictionPoolConstraints;j++)
+				{
+					btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+					btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse>btScalar(0))
+					{
+						solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
+						solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
-					resolveSingleConstraintRowGenericSIMD(*solveManifold.m_solverBodyA,	*solveManifold.m_solverBodyB,solveManifold);
+						resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+					}
+				int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+				for (j=0;j<numRollingFrictionPoolConstraints;j++)
+				{
+					btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+					btScalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse>btScalar(0))
+					{
+						btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+						if (rollingFrictionMagnitude>rollingFrictionConstraint.m_friction)
+							rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+						rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+						rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+						resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA],m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB],rollingFrictionConstraint);
+					}
+				}
 	} else
+		//non-SIMD version
 		///solve all joint constraints
-		for (j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
+		for (int j=0;j<m_tmpSolverNonContactConstraintPool.size();j++)
 			btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
 			if (iteration < constraint.m_overrideNumSolverIterations)
-				resolveSingleConstraintRowGeneric(*constraint.m_solverBodyA,*constraint.m_solverBodyB,constraint);
+				resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
 		if (iteration< infoGlobal.m_numIterations)
-			for (j=0;j<numConstraints;j++)
+			for (int j=0;j<numConstraints;j++)
-				constraints[j]->solveConstraintObsolete(constraints[j]->getRigidBodyA(),constraints[j]->getRigidBodyB(),infoGlobal.m_timeStep);
+				if (constraints[j]->isEnabled())
+				{
+					int bodyAid = getOrInitSolverBody(constraints[j]->getRigidBodyA(),infoGlobal.m_timeStep);
+					int bodyBid = getOrInitSolverBody(constraints[j]->getRigidBodyB(),infoGlobal.m_timeStep);
+					btSolverBody& bodyA = m_tmpSolverBodyPool[bodyAid];
+					btSolverBody& bodyB = m_tmpSolverBodyPool[bodyBid];
+					constraints[j]->solveConstraintObsolete(bodyA,bodyB,infoGlobal.m_timeStep);
+				}
 			///solve all contact constraints
 			int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
-			for (j=0;j<numPoolConstraints;j++)
+			for (int j=0;j<numPoolConstraints;j++)
 				const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
-				resolveSingleConstraintRowLowerLimit(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
+				resolveSingleConstraintRowLowerLimit(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
 			///solve all friction constraints
 			int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
-			for (j=0;j<numFrictionPoolConstraints;j++)
+			for (int j=0;j<numFrictionPoolConstraints;j++)
 				btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
 				btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
@@ -1106,7 +1736,25 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration
 					solveManifold.m_lowerLimit = -(solveManifold.m_friction*totalImpulse);
 					solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
-					resolveSingleConstraintRowGeneric(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
+					resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
+				}
+			}
+			int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+			for (int j=0;j<numRollingFrictionPoolConstraints;j++)
+			{
+				btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+				btScalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+				if (totalImpulse>btScalar(0))
+				{
+					btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+					if (rollingFrictionMagnitude>rollingFrictionConstraint.m_friction)
+						rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+					rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+					rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+					resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA],m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB],rollingFrictionConstraint);
@@ -1115,7 +1763,7 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration
-void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc)
+void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 	int iteration;
 	if (infoGlobal.m_splitImpulse)
@@ -1131,7 +1779,7 @@ void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIte
 						const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
-						resolveSplitPenetrationSIMD(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
+						resolveSplitPenetrationSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
@@ -1147,7 +1795,7 @@ void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIte
 						const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
-						resolveSplitPenetrationImpulseCacheFriendly(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
+						resolveSplitPenetrationImpulseCacheFriendly(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA],m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB],solveManifold);
@@ -1155,45 +1803,49 @@ void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIte
-btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations(btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc)
+btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations(btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 		///this is a special step to resolve penetrations (just for contacts)
-		solveGroupCacheFriendlySplitImpulseIterations(bodies ,numBodies,manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer,stackAlloc);
+		solveGroupCacheFriendlySplitImpulseIterations(bodies ,numBodies,manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer);
 		int maxIterations = m_maxOverrideNumSolverIterations > infoGlobal.m_numIterations? m_maxOverrideNumSolverIterations : infoGlobal.m_numIterations;
 		for ( int iteration = 0 ; iteration< maxIterations ; iteration++)
 		//for ( int iteration = maxIterations-1  ; iteration >= 0;iteration--)
-		{			
-			solveSingleIteration(iteration, bodies ,numBodies,manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer,stackAlloc);
+		{
+			solveSingleIteration(iteration, bodies ,numBodies,manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer);
 	return 0.f;
-btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies ,int numBodies,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** /*constraints*/,int /* numConstraints*/,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/,btStackAlloc* /*stackAlloc*/)
+btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
 	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
 	int i,j;
-	for (j=0;j<numPoolConstraints;j++)
+	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
-		const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
-		btManifoldPoint* pt = (btManifoldPoint*) solveManifold.m_originalContactPoint;
-		btAssert(pt);
-		pt->m_appliedImpulse = solveManifold.m_appliedImpulse;
-		if (infoGlobal.m_solverMode & SOLVER_USE_FRICTION_WARMSTARTING)
+		for (j=0;j<numPoolConstraints;j++)
+			const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
+			btManifoldPoint* pt = (btManifoldPoint*) solveManifold.m_originalContactPoint;
+			btAssert(pt);
+			pt->m_appliedImpulse = solveManifold.m_appliedImpulse;
+		//	float f = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+			//	printf("pt->m_appliedImpulseLateral1 = %f\n", f);
 			pt->m_appliedImpulseLateral1 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
-			pt->m_appliedImpulseLateral2 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex+1].m_appliedImpulse;
+			//printf("pt->m_appliedImpulseLateral1 = %f\n", pt->m_appliedImpulseLateral1);
+			if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+			{
+				pt->m_appliedImpulseLateral2 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex+1].m_appliedImpulse;
+			}
+			//do a callback here?
-		//do a callback here?
 	numPoolConstraints = m_tmpSolverNonContactConstraintPool.size();
@@ -1201,6 +1853,16 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 		const btSolverConstraint& solverConstr = m_tmpSolverNonContactConstraintPool[j];
 		btTypedConstraint* constr = (btTypedConstraint*)solverConstr.m_originalContactPoint;
+		btJointFeedback* fb = constr->getJointFeedback();
+		if (fb)
+		{
+			fb->m_appliedForceBodyA += solverConstr.m_contactNormal1*solverConstr.m_appliedImpulse*constr->getRigidBodyA().getLinearFactor()/infoGlobal.m_timeStep;
+			fb->m_appliedForceBodyB += solverConstr.m_contactNormal2*solverConstr.m_appliedImpulse*constr->getRigidBodyB().getLinearFactor()/infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyA += solverConstr.m_relpos1CrossNormal* constr->getRigidBodyA().getAngularFactor()*solverConstr.m_appliedImpulse/infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyB += solverConstr.m_relpos2CrossNormal* constr->getRigidBodyB().getAngularFactor()*solverConstr.m_appliedImpulse/infoGlobal.m_timeStep; /*RGM ???? */
+		}
 		if (btFabs(solverConstr.m_appliedImpulse)>=constr->getBreakingImpulseThreshold())
@@ -1209,49 +1871,56 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
-	if (infoGlobal.m_splitImpulse)
-	{		
-		for ( i=0;i<numBodies;i++)
-		{
-			btRigidBody* body = btRigidBody::upcast(bodies[i]);
-			if (body)
-				body->internalWritebackVelocity(infoGlobal.m_timeStep);
-		}
-	} else
+	for ( i=0;i<m_tmpSolverBodyPool.size();i++)
-		for ( i=0;i<numBodies;i++)
+		btRigidBody* body = m_tmpSolverBodyPool[i].m_originalBody;
+		if (body)
-			btRigidBody* body = btRigidBody::upcast(bodies[i]);
-			if (body)
-				body->internalWritebackVelocity();
+			if (infoGlobal.m_splitImpulse)
+				m_tmpSolverBodyPool[i].writebackVelocityAndTransform(infoGlobal.m_timeStep, infoGlobal.m_splitImpulseTurnErp);
+			else
+				m_tmpSolverBodyPool[i].writebackVelocity();
+			m_tmpSolverBodyPool[i].m_originalBody->setLinearVelocity(
+				m_tmpSolverBodyPool[i].m_linearVelocity+
+				m_tmpSolverBodyPool[i].m_externalForceImpulse);
+			m_tmpSolverBodyPool[i].m_originalBody->setAngularVelocity(
+				m_tmpSolverBodyPool[i].m_angularVelocity+
+				m_tmpSolverBodyPool[i].m_externalTorqueImpulse);
+			if (infoGlobal.m_splitImpulse)
+				m_tmpSolverBodyPool[i].m_originalBody->setWorldTransform(m_tmpSolverBodyPool[i].m_worldTransform);
+			m_tmpSolverBodyPool[i].m_originalBody->setCompanionId(-1);
+	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0);
-	m_tmpSolverContactConstraintPool.resize(0);
-	m_tmpSolverNonContactConstraintPool.resize(0);
-	m_tmpSolverContactFrictionConstraintPool.resize(0);
+	m_tmpSolverBodyPool.resizeNoInitialize(0);
 	return 0.f;
 /// btSequentialImpulseConstraintSolver Sequentially applies impulses
-btScalar btSequentialImpulseConstraintSolver::solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc,btDispatcher* /*dispatcher*/)
+btScalar btSequentialImpulseConstraintSolver::solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btDispatcher* /*dispatcher*/)
 	//you need to provide at least some bodies
-	btAssert(bodies);
-	btAssert(numBodies);
-	solveGroupCacheFriendlySetup( bodies, numBodies, manifoldPtr,  numManifolds,constraints, numConstraints,infoGlobal,debugDrawer, stackAlloc);
+	solveGroupCacheFriendlySetup( bodies, numBodies, manifoldPtr,  numManifolds,constraints, numConstraints,infoGlobal,debugDrawer);
-	solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr,  numManifolds,constraints, numConstraints,infoGlobal,debugDrawer, stackAlloc);
+	solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr,  numManifolds,constraints, numConstraints,infoGlobal,debugDrawer);
+	solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal);
-	solveGroupCacheFriendlyFinish(bodies, numBodies, manifoldPtr,  numManifolds,constraints, numConstraints,infoGlobal,debugDrawer, stackAlloc);
 	return 0.f;
@@ -1259,11 +1928,3 @@ void	btSequentialImpulseConstraintSolver::reset()
 	m_btSeed2 = 0;
-btRigidBody& btSequentialImpulseConstraintSolver::getFixedBody()
-	static btRigidBody s_fixed(0, 0,0);
-	s_fixed.setMassProps(btScalar(0.),btVector3(btScalar(0.),btScalar(0.),btScalar(0.)));
-	return s_fixed;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h b/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
index bb377db8..a6029180 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
@@ -16,93 +16,109 @@ subject to the following restrictions:
-#include "btConstraintSolver.h"
 class btIDebugDraw;
-#include "btContactConstraint.h"
-#include "btSolverBody.h"
-#include "btSolverConstraint.h"
-#include "btTypedConstraint.h"
+class btPersistentManifold;
+class btDispatcher;
+class btCollisionObject;
+#include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
+#include "BulletDynamics/ConstraintSolver/btSolverBody.h"
+#include "BulletDynamics/ConstraintSolver/btSolverConstraint.h"
 #include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h"
+#include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"
+typedef btSimdScalar(*btSingleConstraintRowSolver)(btSolverBody&, btSolverBody&, const btSolverConstraint&);
 ///The btSequentialImpulseConstraintSolver is a fast SIMD implementation of the Projected Gauss Seidel (iterative LCP) method.
-class btSequentialImpulseConstraintSolver : public btConstraintSolver
+ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolver : public btConstraintSolver
+	btAlignedObjectArray<btSolverBody>      m_tmpSolverBodyPool;
 	btConstraintArray			m_tmpSolverContactConstraintPool;
 	btConstraintArray			m_tmpSolverNonContactConstraintPool;
 	btConstraintArray			m_tmpSolverContactFrictionConstraintPool;
+	btConstraintArray			m_tmpSolverContactRollingFrictionConstraintPool;
 	btAlignedObjectArray<int>	m_orderTmpConstraintPool;
 	btAlignedObjectArray<int>	m_orderNonContactConstraintPool;
 	btAlignedObjectArray<int>	m_orderFrictionConstraintPool;
 	btAlignedObjectArray<btTypedConstraint::btConstraintInfo1> m_tmpConstraintSizesPool;
 	int							m_maxOverrideNumSolverIterations;
+	int m_fixedBodyId;
-	void setupFrictionConstraint(	btSolverConstraint& solverConstraint, const btVector3& normalAxis,btRigidBody* solverBodyA,btRigidBody* solverBodyIdB,
+	btSingleConstraintRowSolver m_resolveSingleConstraintRowGeneric;
+	btSingleConstraintRowSolver m_resolveSingleConstraintRowLowerLimit;
+	void setupFrictionConstraint(	btSolverConstraint& solverConstraint, const btVector3& normalAxis,int solverBodyIdA,int  solverBodyIdB,
 									btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,
 									btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, 
 									btScalar desiredVelocity=0., btScalar cfmSlip=0.);
-	btSolverConstraint&	addFrictionConstraint(const btVector3& normalAxis,btRigidBody* solverBodyA,btRigidBody* solverBodyB,int frictionIndex,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity=0., btScalar cfmSlip=0.);
+	void setupRollingFrictionConstraint(	btSolverConstraint& solverConstraint, const btVector3& normalAxis,int solverBodyIdA,int  solverBodyIdB,
+									btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,
+									btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, 
+									btScalar desiredVelocity=0., btScalar cfmSlip=0.);
+	btSolverConstraint&	addFrictionConstraint(const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity=0., btScalar cfmSlip=0.);
+	btSolverConstraint&	addRollingFrictionConstraint(const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB,int frictionIndex,btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btScalar desiredVelocity=0, btScalar cfmSlip=0.f);
-	void setupContactConstraint(btSolverConstraint& solverConstraint, btCollisionObject* colObj0, btCollisionObject* colObj1, btManifoldPoint& cp, 
-								const btContactSolverInfo& infoGlobal, btVector3& vel, btScalar& rel_vel, btScalar& relaxation, 
-								btVector3& rel_pos1, btVector3& rel_pos2);
+	void setupContactConstraint(btSolverConstraint& solverConstraint, int solverBodyIdA, int solverBodyIdB, btManifoldPoint& cp, 
+								const btContactSolverInfo& infoGlobal,btScalar& relaxation, const btVector3& rel_pos1, const btVector3& rel_pos2);
-	void setFrictionConstraintImpulse( btSolverConstraint& solverConstraint, btRigidBody* rb0, btRigidBody* rb1, 
+	static void	applyAnisotropicFriction(btCollisionObject* colObj,btVector3& frictionDirection, int frictionMode);
+	void setFrictionConstraintImpulse( btSolverConstraint& solverConstraint, int solverBodyIdA,int solverBodyIdB, 
 										 btManifoldPoint& cp, const btContactSolverInfo& infoGlobal);
 	///m_btSeed2 is used for re-arranging the constraint rows. improves convergence/quality of friction
 	unsigned long	m_btSeed2;
-//	void	initSolverBody(btSolverBody* solverBody, btCollisionObject* collisionObject);
 	btScalar restitutionCurve(btScalar rel_vel, btScalar restitution);
+	virtual void convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal);
 	void	convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal);
 	void	resolveSplitPenetrationSIMD(
-        btRigidBody& body1,
-        btRigidBody& body2,
+     btSolverBody& bodyA,btSolverBody& bodyB,
         const btSolverConstraint& contactConstraint);
 	void	resolveSplitPenetrationImpulseCacheFriendly(
-        btRigidBody& body1,
-        btRigidBody& body2,
+       btSolverBody& bodyA,btSolverBody& bodyB,
         const btSolverConstraint& contactConstraint);
 	//internal method
-	int	getOrInitSolverBody(btCollisionObject& body);
+	int		getOrInitSolverBody(btCollisionObject& body,btScalar timeStep);
+	void	initSolverBody(btSolverBody* solverBody, btCollisionObject* collisionObject, btScalar timeStep);
-	void	resolveSingleConstraintRowGeneric(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
-	void	resolveSingleConstraintRowGenericSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
-	void	resolveSingleConstraintRowLowerLimit(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
-	void	resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
+	btSimdScalar	resolveSingleConstraintRowGeneric(btSolverBody& bodyA,btSolverBody& bodyB,const btSolverConstraint& contactConstraint);
+	btSimdScalar	resolveSingleConstraintRowGenericSIMD(btSolverBody& bodyA,btSolverBody& bodyB,const btSolverConstraint& contactConstraint);
+	btSimdScalar	resolveSingleConstraintRowLowerLimit(btSolverBody& bodyA,btSolverBody& bodyB,const btSolverConstraint& contactConstraint);
+	btSimdScalar	resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& bodyA,btSolverBody& bodyB,const btSolverConstraint& contactConstraint);
-	static btRigidBody& getFixedBody();
-	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
-	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
-	btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
+	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal);
+	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
-	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
-	virtual btScalar solveGroupCacheFriendlyIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	virtual btScalar solveGroupCacheFriendlyIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
 	virtual ~btSequentialImpulseConstraintSolver();
-	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);
+	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher);
 	///clear internal cached data and reset random seed
 	virtual	void	reset();
@@ -119,11 +135,41 @@ public:
 		return m_btSeed2;
+	virtual btConstraintSolverType	getSolverType() const
+	{
+	}
+	btSingleConstraintRowSolver	getActiveConstraintRowSolverGeneric()
+	{
+		return m_resolveSingleConstraintRowGeneric;
+	}
+	void setConstraintRowSolverGeneric(btSingleConstraintRowSolver rowSolver)
+	{
+		m_resolveSingleConstraintRowGeneric = rowSolver;
+	}
+	btSingleConstraintRowSolver	getActiveConstraintRowSolverLowerLimit()
+	{
+		return m_resolveSingleConstraintRowLowerLimit;
+	}
+	void setConstraintRowSolverLowerLimit(btSingleConstraintRowSolver rowSolver)
+	{
+		m_resolveSingleConstraintRowLowerLimit = rowSolver;
+	}
+	///Various implementations of solving a single constraint row using a generic equality constraint, using scalar reference, SSE2 or SSE4
+	btSingleConstraintRowSolver	getScalarConstraintRowSolverGeneric();
+	btSingleConstraintRowSolver	getSSE2ConstraintRowSolverGeneric();
+	btSingleConstraintRowSolver	getSSE4_1ConstraintRowSolverGeneric();
+	///Various implementations of solving a single constraint row using an inequality (lower limit) constraint, using scalar reference, SSE2 or SSE4
+	btSingleConstraintRowSolver	getScalarConstraintRowSolverLowerLimit();
+	btSingleConstraintRowSolver	getSSE2ConstraintRowSolverLowerLimit();
+	btSingleConstraintRowSolver	getSSE4_1ConstraintRowSolverLowerLimit();
-typedef btSequentialImpulseConstraintSolver btSequentialImpulseConstraintSolverPrefered;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp
old mode 100644
new mode 100755
index b69f46da..f8f81bfe
--- a/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp
@@ -426,6 +426,8 @@ void btSliderConstraint::getInfo2NonVirtual(btConstraintInfo2* info, const btTra
 		for (i=0; i<3; i++) info->m_J2angularAxis[s3+i] = -tmpB[i];
 		for (i=0; i<3; i++) info->m_J1linearAxis[s2+i] = p[i];
 		for (i=0; i<3; i++) info->m_J1linearAxis[s3+i] = q[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s2+i] = -p[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s3+i] = -q[i];
 	{	// old way - maybe incorrect if bodies are not on the slider axis
@@ -440,6 +442,8 @@ void btSliderConstraint::getInfo2NonVirtual(btConstraintInfo2* info, const btTra
 		for (i=0; i<3; i++) info->m_J1linearAxis[s2+i] = p[i];
 		for (i=0; i<3; i++) info->m_J1linearAxis[s3+i] = q[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s2+i] = -p[i];
+		for (i=0; i<3; i++) info->m_J2linearAxis[s3+i] = -q[i];
 	// compute two elements of right hand side
@@ -479,6 +483,9 @@ void btSliderConstraint::getInfo2NonVirtual(btConstraintInfo2* info, const btTra
 		info->m_J1linearAxis[srow+0] = ax1[0];
 		info->m_J1linearAxis[srow+1] = ax1[1];
 		info->m_J1linearAxis[srow+2] = ax1[2];
+		info->m_J2linearAxis[srow+0] = -ax1[0];
+		info->m_J2linearAxis[srow+1] = -ax1[1];
+		info->m_J2linearAxis[srow+2] = -ax1[2];
 		// linear torque decoupling step:
 		// we have to be careful that the linear constraint forces (+/- ax1) applied to the two bodies
@@ -532,8 +539,8 @@ void btSliderConstraint::getInfo2NonVirtual(btConstraintInfo2* info, const btTra
 			btScalar tag_vel = getTargetLinMotorVelocity();
 			btScalar mot_fact = getMotorFactor(m_linPos, m_lowerLinLimit, m_upperLinLimit, tag_vel, info->fps * currERP);
 			info->m_constraintError[srow] -= signFact * mot_fact * getTargetLinMotorVelocity();
-			info->m_lowerLimit[srow] += -getMaxLinMotorForce() * info->fps;
-			info->m_upperLimit[srow] += getMaxLinMotorForce() * info->fps;
+			info->m_lowerLimit[srow] += -getMaxLinMotorForce() / info->fps;
+			info->m_upperLimit[srow] += getMaxLinMotorForce() / info->fps;
@@ -634,8 +641,8 @@ void btSliderConstraint::getInfo2NonVirtual(btConstraintInfo2* info, const btTra
 			btScalar mot_fact = getMotorFactor(m_angPos, m_lowerAngLimit, m_upperAngLimit, getTargetAngMotorVelocity(), info->fps * currERP);
 			info->m_constraintError[srow] = mot_fact * getTargetAngMotorVelocity();
-			info->m_lowerLimit[srow] = -getMaxAngMotorForce() * info->fps;
-			info->m_upperLimit[srow] = getMaxAngMotorForce() * info->fps;
+			info->m_lowerLimit[srow] = -getMaxAngMotorForce() / info->fps;
+			info->m_upperLimit[srow] = getMaxAngMotorForce() / info->fps;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.h
old mode 100644
new mode 100755
index 2edc8d2b..1957f08a
--- a/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btSliderConstraint.h
@@ -25,7 +25,15 @@ TODO:
+#include "LinearMath/btScalar.h"//for BT_USE_DOUBLE_PRECISION
+#define btSliderConstraintData2		btSliderConstraintDoubleData
+#define btSliderConstraintDataName  "btSliderConstraintDoubleData"
+#define btSliderConstraintData2		btSliderConstraintData 
+#define btSliderConstraintDataName	"btSliderConstraintData"
 #include "LinearMath/btVector3.h"
 #include "btJacobianEntry.h"
@@ -60,7 +68,7 @@ enum btSliderFlags
-class btSliderConstraint : public btTypedConstraint
+ATTRIBUTE_ALIGNED16(class) btSliderConstraint : public btTypedConstraint
 	///for backwards compatibility during the transition to 'getInfo/getInfo2'
@@ -155,6 +163,8 @@ protected:
 	void initParams();
 	// constructors
     btSliderConstraint(btRigidBody& rbA, btRigidBody& rbB, const btTransform& frameInA, const btTransform& frameInB ,bool useLinearReferenceFrameA);
     btSliderConstraint(btRigidBody& rbB, const btTransform& frameInB, bool useLinearReferenceFrameA);
@@ -272,6 +282,11 @@ public:
 	virtual	void	setParam(int num, btScalar value, int axis = -1);
 	///return the local value of parameter
 	virtual	btScalar getParam(int num, int axis = -1) const;
+	virtual	int getFlags() const
+    	{
+		return m_flags;
+	}
 	virtual	int	calculateSerializeBufferSize() const;
@@ -281,7 +296,10 @@ public:
 ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
 struct btSliderConstraintData
 	btTypedConstraintData	m_typeConstraintData;
@@ -300,31 +318,48 @@ struct btSliderConstraintData
+struct btSliderConstraintDoubleData
+	btTypedConstraintDoubleData	m_typeConstraintData;
+	btTransformDoubleData m_rbAFrame; // constraint axii. Assumes z is hinge axis.
+	btTransformDoubleData m_rbBFrame;
+	double	m_linearUpperLimit;
+	double	m_linearLowerLimit;
+	double	m_angularUpperLimit;
+	double	m_angularLowerLimit;
+	int	m_useLinearReferenceFrameA;
+	int m_useOffsetForConstraintFrame;
 SIMD_FORCE_INLINE		int	btSliderConstraint::calculateSerializeBufferSize() const
-	return sizeof(btSliderConstraintData);
+	return sizeof(btSliderConstraintData2);
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 SIMD_FORCE_INLINE	const char*	btSliderConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
-	btSliderConstraintData* sliderData = (btSliderConstraintData*) dataBuffer;
+	btSliderConstraintData2* sliderData = (btSliderConstraintData2*) dataBuffer;
-	m_frameInA.serializeFloat(sliderData->m_rbAFrame);
-	m_frameInB.serializeFloat(sliderData->m_rbBFrame);
+	m_frameInA.serialize(sliderData->m_rbAFrame);
+	m_frameInB.serialize(sliderData->m_rbBFrame);
-	sliderData->m_linearUpperLimit = float(m_upperLinLimit);
-	sliderData->m_linearLowerLimit = float(m_lowerLinLimit);
+	sliderData->m_linearUpperLimit = m_upperLinLimit;
+	sliderData->m_linearLowerLimit = m_lowerLinLimit;
-	sliderData->m_angularUpperLimit = float(m_upperAngLimit);
-	sliderData->m_angularLowerLimit = float(m_lowerAngLimit);
+	sliderData->m_angularUpperLimit = m_upperAngLimit;
+	sliderData->m_angularLowerLimit = m_lowerAngLimit;
 	sliderData->m_useLinearReferenceFrameA = m_useLinearReferenceFrameA;
 	sliderData->m_useOffsetForConstraintFrame = m_useOffsetForConstraintFrame;
-	return "btSliderConstraintData";
+	return btSliderConstraintDataName;
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btSolverBody.h b/src/bullet/BulletDynamics/ConstraintSolver/btSolverBody.h
index 8de51581..27ccefe4 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btSolverBody.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btSolverBody.h
@@ -19,7 +19,7 @@ subject to the following restrictions:
 class	btRigidBody;
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btMatrix3x3.h"
-#include "BulletDynamics/Dynamics/btRigidBody.h"
 #include "LinearMath/btAlignedAllocator.h"
 #include "LinearMath/btTransformUtil.h"
@@ -105,22 +105,48 @@ operator+(const btSimdScalar& v1, const btSimdScalar& v2)
 ///The btSolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
-ATTRIBUTE_ALIGNED64 (struct)	btSolverBodyObsolete
+ATTRIBUTE_ALIGNED16 (struct)	btSolverBody
+	btTransform		m_worldTransform;
 	btVector3		m_deltaLinearVelocity;
 	btVector3		m_deltaAngularVelocity;
 	btVector3		m_angularFactor;
+	btVector3		m_linearFactor;
 	btVector3		m_invMass;
-	btRigidBody*	m_originalBody;
 	btVector3		m_pushVelocity;
 	btVector3		m_turnVelocity;
+	btVector3		m_linearVelocity;
+	btVector3		m_angularVelocity;
+	btVector3		m_externalForceImpulse;
+	btVector3		m_externalTorqueImpulse;
+	btRigidBody*	m_originalBody;
+	void	setWorldTransform(const btTransform& worldTransform)
+	{
+		m_worldTransform = worldTransform;
+	}
+	const btTransform& getWorldTransform() const
+	{
+		return m_worldTransform;
+	}
+	SIMD_FORCE_INLINE void	getVelocityInLocalPointNoDelta(const btVector3& rel_pos, btVector3& velocity ) const
+	{
+		if (m_originalBody)
+			velocity = m_linearVelocity + m_externalForceImpulse + (m_angularVelocity+m_externalTorqueImpulse).cross(rel_pos);
+		else
+			velocity.setValue(0,0,0);
+	}
 	SIMD_FORCE_INLINE void	getVelocityInLocalPointObsolete(const btVector3& rel_pos, btVector3& velocity ) const
 		if (m_originalBody)
-			velocity = m_originalBody->getLinearVelocity()+m_deltaLinearVelocity + (m_originalBody->getAngularVelocity()+m_deltaAngularVelocity).cross(rel_pos);
+			velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
@@ -128,7 +154,7 @@ ATTRIBUTE_ALIGNED64 (struct)	btSolverBodyObsolete
 	SIMD_FORCE_INLINE void	getAngularVelocity(btVector3& angVel) const
 		if (m_originalBody)
-			angVel = m_originalBody->getAngularVelocity()+m_deltaAngularVelocity;
+			angVel =m_angularVelocity+m_deltaAngularVelocity;
@@ -137,9 +163,9 @@ ATTRIBUTE_ALIGNED64 (struct)	btSolverBodyObsolete
 	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
 	SIMD_FORCE_INLINE void applyImpulse(const btVector3& linearComponent, const btVector3& angularComponent,const btScalar impulseMagnitude)
-		//if (m_invMass)
+		if (m_originalBody)
-			m_deltaLinearVelocity += linearComponent*impulseMagnitude;
+			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
 			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
@@ -148,36 +174,125 @@ ATTRIBUTE_ALIGNED64 (struct)	btSolverBodyObsolete
 		if (m_originalBody)
-			m_pushVelocity += linearComponent*impulseMagnitude;
+			m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor;
 			m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+	const btVector3& getDeltaLinearVelocity() const
+	{
+		return m_deltaLinearVelocity;
+	}
+	const btVector3& getDeltaAngularVelocity() const
+	{
+		return m_deltaAngularVelocity;
+	}
+	const btVector3& getPushVelocity() const 
+	{
+		return m_pushVelocity;
+	}
+	const btVector3& getTurnVelocity() const 
+	{
+		return m_turnVelocity;
+	}
+	////////////////////////////////////////////////
+	///some internal methods, don't use them
+	btVector3& internalGetDeltaLinearVelocity()
+	{
+		return m_deltaLinearVelocity;
+	}
+	btVector3& internalGetDeltaAngularVelocity()
+	{
+		return m_deltaAngularVelocity;
+	}
+	const btVector3& internalGetAngularFactor() const
+	{
+		return m_angularFactor;
+	}
+	const btVector3& internalGetInvMass() const
+	{
+		return m_invMass;
+	}
+	void internalSetInvMass(const btVector3& invMass)
+	{
+		m_invMass = invMass;
+	}
+	btVector3& internalGetPushVelocity()
+	{
+		return m_pushVelocity;
+	}
+	btVector3& internalGetTurnVelocity()
+	{
+		return m_turnVelocity;
+	}
+	SIMD_FORCE_INLINE void	internalGetVelocityInLocalPointObsolete(const btVector3& rel_pos, btVector3& velocity ) const
+	{
+		velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+	}
+	SIMD_FORCE_INLINE void	internalGetAngularVelocity(btVector3& angVel) const
+	{
+		angVel = m_angularVelocity+m_deltaAngularVelocity;
+	}
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	SIMD_FORCE_INLINE void internalApplyImpulse(const btVector3& linearComponent, const btVector3& angularComponent,const btScalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
+			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+		}
+	}
 	void	writebackVelocity()
 		if (m_originalBody)
-			m_originalBody->setLinearVelocity(m_originalBody->getLinearVelocity()+ m_deltaLinearVelocity);
-			m_originalBody->setAngularVelocity(m_originalBody->getAngularVelocity()+m_deltaAngularVelocity);
+			m_linearVelocity +=m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
-	void	writebackVelocity(btScalar timeStep)
+	void	writebackVelocityAndTransform(btScalar timeStep, btScalar splitImpulseTurnErp)
         (void) timeStep;
 		if (m_originalBody)
-			m_originalBody->setLinearVelocity(m_originalBody->getLinearVelocity()+ m_deltaLinearVelocity);
-			m_originalBody->setAngularVelocity(m_originalBody->getAngularVelocity()+m_deltaAngularVelocity);
+			m_linearVelocity += m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
 			//correct the position/orientation based on push/turn recovery
 			btTransform newTransform;
-			btTransformUtil::integrateTransform(m_originalBody->getWorldTransform(),m_pushVelocity,m_turnVelocity,timeStep,newTransform);
-			m_originalBody->setWorldTransform(newTransform);
+			if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0)
+			{
+			//	btQuaternion orn = m_worldTransform.getRotation();
+				btTransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
+				m_worldTransform = newTransform;
+			}
+			//m_worldTransform.setRotation(orn);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btSolverConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btSolverConstraint.h
index 179e79d7..5515e6b3 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btSolverConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btSolverConstraint.h
@@ -20,68 +20,50 @@ class	btRigidBody;
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btMatrix3x3.h"
 #include "btJacobianEntry.h"
+#include "LinearMath/btAlignedObjectArray.h"
 #include "btSolverBody.h"
 ///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
-ATTRIBUTE_ALIGNED64 (struct)	btSolverConstraint
+ATTRIBUTE_ALIGNED16 (struct)	btSolverConstraint
 	btVector3		m_relpos1CrossNormal;
-	btVector3		m_contactNormal;
+	btVector3		m_contactNormal1;
 	btVector3		m_relpos2CrossNormal;
-	//btVector3		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal
+	btVector3		m_contactNormal2; //usually m_contactNormal2 == -m_contactNormal1, but not always
 	btVector3		m_angularComponentA;
 	btVector3		m_angularComponentB;
 	mutable btSimdScalar	m_appliedPushImpulse;
 	mutable btSimdScalar	m_appliedImpulse;
 	btScalar	m_friction;
 	btScalar	m_jacDiagABInv;
-	union
-	{
-		int	m_numConsecutiveRowsPerKernel;
-		btScalar	m_unusedPadding0;
-	};
-	int	m_overrideNumSolverIterations;
-	union
-	{
-		int			m_frictionIndex;
-		btScalar	m_unusedPadding1;
-	};
-	union
-	{
-		btRigidBody*	m_solverBodyA;
-		int				m_companionIdA;
-	};
-	union
-	{
-		btRigidBody*	m_solverBodyB;
-		int				m_companionIdB;
-	};
+	btScalar		m_rhs;
+	btScalar		m_cfm;
-	union
+    btScalar		m_lowerLimit;
+	btScalar		m_upperLimit;
+	btScalar		m_rhsPenetration;
+    union
 		void*		m_originalContactPoint;
 		btScalar	m_unusedPadding4;
+		int			m_numRowsForNonContactConstraint;
-	btScalar		m_rhs;
-	btScalar		m_cfm;
-	btScalar		m_lowerLimit;
-	btScalar		m_upperLimit;
-	btScalar		m_rhsPenetration;
+	int	m_overrideNumSolverIterations;
+    int			m_frictionIndex;
+	int m_solverBodyIdA;
+	int m_solverBodyIdB;
 	enum		btSolverConstraintType
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.cpp b/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.cpp
index 06bde5e7..736a64a1 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.cpp
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.cpp
@@ -24,7 +24,7 @@ subject to the following restrictions:
 btTypedConstraint::btTypedConstraint(btTypedConstraintType type, btRigidBody& rbA)
@@ -32,7 +32,8 @@ m_overrideNumSolverIterations(-1),
@@ -40,7 +41,7 @@ m_dbgDrawSize(DEFAULT_DEBUGDRAW_SIZE)
 btTypedConstraint::btTypedConstraint(btTypedConstraintType type, btRigidBody& rbA,btRigidBody& rbB)
@@ -48,7 +49,8 @@ m_overrideNumSolverIterations(-1),
@@ -107,7 +109,7 @@ btScalar btTypedConstraint::getMotorFactor(btScalar pos, btScalar lowLim, btScal
 ///fills the dataBuffer and returns the struct name (and 0 on failure)
 const char*	btTypedConstraint::serialize(void* dataBuffer, btSerializer* serializer) const
-	btTypedConstraintData* tcd = (btTypedConstraintData*) dataBuffer;
+	btTypedConstraintData2* tcd = (btTypedConstraintData2*) dataBuffer;
 	tcd->m_rbA = (btRigidBodyData*)serializer->getUniquePointer(&m_rbA);
 	tcd->m_rbB = (btRigidBodyData*)serializer->getUniquePointer(&m_rbB);
@@ -121,14 +123,14 @@ const char*	btTypedConstraint::serialize(void* dataBuffer, btSerializer* seriali
 	tcd->m_objectType = m_objectType;
 	tcd->m_needsFeedback = m_needsFeedback;
 	tcd->m_overrideNumSolverIterations = m_overrideNumSolverIterations;
-	tcd->m_breakingImpulseThreshold = float(m_breakingImpulseThreshold);
+	tcd->m_breakingImpulseThreshold = m_breakingImpulseThreshold;
 	tcd->m_isEnabled = m_isEnabled? 1: 0;
 	tcd->m_userConstraintId =m_userConstraintId;
 	tcd->m_userConstraintType =m_userConstraintType;
-	tcd->m_appliedImpulse = float(m_appliedImpulse);
-	tcd->m_dbgDrawSize = float(m_dbgDrawSize );
+	tcd->m_appliedImpulse = m_appliedImpulse;
+	tcd->m_dbgDrawSize = m_dbgDrawSize;
 	tcd->m_disableCollisionsBetweenLinkedBodies = false;
@@ -140,7 +142,7 @@ const char*	btTypedConstraint::serialize(void* dataBuffer, btSerializer* seriali
 		if (m_rbB.getConstraintRef(i) == this)
 			tcd->m_disableCollisionsBetweenLinkedBodies = true;
-	return "btTypedConstraintData";
+	return btTypedConstraintDataName;
 btRigidBody& btTypedConstraint::getFixedBody()
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.h
index a16e869a..8a2a2d1a 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btTypedConstraint.h
@@ -16,9 +16,19 @@ subject to the following restrictions:
-class btRigidBody;
 #include "LinearMath/btScalar.h"
 #include "btSolverConstraint.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#define btTypedConstraintData2		btTypedConstraintDoubleData
+#define btTypedConstraintDataName	"btTypedConstraintDoubleData"
+#define btTypedConstraintData2 		btTypedConstraintFloatData
+#define btTypedConstraintDataName  "btTypedConstraintFloatData" 
 class btSerializer;
@@ -32,6 +42,9 @@ enum btTypedConstraintType
@@ -51,8 +64,18 @@ enum btConstraintParams
+ATTRIBUTE_ALIGNED16(struct)	btJointFeedback
+	btVector3	m_appliedForceBodyA;
+	btVector3	m_appliedTorqueBodyA;
+	btVector3	m_appliedForceBodyB;
+	btVector3	m_appliedTorqueBodyB;
 ///TypedConstraint is the baseclass for Bullet constraints and vehicles
-class btTypedConstraint : public btTypedObject
+ATTRIBUTE_ALIGNED16(class) btTypedConstraint : public btTypedObject
 	int	m_userConstraintType;
@@ -80,6 +103,7 @@ protected:
 	btRigidBody&	m_rbB;
 	btScalar	m_appliedImpulse;
 	btScalar	m_dbgDrawSize;
+	btJointFeedback*	m_jointFeedback;
 	///internal method used by the constraint solver, don't use them directly
 	btScalar getMotorFactor(btScalar pos, btScalar lowLim, btScalar uppLim, btScalar vel, btScalar timeFact);
@@ -87,6 +111,8 @@ protected:
 	virtual ~btTypedConstraint() {};
 	btTypedConstraint(btTypedConstraintType type, btRigidBody& rbA);
 	btTypedConstraint(btTypedConstraintType type, btRigidBody& rbA,btRigidBody& rbB);
@@ -119,11 +145,6 @@ public:
 		// lo and hi limits for variables (set to -/+ infinity on entry).
 		btScalar *m_lowerLimit,*m_upperLimit;
-		// findex vector for variables. see the LCP solver interface for a
-		// description of what this does. this is set to -1 on entry.
-		// note that the returned indexes are relative to the first index of
-		// the constraint.
-		int *findex;
 		// number of solver iterations
 		int m_numIterations;
@@ -195,7 +216,7 @@ public:
 	///internal method used by the constraint solver, don't use them directly
-	virtual	void	solveConstraintObsolete(btRigidBody& /*bodyA*/,btRigidBody& /*bodyB*/,btScalar	/*timeStep*/) {};
+	virtual	void	solveConstraintObsolete(btSolverBody& /*bodyA*/,btSolverBody& /*bodyB*/,btScalar	/*timeStep*/) {};
 	const btRigidBody& getRigidBodyA() const
@@ -246,6 +267,22 @@ public:
 		return m_userConstraintPtr;
+	void	setJointFeedback(btJointFeedback* jointFeedback)
+	{
+		m_jointFeedback = jointFeedback;
+	}
+	const btJointFeedback* getJointFeedback() const
+	{
+		return m_jointFeedback;
+	}
+	btJointFeedback* getJointFeedback()
+	{
+		return m_jointFeedback;
+	}
 	int getUid() const
 		return m_userConstraintId;   
@@ -326,6 +363,33 @@ SIMD_FORCE_INLINE btScalar btAdjustAngleToLimits(btScalar angleInRadians, btScal
 ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btTypedConstraintFloatData
+	btRigidBodyFloatData		*m_rbA;
+	btRigidBodyFloatData		*m_rbB;
+	char	*m_name;
+	int	m_objectType;
+	int	m_userConstraintType;
+	int	m_userConstraintId;
+	int	m_needsFeedback;
+	float	m_appliedImpulse;
+	float	m_dbgDrawSize;
+	int	m_disableCollisionsBetweenLinkedBodies;
+	int	m_overrideNumSolverIterations;
+	float	m_breakingImpulseThreshold;
+	int		m_isEnabled;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+///this structure is not used, except for loading pre-2.82 .bullet files
 struct	btTypedConstraintData
 	btRigidBodyData		*m_rbA;
@@ -347,10 +411,35 @@ struct	btTypedConstraintData
 	int		m_isEnabled;
+struct	btTypedConstraintDoubleData
+	btRigidBodyDoubleData		*m_rbA;
+	btRigidBodyDoubleData		*m_rbB;
+	char	*m_name;
+	int	m_objectType;
+	int	m_userConstraintType;
+	int	m_userConstraintId;
+	int	m_needsFeedback;
+	double	m_appliedImpulse;
+	double	m_dbgDrawSize;
+	int	m_disableCollisionsBetweenLinkedBodies;
+	int	m_overrideNumSolverIterations;
+	double	m_breakingImpulseThreshold;
+	int		m_isEnabled;
+	char	padding[4];
 SIMD_FORCE_INLINE	int	btTypedConstraint::calculateSerializeBufferSize() const
-	return sizeof(btTypedConstraintData);
+	return sizeof(btTypedConstraintData2);
diff --git a/src/bullet/BulletDynamics/ConstraintSolver/btUniversalConstraint.h b/src/bullet/BulletDynamics/ConstraintSolver/btUniversalConstraint.h
index a8693916..9e708410 100644
--- a/src/bullet/BulletDynamics/ConstraintSolver/btUniversalConstraint.h
+++ b/src/bullet/BulletDynamics/ConstraintSolver/btUniversalConstraint.h
@@ -31,13 +31,16 @@ subject to the following restrictions:
 /// "Given axis 1 on body 1, and axis 2 on body 2 that is perpendicular to axis 1, it keeps them perpendicular. 
 /// In other words, rotation of the two bodies about the direction perpendicular to the two axes will be equal."
-class btUniversalConstraint : public btGeneric6DofConstraint
+ATTRIBUTE_ALIGNED16(class) btUniversalConstraint : public btGeneric6DofConstraint
 	btVector3	m_anchor;
 	btVector3	m_axis1;
 	btVector3	m_axis2;
 	// constructor
 	// anchor, axis1 and axis2 are in world coordinate system
 	// axis1 must be orthogonal to axis2
diff --git a/src/bullet/BulletDynamics/Dynamics/Bullet-C-API.cpp b/src/bullet/BulletDynamics/Dynamics/Bullet-C-API.cpp
deleted file mode 100644
index bd8e2748..00000000
--- a/src/bullet/BulletDynamics/Dynamics/Bullet-C-API.cpp
+++ /dev/null
@@ -1,405 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-	Draft high-level generic physics C-API. For low-level access, use the physics SDK native API's.
-	Work in progress, functionality will be added on demand.
-	If possible, use the richer Bullet C++ API, by including <src/btBulletDynamicsCommon.h>
-#include "Bullet-C-Api.h"
-#include "btBulletDynamicsCommon.h"
-#include "LinearMath/btAlignedAllocator.h"
-#include "LinearMath/btVector3.h"
-#include "LinearMath/btScalar.h"	
-#include "LinearMath/btMatrix3x3.h"
-#include "LinearMath/btTransform.h"
-#include "BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h"
-#include "BulletCollision/CollisionShapes/btTriangleShape.h"
-#include "BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h"
-#include "BulletCollision/NarrowPhaseCollision/btPointCollector.h"
-#include "BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h"
-#include "BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.h"
-#include "BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h"
-#include "BulletCollision/NarrowPhaseCollision/btGjkEpa2.h"
-#include "BulletCollision/CollisionShapes/btMinkowskiSumShape.h"
-#include "BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h"
-#include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
-#include "BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h"
-	Create and Delete a Physics SDK	
-struct	btPhysicsSdk
-//	btDispatcher*				m_dispatcher;
-//	btOverlappingPairCache*		m_pairCache;
-//	btConstraintSolver*			m_constraintSolver
-	btVector3	m_worldAabbMin;
-	btVector3	m_worldAabbMax;
-	//todo: version, hardware/optimization settings etc?
-	btPhysicsSdk()
-		:m_worldAabbMin(-1000,-1000,-1000),
-		m_worldAabbMax(1000,1000,1000)
-	{
-	}
-plPhysicsSdkHandle	plNewBulletSdk()
-	void* mem = btAlignedAlloc(sizeof(btPhysicsSdk),16);
-	return (plPhysicsSdkHandle)new (mem)btPhysicsSdk;
-void		plDeletePhysicsSdk(plPhysicsSdkHandle	physicsSdk)
-	btPhysicsSdk* phys = reinterpret_cast<btPhysicsSdk*>(physicsSdk);
-	btAlignedFree(phys);	
-/* Dynamics World */
-plDynamicsWorldHandle plCreateDynamicsWorld(plPhysicsSdkHandle physicsSdkHandle)
-	btPhysicsSdk* physicsSdk = reinterpret_cast<btPhysicsSdk*>(physicsSdkHandle);
-	void* mem = btAlignedAlloc(sizeof(btDefaultCollisionConfiguration),16);
-	btDefaultCollisionConfiguration* collisionConfiguration = new (mem)btDefaultCollisionConfiguration();
-	mem = btAlignedAlloc(sizeof(btCollisionDispatcher),16);
-	btDispatcher*				dispatcher = new (mem)btCollisionDispatcher(collisionConfiguration);
-	mem = btAlignedAlloc(sizeof(btAxisSweep3),16);
-	btBroadphaseInterface*		pairCache = new (mem)btAxisSweep3(physicsSdk->m_worldAabbMin,physicsSdk->m_worldAabbMax);
-	mem = btAlignedAlloc(sizeof(btSequentialImpulseConstraintSolver),16);
-	btConstraintSolver*			constraintSolver = new(mem) btSequentialImpulseConstraintSolver();
-	mem = btAlignedAlloc(sizeof(btDiscreteDynamicsWorld),16);
-	return (plDynamicsWorldHandle) new (mem)btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration);
-void           plDeleteDynamicsWorld(plDynamicsWorldHandle world)
-	//todo: also clean up the other allocations, axisSweep, pairCache,dispatcher,constraintSolver,collisionConfiguration
-	btDynamicsWorld* dynamicsWorld = reinterpret_cast< btDynamicsWorld* >(world);
-	btAlignedFree(dynamicsWorld);
-void	plStepSimulation(plDynamicsWorldHandle world,	plReal	timeStep)
-	btDynamicsWorld* dynamicsWorld = reinterpret_cast< btDynamicsWorld* >(world);
-	btAssert(dynamicsWorld);
-	dynamicsWorld->stepSimulation(timeStep);
-void plAddRigidBody(plDynamicsWorldHandle world, plRigidBodyHandle object)
-	btDynamicsWorld* dynamicsWorld = reinterpret_cast< btDynamicsWorld* >(world);
-	btAssert(dynamicsWorld);
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	dynamicsWorld->addRigidBody(body);
-void plRemoveRigidBody(plDynamicsWorldHandle world, plRigidBodyHandle object)
-	btDynamicsWorld* dynamicsWorld = reinterpret_cast< btDynamicsWorld* >(world);
-	btAssert(dynamicsWorld);
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	dynamicsWorld->removeRigidBody(body);
-/* Rigid Body  */
-plRigidBodyHandle plCreateRigidBody(	void* user_data,  float mass, plCollisionShapeHandle cshape )
-	btTransform trans;
-	trans.setIdentity();
-	btVector3 localInertia(0,0,0);
-	btCollisionShape* shape = reinterpret_cast<btCollisionShape*>( cshape);
-	btAssert(shape);
-	if (mass)
-	{
-		shape->calculateLocalInertia(mass,localInertia);
-	}
-	void* mem = btAlignedAlloc(sizeof(btRigidBody),16);
-	btRigidBody::btRigidBodyConstructionInfo rbci(mass, 0,shape,localInertia);
-	btRigidBody* body = new (mem)btRigidBody(rbci);
-	body->setWorldTransform(trans);
-	body->setUserPointer(user_data);
-	return (plRigidBodyHandle) body;
-void plDeleteRigidBody(plRigidBodyHandle cbody)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(cbody);
-	btAssert(body);
-	btAlignedFree( body);
-/* Collision Shape definition */
-plCollisionShapeHandle plNewSphereShape(plReal radius)
-	void* mem = btAlignedAlloc(sizeof(btSphereShape),16);
-	return (plCollisionShapeHandle) new (mem)btSphereShape(radius);
-plCollisionShapeHandle plNewBoxShape(plReal x, plReal y, plReal z)
-	void* mem = btAlignedAlloc(sizeof(btBoxShape),16);
-	return (plCollisionShapeHandle) new (mem)btBoxShape(btVector3(x,y,z));
-plCollisionShapeHandle plNewCapsuleShape(plReal radius, plReal height)
-	//capsule is convex hull of 2 spheres, so use btMultiSphereShape
-	const int numSpheres = 2;
-	btVector3 positions[numSpheres] = {btVector3(0,height,0),btVector3(0,-height,0)};
-	btScalar radi[numSpheres] = {radius,radius};
-	void* mem = btAlignedAlloc(sizeof(btMultiSphereShape),16);
-	return (plCollisionShapeHandle) new (mem)btMultiSphereShape(positions,radi,numSpheres);
-plCollisionShapeHandle plNewConeShape(plReal radius, plReal height)
-	void* mem = btAlignedAlloc(sizeof(btConeShape),16);
-	return (plCollisionShapeHandle) new (mem)btConeShape(radius,height);
-plCollisionShapeHandle plNewCylinderShape(plReal radius, plReal height)
-	void* mem = btAlignedAlloc(sizeof(btCylinderShape),16);
-	return (plCollisionShapeHandle) new (mem)btCylinderShape(btVector3(radius,height,radius));
-/* Convex Meshes */
-plCollisionShapeHandle plNewConvexHullShape()
-	void* mem = btAlignedAlloc(sizeof(btConvexHullShape),16);
-	return (plCollisionShapeHandle) new (mem)btConvexHullShape();
-/* Concave static triangle meshes */
-plMeshInterfaceHandle		   plNewMeshInterface()
-	return 0;
-plCollisionShapeHandle plNewCompoundShape()
-	void* mem = btAlignedAlloc(sizeof(btCompoundShape),16);
-	return (plCollisionShapeHandle) new (mem)btCompoundShape();
-void	plAddChildShape(plCollisionShapeHandle compoundShapeHandle,plCollisionShapeHandle childShapeHandle, plVector3 childPos,plQuaternion childOrn)
-	btCollisionShape* colShape = reinterpret_cast<btCollisionShape*>(compoundShapeHandle);
-	btAssert(colShape->getShapeType() == COMPOUND_SHAPE_PROXYTYPE);
-	btCompoundShape* compoundShape = reinterpret_cast<btCompoundShape*>(colShape);
-	btCollisionShape* childShape = reinterpret_cast<btCollisionShape*>(childShapeHandle);
-	btTransform	localTrans;
-	localTrans.setIdentity();
-	localTrans.setOrigin(btVector3(childPos[0],childPos[1],childPos[2]));
-	localTrans.setRotation(btQuaternion(childOrn[0],childOrn[1],childOrn[2],childOrn[3]));
-	compoundShape->addChildShape(localTrans,childShape);
-void plSetEuler(plReal yaw,plReal pitch,plReal roll, plQuaternion orient)
-	btQuaternion orn;
-	orn.setEuler(yaw,pitch,roll);
-	orient[0] = orn.getX();
-	orient[1] = orn.getY();
-	orient[2] = orn.getZ();
-	orient[3] = orn.getW();
-//	extern  void		plAddTriangle(plMeshInterfaceHandle meshHandle, plVector3 v0,plVector3 v1,plVector3 v2);
-//	extern  plCollisionShapeHandle plNewStaticTriangleMeshShape(plMeshInterfaceHandle);
-void		plAddVertex(plCollisionShapeHandle cshape, plReal x,plReal y,plReal z)
-	btCollisionShape* colShape = reinterpret_cast<btCollisionShape*>( cshape);
-	(void)colShape;
-	btAssert(colShape->getShapeType()==CONVEX_HULL_SHAPE_PROXYTYPE);
-	btConvexHullShape* convexHullShape = reinterpret_cast<btConvexHullShape*>( cshape);
-	convexHullShape->addPoint(btVector3(x,y,z));
-void plDeleteShape(plCollisionShapeHandle cshape)
-	btCollisionShape* shape = reinterpret_cast<btCollisionShape*>( cshape);
-	btAssert(shape);
-	btAlignedFree(shape);
-void plSetScaling(plCollisionShapeHandle cshape, plVector3 cscaling)
-	btCollisionShape* shape = reinterpret_cast<btCollisionShape*>( cshape);
-	btAssert(shape);
-	btVector3 scaling(cscaling[0],cscaling[1],cscaling[2]);
-	shape->setLocalScaling(scaling);	
-void plSetPosition(plRigidBodyHandle object, const plVector3 position)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	btVector3 pos(position[0],position[1],position[2]);
-	btTransform worldTrans = body->getWorldTransform();
-	worldTrans.setOrigin(pos);
-	body->setWorldTransform(worldTrans);
-void plSetOrientation(plRigidBodyHandle object, const plQuaternion orientation)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	btQuaternion orn(orientation[0],orientation[1],orientation[2],orientation[3]);
-	btTransform worldTrans = body->getWorldTransform();
-	worldTrans.setRotation(orn);
-	body->setWorldTransform(worldTrans);
-void	plSetOpenGLMatrix(plRigidBodyHandle object, plReal* matrix)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	btTransform& worldTrans = body->getWorldTransform();
-	worldTrans.setFromOpenGLMatrix(matrix);
-void	plGetOpenGLMatrix(plRigidBodyHandle object, plReal* matrix)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	body->getWorldTransform().getOpenGLMatrix(matrix);
-void	plGetPosition(plRigidBodyHandle object,plVector3 position)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	const btVector3& pos = body->getWorldTransform().getOrigin();
-	position[0] = pos.getX();
-	position[1] = pos.getY();
-	position[2] = pos.getZ();
-void plGetOrientation(plRigidBodyHandle object,plQuaternion orientation)
-	btRigidBody* body = reinterpret_cast< btRigidBody* >(object);
-	btAssert(body);
-	const btQuaternion& orn = body->getWorldTransform().getRotation();
-	orientation[0] = orn.getX();
-	orientation[1] = orn.getY();
-	orientation[2] = orn.getZ();
-	orientation[3] = orn.getW();
-//plRigidBodyHandle plRayCast(plDynamicsWorldHandle world, const plVector3 rayStart, const plVector3 rayEnd, plVector3 hitpoint, plVector3 normal);
-//	extern  plRigidBodyHandle plObjectCast(plDynamicsWorldHandle world, const plVector3 rayStart, const plVector3 rayEnd, plVector3 hitpoint, plVector3 normal);
-double plNearestPoints(float p1[3], float p2[3], float p3[3], float q1[3], float q2[3], float q3[3], float *pa, float *pb, float normal[3])
-	btVector3 vp(p1[0], p1[1], p1[2]);
-	btTriangleShape trishapeA(vp, 
-				  btVector3(p2[0], p2[1], p2[2]), 
-				  btVector3(p3[0], p3[1], p3[2]));
-	trishapeA.setMargin(0.000001f);
-	btVector3 vq(q1[0], q1[1], q1[2]);
-	btTriangleShape trishapeB(vq, 
-				  btVector3(q2[0], q2[1], q2[2]), 
-				  btVector3(q3[0], q3[1], q3[2]));
-	trishapeB.setMargin(0.000001f);
-	// btVoronoiSimplexSolver sGjkSimplexSolver;
-	// btGjkEpaPenetrationDepthSolver penSolverPtr;	
-	static btSimplexSolverInterface sGjkSimplexSolver;
-	sGjkSimplexSolver.reset();
-	static btGjkEpaPenetrationDepthSolver Solver0;
-	static btMinkowskiPenetrationDepthSolver Solver1;
-	btConvexPenetrationDepthSolver* Solver = NULL;
-	Solver = &Solver1;	
-	btGjkPairDetector convexConvex(&trishapeA ,&trishapeB,&sGjkSimplexSolver,Solver);
-	convexConvex.m_catchDegeneracies = 1;
-	// btGjkPairDetector convexConvex(&trishapeA ,&trishapeB,&sGjkSimplexSolver,0);
-	btPointCollector gjkOutput;
-	btGjkPairDetector::ClosestPointInput input;
-	btTransform tr;
-	tr.setIdentity();
-	input.m_transformA = tr;
-	input.m_transformB = tr;
-	convexConvex.getClosestPoints(input, gjkOutput, 0);
-	if (gjkOutput.m_hasResult)
-	{
-		pb[0] = pa[0] = gjkOutput.m_pointInWorld[0];
-		pb[1] = pa[1] = gjkOutput.m_pointInWorld[1];
-		pb[2] = pa[2] = gjkOutput.m_pointInWorld[2];
-		pb[0]+= gjkOutput.m_normalOnBInWorld[0] * gjkOutput.m_distance;
-		pb[1]+= gjkOutput.m_normalOnBInWorld[1] * gjkOutput.m_distance;
-		pb[2]+= gjkOutput.m_normalOnBInWorld[2] * gjkOutput.m_distance;
-		normal[0] = gjkOutput.m_normalOnBInWorld[0];
-		normal[1] = gjkOutput.m_normalOnBInWorld[1];
-		normal[2] = gjkOutput.m_normalOnBInWorld[2];
-		return gjkOutput.m_distance;
-	}
-	return -1.0f;	
diff --git a/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp b/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp
index 954ef241..361a054e 100644
--- a/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp
+++ b/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp
@@ -4,8 +4,8 @@ Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
@@ -34,6 +34,7 @@ subject to the following restrictions:
 #include "BulletDynamics/ConstraintSolver/btHingeConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btConeTwistConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h"
 #include "BulletDynamics/ConstraintSolver/btSliderConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btContactConstraint.h"
@@ -58,7 +59,7 @@ int firstHit=startHit;
 SIMD_FORCE_INLINE	int	btGetConstraintIslandId(const btTypedConstraint* lhs)
 	int islandId;
 	const btCollisionObject& rcolObj0 = lhs->getRigidBodyA();
 	const btCollisionObject& rcolObj1 = lhs->getRigidBodyB();
 	islandId= rcolObj0.getIslandTag()>=0?rcolObj0.getIslandTag():rcolObj1.getIslandTag();
@@ -87,9 +88,8 @@ struct InplaceSolverIslandCallback : public btSimulationIslandManager::IslandCal
 	btTypedConstraint**		m_sortedConstraints;
 	int						m_numConstraints;
 	btIDebugDraw*			m_debugDrawer;
-	btStackAlloc*			m_stackAlloc;
 	btDispatcher*			m_dispatcher;
 	btAlignedObjectArray<btCollisionObject*> m_bodies;
 	btAlignedObjectArray<btPersistentManifold*> m_manifolds;
 	btAlignedObjectArray<btTypedConstraint*> m_constraints;
@@ -104,7 +104,6 @@ struct InplaceSolverIslandCallback : public btSimulationIslandManager::IslandCal
-		m_stackAlloc(stackAlloc),
@@ -129,23 +128,20 @@ struct InplaceSolverIslandCallback : public btSimulationIslandManager::IslandCal
 		m_constraints.resize (0);
 	virtual	void	processIsland(btCollisionObject** bodies,int numBodies,btPersistentManifold**	manifolds,int numManifolds, int islandId)
 		if (islandId<0)
-			if (numManifolds + m_numConstraints)
-			{
-				///we don't split islands, so all constraints/contact manifolds/bodies are passed into the solver regardless the island id
-				m_solver->solveGroup( bodies,numBodies,manifolds, numManifolds,&m_sortedConstraints[0],m_numConstraints,*m_solverInfo,m_debugDrawer,m_stackAlloc,m_dispatcher);
-			}
+			///we don't split islands, so all constraints/contact manifolds/bodies are passed into the solver regardless the island id
+			m_solver->solveGroup( bodies,numBodies,manifolds, numManifolds,&m_sortedConstraints[0],m_numConstraints,*m_solverInfo,m_debugDrawer,m_dispatcher);
 		} else
 				//also add all non-contact constraints/joints for this island
 			btTypedConstraint** startConstraint = 0;
 			int numCurConstraints = 0;
 			int i;
 			//find the first constraint for this island
 			for (i=0;i<m_numConstraints;i++)
@@ -166,14 +162,10 @@ struct InplaceSolverIslandCallback : public btSimulationIslandManager::IslandCal
 			if (m_solverInfo->m_minimumSolverBatchSize<=1)
-				///only call solveGroup if there is some work: avoid virtual function call, its overhead can be excessive
-				if (numManifolds + numCurConstraints)
-				{
-					m_solver->solveGroup( bodies,numBodies,manifolds, numManifolds,startConstraint,numCurConstraints,*m_solverInfo,m_debugDrawer,m_stackAlloc,m_dispatcher);
-				}
+				m_solver->solveGroup( bodies,numBodies,manifolds, numManifolds,startConstraint,numCurConstraints,*m_solverInfo,m_debugDrawer,m_dispatcher);
 			} else
 				for (i=0;i<numBodies;i++)
 				for (i=0;i<numManifolds;i++)
@@ -192,15 +184,12 @@ struct InplaceSolverIslandCallback : public btSimulationIslandManager::IslandCal
 	void	processConstraints()
-		if (m_manifolds.size() + m_constraints.size()>0)
-		{
-			btCollisionObject** bodies = m_bodies.size()? &m_bodies[0]:0;
-			btPersistentManifold** manifold = m_manifolds.size()?&m_manifolds[0]:0;
-			btTypedConstraint** constraints = m_constraints.size()?&m_constraints[0]:0;
-			m_solver->solveGroup( bodies,m_bodies.size(),manifold, m_manifolds.size(),constraints, m_constraints.size() ,*m_solverInfo,m_debugDrawer,m_stackAlloc,m_dispatcher);
-		}
+		btCollisionObject** bodies = m_bodies.size()? &m_bodies[0]:0;
+		btPersistentManifold** manifold = m_manifolds.size()?&m_manifolds[0]:0;
+		btTypedConstraint** constraints = m_constraints.size()?&m_constraints[0]:0;
+		m_solver->solveGroup( bodies,m_bodies.size(),manifold, m_manifolds.size(),constraints, m_constraints.size() ,*m_solverInfo,m_debugDrawer,m_dispatcher);
@@ -213,13 +202,17 @@ struct InplaceSolverIslandCallback : public btSimulationIslandManager::IslandCal
 btDiscreteDynamicsWorld::btDiscreteDynamicsWorld(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver, btCollisionConfiguration* collisionConfiguration)
+m_sortedConstraints	(),
+m_solverIslandCallback ( NULL ),
-m_sortedConstraints	(),
-m_solverIslandCallback ( NULL )
 	if (!m_constraintSolver)
@@ -240,7 +233,7 @@ m_solverIslandCallback ( NULL )
 		void* mem = btAlignedAlloc(sizeof(InplaceSolverIslandCallback),16);
-		m_solverIslandCallback = new (mem) InplaceSolverIslandCallback (constraintSolver, m_stackAlloc, dispatcher);
+		m_solverIslandCallback = new (mem) InplaceSolverIslandCallback (m_constraintSolver, 0, dispatcher);
@@ -325,6 +318,9 @@ void	btDiscreteDynamicsWorld::debugDrawWorld()
+    if (getDebugDrawer())
+        getDebugDrawer()->flushLines();
 void	btDiscreteDynamicsWorld::clearForces()
@@ -337,7 +333,7 @@ void	btDiscreteDynamicsWorld::clearForces()
 		//it might break backward compatibility (people applying forces on sleeping objects get never cleared and accumulate on wake-up
 ///apply gravity, call this once per timestep
 void	btDiscreteDynamicsWorld::applyGravity()
@@ -367,7 +363,9 @@ void	btDiscreteDynamicsWorld::synchronizeSingleMotionState(btRigidBody* body)
 			btTransform interpolatedTransform;
-				body->getInterpolationLinearVelocity(),body->getInterpolationAngularVelocity(),m_localTime*body->getHitFraction(),interpolatedTransform);
+				body->getInterpolationLinearVelocity(),body->getInterpolationAngularVelocity(),
+				(m_latencyMotionStateInterpolation && m_fixedTimeStep) ? m_localTime - m_fixedTimeStep : m_localTime*body->getHitFraction(),
+				interpolatedTransform);
@@ -411,6 +409,7 @@ int	btDiscreteDynamicsWorld::stepSimulation( btScalar timeStep,int maxSubSteps,
 	if (maxSubSteps)
 		//fixed timestep with interpolation
+		m_fixedTimeStep = fixedTimeStep;
 		m_localTime += timeStep;
 		if (m_localTime >= fixedTimeStep)
@@ -421,7 +420,8 @@ int	btDiscreteDynamicsWorld::stepSimulation( btScalar timeStep,int maxSubSteps,
 		//variable timestep
 		fixedTimeStep = timeStep;
-		m_localTime = timeStep;
+		m_localTime = m_latencyMotionStateInterpolation ? 0 : timeStep;
+		m_fixedTimeStep = 0;
 		if (btFuzzyZero(timeStep))
 			numSimulationSubSteps = 0;
@@ -449,7 +449,7 @@ int	btDiscreteDynamicsWorld::stepSimulation( btScalar timeStep,int maxSubSteps,
 		for (int i=0;i<clampedSimulationSteps;i++)
@@ -467,18 +467,18 @@ int	btDiscreteDynamicsWorld::stepSimulation( btScalar timeStep,int maxSubSteps,
 #ifndef BT_NO_PROFILE
 #endif //BT_NO_PROFILE
 	return numSimulationSubSteps;
 void	btDiscreteDynamicsWorld::internalSingleStepSimulation(btScalar timeStep)
 	if(0 != m_internalPreTickCallback) {
 		(*m_internalPreTickCallback)(this, timeStep);
-	}	
+	}
 	///apply gravity, predict motion
@@ -490,33 +490,35 @@ void	btDiscreteDynamicsWorld::internalSingleStepSimulation(btScalar timeStep)
 	dispatchInfo.m_debugDraw = getDebugDrawer();
+    createPredictiveContacts(timeStep);
 	///perform collision detection
 	getSolverInfo().m_timeStep = timeStep;
 	///solve contact and other joint constraints
 	///integrate transforms
 	///update vehicle simulation
 	updateActivationState( timeStep );
 	if(0 != m_internalTickCallback) {
 		(*m_internalTickCallback)(this, timeStep);
-	}	
+	}
 void	btDiscreteDynamicsWorld::setGravity(const btVector3& gravity)
@@ -608,14 +610,14 @@ void	btDiscreteDynamicsWorld::addRigidBody(btRigidBody* body, short group, short
 void	btDiscreteDynamicsWorld::updateActions(btScalar timeStep)
 	for ( int i=0;i<m_actions.size();i++)
 		m_actions[i]->updateAction( this, timeStep);
 void	btDiscreteDynamicsWorld::updateActivationState(btScalar timeStep)
@@ -636,7 +638,7 @@ void	btDiscreteDynamicsWorld::updateActivationState(btScalar timeStep)
 					if (body->getActivationState() == ACTIVE_TAG)
 						body->setActivationState( WANTS_DEACTIVATION );
-					if (body->getActivationState() == ISLAND_SLEEPING) 
+					if (body->getActivationState() == ISLAND_SLEEPING)
@@ -655,6 +657,9 @@ void	btDiscreteDynamicsWorld::updateActivationState(btScalar timeStep)
 void	btDiscreteDynamicsWorld::addConstraint(btTypedConstraint* constraint,bool disableCollisionsBetweenLinkedBodies)
+    //Make sure the two bodies of a type constraint are different (possibly add this to the btTypedConstraint constructor?)
+    btAssert(&constraint->getRigidBodyA()!=&constraint->getRigidBodyB());
 	if (disableCollisionsBetweenLinkedBodies)
@@ -706,31 +711,31 @@ void	btDiscreteDynamicsWorld::removeCharacter(btActionInterface* character)
 void	btDiscreteDynamicsWorld::solveConstraints(btContactSolverInfo& solverInfo)
 	m_sortedConstraints.resize( m_constraints.size());
-	int i; 
+	int i;
 	for (i=0;i<getNumConstraints();i++)
 		m_sortedConstraints[i] = m_constraints[i];
 //	btAssert(0);
 	btTypedConstraint** constraintsPtr = getNumConstraints() ? &m_sortedConstraints[0] : 0;
 	m_constraintSolver->prepareSolve(getCollisionWorld()->getNumCollisionObjects(), getCollisionWorld()->getDispatcher()->getNumManifolds());
 	/// solve all the constraints for this island
-	m_constraintSolver->allSolved(solverInfo, m_debugDrawer, m_stackAlloc);
+	m_constraintSolver->allSolved(solverInfo, m_debugDrawer);
@@ -740,6 +745,23 @@ void	btDiscreteDynamicsWorld::calculateSimulationIslands()
+    {
+        //merge islands based on speculative contact manifolds too
+        for (int i=0;i<this->m_predictiveManifolds.size();i++)
+        {
+            btPersistentManifold* manifold = m_predictiveManifolds[i];
+            const btCollisionObject* colObj0 = manifold->getBody0();
+            const btCollisionObject* colObj1 = manifold->getBody1();
+            if (((colObj0) && (!(colObj0)->isStaticOrKinematicObject())) &&
+                ((colObj1) && (!(colObj1)->isStaticOrKinematicObject())))
+            {
+				getSimulationIslandManager()->getUnionFind().unite((colObj0)->getIslandTag(),(colObj1)->getIslandTag());
+            }
+        }
+    }
 		int i;
 		int numConstraints = int(m_constraints.size());
@@ -754,12 +776,7 @@ void	btDiscreteDynamicsWorld::calculateSimulationIslands()
 				if (((colObj0) && (!(colObj0)->isStaticOrKinematicObject())) &&
 					((colObj1) && (!(colObj1)->isStaticOrKinematicObject())))
-					if (colObj0->isActive() || colObj1->isActive())
-					{
-						getSimulationIslandManager()->getUnionFind().unite((colObj0)->getIslandTag(),
-							(colObj1)->getIslandTag());
-					}
+					getSimulationIslandManager()->getUnionFind().unite((colObj0)->getIslandTag(),(colObj1)->getIslandTag());
@@ -768,7 +785,7 @@ void	btDiscreteDynamicsWorld::calculateSimulationIslands()
 	//Store the island id in each body
@@ -784,7 +801,7 @@ public:
 	btDispatcher* m_dispatcher;
-	btClosestNotMeConvexResultCallback (btCollisionObject* me,const btVector3& fromA,const btVector3& toA,btOverlappingPairCache* pairCache,btDispatcher* dispatcher) : 
+	btClosestNotMeConvexResultCallback (btCollisionObject* me,const btVector3& fromA,const btVector3& toA,btOverlappingPairCache* pairCache,btDispatcher* dispatcher) :
@@ -860,6 +877,103 @@ public:
 ///internal debugging variable. this value shouldn't be too high
 int gNumClampedCcdMotions=0;
+void	btDiscreteDynamicsWorld::createPredictiveContacts(btScalar timeStep)
+	BT_PROFILE("createPredictiveContacts");
+	{
+		BT_PROFILE("release predictive contact manifolds");
+		for (int i=0;i<m_predictiveManifolds.size();i++)
+		{
+			btPersistentManifold* manifold = m_predictiveManifolds[i];
+			this->m_dispatcher1->releaseManifold(manifold);
+		}
+		m_predictiveManifolds.clear();
+	}
+	btTransform predictedTrans;
+	for ( int i=0;i<m_nonStaticRigidBodies.size();i++)
+	{
+		btRigidBody* body = m_nonStaticRigidBodies[i];
+		body->setHitFraction(1.f);
+		if (body->isActive() && (!body->isStaticOrKinematicObject()))
+		{
+			body->predictIntegratedTransform(timeStep, predictedTrans);
+			btScalar squareMotion = (predictedTrans.getOrigin()-body->getWorldTransform().getOrigin()).length2();
+			if (getDispatchInfo().m_useContinuous && body->getCcdSquareMotionThreshold() && body->getCcdSquareMotionThreshold() < squareMotion)
+			{
+				BT_PROFILE("predictive convexSweepTest");
+				if (body->getCollisionShape()->isConvex())
+				{
+					gNumClampedCcdMotions++;
+					class StaticOnlyCallback : public btClosestNotMeConvexResultCallback
+					{
+					public:
+						StaticOnlyCallback (btCollisionObject* me,const btVector3& fromA,const btVector3& toA,btOverlappingPairCache* pairCache,btDispatcher* dispatcher) :
+						  btClosestNotMeConvexResultCallback(me,fromA,toA,pairCache,dispatcher)
+						{
+						}
+					  	virtual bool needsCollision(btBroadphaseProxy* proxy0) const
+						{
+							btCollisionObject* otherObj = (btCollisionObject*) proxy0->m_clientObject;
+							if (!otherObj->isStaticOrKinematicObject())
+								return false;
+							return btClosestNotMeConvexResultCallback::needsCollision(proxy0);
+						}
+					};
+					StaticOnlyCallback sweepResults(body,body->getWorldTransform().getOrigin(),predictedTrans.getOrigin(),getBroadphase()->getOverlappingPairCache(),getDispatcher());
+					btClosestNotMeConvexResultCallback sweepResults(body,body->getWorldTransform().getOrigin(),predictedTrans.getOrigin(),getBroadphase()->getOverlappingPairCache(),getDispatcher());
+					//btConvexShape* convexShape = static_cast<btConvexShape*>(body->getCollisionShape());
+					btSphereShape tmpSphere(body->getCcdSweptSphereRadius());//btConvexShape* convexShape = static_cast<btConvexShape*>(body->getCollisionShape());
+					sweepResults.m_allowedPenetration=getDispatchInfo().m_allowedCcdPenetration;
+					sweepResults.m_collisionFilterGroup = body->getBroadphaseProxy()->m_collisionFilterGroup;
+					sweepResults.m_collisionFilterMask  = body->getBroadphaseProxy()->m_collisionFilterMask;
+					btTransform modifiedPredictedTrans = predictedTrans;
+					modifiedPredictedTrans.setBasis(body->getWorldTransform().getBasis());
+					convexSweepTest(&tmpSphere,body->getWorldTransform(),modifiedPredictedTrans,sweepResults);
+					if (sweepResults.hasHit() && (sweepResults.m_closestHitFraction < 1.f))
+					{
+						btVector3 distVec = (predictedTrans.getOrigin()-body->getWorldTransform().getOrigin())*sweepResults.m_closestHitFraction;
+						btScalar distance = distVec.dot(-sweepResults.m_hitNormalWorld);
+						btPersistentManifold* manifold = m_dispatcher1->getNewManifold(body,sweepResults.m_hitCollisionObject);
+						m_predictiveManifolds.push_back(manifold);
+						btVector3 worldPointB = body->getWorldTransform().getOrigin()+distVec;
+						btVector3 localPointB = sweepResults.m_hitCollisionObject->getWorldTransform().inverse()*worldPointB;
+						btManifoldPoint newPoint(btVector3(0,0,0), localPointB,sweepResults.m_hitNormalWorld,distance);
+						bool isPredictive = true;
+						int index = manifold->addManifoldPoint(newPoint, isPredictive);
+						btManifoldPoint& pt = manifold->getContactPoint(index);
+						pt.m_combinedRestitution = 0;
+						pt.m_combinedFriction = btManifoldResult::calculateCombinedFriction(body,sweepResults.m_hitCollisionObject);
+						pt.m_positionWorldOnA = body->getWorldTransform().getOrigin();
+						pt.m_positionWorldOnB = worldPointB;
+					}
+				}
+			}
+		}
+	}
 void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
@@ -873,10 +987,10 @@ void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
 			body->predictIntegratedTransform(timeStep, predictedTrans);
 			btScalar squareMotion = (predictedTrans.getOrigin()-body->getWorldTransform().getOrigin()).length2();
 			if (getDispatchInfo().m_useContinuous && body->getCcdSquareMotionThreshold() && body->getCcdSquareMotionThreshold() < squareMotion)
@@ -889,7 +1003,7 @@ void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
-						StaticOnlyCallback (btCollisionObject* me,const btVector3& fromA,const btVector3& toA,btOverlappingPairCache* pairCache,btDispatcher* dispatcher) : 
+						StaticOnlyCallback (btCollisionObject* me,const btVector3& fromA,const btVector3& toA,btOverlappingPairCache* pairCache,btDispatcher* dispatcher) :
@@ -919,7 +1033,7 @@ void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
 					if (sweepResults.hasHit() && (sweepResults.m_closestHitFraction < 1.f))
 						//printf("clamped integration to hit fraction = %f\n",fraction);
 						body->predictIntegratedTransform(timeStep*body->getHitFraction(), predictedTrans);
@@ -944,11 +1058,13 @@ void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
-						//response  between two dynamic objects without friction, assuming 0 penetration depth
-						btScalar appliedImpulse = 0.f;
-						btScalar depth = 0.f;
-						appliedImpulse = resolveSingleCollision(body,sweepResults.m_hitCollisionObject,sweepResults.m_hitPointWorld,sweepResults.m_hitNormalWorld,getSolverInfo(), depth);
+						//don't apply the collision response right now, it will happen next frame
+						//if you really need to, you can uncomment next 3 lines. Note that is uses zero restitution.
+						//btScalar appliedImpulse = 0.f;
+						//btScalar depth = 0.f;
+						//appliedImpulse = resolveSingleCollision(body,(btCollisionObject*)sweepResults.m_hitCollisionObject,sweepResults.m_hitPointWorld,sweepResults.m_hitNormalWorld,getSolverInfo(), depth);
@@ -956,11 +1072,49 @@ void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
 			body->proceedToTransform( predictedTrans);
+		}
+	}
+	///this should probably be switched on by default, but it is not well tested yet
+	if (m_applySpeculativeContactRestitution)
+	{
+		BT_PROFILE("apply speculative contact restitution");
+		for (int i=0;i<m_predictiveManifolds.size();i++)
+		{
+			btPersistentManifold* manifold = m_predictiveManifolds[i];
+			btRigidBody* body0 = btRigidBody::upcast((btCollisionObject*)manifold->getBody0());
+			btRigidBody* body1 = btRigidBody::upcast((btCollisionObject*)manifold->getBody1());
+			for (int p=0;p<manifold->getNumContacts();p++)
+			{
+				const btManifoldPoint& pt = manifold->getContactPoint(p);
+				btScalar combinedRestitution = btManifoldResult::calculateCombinedRestitution(body0, body1);
+				if (combinedRestitution>0 && pt.m_appliedImpulse != 0.f)
+				//if (pt.getDistance()>0 && combinedRestitution>0 && pt.m_appliedImpulse != 0.f)
+				{
+					btVector3 imp = -pt.m_normalWorldOnB * pt.m_appliedImpulse* combinedRestitution;
+					const btVector3& pos1 = pt.getPositionWorldOnA();
+					const btVector3& pos2 = pt.getPositionWorldOnB();
+					btVector3 rel_pos0 = pos1 - body0->getWorldTransform().getOrigin();
+					btVector3 rel_pos1 = pos2 - body1->getWorldTransform().getOrigin();
+					if (body0)
+						body0->applyImpulse(imp,rel_pos0);
+					if (body1)
+						body1->applyImpulse(-imp,rel_pos1);
+				}
+			}
@@ -976,8 +1130,8 @@ void	btDiscreteDynamicsWorld::predictUnconstraintMotion(btScalar timeStep)
 		btRigidBody* body = m_nonStaticRigidBodies[i];
 		if (!body->isStaticOrKinematicObject())
-			body->integrateVelocities( timeStep);
-			//damping
+			//don't integrate/update velocities here, it happens in the constraint solver
@@ -999,7 +1153,7 @@ void	btDiscreteDynamicsWorld::startProfiling(btScalar timeStep)
 void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
@@ -1019,12 +1173,12 @@ void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
 				btTransform tr;
 				btVector3 pivot = p2pC->getPivotInA();
-				pivot = p2pC->getRigidBodyA().getCenterOfMassTransform() * pivot; 
+				pivot = p2pC->getRigidBodyA().getCenterOfMassTransform() * pivot;
 				getDebugDrawer()->drawTransform(tr, dbgDrawSize);
-				// that ideally should draw the same frame	
+				// that ideally should draw the same frame
 				pivot = p2pC->getPivotInB();
-				pivot = p2pC->getRigidBodyB().getCenterOfMassTransform() * pivot; 
+				pivot = p2pC->getRigidBodyB().getCenterOfMassTransform() * pivot;
 				if(drawFrames) getDebugDrawer()->drawTransform(tr, dbgDrawSize);
@@ -1043,13 +1197,13 @@ void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
 				bool drawSect = true;
-				if(minAng > maxAng)
+				if(!pHinge->hasLimit())
 					minAng = btScalar(0.f);
 					maxAng = SIMD_2_PI;
 					drawSect = false;
-				if(drawLimits) 
+				if(drawLimits)
 					btVector3& center = tr.getOrigin();
 					btVector3 normal = tr.getBasis().getColumn(2);
@@ -1084,7 +1238,7 @@ void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
 							getDebugDrawer()->drawLine(tr.getOrigin(), pCur, btVector3(0,0,0));
 						pPrev = pCur;
-					}						
+					}
 					btScalar tws = pCT->getTwistSpan();
 					btScalar twa = pCT->getTwistAngle();
 					bool useFrameB = (pCT->getRigidBodyB().getInvMass() > btScalar(0.f));
@@ -1112,7 +1266,7 @@ void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
 				if(drawFrames) getDebugDrawer()->drawTransform(tr, dbgDrawSize);
 				tr = p6DOF->getCalculatedTransformB();
 				if(drawFrames) getDebugDrawer()->drawTransform(tr, dbgDrawSize);
-				if(drawLimits) 
+				if(drawLimits)
 					tr = p6DOF->getCalculatedTransformA();
 					const btVector3& center = p6DOF->getCalculatedTransformB().getOrigin();
@@ -1153,6 +1307,57 @@ void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
+		///note: the code for D6_SPRING_2_CONSTRAINT_TYPE is identical to D6_CONSTRAINT_TYPE, the D6_CONSTRAINT_TYPE+D6_SPRING_CONSTRAINT_TYPE will likely become obsolete/deprecated at some stage
+		{
+			{
+				btGeneric6DofSpring2Constraint* p6DOF = (btGeneric6DofSpring2Constraint*)constraint;
+				btTransform tr = p6DOF->getCalculatedTransformA();
+				if (drawFrames) getDebugDrawer()->drawTransform(tr, dbgDrawSize);
+				tr = p6DOF->getCalculatedTransformB();
+				if (drawFrames) getDebugDrawer()->drawTransform(tr, dbgDrawSize);
+				if (drawLimits)
+				{
+					tr = p6DOF->getCalculatedTransformA();
+					const btVector3& center = p6DOF->getCalculatedTransformB().getOrigin();
+					btVector3 up = tr.getBasis().getColumn(2);
+					btVector3 axis = tr.getBasis().getColumn(0);
+					btScalar minTh = p6DOF->getRotationalLimitMotor(1)->m_loLimit;
+					btScalar maxTh = p6DOF->getRotationalLimitMotor(1)->m_hiLimit;
+					btScalar minPs = p6DOF->getRotationalLimitMotor(2)->m_loLimit;
+					btScalar maxPs = p6DOF->getRotationalLimitMotor(2)->m_hiLimit;
+					getDebugDrawer()->drawSpherePatch(center, up, axis, dbgDrawSize * btScalar(.9f), minTh, maxTh, minPs, maxPs, btVector3(0, 0, 0));
+					axis = tr.getBasis().getColumn(1);
+					btScalar ay = p6DOF->getAngle(1);
+					btScalar az = p6DOF->getAngle(2);
+					btScalar cy = btCos(ay);
+					btScalar sy = btSin(ay);
+					btScalar cz = btCos(az);
+					btScalar sz = btSin(az);
+					btVector3 ref;
+					ref[0] = cy*cz*axis[0] + cy*sz*axis[1] - sy*axis[2];
+					ref[1] = -sz*axis[0] + cz*axis[1];
+					ref[2] = cz*sy*axis[0] + sz*sy*axis[1] + cy*axis[2];
+					tr = p6DOF->getCalculatedTransformB();
+					btVector3 normal = -tr.getBasis().getColumn(0);
+					btScalar minFi = p6DOF->getRotationalLimitMotor(0)->m_loLimit;
+					btScalar maxFi = p6DOF->getRotationalLimitMotor(0)->m_hiLimit;
+					if (minFi > maxFi)
+					{
+						getDebugDrawer()->drawArc(center, normal, ref, dbgDrawSize, dbgDrawSize, -SIMD_PI, SIMD_PI, btVector3(0, 0, 0), false);
+					}
+					else if (minFi < maxFi)
+					{
+						getDebugDrawer()->drawArc(center, normal, ref, dbgDrawSize, dbgDrawSize, minFi, maxFi, btVector3(0, 0, 0), true);
+					}
+					tr = p6DOF->getCalculatedTransformA();
+					btVector3 bbMin = p6DOF->getTranslationalLimitMotor()->m_lowerLimit;
+					btVector3 bbMax = p6DOF->getTranslationalLimitMotor()->m_upperLimit;
+					getDebugDrawer()->drawBox(bbMin, bbMax, tr, btVector3(0, 0, 0));
+				}
+			}
+			break;
+		}
 				btSliderConstraint* pSlider = (btSliderConstraint*)constraint;
@@ -1175,7 +1380,7 @@ void btDiscreteDynamicsWorld::debugDrawConstraint(btTypedConstraint* constraint)
-		default : 
+		default :
@@ -1193,6 +1398,7 @@ void	btDiscreteDynamicsWorld::setConstraintSolver(btConstraintSolver* solver)
 	m_ownsConstraintSolver = false;
 	m_constraintSolver = solver;
+	m_solverIslandCallback->m_solver = solver;
 btConstraintSolver* btDiscreteDynamicsWorld::getConstraintSolver()
@@ -1243,15 +1449,69 @@ void	btDiscreteDynamicsWorld::serializeRigidBodies(btSerializer* serializer)
+void	btDiscreteDynamicsWorld::serializeDynamicsWorldInfo(btSerializer* serializer)
+		int len = sizeof(btDynamicsWorldDoubleData);
+		btChunk* chunk = serializer->allocate(len,1);
+		btDynamicsWorldDoubleData* worldInfo = (btDynamicsWorldDoubleData*)chunk->m_oldPtr;
+		int len = sizeof(btDynamicsWorldFloatData);
+		btChunk* chunk = serializer->allocate(len,1);
+		btDynamicsWorldFloatData* worldInfo = (btDynamicsWorldFloatData*)chunk->m_oldPtr;
+		memset(worldInfo ,0x00,len);
+		m_gravity.serialize(worldInfo->m_gravity);
+		worldInfo->m_solverInfo.m_tau = getSolverInfo().m_tau;
+		worldInfo->m_solverInfo.m_damping = getSolverInfo().m_damping;
+		worldInfo->m_solverInfo.m_friction = getSolverInfo().m_friction;
+		worldInfo->m_solverInfo.m_timeStep = getSolverInfo().m_timeStep;
+		worldInfo->m_solverInfo.m_restitution = getSolverInfo().m_restitution;
+		worldInfo->m_solverInfo.m_maxErrorReduction = getSolverInfo().m_maxErrorReduction;
+		worldInfo->m_solverInfo.m_sor = getSolverInfo().m_sor;
+		worldInfo->m_solverInfo.m_erp = getSolverInfo().m_erp;
+		worldInfo->m_solverInfo.m_erp2 = getSolverInfo().m_erp2;
+		worldInfo->m_solverInfo.m_globalCfm = getSolverInfo().m_globalCfm;
+		worldInfo->m_solverInfo.m_splitImpulsePenetrationThreshold = getSolverInfo().m_splitImpulsePenetrationThreshold;
+		worldInfo->m_solverInfo.m_splitImpulseTurnErp = getSolverInfo().m_splitImpulseTurnErp;
+		worldInfo->m_solverInfo.m_linearSlop = getSolverInfo().m_linearSlop;
+		worldInfo->m_solverInfo.m_warmstartingFactor = getSolverInfo().m_warmstartingFactor;
+		worldInfo->m_solverInfo.m_maxGyroscopicForce = getSolverInfo().m_maxGyroscopicForce;
+		worldInfo->m_solverInfo.m_singleAxisRollingFrictionThreshold = getSolverInfo().m_singleAxisRollingFrictionThreshold;
+		worldInfo->m_solverInfo.m_numIterations = getSolverInfo().m_numIterations;
+		worldInfo->m_solverInfo.m_solverMode = getSolverInfo().m_solverMode;
+		worldInfo->m_solverInfo.m_restingContactRestitutionThreshold = getSolverInfo().m_restingContactRestitutionThreshold;
+		worldInfo->m_solverInfo.m_minimumSolverBatchSize = getSolverInfo().m_minimumSolverBatchSize;
+		worldInfo->m_solverInfo.m_splitImpulse = getSolverInfo().m_splitImpulse;
+		const char* structType = "btDynamicsWorldDoubleData";
+		const char* structType = "btDynamicsWorldFloatData";
+		serializer->finalizeChunk(chunk,structType,BT_DYNAMICSWORLD_CODE,worldInfo);
 void	btDiscreteDynamicsWorld::serialize(btSerializer* serializer)
-	serializeRigidBodies(serializer);
+	serializeDynamicsWorldInfo(serializer);
+	serializeRigidBodies(serializer);
diff --git a/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h b/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h
index 23a38dd2..dd3d1c36 100644
--- a/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h
+++ b/src/bullet/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h
@@ -25,7 +25,7 @@ class btConstraintSolver;
 class btSimulationIslandManager;
 class btTypedConstraint;
 class btActionInterface;
+class btPersistentManifold;
 class btIDebugDraw;
 struct InplaceSolverIslandCallback;
@@ -34,7 +34,7 @@ struct InplaceSolverIslandCallback;
 ///btDiscreteDynamicsWorld provides discrete rigid body simulation
 ///those classes replace the obsolete CcdPhysicsEnvironment/CcdPhysicsController
-class btDiscreteDynamicsWorld : public btDynamicsWorld
+ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorld : public btDynamicsWorld
@@ -53,16 +53,22 @@ protected:
 	//for variable timesteps
 	btScalar	m_localTime;
+	btScalar	m_fixedTimeStep;
 	//for variable timesteps
 	bool	m_ownsIslandManager;
 	bool	m_ownsConstraintSolver;
 	bool	m_synchronizeAllMotionStates;
+	bool	m_applySpeculativeContactRestitution;
 	btAlignedObjectArray<btActionInterface*>	m_actions;
 	int	m_profileTimings;
+	bool	m_latencyMotionStateInterpolation;
+	btAlignedObjectArray<btPersistentManifold*>	m_predictiveManifolds;
 	virtual void	predictUnconstraintMotion(btScalar timeStep);
 	virtual void	integrateTransforms(btScalar timeStep);
@@ -71,7 +77,7 @@ protected:
 	virtual void	solveConstraints(btContactSolverInfo& solverInfo);
-	void	updateActivationState(btScalar timeStep);
+	virtual void	updateActivationState(btScalar timeStep);
 	void	updateActions(btScalar timeStep);
@@ -79,14 +85,19 @@ protected:
 	virtual void	internalSingleStepSimulation( btScalar timeStep);
+	void	createPredictiveContacts(btScalar timeStep);
 	virtual void	saveKinematicState(btScalar timeStep);
 	void	serializeRigidBodies(btSerializer* serializer);
+	void	serializeDynamicsWorldInfo(btSerializer* serializer);
 	///this btDiscreteDynamicsWorld constructor gets created objects from the user, and will not delete those
 	btDiscreteDynamicsWorld(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration);
@@ -140,7 +151,7 @@ public:
 	virtual void	removeCollisionObject(btCollisionObject* collisionObject);
-	void	debugDrawConstraint(btTypedConstraint* constraint);
+	virtual void	debugDrawConstraint(btTypedConstraint* constraint);
 	virtual void	debugDrawWorld();
@@ -195,9 +206,29 @@ public:
 		return m_synchronizeAllMotionStates;
+	void setApplySpeculativeContactRestitution(bool enable)
+	{
+		m_applySpeculativeContactRestitution = enable;
+	}
+	bool getApplySpeculativeContactRestitution() const
+	{
+		return m_applySpeculativeContactRestitution;
+	}
 	///Preliminary serialization test for Bullet 2.76. Loading those files requires a separate parser (see Bullet/Demos/SerializeDemo)
 	virtual	void	serialize(btSerializer* serializer);
+	///Interpolate motion state between previous and current transform, instead of current and next transform.
+	///This can relieve discontinuities in the rendering, due to penetrations
+	void setLatencyMotionStateInterpolation(bool latencyInterpolation )
+	{
+		m_latencyMotionStateInterpolation = latencyInterpolation;
+	}
+	bool getLatencyMotionStateInterpolation() const
+	{
+		return m_latencyMotionStateInterpolation;
+	}
diff --git a/src/bullet/BulletDynamics/Dynamics/btDynamicsWorld.h b/src/bullet/BulletDynamics/Dynamics/btDynamicsWorld.h
index 6b009337..35dd1400 100644
--- a/src/bullet/BulletDynamics/Dynamics/btDynamicsWorld.h
+++ b/src/bullet/BulletDynamics/Dynamics/btDynamicsWorld.h
@@ -33,7 +33,8 @@ enum btDynamicsWorldType
 ///The btDynamicsWorld is the interface class for several dynamics implementation, basic, discrete, parallel, and continuous etc.
@@ -146,6 +147,21 @@ public:
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct btDynamicsWorldDoubleData
+	btContactSolverInfoDoubleData	m_solverInfo;
+	btVector3DoubleData	m_gravity;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct btDynamicsWorldFloatData
+	btContactSolverInfoFloatData	m_solverInfo;
+	btVector3FloatData	m_gravity;
diff --git a/src/bullet/BulletDynamics/Dynamics/btRigidBody.cpp b/src/bullet/BulletDynamics/Dynamics/btRigidBody.cpp
index 911b5072..e0e8bc70 100644
--- a/src/bullet/BulletDynamics/Dynamics/btRigidBody.cpp
+++ b/src/bullet/BulletDynamics/Dynamics/btRigidBody.cpp
@@ -78,6 +78,7 @@ void	btRigidBody::setupRigidBody(const btRigidBody::btRigidBodyConstructionInfo&
 	//moved to btCollisionObject
 	m_friction = constructionInfo.m_friction;
+	m_rollingFriction = constructionInfo.m_rollingFriction;
 	m_restitution = constructionInfo.m_restitution;
 	setCollisionShape( constructionInfo.m_collisionShape );
@@ -86,7 +87,7 @@ void	btRigidBody::setupRigidBody(const btRigidBody::btRigidBodyConstructionInfo&
 	setMassProps(constructionInfo.m_mass, constructionInfo.m_localInertia);
-	m_rigidbodyFlags = 0;
@@ -250,13 +251,137 @@ void btRigidBody::setMassProps(btScalar mass, const btVector3& inertia)
 void btRigidBody::updateInertiaTensor() 
 	m_invInertiaTensorWorld = m_worldTransform.getBasis().scaled(m_invInertiaLocal) * m_worldTransform.getBasis().transpose();
+btVector3 btRigidBody::getLocalInertia() const
+	btVector3 inertiaLocal;
+	const btVector3 inertia = m_invInertiaLocal;
+	inertiaLocal.setValue(inertia.x() != btScalar(0.0) ? btScalar(1.0) / inertia.x() : btScalar(0.0),
+		inertia.y() != btScalar(0.0) ? btScalar(1.0) / inertia.y() : btScalar(0.0),
+		inertia.z() != btScalar(0.0) ? btScalar(1.0) / inertia.z() : btScalar(0.0));
+	return inertiaLocal;
+inline btVector3 evalEulerEqn(const btVector3& w1, const btVector3& w0, const btVector3& T, const btScalar dt,
+	const btMatrix3x3 &I)
+	const btVector3 w2 = I*w1 + w1.cross(I*w1)*dt - (T*dt + I*w0);
+	return w2;
+inline btMatrix3x3 evalEulerEqnDeriv(const btVector3& w1, const btVector3& w0, const btScalar dt,
+	const btMatrix3x3 &I)
+	btMatrix3x3 w1x, Iw1x;
+	const btVector3 Iwi = (I*w1);
+	w1.getSkewSymmetricMatrix(&w1x[0], &w1x[1], &w1x[2]);
+	Iwi.getSkewSymmetricMatrix(&Iw1x[0], &Iw1x[1], &Iw1x[2]);
+	const btMatrix3x3 dfw1 = I + (w1x*I - Iw1x)*dt;
+	return dfw1;
+btVector3 btRigidBody::computeGyroscopicForceExplicit(btScalar maxGyroscopicForce) const
+	btVector3 inertiaLocal = getLocalInertia();
+	btMatrix3x3 inertiaTensorWorld = getWorldTransform().getBasis().scaled(inertiaLocal) * getWorldTransform().getBasis().transpose();
+	btVector3 tmp = inertiaTensorWorld*getAngularVelocity();
+	btVector3 gf = getAngularVelocity().cross(tmp);
+	btScalar l2 = gf.length2();
+	if (l2>maxGyroscopicForce*maxGyroscopicForce)
+	{
+		gf *= btScalar(1.)/btSqrt(l2)*maxGyroscopicForce;
+	}
+	return gf;
+btVector3 btRigidBody::computeGyroscopicImpulseImplicit_Body(btScalar step) const
+	btVector3 idl = getLocalInertia();
+	btVector3 omega1 = getAngularVelocity();
+	btQuaternion q = getWorldTransform().getRotation();
+	// Convert to body coordinates
+	btVector3 omegab = quatRotate(q.inverse(), omega1);
+	btMatrix3x3 Ib;
+	Ib.setValue(idl.x(),0,0,
+				0,idl.y(),0,
+				0,0,idl.z());
+	btVector3 ibo = Ib*omegab;
+	// Residual vector
+	btVector3 f = step * omegab.cross(ibo);
+	btMatrix3x3 skew0;
+	omegab.getSkewSymmetricMatrix(&skew0[0], &skew0[1], &skew0[2]);
+	btVector3 om = Ib*omegab;
+	btMatrix3x3 skew1;
+	om.getSkewSymmetricMatrix(&skew1[0],&skew1[1],&skew1[2]);
+	// Jacobian
+	btMatrix3x3 J = Ib +  (skew0*Ib - skew1)*step;
+//	btMatrix3x3 Jinv = J.inverse();
+//	btVector3 omega_div = Jinv*f;
+	btVector3 omega_div = J.solve33(f);
+	// Single Newton-Raphson update
+	omegab = omegab - omega_div;//Solve33(J, f);
+	// Back to world coordinates
+	btVector3 omega2 = quatRotate(q,omegab);
+	btVector3 gf = omega2-omega1;
+	return gf;
+btVector3 btRigidBody::computeGyroscopicImpulseImplicit_World(btScalar step) const
+	// use full newton-euler equations.  common practice to drop the wxIw term. want it for better tumbling behavior.
+	// calculate using implicit euler step so it's stable.
+	const btVector3 inertiaLocal = getLocalInertia();
+	const btVector3 w0 = getAngularVelocity();
+	btMatrix3x3 I;
+	I = m_worldTransform.getBasis().scaled(inertiaLocal) *
+		m_worldTransform.getBasis().transpose();
+	// use newtons method to find implicit solution for new angular velocity (w')
+	// f(w') = -(T*step + Iw) + Iw' + w' + w'xIw'*step = 0 
+	// df/dw' = I + 1xIw'*step + w'xI*step
+	btVector3 w1 = w0;
+	// one step of newton's method
+	{
+		const btVector3 fw = evalEulerEqn(w1, w0, btVector3(0, 0, 0), step, I);
+		const btMatrix3x3 dfw = evalEulerEqnDeriv(w1, w0, step, I);
+		btVector3 dw;
+		dw = dfw.solve33(fw);
+		//const btMatrix3x3 dfw_inv = dfw.inverse();
+		//dw = dfw_inv*fw;
+		w1 -= dw;
+	}
+	btVector3 gf = (w1 - w0);
+	return gf;
 void btRigidBody::integrateVelocities(btScalar step) 
 	if (isStaticOrKinematicObject())
@@ -300,58 +425,50 @@ void btRigidBody::setCenterOfMassTransform(const btTransform& xform)
-bool btRigidBody::checkCollideWithOverride(btCollisionObject* co)
-	btRigidBody* otherRb = btRigidBody::upcast(co);
-	if (!otherRb)
-		return true;
-	for (int i = 0; i < m_constraintRefs.size(); ++i)
-	{
-		btTypedConstraint* c = m_constraintRefs[i];
-		if (c->isEnabled())
-			if (&c->getRigidBodyA() == otherRb || &c->getRigidBodyB() == otherRb)
-				return false;
-	}
-	return true;
-void	btRigidBody::internalWritebackVelocity(btScalar timeStep)
-    (void) timeStep;
-	if (m_inverseMass)
-	{
-		setLinearVelocity(getLinearVelocity()+ m_deltaLinearVelocity);
-		setAngularVelocity(getAngularVelocity()+m_deltaAngularVelocity);
-		//correct the position/orientation based on push/turn recovery
-		btTransform newTransform;
-		btTransformUtil::integrateTransform(getWorldTransform(),m_pushVelocity,m_turnVelocity,timeStep,newTransform);
-		setWorldTransform(newTransform);
-		//m_originalBody->setCompanionId(-1);
-	}
-//	m_deltaLinearVelocity.setZero();
-//	m_deltaAngularVelocity .setZero();
-//	m_pushVelocity.setZero();
-//	m_turnVelocity.setZero();
 void btRigidBody::addConstraintRef(btTypedConstraint* c)
+	///disable collision with the 'other' body
 	int index = m_constraintRefs.findLinearSearch(c);
+	//don't add constraints that are already referenced
+	//btAssert(index == m_constraintRefs.size());
 	if (index == m_constraintRefs.size())
-		m_constraintRefs.push_back(c); 
-	m_checkCollideWith = true;
+	{
+		m_constraintRefs.push_back(c);
+		btCollisionObject* colObjA = &c->getRigidBodyA();
+		btCollisionObject* colObjB = &c->getRigidBodyB();
+		if (colObjA == this)
+		{
+			colObjA->setIgnoreCollisionCheck(colObjB, true);
+		}
+		else
+		{
+			colObjB->setIgnoreCollisionCheck(colObjA, true);
+		}
+	} 
 void btRigidBody::removeConstraintRef(btTypedConstraint* c)
-	m_constraintRefs.remove(c);
-	m_checkCollideWith = m_constraintRefs.size() > 0;
+	int index = m_constraintRefs.findLinearSearch(c);
+	//don't remove constraints that are not referenced
+	if(index < m_constraintRefs.size())
+    {
+        m_constraintRefs.remove(c);
+        btCollisionObject* colObjA = &c->getRigidBodyA();
+        btCollisionObject* colObjB = &c->getRigidBodyB();
+        if (colObjA == this)
+        {
+            colObjA->setIgnoreCollisionCheck(colObjB, false);
+        }
+        else
+        {
+            colObjB->setIgnoreCollisionCheck(colObjA, false);
+        }
+    }
 int	btRigidBody::calculateSerializeBufferSize()	const
diff --git a/src/bullet/BulletDynamics/Dynamics/btRigidBody.h b/src/bullet/BulletDynamics/Dynamics/btRigidBody.h
index 7c121e6d..1d177db8 100644
--- a/src/bullet/BulletDynamics/Dynamics/btRigidBody.h
+++ b/src/bullet/BulletDynamics/Dynamics/btRigidBody.h
@@ -40,7 +40,14 @@ extern bool gDisableDeactivation;
 enum	btRigidBodyFlags
+	///BT_ENABLE_GYROPSCOPIC_FORCE flags is enabled by default in Bullet 2.83 and onwards.
+	///See Demos/GyroscopicDemo and computeGyroscopicImpulseImplicit
@@ -83,7 +90,7 @@ class btRigidBody  : public btCollisionObject
 	//m_optionalMotionState allows to automatic synchronize the world transform for active objects
 	btMotionState*	m_optionalMotionState;
-	//keep track of typed constraints referencing this rigid body
+	//keep track of typed constraints referencing this rigid body, to disable collision between linked bodies
 	btAlignedObjectArray<btTypedConstraint*> m_constraintRefs;
 	int				m_rigidbodyFlags;
@@ -93,7 +100,7 @@ class btRigidBody  : public btCollisionObject
-	ATTRIBUTE_ALIGNED64(btVector3		m_deltaLinearVelocity);
+	ATTRIBUTE_ALIGNED16(btVector3		m_deltaLinearVelocity);
 	btVector3		m_deltaAngularVelocity;
 	btVector3		m_angularFactor;
 	btVector3		m_invMass;
@@ -125,6 +132,9 @@ public:
 		///best simulation results when friction is non-zero
 		btScalar			m_friction;
+		///the m_rollingFriction prevents rounded shapes, such as spheres, cylinders and capsules from rolling forever.
+		///See Bullet/Demos/RollingFrictionDemo for usage
+		btScalar			m_rollingFriction;
 		///best simulation results using zero restitution.
 		btScalar			m_restitution;
@@ -147,6 +157,7 @@ public:
+			m_rollingFriction(btScalar(0)),
@@ -355,11 +366,13 @@ public:
 	inline void setLinearVelocity(const btVector3& lin_vel)
+		m_updateRevision++;
 		m_linearVelocity = lin_vel; 
 	inline void setAngularVelocity(const btVector3& ang_vel) 
+		m_updateRevision++;
 		m_angularVelocity = ang_vel; 
@@ -476,11 +489,13 @@ public:
 	void	setAngularFactor(const btVector3& angFac)
+		m_updateRevision++;
 		m_angularFactor = angFac;
 	void	setAngularFactor(btScalar angFac)
+		m_updateRevision++;
 	const btVector3&	getAngularFactor() const
@@ -494,8 +509,6 @@ public:
 		return (getBroadphaseProxy() != 0);
-	virtual bool checkCollideWithOverride(btCollisionObject* co);
 	void addConstraintRef(btTypedConstraint* c);
 	void removeConstraintRef(btTypedConstraint* c);
@@ -519,106 +532,18 @@ public:
 		return m_rigidbodyFlags;
-	const btVector3& getDeltaLinearVelocity() const
-	{
-		return m_deltaLinearVelocity;
-	}
-	const btVector3& getDeltaAngularVelocity() const
-	{
-		return m_deltaAngularVelocity;
-	}
-	const btVector3& getPushVelocity() const 
-	{
-		return m_pushVelocity;
-	}
-	const btVector3& getTurnVelocity() const 
-	{
-		return m_turnVelocity;
-	}
-	////////////////////////////////////////////////
-	///some internal methods, don't use them
-	btVector3& internalGetDeltaLinearVelocity()
-	{
-		return m_deltaLinearVelocity;
-	}
-	btVector3& internalGetDeltaAngularVelocity()
-	{
-		return m_deltaAngularVelocity;
-	}
-	const btVector3& internalGetAngularFactor() const
-	{
-		return m_angularFactor;
-	}
-	const btVector3& internalGetInvMass() const
-	{
-		return m_invMass;
-	}
-	btVector3& internalGetPushVelocity()
-	{
-		return m_pushVelocity;
-	}
-	btVector3& internalGetTurnVelocity()
-	{
-		return m_turnVelocity;
-	}
-	SIMD_FORCE_INLINE void	internalGetVelocityInLocalPointObsolete(const btVector3& rel_pos, btVector3& velocity ) const
-	{
-		velocity = getLinearVelocity()+m_deltaLinearVelocity + (getAngularVelocity()+m_deltaAngularVelocity).cross(rel_pos);
-	}
-	SIMD_FORCE_INLINE void	internalGetAngularVelocity(btVector3& angVel) const
-	{
-		angVel = getAngularVelocity()+m_deltaAngularVelocity;
-	}
-	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
-	SIMD_FORCE_INLINE void internalApplyImpulse(const btVector3& linearComponent, const btVector3& angularComponent,const btScalar impulseMagnitude)
-	{
-		if (m_inverseMass)
-		{
-			m_deltaLinearVelocity += linearComponent*impulseMagnitude;
-			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
-		}
-	}
-	SIMD_FORCE_INLINE void internalApplyPushImpulse(const btVector3& linearComponent, const btVector3& angularComponent,btScalar impulseMagnitude)
-	{
-		if (m_inverseMass)
-		{
-			m_pushVelocity += linearComponent*impulseMagnitude;
-			m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
-		}
-	}
+	///perform implicit force computation in world space
+	btVector3 computeGyroscopicImpulseImplicit_World(btScalar dt) const;
-	void	internalWritebackVelocity()
-	{
-		if (m_inverseMass)
-		{
-			setLinearVelocity(getLinearVelocity()+ m_deltaLinearVelocity);
-			setAngularVelocity(getAngularVelocity()+m_deltaAngularVelocity);
-			//m_deltaLinearVelocity.setZero();
-			//m_deltaAngularVelocity .setZero();
-			//m_originalBody->setCompanionId(-1);
-		}
-	}
+	///perform implicit force computation in body space (inertial frame)
+	btVector3 computeGyroscopicImpulseImplicit_Body(btScalar step) const;
-	void	internalWritebackVelocity(btScalar timeStep);
+	///explicit version is best avoided, it gains energy
+	btVector3 computeGyroscopicForceExplicit(btScalar maxGyroscopicForce) const;
+	btVector3 getLocalInertia() const;
diff --git a/src/bullet/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp b/src/bullet/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp
index 5fc2f3cf..35dd3884 100644
--- a/src/bullet/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp
+++ b/src/bullet/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp
@@ -78,8 +78,8 @@ int		btSimpleDynamicsWorld::stepSimulation( btScalar timeStep,int maxSubSteps, b
 		btContactSolverInfo infoGlobal;
 		infoGlobal.m_timeStep = timeStep;
-		m_constraintSolver->solveGroup(&getCollisionObjectArray()[0],getNumCollisionObjects(),manifoldPtr, numManifolds,0,0,infoGlobal,m_debugDrawer, m_stackAlloc,m_dispatcher1);
-		m_constraintSolver->allSolved(infoGlobal,m_debugDrawer, m_stackAlloc);
+		m_constraintSolver->solveGroup(&getCollisionObjectArray()[0],getNumCollisionObjects(),manifoldPtr, numManifolds,0,0,infoGlobal,m_debugDrawer, m_dispatcher1);
+		m_constraintSolver->allSolved(infoGlobal,m_debugDrawer);
 	///integrate transforms
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBody.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBody.cpp
new file mode 100644
index 00000000..662d2bf8
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBody.cpp
@@ -0,0 +1,1959 @@
+ *   Class representing an articulated rigid body. Stores the body's
+ *   current state, allows forces and torques to be set, handles
+ *   timestepping and implements Featherstone's algorithm.
+ *   
+ *   Copyright (C) Stephen Thompson, <stephen@solarflare.org.uk>, 2011-2013
+ *   Portions written By Erwin Coumans: connection to LCP solver, various multibody constraints, replacing Eigen math library by Bullet LinearMath and a dedicated 6x6 matrix inverse (solveImatrix)
+ *   Portions written By Jakub Stepien: support for multi-DOF constraints, introduction of spatial algebra and several other improvements
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ */
+#include "btMultiBody.h"
+#include "btMultiBodyLink.h"
+#include "btMultiBodyLinkCollider.h"
+#include "btMultiBodyJointFeedback.h"
+#include "LinearMath/btTransformUtil.h"
+#include "LinearMath/btSerializer.h"
+#include "Bullet3Common/b3Logging.h"
+// #define INCLUDE_GYRO_TERM 
+///todo: determine if we need these options. If so, make a proper API, otherwise delete those globals
+bool gJointFeedbackInWorldSpace = false;
+bool gJointFeedbackInJointFrame = false;
+namespace {
+    const btScalar SLEEP_EPSILON = btScalar(0.05);  // this is a squared velocity (m^2 s^-2)
+    const btScalar SLEEP_TIMEOUT = btScalar(2);     // in seconds
+namespace {
+    void SpatialTransform(const btMatrix3x3 &rotation_matrix,  // rotates vectors in 'from' frame to vectors in 'to' frame
+                          const btVector3 &displacement,     // vector from origin of 'from' frame to origin of 'to' frame, in 'to' coordinates
+                          const btVector3 &top_in,       // top part of input vector
+                          const btVector3 &bottom_in,    // bottom part of input vector
+                          btVector3 &top_out,         // top part of output vector
+                          btVector3 &bottom_out)      // bottom part of output vector
+    {
+        top_out = rotation_matrix * top_in;
+        bottom_out = -displacement.cross(top_out) + rotation_matrix * bottom_in;
+    }
+    void InverseSpatialTransform(const btMatrix3x3 &rotation_matrix,
+                                 const btVector3 &displacement,
+                                 const btVector3 &top_in,
+                                 const btVector3 &bottom_in,
+                                 btVector3 &top_out,
+                                 btVector3 &bottom_out)
+    {
+        top_out = rotation_matrix.transpose() * top_in;
+        bottom_out = rotation_matrix.transpose() * (bottom_in + displacement.cross(top_in));		
+    }
+    btScalar SpatialDotProduct(const btVector3 &a_top,
+                            const btVector3 &a_bottom,
+                            const btVector3 &b_top,
+                            const btVector3 &b_bottom)
+    {
+        return a_bottom.dot(b_top) + a_top.dot(b_bottom);
+    }
+	void SpatialCrossProduct(const btVector3 &a_top,
+                            const btVector3 &a_bottom,
+                            const btVector3 &b_top,
+                            const btVector3 &b_bottom,
+							btVector3 &top_out,
+							btVector3 &bottom_out)
+	{
+		top_out = a_top.cross(b_top);
+		bottom_out = a_bottom.cross(b_top) + a_top.cross(b_bottom);
+	}
+// Implementation of class btMultiBody
+btMultiBody::btMultiBody(int n_links,
+                     btScalar mass,
+                     const btVector3 &inertia,
+                     bool fixedBase,
+                     bool canSleep,
+		     bool /*deprecatedUseMultiDof*/)
+    : 
+    	m_baseCollider(0),
+		m_baseName(0),
+    	m_basePos(0,0,0),
+    	m_baseQuat(0, 0, 0, 1),
+      m_baseMass(mass),
+      m_baseInertia(inertia),
+		m_fixedBase(fixedBase),
+		m_awake(true),
+		m_canSleep(canSleep),
+		m_sleepTimer(0),
+		m_linearDamping(0.04f),
+		m_angularDamping(0.04f),
+		m_useGyroTerm(true),
+			m_maxAppliedImpulse(1000.f),
+		m_maxCoordinateVelocity(100.f),
+			m_hasSelfCollision(true),		
+		__posUpdated(false),
+			m_dofCount(0),
+		m_posVarCnt(0),
+		m_useRK4(false), 	
+		m_useGlobalVelocities(false),
+		m_internalNeedsJointFeedback(false)
+	m_links.resize(n_links);
+	m_matrixBuf.resize(n_links + 1);
+    m_baseForce.setValue(0, 0, 0);
+    m_baseTorque.setValue(0, 0, 0);
+void btMultiBody::setupFixed(int i,
+						   btScalar mass,
+						   const btVector3 &inertia,
+						   int parent,
+						   const btQuaternion &rotParentToThis,
+						   const btVector3 &parentComToThisPivotOffset,
+                           const btVector3 &thisPivotToThisComOffset, bool /*deprecatedDisableParentCollision*/)
+	m_links[i].m_mass = mass;
+    m_links[i].m_inertiaLocal = inertia;
+    m_links[i].m_parent = parent;
+    m_links[i].m_zeroRotParentToThis = rotParentToThis;
+	m_links[i].m_dVector = thisPivotToThisComOffset;
+    m_links[i].m_eVector = parentComToThisPivotOffset;    
+	m_links[i].m_jointType = btMultibodyLink::eFixed;
+	m_links[i].m_dofCount = 0;
+	m_links[i].m_posVarCount = 0;
+	m_links[i].updateCacheMultiDof();
+	updateLinksDofOffsets();
+void btMultiBody::setupPrismatic(int i,
+                               btScalar mass,
+                               const btVector3 &inertia,
+                               int parent,
+                               const btQuaternion &rotParentToThis,
+                               const btVector3 &jointAxis,
+                               const btVector3 &parentComToThisPivotOffset,
+							   const btVector3 &thisPivotToThisComOffset,
+							   bool disableParentCollision)
+	m_dofCount += 1;
+	m_posVarCnt += 1;
+    m_links[i].m_mass = mass;
+    m_links[i].m_inertiaLocal = inertia;
+    m_links[i].m_parent = parent;
+    m_links[i].m_zeroRotParentToThis = rotParentToThis;
+    m_links[i].setAxisTop(0, 0., 0., 0.);
+    m_links[i].setAxisBottom(0, jointAxis);
+    m_links[i].m_eVector = parentComToThisPivotOffset;
+	m_links[i].m_dVector = thisPivotToThisComOffset;
+    m_links[i].m_cachedRotParentToThis = rotParentToThis;
+	m_links[i].m_jointType = btMultibodyLink::ePrismatic;
+	m_links[i].m_dofCount = 1;
+	m_links[i].m_posVarCount = 1;	
+	m_links[i].m_jointPos[0] = 0.f;
+	m_links[i].m_jointTorque[0] = 0.f;
+	if (disableParentCollision)
+	//
+	m_links[i].updateCacheMultiDof();
+	updateLinksDofOffsets();
+void btMultiBody::setupRevolute(int i,
+                              btScalar mass,
+                              const btVector3 &inertia,
+                              int parent,
+                              const btQuaternion &rotParentToThis,
+                              const btVector3 &jointAxis,
+                              const btVector3 &parentComToThisPivotOffset,
+                              const btVector3 &thisPivotToThisComOffset,
+							  bool disableParentCollision)
+	m_dofCount += 1;
+	m_posVarCnt += 1;
+    m_links[i].m_mass = mass;
+    m_links[i].m_inertiaLocal = inertia;
+    m_links[i].m_parent = parent;
+    m_links[i].m_zeroRotParentToThis = rotParentToThis;
+    m_links[i].setAxisTop(0, jointAxis);
+    m_links[i].setAxisBottom(0, jointAxis.cross(thisPivotToThisComOffset));
+    m_links[i].m_dVector = thisPivotToThisComOffset;
+    m_links[i].m_eVector = parentComToThisPivotOffset;
+	m_links[i].m_jointType = btMultibodyLink::eRevolute;
+	m_links[i].m_dofCount = 1;
+	m_links[i].m_posVarCount = 1;	
+	m_links[i].m_jointPos[0] = 0.f;
+	m_links[i].m_jointTorque[0] = 0.f;
+	if (disableParentCollision)
+    //
+	m_links[i].updateCacheMultiDof();
+	//
+	updateLinksDofOffsets();
+void btMultiBody::setupSpherical(int i,
+						   btScalar mass,
+						   const btVector3 &inertia,
+						   int parent,
+						   const btQuaternion &rotParentToThis,
+						   const btVector3 &parentComToThisPivotOffset,
+						   const btVector3 &thisPivotToThisComOffset,
+						   bool disableParentCollision)
+	m_dofCount += 3;
+	m_posVarCnt += 4;
+	m_links[i].m_mass = mass;
+    m_links[i].m_inertiaLocal = inertia;
+    m_links[i].m_parent = parent;
+    m_links[i].m_zeroRotParentToThis = rotParentToThis;    
+    m_links[i].m_dVector = thisPivotToThisComOffset;
+    m_links[i].m_eVector = parentComToThisPivotOffset;    
+	m_links[i].m_jointType = btMultibodyLink::eSpherical;
+	m_links[i].m_dofCount = 3;
+	m_links[i].m_posVarCount = 4;
+	m_links[i].setAxisTop(0, 1.f, 0.f, 0.f);
+	m_links[i].setAxisTop(1, 0.f, 1.f, 0.f);
+	m_links[i].setAxisTop(2, 0.f, 0.f, 1.f);
+	m_links[i].setAxisBottom(0, m_links[i].getAxisTop(0).cross(thisPivotToThisComOffset));
+	m_links[i].setAxisBottom(1, m_links[i].getAxisTop(1).cross(thisPivotToThisComOffset));
+	m_links[i].setAxisBottom(2, m_links[i].getAxisTop(2).cross(thisPivotToThisComOffset));
+	m_links[i].m_jointPos[0] = m_links[i].m_jointPos[1] = m_links[i].m_jointPos[2] = 0.f; m_links[i].m_jointPos[3] = 1.f;
+	m_links[i].m_jointTorque[0] = m_links[i].m_jointTorque[1] = m_links[i].m_jointTorque[2] = 0.f;
+	if (disableParentCollision)
+	//
+	m_links[i].updateCacheMultiDof();	
+	//
+	updateLinksDofOffsets();
+void btMultiBody::setupPlanar(int i,
+						   btScalar mass,
+						   const btVector3 &inertia,
+						   int parent,
+						   const btQuaternion &rotParentToThis,
+						   const btVector3 &rotationAxis,
+						   const btVector3 &parentComToThisComOffset,						   
+						   bool disableParentCollision)
+	m_dofCount += 3;
+	m_posVarCnt += 3;
+	m_links[i].m_mass = mass;
+    m_links[i].m_inertiaLocal = inertia;
+    m_links[i].m_parent = parent;
+    m_links[i].m_zeroRotParentToThis = rotParentToThis;    
+	m_links[i].m_dVector.setZero();
+    m_links[i].m_eVector = parentComToThisComOffset;
+	//
+	btVector3 vecNonParallelToRotAxis(1, 0, 0);
+	if(rotationAxis.normalized().dot(vecNonParallelToRotAxis) > 0.999)
+		vecNonParallelToRotAxis.setValue(0, 1, 0);
+	//
+	m_links[i].m_jointType = btMultibodyLink::ePlanar;
+	m_links[i].m_dofCount = 3;
+	m_links[i].m_posVarCount = 3;
+	btVector3 n=rotationAxis.normalized();
+	m_links[i].setAxisTop(0, n[0],n[1],n[2]);
+	m_links[i].setAxisTop(1,0,0,0);
+	m_links[i].setAxisTop(2,0,0,0);
+	m_links[i].setAxisBottom(0,0,0,0);
+	btVector3 cr = m_links[i].getAxisTop(0).cross(vecNonParallelToRotAxis);
+	m_links[i].setAxisBottom(1,cr[0],cr[1],cr[2]);
+	cr = m_links[i].getAxisBottom(1).cross(m_links[i].getAxisTop(0));
+	m_links[i].setAxisBottom(2,cr[0],cr[1],cr[2]);
+	m_links[i].m_jointPos[0] = m_links[i].m_jointPos[1] = m_links[i].m_jointPos[2] = 0.f;
+	m_links[i].m_jointTorque[0] = m_links[i].m_jointTorque[1] = m_links[i].m_jointTorque[2] = 0.f;
+	if (disableParentCollision)
+    //
+	m_links[i].updateCacheMultiDof();
+	//
+	updateLinksDofOffsets();
+void btMultiBody::finalizeMultiDof()
+	m_deltaV.resize(0);
+	m_deltaV.resize(6 + m_dofCount);
+	m_realBuf.resize(6 + m_dofCount + m_dofCount*m_dofCount + 6 + m_dofCount);			//m_dofCount for joint-space vels + m_dofCount^2 for "D" matrices + delta-pos vector (6 base "vels" + joint "vels")
+	m_vectorBuf.resize(2 * m_dofCount);													//two 3-vectors (i.e. one six-vector) for each system dof	("h" matrices)
+	updateLinksDofOffsets();
+int btMultiBody::getParent(int i) const
+    return m_links[i].m_parent;
+btScalar btMultiBody::getLinkMass(int i) const
+    return m_links[i].m_mass;
+const btVector3 & btMultiBody::getLinkInertia(int i) const
+    return m_links[i].m_inertiaLocal;
+btScalar btMultiBody::getJointPos(int i) const
+    return m_links[i].m_jointPos[0];
+btScalar btMultiBody::getJointVel(int i) const
+    return m_realBuf[6 + m_links[i].m_dofOffset];
+btScalar * btMultiBody::getJointPosMultiDof(int i)
+	return &m_links[i].m_jointPos[0];
+btScalar * btMultiBody::getJointVelMultiDof(int i)
+	return &m_realBuf[6 + m_links[i].m_dofOffset];
+const btScalar * btMultiBody::getJointPosMultiDof(int i) const 
+	return &m_links[i].m_jointPos[0];
+const btScalar * btMultiBody::getJointVelMultiDof(int i) const 
+	return &m_realBuf[6 + m_links[i].m_dofOffset];
+void btMultiBody::setJointPos(int i, btScalar q)
+    m_links[i].m_jointPos[0] = q;
+    m_links[i].updateCacheMultiDof();
+void btMultiBody::setJointPosMultiDof(int i, btScalar *q)
+	for(int pos = 0; pos < m_links[i].m_posVarCount; ++pos)
+		m_links[i].m_jointPos[pos] = q[pos];
+    m_links[i].updateCacheMultiDof();
+void btMultiBody::setJointVel(int i, btScalar qdot)
+    m_realBuf[6 + m_links[i].m_dofOffset] = qdot;
+void btMultiBody::setJointVelMultiDof(int i, btScalar *qdot)
+	for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		m_realBuf[6 + m_links[i].m_dofOffset + dof] = qdot[dof];
+const btVector3 & btMultiBody::getRVector(int i) const
+    return m_links[i].m_cachedRVector;
+const btQuaternion & btMultiBody::getParentToLocalRot(int i) const
+    return m_links[i].m_cachedRotParentToThis;
+btVector3 btMultiBody::localPosToWorld(int i, const btVector3 &local_pos) const
+    btVector3 result = local_pos;
+    while (i != -1) {
+        // 'result' is in frame i. transform it to frame parent(i)
+        result += getRVector(i);
+        result = quatRotate(getParentToLocalRot(i).inverse(),result);
+        i = getParent(i);
+    }
+    // 'result' is now in the base frame. transform it to world frame
+    result = quatRotate(getWorldToBaseRot().inverse() ,result);
+    result += getBasePos();
+    return result;
+btVector3 btMultiBody::worldPosToLocal(int i, const btVector3 &world_pos) const
+    if (i == -1) {
+        // world to base
+        return quatRotate(getWorldToBaseRot(),(world_pos - getBasePos()));
+    } else {
+        // find position in parent frame, then transform to current frame
+        return quatRotate(getParentToLocalRot(i),worldPosToLocal(getParent(i), world_pos)) - getRVector(i);
+    }
+btVector3 btMultiBody::localDirToWorld(int i, const btVector3 &local_dir) const
+    btVector3 result = local_dir;
+    while (i != -1) {
+        result = quatRotate(getParentToLocalRot(i).inverse() , result);
+        i = getParent(i);
+    }
+    result = quatRotate(getWorldToBaseRot().inverse() , result);
+    return result;
+btVector3 btMultiBody::worldDirToLocal(int i, const btVector3 &world_dir) const
+    if (i == -1) {
+        return quatRotate(getWorldToBaseRot(), world_dir);
+    } else {
+        return quatRotate(getParentToLocalRot(i) ,worldDirToLocal(getParent(i), world_dir));
+    }
+btMatrix3x3 btMultiBody::localFrameToWorld(int i, const btMatrix3x3 &local_frame) const
+    btMatrix3x3 result = local_frame;
+    btVector3 frameInWorld0 = localDirToWorld(i, local_frame.getColumn(0));
+    btVector3 frameInWorld1 = localDirToWorld(i, local_frame.getColumn(1));
+    btVector3 frameInWorld2 = localDirToWorld(i, local_frame.getColumn(2));
+    result.setValue(frameInWorld0[0], frameInWorld1[0], frameInWorld2[0], frameInWorld0[1], frameInWorld1[1], frameInWorld2[1], frameInWorld0[2], frameInWorld1[2], frameInWorld2[2]);
+    return result;
+void btMultiBody::compTreeLinkVelocities(btVector3 *omega, btVector3 *vel) const
+	int num_links = getNumLinks();
+    // Calculates the velocities of each link (and the base) in its local frame
+    omega[0] = quatRotate(m_baseQuat ,getBaseOmega());
+    vel[0] = quatRotate(m_baseQuat ,getBaseVel());
+    for (int i = 0; i < num_links; ++i) {
+        const int parent = m_links[i].m_parent;
+        // transform parent vel into this frame, store in omega[i+1], vel[i+1]
+        SpatialTransform(btMatrix3x3(m_links[i].m_cachedRotParentToThis), m_links[i].m_cachedRVector,
+                         omega[parent+1], vel[parent+1],
+                         omega[i+1], vel[i+1]);
+        // now add qidot * shat_i
+        omega[i+1] += getJointVel(i) * m_links[i].getAxisTop(0);
+        vel[i+1] += getJointVel(i) * m_links[i].getAxisBottom(0);
+    }
+btScalar btMultiBody::getKineticEnergy() const
+	int num_links = getNumLinks();
+    // TODO: would be better not to allocate memory here
+    btAlignedObjectArray<btVector3> omega;omega.resize(num_links+1);
+	btAlignedObjectArray<btVector3> vel;vel.resize(num_links+1);
+    compTreeLinkVelocities(&omega[0], &vel[0]);
+    // we will do the factor of 0.5 at the end
+    btScalar result = m_baseMass * vel[0].dot(vel[0]);
+    result += omega[0].dot(m_baseInertia * omega[0]);
+    for (int i = 0; i < num_links; ++i) {
+        result += m_links[i].m_mass * vel[i+1].dot(vel[i+1]);
+        result += omega[i+1].dot(m_links[i].m_inertiaLocal * omega[i+1]);
+    }
+    return 0.5f * result;
+btVector3 btMultiBody::getAngularMomentum() const
+	int num_links = getNumLinks();
+    // TODO: would be better not to allocate memory here
+    btAlignedObjectArray<btVector3> omega;omega.resize(num_links+1);
+	btAlignedObjectArray<btVector3> vel;vel.resize(num_links+1);
+    btAlignedObjectArray<btQuaternion> rot_from_world;rot_from_world.resize(num_links+1);
+    compTreeLinkVelocities(&omega[0], &vel[0]);
+    rot_from_world[0] = m_baseQuat;
+    btVector3 result = quatRotate(rot_from_world[0].inverse() , (m_baseInertia * omega[0]));
+    for (int i = 0; i < num_links; ++i) {
+        rot_from_world[i+1] = m_links[i].m_cachedRotParentToThis * rot_from_world[m_links[i].m_parent+1];
+        result += (quatRotate(rot_from_world[i+1].inverse() , (m_links[i].m_inertiaLocal * omega[i+1])));
+    }
+    return result;
+void btMultiBody::clearConstraintForces()
+	m_baseConstraintForce.setValue(0, 0, 0);
+	m_baseConstraintTorque.setValue(0, 0, 0);
+    for (int i = 0; i < getNumLinks(); ++i) {
+        m_links[i].m_appliedConstraintForce.setValue(0, 0, 0);
+        m_links[i].m_appliedConstraintTorque.setValue(0, 0, 0);
+    }
+void btMultiBody::clearForcesAndTorques()
+    m_baseForce.setValue(0, 0, 0);
+    m_baseTorque.setValue(0, 0, 0);
+    for (int i = 0; i < getNumLinks(); ++i) {
+        m_links[i].m_appliedForce.setValue(0, 0, 0);
+        m_links[i].m_appliedTorque.setValue(0, 0, 0);
+		m_links[i].m_jointTorque[0] = m_links[i].m_jointTorque[1] = m_links[i].m_jointTorque[2] = m_links[i].m_jointTorque[3] = m_links[i].m_jointTorque[4] = m_links[i].m_jointTorque[5] = 0.f;
+    }
+void btMultiBody::clearVelocities()
+	for (int i = 0; i < 6 + getNumLinks(); ++i) 
+	{
+		m_realBuf[i] = 0.f;
+	}
+void btMultiBody::addLinkForce(int i, const btVector3 &f)
+    m_links[i].m_appliedForce += f;
+void btMultiBody::addLinkTorque(int i, const btVector3 &t)
+    m_links[i].m_appliedTorque += t;
+void btMultiBody::addLinkConstraintForce(int i, const btVector3 &f)
+    m_links[i].m_appliedConstraintForce += f;
+void btMultiBody::addLinkConstraintTorque(int i, const btVector3 &t)
+    m_links[i].m_appliedConstraintTorque += t;
+void btMultiBody::addJointTorque(int i, btScalar Q)
+    m_links[i].m_jointTorque[0] += Q;
+void btMultiBody::addJointTorqueMultiDof(int i, int dof, btScalar Q)
+	m_links[i].m_jointTorque[dof] += Q;
+void btMultiBody::addJointTorqueMultiDof(int i, const btScalar *Q)
+	for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		m_links[i].m_jointTorque[dof] = Q[dof];
+const btVector3 & btMultiBody::getLinkForce(int i) const
+    return m_links[i].m_appliedForce;
+const btVector3 & btMultiBody::getLinkTorque(int i) const
+    return m_links[i].m_appliedTorque;
+btScalar btMultiBody::getJointTorque(int i) const
+    return m_links[i].m_jointTorque[0];
+btScalar * btMultiBody::getJointTorqueMultiDof(int i)
+    return &m_links[i].m_jointTorque[0];
+inline btMatrix3x3 outerProduct(const btVector3& v0, const btVector3& v1)				//renamed it from vecMulVecTranspose (http://en.wikipedia.org/wiki/Outer_product); maybe it should be moved to btVector3 like dot and cross?
+		btVector3 row0 = btVector3( 
+			v0.x() * v1.x(),
+			v0.x() * v1.y(),
+			v0.x() * v1.z());
+		btVector3 row1 = btVector3( 
+			v0.y() * v1.x(),
+			v0.y() * v1.y(),
+			v0.y() * v1.z());
+		btVector3 row2 = btVector3( 
+			v0.z() * v1.x(),
+			v0.z() * v1.y(),
+			v0.z() * v1.z());
+        btMatrix3x3 m(row0[0],row0[1],row0[2],
+						row1[0],row1[1],row1[2],
+						row2[0],row2[1],row2[2]);
+		return m;
+#define vecMulVecTranspose(v0, v1Transposed) outerProduct(v0, v1Transposed)
+void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar dt,
+                               btAlignedObjectArray<btScalar> &scratch_r,
+                               btAlignedObjectArray<btVector3> &scratch_v,
+                               btAlignedObjectArray<btMatrix3x3> &scratch_m,
+				bool isConstraintPass)
+    // Implement Featherstone's algorithm to calculate joint accelerations (q_double_dot)
+    // and the base linear & angular accelerations.
+    // We apply damping forces in this routine as well as any external forces specified by the 
+    // caller (via addBaseForce etc).
+    // output should point to an array of 6 + num_links reals.
+    // Format is: 3 angular accelerations (in world frame), 3 linear accelerations (in world frame),
+    // num_links joint acceleration values.
+	// We added support for multi degree of freedom (multi dof) joints.
+	// In addition we also can compute the joint reaction forces. This is performed in a second pass,
+	// so that we can include the effect of the constraint solver forces (computed in the PGS LCP solver)
+	m_internalNeedsJointFeedback = false;
+	int num_links = getNumLinks();
+    const btScalar DAMPING_K1_LINEAR = m_linearDamping;
+	const btScalar DAMPING_K2_LINEAR = m_linearDamping;
+	const btScalar DAMPING_K1_ANGULAR = m_angularDamping;
+	const btScalar DAMPING_K2_ANGULAR= m_angularDamping;
+    btVector3 base_vel = getBaseVel();
+    btVector3 base_omega = getBaseOmega();
+    // Temporary matrices/vectors -- use scratch space from caller
+    // so that we don't have to keep reallocating every frame
+    scratch_r.resize(2*m_dofCount + 6);				//multidof? ("Y"s use it and it is used to store qdd) => 2 x m_dofCount
+    scratch_v.resize(8*num_links + 6);
+    scratch_m.resize(4*num_links + 4);
+	//btScalar * r_ptr = &scratch_r[0];
+    btScalar * output = &scratch_r[m_dofCount];  // "output" holds the q_double_dot results
+    btVector3 * v_ptr = &scratch_v[0];
+    // vhat_i  (top = angular, bottom = linear part)	
+	btSpatialMotionVector *spatVel = (btSpatialMotionVector *)v_ptr;
+	v_ptr += num_links * 2 + 2;
+	//
+    // zhat_i^A    
+	btSpatialForceVector * zeroAccSpatFrc = (btSpatialForceVector *)v_ptr;
+	v_ptr += num_links * 2 + 2;
+	//
+    // chat_i  (note NOT defined for the base)    
+	btSpatialMotionVector * spatCoriolisAcc = (btSpatialMotionVector *)v_ptr;
+	v_ptr += num_links * 2;
+	//
+    // Ihat_i^A.    
+	btSymmetricSpatialDyad * spatInertia = (btSymmetricSpatialDyad *)&scratch_m[num_links + 1];
+    // Cached 3x3 rotation matrices from parent frame to this frame.
+    btMatrix3x3 * rot_from_parent = &m_matrixBuf[0];
+    btMatrix3x3 * rot_from_world = &scratch_m[0];
+    // hhat_i, ahat_i
+    // hhat is NOT stored for the base (but ahat is)    
+	btSpatialForceVector * h = (btSpatialForceVector *)(m_dofCount > 0 ? &m_vectorBuf[0] : 0);
+	btSpatialMotionVector * spatAcc = (btSpatialMotionVector *)v_ptr;
+	v_ptr += num_links * 2 + 2;
+	//
+    // Y_i, invD_i
+    btScalar * invD = m_dofCount > 0 ? &m_realBuf[6 + m_dofCount] : 0;
+	btScalar * Y = &scratch_r[0];
+	//
+	//aux variables	
+	btSpatialMotionVector spatJointVel;					//spatial velocity due to the joint motion (i.e. without predecessors' influence)
+	btScalar D[36];										//"D" matrix; it's dofxdof for each body so asingle 6x6 D matrix will do	
+	btScalar invD_times_Y[6];							//D^{-1} * Y [dofxdof x dofx1 = dofx1] <=> D^{-1} * u; better moved to buffers since it is recalced in calcAccelerationDeltasMultiDof; num_dof of btScalar would cover all bodies	
+	btSpatialMotionVector result;							//holds results of the SolveImatrix op; it is a spatial motion vector (accel)
+	btScalar Y_minus_hT_a[6];							//Y - h^{T} * a; it's dofx1 for each body so a single 6x1 temp is enough	
+	btSpatialForceVector spatForceVecTemps[6];				//6 temporary spatial force vectors
+	btSpatialTransformationMatrix fromParent;				//spatial transform from parent to child
+	btSymmetricSpatialDyad dyadTemp;						//inertia matrix temp
+	btSpatialTransformationMatrix fromWorld;
+	fromWorld.m_trnVec.setZero();
+	/////////////////
+    // ptr to the joint accel part of the output
+    btScalar * joint_accel = output + 6;
+    // Start of the algorithm proper.
+    // First 'upward' loop.
+    // Combines CompTreeLinkVelocities and InitTreeLinks from Mirtich.
+    rot_from_parent[0] = btMatrix3x3(m_baseQuat);				//m_baseQuat assumed to be alias!?
+	//create the vector of spatial velocity of the base by transforming global-coor linear and angular velocities into base-local coordinates
+	spatVel[0].setVector(rot_from_parent[0] * base_omega, rot_from_parent[0] * base_vel);
+    if (m_fixedBase) 
+	{        
+		zeroAccSpatFrc[0].setZero();
+    }
+	else 
+	{
+		btVector3 baseForce = isConstraintPass? m_baseConstraintForce : m_baseForce;
+		btVector3 baseTorque = isConstraintPass? m_baseConstraintTorque : m_baseTorque;
+		//external forces		
+		zeroAccSpatFrc[0].setVector(-(rot_from_parent[0] * baseTorque), -(rot_from_parent[0] * baseForce));	
+		//adding damping terms (only)
+		btScalar linDampMult = 1., angDampMult = 1.;
+		zeroAccSpatFrc[0].addVector(angDampMult * m_baseInertia * spatVel[0].getAngular() * (DAMPING_K1_ANGULAR + DAMPING_K2_ANGULAR * spatVel[0].getAngular().norm()),
+								   linDampMult * m_baseMass * spatVel[0].getLinear() * (DAMPING_K1_LINEAR + DAMPING_K2_LINEAR * spatVel[0].getLinear().norm()));
+		//
+		//p += vhat x Ihat vhat - done in a simpler way
+		if (m_useGyroTerm)
+			zeroAccSpatFrc[0].addAngular(spatVel[0].getAngular().cross(m_baseInertia * spatVel[0].getAngular()));
+		//
+		zeroAccSpatFrc[0].addLinear(m_baseMass * spatVel[0].getAngular().cross(spatVel[0].getLinear()));
+    }
+	//init the spatial AB inertia (it has the simple form thanks to choosing local body frames origins at their COMs)
+	spatInertia[0].setMatrix(	btMatrix3x3(0,0,0,0,0,0,0,0,0),
+								//
+								btMatrix3x3(m_baseMass, 0, 0,
+											0, m_baseMass, 0,
+											0, 0, m_baseMass),
+								//
+								btMatrix3x3(m_baseInertia[0], 0, 0,
+											0, m_baseInertia[1], 0,
+											0, 0, m_baseInertia[2])
+							);
+    rot_from_world[0] = rot_from_parent[0];
+	//
+    for (int i = 0; i < num_links; ++i) {		
+        const int parent = m_links[i].m_parent;
+        rot_from_parent[i+1] = btMatrix3x3(m_links[i].m_cachedRotParentToThis);
+        rot_from_world[i+1] = rot_from_parent[i+1] * rot_from_world[parent+1];
+		fromParent.m_rotMat = rot_from_parent[i+1]; fromParent.m_trnVec = m_links[i].m_cachedRVector;
+		fromWorld.m_rotMat = rot_from_world[i+1];
+		fromParent.transform(spatVel[parent+1], spatVel[i+1]);
+		// now set vhat_i to its true value by doing
+        // vhat_i += qidot * shat_i			
+		if(!m_useGlobalVelocities)
+		{
+			spatJointVel.setZero();
+			for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)		
+				spatJointVel += m_links[i].m_axes[dof] * getJointVelMultiDof(i)[dof];
+			// remember vhat_i is really vhat_p(i) (but in current frame) at this point	=> we need to add velocity across the inboard joint
+			spatVel[i+1] += spatJointVel;
+			//
+			// vhat_i is vhat_p(i) transformed to local coors + the velocity across the i-th inboard joint
+			//spatVel[i+1] = fromParent * spatVel[parent+1] + spatJointVel;
+		}
+		else
+		{
+			fromWorld.transformRotationOnly(m_links[i].m_absFrameTotVelocity, spatVel[i+1]);
+			fromWorld.transformRotationOnly(m_links[i].m_absFrameLocVelocity, spatJointVel);
+		}
+		// we can now calculate chat_i 		
+		spatVel[i+1].cross(spatJointVel, spatCoriolisAcc[i]);		
+        // calculate zhat_i^A
+		//
+		//external forces		
+		btVector3 linkAppliedForce = isConstraintPass? m_links[i].m_appliedConstraintForce : m_links[i].m_appliedForce;
+		btVector3 linkAppliedTorque =isConstraintPass ? m_links[i].m_appliedConstraintTorque : m_links[i].m_appliedTorque;
+		zeroAccSpatFrc[i+1].setVector(-(rot_from_world[i+1] * linkAppliedTorque), -(rot_from_world[i+1] * linkAppliedForce ));
+#if 0	
+		{
+			b3Printf("stepVelocitiesMultiDof zeroAccSpatFrc[%d] linear:%f,%f,%f, angular:%f,%f,%f",
+			i+1,
+			zeroAccSpatFrc[i+1].m_topVec[0],
+			zeroAccSpatFrc[i+1].m_topVec[1],
+			zeroAccSpatFrc[i+1].m_topVec[2],
+			zeroAccSpatFrc[i+1].m_bottomVec[0],
+			zeroAccSpatFrc[i+1].m_bottomVec[1],
+			zeroAccSpatFrc[i+1].m_bottomVec[2]);
+		}
+		//
+		//adding damping terms (only)
+		btScalar linDampMult = 1., angDampMult = 1.;
+		zeroAccSpatFrc[i+1].addVector(angDampMult * m_links[i].m_inertiaLocal * spatVel[i+1].getAngular() * (DAMPING_K1_ANGULAR + DAMPING_K2_ANGULAR * spatVel[i+1].getAngular().norm()),
+									 linDampMult * m_links[i].m_mass * spatVel[i+1].getLinear() * (DAMPING_K1_LINEAR + DAMPING_K2_LINEAR * spatVel[i+1].getLinear().norm()));
+        // calculate Ihat_i^A
+		//init the spatial AB inertia (it has the simple form thanks to choosing local body frames origins at their COMs)
+		spatInertia[i+1].setMatrix(	btMatrix3x3(0,0,0,0,0,0,0,0,0),
+									//
+									btMatrix3x3(m_links[i].m_mass, 0, 0,
+												0, m_links[i].m_mass, 0,
+												0, 0, m_links[i].m_mass),
+									//
+									btMatrix3x3(m_links[i].m_inertiaLocal[0], 0, 0,
+												0, m_links[i].m_inertiaLocal[1], 0,
+												0, 0, m_links[i].m_inertiaLocal[2])
+								);
+		//
+		//p += vhat x Ihat vhat - done in a simpler way
+		if(m_useGyroTerm)
+			zeroAccSpatFrc[i+1].addAngular(spatVel[i+1].getAngular().cross(m_links[i].m_inertiaLocal * spatVel[i+1].getAngular()));			
+		//		
+		zeroAccSpatFrc[i+1].addLinear(m_links[i].m_mass * spatVel[i+1].getAngular().cross(spatVel[i+1].getLinear()));
+		//btVector3 temp = m_links[i].m_mass * spatVel[i+1].getAngular().cross(spatVel[i+1].getLinear());
+		////clamp parent's omega
+		//btScalar parOmegaMod = temp.length();
+		//btScalar parOmegaModMax = 1000;
+		//if(parOmegaMod > parOmegaModMax)
+		//	temp *= parOmegaModMax / parOmegaMod;
+		//zeroAccSpatFrc[i+1].addLinear(temp);
+		//printf("|zeroAccSpatFrc[%d]| = %.4f\n", i+1, temp.length());
+		//temp = spatCoriolisAcc[i].getLinear();
+		//printf("|spatCoriolisAcc[%d]| = %.4f\n", i+1, temp.length());
+		//printf("w[%d] = [%.4f %.4f %.4f]\n", i, vel_top_angular[i+1].x(), vel_top_angular[i+1].y(), vel_top_angular[i+1].z());
+		//printf("v[%d] = [%.4f %.4f %.4f]\n", i, vel_bottom_linear[i+1].x(), vel_bottom_linear[i+1].y(), vel_bottom_linear[i+1].z());		
+		//printf("c[%d] = [%.4f %.4f %.4f]\n", i, coriolis_bottom_linear[i].x(), coriolis_bottom_linear[i].y(), coriolis_bottom_linear[i].z());
+    }
+    // 'Downward' loop.
+    // (part of TreeForwardDynamics in Mirtich.)
+    for (int i = num_links - 1; i >= 0; --i)
+	{
+		const int parent = m_links[i].m_parent;
+		fromParent.m_rotMat = rot_from_parent[i+1]; fromParent.m_trnVec = m_links[i].m_cachedRVector;
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+			//
+			hDof = spatInertia[i+1] * m_links[i].m_axes[dof];
+			//
+			Y[m_links[i].m_dofOffset + dof] = m_links[i].m_jointTorque[dof]
+			- m_links[i].m_axes[dof].dot(zeroAccSpatFrc[i+1])
+			- spatCoriolisAcc[i].dot(hDof)
+			;
+		}
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			btScalar *D_row = &D[dof * m_links[i].m_dofCount];			
+			for(int dof2 = 0; dof2 < m_links[i].m_dofCount; ++dof2)
+			{
+				btSpatialForceVector &hDof2 = h[m_links[i].m_dofOffset + dof2];
+				D_row[dof2] = m_links[i].m_axes[dof].dot(hDof2);
+			}
+		}
+        btScalar *invDi = &invD[m_links[i].m_dofOffset*m_links[i].m_dofOffset];
+		switch(m_links[i].m_jointType)
+		{
+			case btMultibodyLink::ePrismatic:
+			case btMultibodyLink::eRevolute:
+			{
+				invDi[0] = 1.0f / D[0];
+				break;
+			}
+			case btMultibodyLink::eSpherical:
+			case btMultibodyLink::ePlanar:
+			{
+				btMatrix3x3 D3x3; D3x3.setValue(D[0], D[1], D[2], D[3], D[4], D[5], D[6], D[7], D[8]);
+				btMatrix3x3 invD3x3; invD3x3 = D3x3.inverse();
+				//unroll the loop?
+				for(int row = 0; row < 3; ++row)
+				{
+					for(int col = 0; col < 3; ++col)
+					{						
+						invDi[row * 3 + col] = invD3x3[row][col];
+					}
+				}
+				break;
+			}
+			default:
+			{
+			}
+		}
+		//determine h*D^{-1}
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			spatForceVecTemps[dof].setZero();
+			for(int dof2 = 0; dof2 < m_links[i].m_dofCount; ++dof2)
+			{				
+				btSpatialForceVector &hDof2 = h[m_links[i].m_dofOffset + dof2];
+				//				
+				spatForceVecTemps[dof] += hDof2 * invDi[dof2 * m_links[i].m_dofCount + dof];
+			}
+		}
+		dyadTemp = spatInertia[i+1];
+		//determine (h*D^{-1}) * h^{T}
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{			
+			btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+			//
+			dyadTemp -= symmetricSpatialOuterProduct(hDof, spatForceVecTemps[dof]);
+		}
+		fromParent.transformInverse(dyadTemp, spatInertia[parent+1], btSpatialTransformationMatrix::Add);
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			invD_times_Y[dof] = 0.f;
+			for(int dof2 = 0; dof2 < m_links[i].m_dofCount; ++dof2)
+			{
+				invD_times_Y[dof] += invDi[dof * m_links[i].m_dofCount + dof2] * Y[m_links[i].m_dofOffset + dof2];				
+			}	
+		}
+		spatForceVecTemps[0] = zeroAccSpatFrc[i+1] + spatInertia[i+1] * spatCoriolisAcc[i];		
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{				
+			btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+			//
+			spatForceVecTemps[0] += hDof * invD_times_Y[dof];			
+		}
+		fromParent.transformInverse(spatForceVecTemps[0], spatForceVecTemps[1]);
+		zeroAccSpatFrc[parent+1] += spatForceVecTemps[1];
+    }
+    // Second 'upward' loop
+    // (part of TreeForwardDynamics in Mirtich)
+    if (m_fixedBase) 
+	{
+        spatAcc[0].setZero();
+    } 
+	else 
+	{
+        if (num_links > 0) 
+		{
+			m_cachedInertiaTopLeft = spatInertia[0].m_topLeftMat;
+			m_cachedInertiaTopRight = spatInertia[0].m_topRightMat;
+			m_cachedInertiaLowerLeft = spatInertia[0].m_bottomLeftMat;
+			m_cachedInertiaLowerRight= spatInertia[0].m_topLeftMat.transpose();
+        }		
+		solveImatrix(zeroAccSpatFrc[0], result);
+		spatAcc[0] = -result;
+    }
+    // now do the loop over the m_links
+    for (int i = 0; i < num_links; ++i) 
+	{
+		//	qdd = D^{-1} * (Y - h^{T}*apar) = (S^{T}*I*S)^{-1} * (tau - S^{T}*I*cor - S^{T}*zeroAccFrc - S^{T}*I*apar)
+		//	a = apar + cor + Sqdd
+		//or
+		//	qdd = D^{-1} * (Y - h^{T}*(apar+cor))
+		//	a = apar + Sqdd
+        const int parent = m_links[i].m_parent;
+		fromParent.m_rotMat = rot_from_parent[i+1]; fromParent.m_trnVec = m_links[i].m_cachedRVector;
+		fromParent.transform(spatAcc[parent+1], spatAcc[i+1]);
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+			//			
+			Y_minus_hT_a[dof] = Y[m_links[i].m_dofOffset + dof] - spatAcc[i+1].dot(hDof);			
+		}
+		btScalar *invDi = &invD[m_links[i].m_dofOffset*m_links[i].m_dofOffset];
+		//D^{-1} * (Y - h^{T}*apar)
+		mulMatrix(invDi, Y_minus_hT_a, m_links[i].m_dofCount, m_links[i].m_dofCount, m_links[i].m_dofCount, 1, &joint_accel[m_links[i].m_dofOffset]);
+		spatAcc[i+1] += spatCoriolisAcc[i];		
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)		
+			spatAcc[i+1] += m_links[i].m_axes[dof] * joint_accel[m_links[i].m_dofOffset + dof];
+		if (m_links[i].m_jointFeedback)
+		{
+			m_internalNeedsJointFeedback = true;
+			btVector3 angularBotVec = (spatInertia[i+1]*spatAcc[i+1]+zeroAccSpatFrc[i+1]).m_bottomVec;
+			btVector3 linearTopVec = (spatInertia[i+1]*spatAcc[i+1]+zeroAccSpatFrc[i+1]).m_topVec;
+			if (gJointFeedbackInJointFrame)
+			{
+				//shift the reaction forces to the joint frame
+				//linear (force) component is the same
+				//shift the angular (torque, moment) component using the relative position,  m_links[i].m_dVector
+				 angularBotVec = angularBotVec - linearTopVec.cross(m_links[i].m_dVector);
+			}
+			if (gJointFeedbackInWorldSpace)
+			{
+				if (isConstraintPass)
+				{
+ m_links[i].m_jointFeedback->m_reactionForces.m_bottomVec += m_links[i].m_cachedWorldTransform.getBasis()*angularBotVec;
+                                        m_links[i].m_jointFeedback->m_reactionForces.m_topVec += m_links[i].m_cachedWorldTransform.getBasis()*linearTopVec;
+				} else
+				{
+					m_links[i].m_jointFeedback->m_reactionForces.m_bottomVec = m_links[i].m_cachedWorldTransform.getBasis()*angularBotVec;
+					m_links[i].m_jointFeedback->m_reactionForces.m_topVec = m_links[i].m_cachedWorldTransform.getBasis()*linearTopVec;
+				}
+			} else
+			{
+				if (isConstraintPass)
+				{
+					  m_links[i].m_jointFeedback->m_reactionForces.m_bottomVec += angularBotVec;                        
+                                m_links[i].m_jointFeedback->m_reactionForces.m_topVec += linearTopVec;
+				}
+				else
+				{
+				m_links[i].m_jointFeedback->m_reactionForces.m_bottomVec = angularBotVec;
+				m_links[i].m_jointFeedback->m_reactionForces.m_topVec = linearTopVec;
+				}		
+			}	
+	}
+    }
+    // transform base accelerations back to the world frame.
+    btVector3 omegadot_out = rot_from_parent[0].transpose() * spatAcc[0].getAngular();
+	output[0] = omegadot_out[0];
+	output[1] = omegadot_out[1];
+	output[2] = omegadot_out[2];
+    btVector3 vdot_out = rot_from_parent[0].transpose() * (spatAcc[0].getLinear() + spatVel[0].getAngular().cross(spatVel[0].getLinear()));
+	output[3] = vdot_out[0];
+	output[4] = vdot_out[1];
+	output[5] = vdot_out[2];
+	/////////////////
+	//printf("q = [");
+	//printf("%.6f, %.6f, %.6f, %.6f, %.6f, %.6f, %.6f ", m_baseQuat.x(), m_baseQuat.y(), m_baseQuat.z(), m_baseQuat.w(), m_basePos.x(), m_basePos.y(), m_basePos.z());
+	//for(int link = 0; link < getNumLinks(); ++link)
+	//	for(int dof = 0; dof < m_links[link].m_dofCount; ++dof)
+	//		printf("%.6f ", m_links[link].m_jointPos[dof]);
+	//printf("]\n");
+	////
+	//printf("qd = [");
+	//for(int dof = 0; dof < getNumDofs() + 6; ++dof)
+	//	printf("%.6f ", m_realBuf[dof]);
+	//printf("]\n");
+	//printf("qdd = [");
+	//for(int dof = 0; dof < getNumDofs() + 6; ++dof)
+	//	printf("%.6f ", output[dof]);
+	//printf("]\n");
+	/////////////////
+    // Final step: add the accelerations (times dt) to the velocities.
+	if (!isConstraintPass)
+	{
+	if(dt > 0.)
+		applyDeltaVeeMultiDof(output, dt);
+	}
+	/////
+	//btScalar angularThres = 1;
+	//btScalar maxAngVel = 0.;		
+	//bool scaleDown = 1.;
+	//for(int link = 0; link < m_links.size(); ++link)
+	//{		
+	//	if(spatVel[link+1].getAngular().length() > maxAngVel)
+	//	{
+	//		maxAngVel = spatVel[link+1].getAngular().length();
+	//		scaleDown = angularThres / spatVel[link+1].getAngular().length();
+	//		break;
+	//	}		
+	//}
+	//if(scaleDown != 1.)
+	//{
+	//	for(int link = 0; link < m_links.size(); ++link)
+	//	{
+	//		if(m_links[link].m_jointType == btMultibodyLink::eRevolute || m_links[link].m_jointType == btMultibodyLink::eSpherical)
+	//		{
+	//			for(int dof = 0; dof < m_links[link].m_dofCount; ++dof)
+	//				getJointVelMultiDof(link)[dof] *= scaleDown;
+	//		}
+	//	}
+	//}
+	/////
+	/////////////////////
+	if(m_useGlobalVelocities)
+	{
+		for (int i = 0; i < num_links; ++i) 
+		{
+			const int parent = m_links[i].m_parent;
+			//rot_from_parent[i+1] = btMatrix3x3(m_links[i].m_cachedRotParentToThis);    /// <- done
+			//rot_from_world[i+1] = rot_from_parent[i+1] * rot_from_world[parent+1];		/// <- done
+			fromParent.m_rotMat = rot_from_parent[i+1]; fromParent.m_trnVec = m_links[i].m_cachedRVector;
+			fromWorld.m_rotMat = rot_from_world[i+1];			
+			// vhat_i = i_xhat_p(i) * vhat_p(i)		
+			fromParent.transform(spatVel[parent+1], spatVel[i+1]);
+			//nice alternative below (using operator *) but it generates temps
+			/////////////////////////////////////////////////////////////
+			// now set vhat_i to its true value by doing
+			// vhat_i += qidot * shat_i			
+			spatJointVel.setZero();
+			for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)		
+				spatJointVel += m_links[i].m_axes[dof] * getJointVelMultiDof(i)[dof];
+			// remember vhat_i is really vhat_p(i) (but in current frame) at this point	=> we need to add velocity across the inboard joint
+			spatVel[i+1] += spatJointVel;
+			fromWorld.transformInverseRotationOnly(spatVel[i+1], m_links[i].m_absFrameTotVelocity);
+			fromWorld.transformInverseRotationOnly(spatJointVel, m_links[i].m_absFrameLocVelocity);
+		}
+	}
+void btMultiBody::solveImatrix(const btVector3& rhs_top, const btVector3& rhs_bot, float result[6]) const
+	int num_links = getNumLinks();
+	///solve I * x = rhs, so the result = invI * rhs
+    if (num_links == 0) 
+	{
+		// in the case of 0 m_links (i.e. a plain rigid body, not a multibody) rhs * invI is easier
+        result[0] = rhs_bot[0] / m_baseInertia[0];
+        result[1] = rhs_bot[1] / m_baseInertia[1];
+        result[2] = rhs_bot[2] / m_baseInertia[2];
+        result[3] = rhs_top[0] / m_baseMass;
+        result[4] = rhs_top[1] / m_baseMass;
+        result[5] = rhs_top[2] / m_baseMass;
+    } else 
+	{
+		/// Special routine for calculating the inverse of a spatial inertia matrix
+		///the 6x6 matrix is stored as 4 blocks of 3x3 matrices
+		btMatrix3x3 Binv = m_cachedInertiaTopRight.inverse()*-1.f;
+		btMatrix3x3 tmp = m_cachedInertiaLowerRight * Binv;
+		btMatrix3x3 invIupper_right = (tmp * m_cachedInertiaTopLeft + m_cachedInertiaLowerLeft).inverse();
+		tmp = invIupper_right * m_cachedInertiaLowerRight;
+		btMatrix3x3 invI_upper_left = (tmp * Binv);
+		btMatrix3x3 invI_lower_right = (invI_upper_left).transpose();
+		tmp = m_cachedInertiaTopLeft  * invI_upper_left;
+		tmp[0][0]-= 1.0;
+		tmp[1][1]-= 1.0;
+		tmp[2][2]-= 1.0;
+		btMatrix3x3 invI_lower_left = (Binv * tmp);
+		//multiply result = invI * rhs
+		{
+		  btVector3 vtop = invI_upper_left*rhs_top;
+		  btVector3 tmp;
+		  tmp = invIupper_right * rhs_bot;
+		  vtop += tmp;
+		  btVector3 vbot = invI_lower_left*rhs_top;
+		  tmp = invI_lower_right * rhs_bot;
+		  vbot += tmp;
+		  result[0] = vtop[0];
+		  result[1] = vtop[1];
+		  result[2] = vtop[2];
+		  result[3] = vbot[0];
+		  result[4] = vbot[1];
+		  result[5] = vbot[2];
+		}
+    }
+void btMultiBody::solveImatrix(const btSpatialForceVector &rhs, btSpatialMotionVector &result) const
+	int num_links = getNumLinks();
+	///solve I * x = rhs, so the result = invI * rhs
+    if (num_links == 0) 
+	{
+		// in the case of 0 m_links (i.e. a plain rigid body, not a multibody) rhs * invI is easier
+		result.setAngular(rhs.getAngular() / m_baseInertia);
+		result.setLinear(rhs.getLinear() / m_baseMass);		
+    } else 
+	{
+		/// Special routine for calculating the inverse of a spatial inertia matrix
+		///the 6x6 matrix is stored as 4 blocks of 3x3 matrices
+		btMatrix3x3 Binv = m_cachedInertiaTopRight.inverse()*-1.f;
+		btMatrix3x3 tmp = m_cachedInertiaLowerRight * Binv;
+		btMatrix3x3 invIupper_right = (tmp * m_cachedInertiaTopLeft + m_cachedInertiaLowerLeft).inverse();
+		tmp = invIupper_right * m_cachedInertiaLowerRight;
+		btMatrix3x3 invI_upper_left = (tmp * Binv);
+		btMatrix3x3 invI_lower_right = (invI_upper_left).transpose();
+		tmp = m_cachedInertiaTopLeft  * invI_upper_left;
+		tmp[0][0]-= 1.0;
+		tmp[1][1]-= 1.0;
+		tmp[2][2]-= 1.0;
+		btMatrix3x3 invI_lower_left = (Binv * tmp);
+		//multiply result = invI * rhs
+		{
+		  btVector3 vtop = invI_upper_left*rhs.getLinear();
+		  btVector3 tmp;
+		  tmp = invIupper_right * rhs.getAngular();
+		  vtop += tmp;
+		  btVector3 vbot = invI_lower_left*rhs.getLinear();
+		  tmp = invI_lower_right * rhs.getAngular();
+		  vbot += tmp;
+		  result.setVector(vtop, vbot);		  
+		}
+    }
+void btMultiBody::mulMatrix(btScalar *pA, btScalar *pB, int rowsA, int colsA, int rowsB, int colsB, btScalar *pC) const
+	for (int row = 0; row < rowsA; row++)
+	{
+		for (int col = 0; col < colsB; col++)
+		{
+			pC[row * colsB + col] = 0.f;
+			for (int inner = 0; inner < rowsB; inner++)
+			{
+				pC[row * colsB + col] += pA[row * colsA + inner] * pB[col + inner * colsB];
+			}
+		}
+	}
+void btMultiBody::calcAccelerationDeltasMultiDof(const btScalar *force, btScalar *output,
+                                       btAlignedObjectArray<btScalar> &scratch_r, btAlignedObjectArray<btVector3> &scratch_v) const
+    // Temporary matrices/vectors -- use scratch space from caller
+    // so that we don't have to keep reallocating every frame
+	int num_links = getNumLinks();	
+    scratch_r.resize(m_dofCount);
+    scratch_v.resize(4*num_links + 4);	    
+    btScalar * r_ptr = m_dofCount ? &scratch_r[0] : 0;
+    btVector3 * v_ptr = &scratch_v[0];
+    // zhat_i^A (scratch space)
+    btSpatialForceVector * zeroAccSpatFrc = (btSpatialForceVector *)v_ptr;
+	v_ptr += num_links * 2 + 2;
+    // rot_from_parent (cached from calcAccelerations)
+    const btMatrix3x3 * rot_from_parent = &m_matrixBuf[0];
+    // hhat (cached), accel (scratch)
+    // hhat is NOT stored for the base (but ahat is) 
+	const btSpatialForceVector * h = (btSpatialForceVector *)(m_dofCount > 0 ? &m_vectorBuf[0] : 0);
+	btSpatialMotionVector * spatAcc = (btSpatialMotionVector *)v_ptr;
+	v_ptr += num_links * 2 + 2;
+    // Y_i (scratch), invD_i (cached)
+    const btScalar * invD = m_dofCount > 0 ? &m_realBuf[6 + m_dofCount] : 0;
+	btScalar * Y = r_ptr; 
+	////////////////
+	//aux variables
+	btScalar invD_times_Y[6];							//D^{-1} * Y [dofxdof x dofx1 = dofx1] <=> D^{-1} * u; better moved to buffers since it is recalced in calcAccelerationDeltasMultiDof; num_dof of btScalar would cover all bodies
+	btSpatialMotionVector result;							//holds results of the SolveImatrix op; it is a spatial motion vector (accel)
+	btScalar Y_minus_hT_a[6];							//Y - h^{T} * a; it's dofx1 for each body so a single 6x1 temp is enough	
+	btSpatialForceVector spatForceVecTemps[6];				//6 temporary spatial force vectors
+	btSpatialTransformationMatrix fromParent;	
+	/////////////////
+    // First 'upward' loop.
+    // Combines CompTreeLinkVelocities and InitTreeLinks from Mirtich.
+	// Fill in zero_acc
+    // -- set to force/torque on the base, zero otherwise
+    if (m_fixedBase) 
+	{
+        zeroAccSpatFrc[0].setZero();
+    } else 
+	{	
+		//test forces
+		fromParent.m_rotMat = rot_from_parent[0];
+		fromParent.transformRotationOnly(btSpatialForceVector(-force[0],-force[1],-force[2], -force[3],-force[4],-force[5]), zeroAccSpatFrc[0]);
+    }
+    for (int i = 0; i < num_links; ++i) 
+	{
+		zeroAccSpatFrc[i+1].setZero();
+    }    
+	// 'Downward' loop.
+    // (part of TreeForwardDynamics in Mirtich.)
+    for (int i = num_links - 1; i >= 0; --i)
+	{
+		const int parent = m_links[i].m_parent;
+		fromParent.m_rotMat = rot_from_parent[i+1]; fromParent.m_trnVec = m_links[i].m_cachedRVector;
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			Y[m_links[i].m_dofOffset + dof] = force[6 + m_links[i].m_dofOffset + dof]
+											- m_links[i].m_axes[dof].dot(zeroAccSpatFrc[i+1])
+											;
+		}
+		btVector3 in_top, in_bottom, out_top, out_bottom;
+		const btScalar *invDi = &invD[m_links[i].m_dofOffset*m_links[i].m_dofOffset];
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			invD_times_Y[dof] = 0.f;
+			for(int dof2 = 0; dof2 < m_links[i].m_dofCount; ++dof2)
+			{
+				invD_times_Y[dof] += invDi[dof * m_links[i].m_dofCount + dof2] * Y[m_links[i].m_dofOffset + dof2];				
+			}	
+		}
+		 // Zp += pXi * (Zi + hi*Yi/Di)
+		spatForceVecTemps[0] = zeroAccSpatFrc[i+1];
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			const btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+			//
+			spatForceVecTemps[0] += hDof * invD_times_Y[dof];		
+		}
+		fromParent.transformInverse(spatForceVecTemps[0], spatForceVecTemps[1]);
+		zeroAccSpatFrc[parent+1] += spatForceVecTemps[1];
+    }
+	// ptr to the joint accel part of the output
+    btScalar * joint_accel = output + 6;
+    // Second 'upward' loop
+    // (part of TreeForwardDynamics in Mirtich)
+    if (m_fixedBase) 
+	{
+        spatAcc[0].setZero();
+    } 
+	else 
+	{
+		solveImatrix(zeroAccSpatFrc[0], result);
+		spatAcc[0] = -result;
+    }
+    // now do the loop over the m_links
+    for (int i = 0; i < num_links; ++i)
+	{
+        const int parent = m_links[i].m_parent;
+		fromParent.m_rotMat = rot_from_parent[i+1]; fromParent.m_trnVec = m_links[i].m_cachedRVector;
+		fromParent.transform(spatAcc[parent+1], spatAcc[i+1]);
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		{
+			const btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+			//			
+			Y_minus_hT_a[dof] = Y[m_links[i].m_dofOffset + dof] - spatAcc[i+1].dot(hDof);
+		}
+		const btScalar *invDi = &invD[m_links[i].m_dofOffset*m_links[i].m_dofOffset];
+		mulMatrix(const_cast<btScalar*>(invDi), Y_minus_hT_a, m_links[i].m_dofCount, m_links[i].m_dofCount, m_links[i].m_dofCount, 1, &joint_accel[m_links[i].m_dofOffset]);
+		for(int dof = 0; dof < m_links[i].m_dofCount; ++dof)		
+			spatAcc[i+1] += m_links[i].m_axes[dof] * joint_accel[m_links[i].m_dofOffset + dof];      
+    }
+    // transform base accelerations back to the world frame.
+    btVector3 omegadot_out;
+    omegadot_out = rot_from_parent[0].transpose() * spatAcc[0].getAngular();
+	output[0] = omegadot_out[0];
+	output[1] = omegadot_out[1];
+	output[2] = omegadot_out[2];
+    btVector3 vdot_out;
+    vdot_out = rot_from_parent[0].transpose() * spatAcc[0].getLinear();
+	output[3] = vdot_out[0];
+	output[4] = vdot_out[1];
+	output[5] = vdot_out[2];
+	/////////////////
+	//printf("delta = [");
+	//for(int dof = 0; dof < getNumDofs() + 6; ++dof)
+	//	printf("%.2f ", output[dof]);
+	//printf("]\n");
+	/////////////////
+void btMultiBody::stepPositionsMultiDof(btScalar dt, btScalar *pq, btScalar *pqd)
+	int num_links = getNumLinks();
+    // step position by adding dt * velocity
+	//btVector3 v = getBaseVel();	
+    //m_basePos += dt * v;
+	//
+	btScalar *pBasePos = (pq ? &pq[4] : m_basePos);
+	btScalar *pBaseVel = (pqd ? &pqd[3] : &m_realBuf[3]);			//note: the !pqd case assumes m_realBuf holds with base velocity at 3,4,5 (should be wrapped for safety)
+	//	
+	pBasePos[0] += dt * pBaseVel[0];
+	pBasePos[1] += dt * pBaseVel[1];
+	pBasePos[2] += dt * pBaseVel[2];
+	///////////////////////////////
+	//local functor for quaternion integration (to avoid error prone redundancy)
+	struct
+	{
+		//"exponential map" based on btTransformUtil::integrateTransform(..)
+		void operator() (const btVector3 &omega, btQuaternion &quat, bool baseBody, btScalar dt)
+		{
+			//baseBody	=>	quat is alias and omega is global coor
+			//!baseBody	=>	quat is alibi and omega is local coor	
+			btVector3 axis;
+			btVector3 angvel;
+			if(!baseBody)			
+				angvel = quatRotate(quat, omega);				//if quat is not m_baseQuat, it is alibi => ok			
+			else
+				angvel = omega;
+			btScalar fAngle = angvel.length(); 		
+			//limit the angular motion
+			if (fAngle * dt > ANGULAR_MOTION_THRESHOLD)
+			{
+				fAngle = btScalar(0.5)*SIMD_HALF_PI / dt;
+			}
+			if ( fAngle < btScalar(0.001) )
+			{
+				// use Taylor's expansions of sync function
+				axis   = angvel*( btScalar(0.5)*dt-(dt*dt*dt)*(btScalar(0.020833333333))*fAngle*fAngle );
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis   = angvel*( btSin(btScalar(0.5)*fAngle*dt)/fAngle );
+			}
+			if(!baseBody)				
+				quat = btQuaternion(axis.x(),axis.y(),axis.z(),btCos( fAngle*dt*btScalar(0.5) )) * quat;			
+			else			
+				quat = quat * btQuaternion(-axis.x(),-axis.y(),-axis.z(),btCos( fAngle*dt*btScalar(0.5) ));
+				//equivalent to: quat = (btQuaternion(axis.x(),axis.y(),axis.z(),btCos( fAngle*dt*btScalar(0.5) )) * quat.inverse()).inverse();			
+			quat.normalize();
+		}
+	} pQuatUpdateFun;
+	///////////////////////////////
+	//pQuatUpdateFun(getBaseOmega(), m_baseQuat, true, dt);
+	//	
+	btScalar *pBaseQuat = pq ? pq : m_baseQuat;	
+	btScalar *pBaseOmega = pqd ? pqd : &m_realBuf[0];		//note: the !pqd case assumes m_realBuf starts with base omega (should be wrapped for safety)
+	//
+	btQuaternion baseQuat; baseQuat.setValue(pBaseQuat[0], pBaseQuat[1], pBaseQuat[2], pBaseQuat[3]);
+	btVector3 baseOmega; baseOmega.setValue(pBaseOmega[0], pBaseOmega[1], pBaseOmega[2]);
+	pQuatUpdateFun(baseOmega, baseQuat, true, dt);
+	pBaseQuat[0] = baseQuat.x();
+	pBaseQuat[1] = baseQuat.y();
+	pBaseQuat[2] = baseQuat.z();
+	pBaseQuat[3] = baseQuat.w();
+	//printf("pBaseOmega = %.4f %.4f %.4f\n", pBaseOmega->x(), pBaseOmega->y(), pBaseOmega->z());
+	//printf("pBaseVel = %.4f %.4f %.4f\n", pBaseVel->x(), pBaseVel->y(), pBaseVel->z());
+	//printf("baseQuat = %.4f %.4f %.4f %.4f\n", pBaseQuat->x(), pBaseQuat->y(), pBaseQuat->z(), pBaseQuat->w());
+	if(pq)		
+		pq += 7;
+	if(pqd)
+		pqd += 6;
+	// Finally we can update m_jointPos for each of the m_links
+    for (int i = 0; i < num_links; ++i) 
+	{
+		btScalar *pJointPos = (pq ? pq : &m_links[i].m_jointPos[0]);		
+		btScalar *pJointVel = (pqd ? pqd : getJointVelMultiDof(i));
+		switch(m_links[i].m_jointType)
+		{
+			case btMultibodyLink::ePrismatic:
+			case btMultibodyLink::eRevolute:
+			{
+				btScalar jointVel = pJointVel[0];	
+				pJointPos[0] += dt * jointVel;
+				break;
+			}
+			case btMultibodyLink::eSpherical:
+			{
+				btVector3 jointVel; jointVel.setValue(pJointVel[0], pJointVel[1], pJointVel[2]);
+				btQuaternion jointOri; jointOri.setValue(pJointPos[0], pJointPos[1], pJointPos[2], pJointPos[3]);
+				pQuatUpdateFun(jointVel, jointOri, false, dt);
+				pJointPos[0] = jointOri.x(); pJointPos[1] = jointOri.y(); pJointPos[2] = jointOri.z(); pJointPos[3] = jointOri.w();
+				break;
+			}
+			case btMultibodyLink::ePlanar:
+			{
+				pJointPos[0] += dt * getJointVelMultiDof(i)[0];
+				btVector3 q0_coors_qd1qd2 = getJointVelMultiDof(i)[1] * m_links[i].getAxisBottom(1) + getJointVelMultiDof(i)[2] * m_links[i].getAxisBottom(2);
+				btVector3 no_q0_coors_qd1qd2 = quatRotate(btQuaternion(m_links[i].getAxisTop(0), pJointPos[0]), q0_coors_qd1qd2);
+				pJointPos[1] += m_links[i].getAxisBottom(1).dot(no_q0_coors_qd1qd2) * dt;
+				pJointPos[2] += m_links[i].getAxisBottom(2).dot(no_q0_coors_qd1qd2) * dt;
+				break;
+			}
+			default:
+			{
+			}
+		}
+		m_links[i].updateCacheMultiDof(pq);
+		if(pq)		
+			pq += m_links[i].m_posVarCount;
+		if(pqd)
+			pqd += m_links[i].m_dofCount;
+    }
+void btMultiBody::fillConstraintJacobianMultiDof(int link,
+                                    const btVector3 &contact_point,
+									const btVector3 &normal_ang,
+                                    const btVector3 &normal_lin,
+                                    btScalar *jac,
+                                    btAlignedObjectArray<btScalar> &scratch_r,
+                                    btAlignedObjectArray<btVector3> &scratch_v,
+                                    btAlignedObjectArray<btMatrix3x3> &scratch_m) const
+    // temporary space
+	int num_links = getNumLinks();
+	int m_dofCount = getNumDofs();
+    scratch_v.resize(3*num_links + 3);			//(num_links + base) offsets + (num_links + base) normals_lin + (num_links + base) normals_ang
+    scratch_m.resize(num_links + 1);
+    btVector3 * v_ptr = &scratch_v[0];
+    btVector3 * p_minus_com_local = v_ptr; v_ptr += num_links + 1;
+    btVector3 * n_local_lin = v_ptr; v_ptr += num_links + 1;
+	btVector3 * n_local_ang = v_ptr; v_ptr += num_links + 1;
+    btAssert(v_ptr - &scratch_v[0] == scratch_v.size());
+    scratch_r.resize(m_dofCount);
+    btScalar * results = m_dofCount > 0 ? &scratch_r[0] : 0;
+    btMatrix3x3 * rot_from_world = &scratch_m[0];
+    const btVector3 p_minus_com_world = contact_point - m_basePos;
+	const btVector3 &normal_lin_world = normal_lin;							//convenience
+	const btVector3 &normal_ang_world = normal_ang;
+    rot_from_world[0] = btMatrix3x3(m_baseQuat);    
+    // omega coeffients first.
+    btVector3 omega_coeffs_world;
+    omega_coeffs_world = p_minus_com_world.cross(normal_lin_world);
+	jac[0] = omega_coeffs_world[0] + normal_ang_world[0];
+	jac[1] = omega_coeffs_world[1] + normal_ang_world[1];
+	jac[2] = omega_coeffs_world[2] + normal_ang_world[2];
+    // then v coefficients
+    jac[3] = normal_lin_world[0];
+    jac[4] = normal_lin_world[1];
+    jac[5] = normal_lin_world[2];
+	//create link-local versions of p_minus_com and normal
+	p_minus_com_local[0] = rot_from_world[0] * p_minus_com_world;
+    n_local_lin[0] = rot_from_world[0] * normal_lin_world;
+	n_local_ang[0] = rot_from_world[0] * normal_ang_world;
+    // Set remaining jac values to zero for now.
+    for (int i = 6; i < 6 + m_dofCount; ++i) 
+	{
+        jac[i] = 0;
+    }
+    // Qdot coefficients, if necessary.
+    if (num_links > 0 && link > -1) {
+        // TODO: speed this up -- don't calculate for m_links we don't need.
+        // (Also, we are making 3 separate calls to this function, for the normal & the 2 friction directions,
+        // which is resulting in repeated work being done...)
+        // calculate required normals & positions in the local frames.
+        for (int i = 0; i < num_links; ++i) {
+            // transform to local frame
+            const int parent = m_links[i].m_parent;
+            const btMatrix3x3 mtx(m_links[i].m_cachedRotParentToThis);
+            rot_from_world[i+1] = mtx * rot_from_world[parent+1];
+            n_local_lin[i+1] = mtx * n_local_lin[parent+1];
+			n_local_ang[i+1] = mtx * n_local_ang[parent+1];
+            p_minus_com_local[i+1] = mtx * p_minus_com_local[parent+1] - m_links[i].m_cachedRVector;
+			// calculate the jacobian entry
+			switch(m_links[i].m_jointType)
+			{
+				case btMultibodyLink::eRevolute:
+				{
+					results[m_links[i].m_dofOffset] = n_local_lin[i+1].dot(m_links[i].getAxisTop(0).cross(p_minus_com_local[i+1]) + m_links[i].getAxisBottom(0));
+					results[m_links[i].m_dofOffset] += n_local_ang[i+1].dot(m_links[i].getAxisTop(0));
+					break;
+				}
+				case btMultibodyLink::ePrismatic:
+				{
+					results[m_links[i].m_dofOffset] = n_local_lin[i+1].dot(m_links[i].getAxisBottom(0));
+					break;
+				}
+				case btMultibodyLink::eSpherical:
+				{
+					results[m_links[i].m_dofOffset + 0] = n_local_lin[i+1].dot(m_links[i].getAxisTop(0).cross(p_minus_com_local[i+1]) + m_links[i].getAxisBottom(0));
+					results[m_links[i].m_dofOffset + 1] = n_local_lin[i+1].dot(m_links[i].getAxisTop(1).cross(p_minus_com_local[i+1]) + m_links[i].getAxisBottom(1));
+					results[m_links[i].m_dofOffset + 2] = n_local_lin[i+1].dot(m_links[i].getAxisTop(2).cross(p_minus_com_local[i+1]) + m_links[i].getAxisBottom(2));
+					results[m_links[i].m_dofOffset + 0] += n_local_ang[i+1].dot(m_links[i].getAxisTop(0));
+					results[m_links[i].m_dofOffset + 1] += n_local_ang[i+1].dot(m_links[i].getAxisTop(1));
+					results[m_links[i].m_dofOffset + 2] += n_local_ang[i+1].dot(m_links[i].getAxisTop(2));
+					break;
+				}
+				case btMultibodyLink::ePlanar:
+				{
+					results[m_links[i].m_dofOffset + 0] = n_local_lin[i+1].dot(m_links[i].getAxisTop(0).cross(p_minus_com_local[i+1]));// + m_links[i].getAxisBottom(0));
+					results[m_links[i].m_dofOffset + 1] = n_local_lin[i+1].dot(m_links[i].getAxisBottom(1));
+					results[m_links[i].m_dofOffset + 2] = n_local_lin[i+1].dot(m_links[i].getAxisBottom(2));
+					break;
+				}
+				default:
+				{
+				}
+			}
+        }
+        // Now copy through to output.
+		//printf("jac[%d] = ", link);
+        while (link != -1) 
+		{
+			for(int dof = 0; dof < m_links[link].m_dofCount; ++dof)
+			{
+				jac[6 + m_links[link].m_dofOffset + dof] = results[m_links[link].m_dofOffset + dof];
+				//printf("%.2f\t", jac[6 + m_links[link].m_dofOffset + dof]);
+			}
+			link = m_links[link].m_parent;
+        }
+		//printf("]\n");
+    }
+void btMultiBody::wakeUp()
+    m_awake = true;
+void btMultiBody::goToSleep()
+    m_awake = false;
+void btMultiBody::checkMotionAndSleepIfRequired(btScalar timestep)
+	int num_links = getNumLinks();
+	extern bool gDisableDeactivation;
+    if (!m_canSleep || gDisableDeactivation) 
+	{
+		m_awake = true;
+		m_sleepTimer = 0;
+		return;
+	}
+    // motion is computed as omega^2 + v^2 + (sum of squares of joint velocities)
+    btScalar motion = 0;
+	{
+		for (int i = 0; i < 6 + m_dofCount; ++i) 		
+			motion += m_realBuf[i] * m_realBuf[i];
+	}
+    if (motion < SLEEP_EPSILON) {
+        m_sleepTimer += timestep;
+        if (m_sleepTimer > SLEEP_TIMEOUT) {
+            goToSleep();
+        }
+    } else {
+        m_sleepTimer = 0;
+		if (!m_awake)
+			wakeUp();
+    }
+void	btMultiBody::forwardKinematics(btAlignedObjectArray<btQuaternion>& world_to_local,btAlignedObjectArray<btVector3>& local_origin)
+	int num_links = getNumLinks();
+	// Cached 3x3 rotation matrices from parent frame to this frame.
+	btMatrix3x3* rot_from_parent =(btMatrix3x3 *) &m_matrixBuf[0];
+	rot_from_parent[0] = btMatrix3x3(m_baseQuat);				//m_baseQuat assumed to be alias!?
+	for (int i = 0; i < num_links; ++i) 
+	{
+		rot_from_parent[i+1] = btMatrix3x3(m_links[i].m_cachedRotParentToThis);
+	}
+	int nLinks = getNumLinks();
+	///base + num m_links
+	world_to_local.resize(nLinks+1);
+	local_origin.resize(nLinks+1);
+	world_to_local[0] = getWorldToBaseRot();
+	local_origin[0] = getBasePos();
+	for (int k=0;k<getNumLinks();k++)
+	{
+		const int parent = getParent(k);
+		world_to_local[k+1] = getParentToLocalRot(k) * world_to_local[parent+1];
+		local_origin[k+1] = local_origin[parent+1] + (quatRotate(world_to_local[k+1].inverse() , getRVector(k)));
+	}
+	for (int link=0;link<getNumLinks();link++)
+	{
+		int index = link+1;
+		btVector3 posr = local_origin[index];
+		btScalar quat[4]={-world_to_local[index].x(),-world_to_local[index].y(),-world_to_local[index].z(),world_to_local[index].w()};
+		btTransform tr;
+		tr.setIdentity();
+		tr.setOrigin(posr);
+		tr.setRotation(btQuaternion(quat[0],quat[1],quat[2],quat[3]));
+		getLink(link).m_cachedWorldTransform = tr;
+	}
+void	btMultiBody::updateCollisionObjectWorldTransforms(btAlignedObjectArray<btQuaternion>& world_to_local,btAlignedObjectArray<btVector3>& local_origin)
+	world_to_local.resize(getNumLinks()+1);
+	local_origin.resize(getNumLinks()+1);
+	world_to_local[0] = getWorldToBaseRot();
+	local_origin[0] = getBasePos();
+	if (getBaseCollider())
+	{
+		btVector3 posr = local_origin[0];
+		//	float pos[4]={posr.x(),posr.y(),posr.z(),1};
+		btScalar quat[4]={-world_to_local[0].x(),-world_to_local[0].y(),-world_to_local[0].z(),world_to_local[0].w()};
+		btTransform tr;
+		tr.setIdentity();
+		tr.setOrigin(posr);
+		tr.setRotation(btQuaternion(quat[0],quat[1],quat[2],quat[3]));
+		getBaseCollider()->setWorldTransform(tr);
+	}
+	for (int k=0;k<getNumLinks();k++)
+	{
+		const int parent = getParent(k);
+		world_to_local[k+1] = getParentToLocalRot(k) * world_to_local[parent+1];
+		local_origin[k+1] = local_origin[parent+1] + (quatRotate(world_to_local[k+1].inverse() , getRVector(k)));
+	}
+	for (int m=0;m<getNumLinks();m++)
+	{
+		btMultiBodyLinkCollider* col = getLink(m).m_collider;
+		if (col)
+		{
+			int link = col->m_link;
+			btAssert(link == m);
+			int index = link+1;
+			btVector3 posr = local_origin[index];
+			//			float pos[4]={posr.x(),posr.y(),posr.z(),1};
+			btScalar quat[4]={-world_to_local[index].x(),-world_to_local[index].y(),-world_to_local[index].z(),world_to_local[index].w()};
+			btTransform tr;
+			tr.setIdentity();
+			tr.setOrigin(posr);
+			tr.setRotation(btQuaternion(quat[0],quat[1],quat[2],quat[3]));
+			col->setWorldTransform(tr);
+		}
+	}
+int	btMultiBody::calculateSerializeBufferSize()	const
+	int sz = sizeof(btMultiBodyData);
+	return sz;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+const char*	btMultiBody::serialize(void* dataBuffer, class btSerializer* serializer) const
+		btMultiBodyData* mbd = (btMultiBodyData*) dataBuffer;
+		getBaseWorldTransform().serialize(mbd->m_baseWorldTransform);
+		mbd->m_baseMass = this->getBaseMass();
+		getBaseInertia().serialize(mbd->m_baseInertia);
+		{
+			char* name = (char*) serializer->findNameForPointer(m_baseName);
+			mbd->m_baseName = (char*)serializer->getUniquePointer(name);
+			if (mbd->m_baseName)
+			{
+				serializer->serializeName(name);
+			}
+		}
+		mbd->m_numLinks = this->getNumLinks();
+		if (mbd->m_numLinks)
+		{
+			int sz = sizeof(btMultiBodyLinkData);
+			int numElem = mbd->m_numLinks;
+			btChunk* chunk = serializer->allocate(sz,numElem);
+			btMultiBodyLinkData* memPtr = (btMultiBodyLinkData*)chunk->m_oldPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				memPtr->m_jointType = getLink(i).m_jointType;
+				memPtr->m_dofCount = getLink(i).m_dofCount;
+				memPtr->m_posVarCount = getLink(i).m_posVarCount;
+				getLink(i).m_inertiaLocal.serialize(memPtr->m_linkInertia);
+				memPtr->m_linkMass = getLink(i).m_mass;
+				memPtr->m_parentIndex = getLink(i).m_parent;
+				memPtr->m_jointDamping = getLink(i).m_jointDamping;
+				memPtr->m_jointFriction = getLink(i).m_jointFriction;
+				getLink(i).m_eVector.serialize(memPtr->m_parentComToThisComOffset);
+				getLink(i).m_dVector.serialize(memPtr->m_thisPivotToThisComOffset);
+				getLink(i).m_zeroRotParentToThis.serialize(memPtr->m_zeroRotParentToThis);
+				btAssert(memPtr->m_dofCount<=3);
+				for (int dof = 0;dof<getLink(i).m_dofCount;dof++)
+				{
+					getLink(i).getAxisBottom(dof).serialize(memPtr->m_jointAxisBottom[dof]);
+					getLink(i).getAxisTop(dof).serialize(memPtr->m_jointAxisTop[dof]);
+					memPtr->m_jointTorque[dof] = getLink(i).m_jointTorque[dof];
+					memPtr->m_jointVel[dof] = getJointVelMultiDof(i)[dof];
+				}
+				int numPosVar = getLink(i).m_posVarCount;
+				for (int posvar = 0; posvar < numPosVar;posvar++)
+				{
+					memPtr->m_jointPos[posvar] = getLink(i).m_jointPos[posvar];
+				}
+				{
+					char* name = (char*) serializer->findNameForPointer(m_links[i].m_linkName);
+					memPtr->m_linkName = (char*)serializer->getUniquePointer(name);
+					if (memPtr->m_linkName)
+					{
+						serializer->serializeName(name);
+					}
+				}
+				{
+					char* name = (char*) serializer->findNameForPointer(m_links[i].m_jointName);
+					memPtr->m_jointName = (char*)serializer->getUniquePointer(name);
+					if (memPtr->m_jointName)
+					{
+						serializer->serializeName(name);
+					}
+				}
+				memPtr->m_linkCollider = (btCollisionObjectData*)serializer->getUniquePointer(getLink(i).m_collider);
+			}
+			serializer->finalizeChunk(chunk,btMultiBodyLinkDataName,BT_ARRAY_CODE,(void*) &m_links[0]);
+		}
+		mbd->m_links = mbd->m_numLinks? (btMultiBodyLinkData*) serializer->getUniquePointer((void*)&m_links[0]):0;
+		return btMultiBodyDataName;
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBody.h b/src/bullet/BulletDynamics/Featherstone/btMultiBody.h
new file mode 100644
index 00000000..e7ba8723
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBody.h
@@ -0,0 +1,760 @@
+ *   Class representing an articulated rigid body. Stores the body's
+ *   current state, allows forces and torques to be set, handles
+ *   timestepping and implements Featherstone's algorithm.
+ *   
+ *   Copyright (C) Stephen Thompson, <stephen@solarflare.org.uk>, 2011-2013
+ *   Portions written By Erwin Coumans: connection to LCP solver, various multibody constraints, replacing Eigen math library by Bullet LinearMath and a dedicated 6x6 matrix inverse (solveImatrix)
+ *   Portions written By Jakub Stepien: support for multi-DOF constraints, introduction of spatial algebra and several other improvements
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ */
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btQuaternion.h"
+#include "LinearMath/btMatrix3x3.h"
+#include "LinearMath/btAlignedObjectArray.h"
+///serialization data, don't change them if you are not familiar with the details of the serialization mechanisms
+	#define btMultiBodyData	btMultiBodyDoubleData
+	#define btMultiBodyDataName	"btMultiBodyDoubleData"
+	#define btMultiBodyLinkData btMultiBodyLinkDoubleData
+	#define btMultiBodyLinkDataName	"btMultiBodyLinkDoubleData"
+	#define btMultiBodyData	btMultiBodyFloatData
+	#define btMultiBodyDataName	"btMultiBodyFloatData"
+	#define btMultiBodyLinkData btMultiBodyLinkFloatData
+	#define btMultiBodyLinkDataName	"btMultiBodyLinkFloatData"
+#include "btMultiBodyLink.h"
+class btMultiBodyLinkCollider;
+ATTRIBUTE_ALIGNED16(class) btMultiBody 
+    //
+    // initialization
+    //
+	btMultiBody(int n_links,             // NOT including the base
+		btScalar mass,                // mass of base
+		const btVector3 &inertia,    // inertia of base, in base frame; assumed diagonal
+		bool fixedBase,           // whether the base is fixed (true) or can move (false)
+		bool canSleep, bool deprecatedMultiDof=true);
+	virtual ~btMultiBody();
+	//note: fixed link collision with parent is always disabled
+	void setupFixed(int linkIndex,
+						   btScalar mass,
+						   const btVector3 &inertia,
+						   int parent,
+						   const btQuaternion &rotParentToThis,
+						   const btVector3 &parentComToThisPivotOffset,
+                           const btVector3 &thisPivotToThisComOffset, bool deprecatedDisableParentCollision=true);
+	void setupPrismatic(int i,
+                               btScalar mass,
+                               const btVector3 &inertia,
+                               int parent,
+                               const btQuaternion &rotParentToThis,
+                               const btVector3 &jointAxis,
+                               const btVector3 &parentComToThisPivotOffset,
+							   const btVector3 &thisPivotToThisComOffset,
+							   bool disableParentCollision);
+    void setupRevolute(int linkIndex,            // 0 to num_links-1
+                       btScalar mass,
+                       const btVector3 &inertia,
+                       int parentIndex,
+                       const btQuaternion &rotParentToThis,  // rotate points in parent frame to this frame, when q = 0
+                       const btVector3 &jointAxis,    // in my frame
+                       const btVector3 &parentComToThisPivotOffset,    // vector from parent COM to joint axis, in PARENT frame
+                       const btVector3 &thisPivotToThisComOffset,       // vector from joint axis to my COM, in MY frame
+					   bool disableParentCollision=false);
+	void setupSpherical(int linkIndex,											// 0 to num_links-1
+                       btScalar mass,
+                       const btVector3 &inertia,
+                       int parent,
+                       const btQuaternion &rotParentToThis,		// rotate points in parent frame to this frame, when q = 0                       
+                       const btVector3 &parentComToThisPivotOffset,			// vector from parent COM to joint axis, in PARENT frame
+                       const btVector3 &thisPivotToThisComOffset,				// vector from joint axis to my COM, in MY frame
+					   bool disableParentCollision=false);		
+	void setupPlanar(int i,											// 0 to num_links-1
+                       btScalar mass,
+                       const btVector3 &inertia,
+                       int parent,
+                       const btQuaternion &rotParentToThis,		// rotate points in parent frame to this frame, when q = 0                       
+					   const btVector3 &rotationAxis,
+                       const btVector3 &parentComToThisComOffset,			// vector from parent COM to this COM, in PARENT frame                       
+					   bool disableParentCollision=false);		
+	const btMultibodyLink& getLink(int index) const
+	{
+		return m_links[index];
+	}
+	btMultibodyLink& getLink(int index)
+	{
+		return m_links[index];
+	}
+	void setBaseCollider(btMultiBodyLinkCollider* collider)//collider can be NULL to disable collision for the base
+	{
+		m_baseCollider = collider;
+	}
+	const btMultiBodyLinkCollider* getBaseCollider() const
+	{
+		return m_baseCollider;
+	}
+	btMultiBodyLinkCollider* getBaseCollider()
+	{
+		return m_baseCollider;
+	}
+    //
+    // get parent
+    // input: link num from 0 to num_links-1
+    // output: link num from 0 to num_links-1, OR -1 to mean the base.
+    //
+    int getParent(int link_num) const;
+    //
+    // get number of m_links, masses, moments of inertia
+    //
+    int getNumLinks() const { return m_links.size(); }
+	int getNumDofs() const { return m_dofCount; }
+	int getNumPosVars() const { return m_posVarCnt; }
+    btScalar getBaseMass() const { return m_baseMass; }
+    const btVector3 & getBaseInertia() const { return m_baseInertia; }
+    btScalar getLinkMass(int i) const;
+    const btVector3 & getLinkInertia(int i) const;
+    //
+    // change mass (incomplete: can only change base mass and inertia at present)
+    //
+    void setBaseMass(btScalar mass) { m_baseMass = mass; }
+    void setBaseInertia(const btVector3 &inertia) { m_baseInertia = inertia; }
+    //
+    // get/set pos/vel/rot/omega for the base link
+    //
+    const btVector3 & getBasePos() const { return m_basePos; }    // in world frame
+    const btVector3 getBaseVel() const 
+	{ 
+		return btVector3(m_realBuf[3],m_realBuf[4],m_realBuf[5]); 
+	}     // in world frame
+    const btQuaternion & getWorldToBaseRot() const 
+	{ 
+		return m_baseQuat; 
+	}     // rotates world vectors into base frame
+    btVector3 getBaseOmega() const { return btVector3(m_realBuf[0],m_realBuf[1],m_realBuf[2]); }   // in world frame
+    void setBasePos(const btVector3 &pos) 
+	{ 
+		m_basePos = pos; 
+	}
+	void setBaseWorldTransform(const btTransform& tr)
+	{
+		setBasePos(tr.getOrigin());
+		setWorldToBaseRot(tr.getRotation().inverse());
+	}
+	btTransform getBaseWorldTransform() const
+	{
+		btTransform tr;
+		tr.setOrigin(getBasePos());
+		tr.setRotation(getWorldToBaseRot().inverse());
+		return tr;
+	}
+    void setBaseVel(const btVector3 &vel) 
+	{ 
+		m_realBuf[3]=vel[0]; m_realBuf[4]=vel[1]; m_realBuf[5]=vel[2]; 
+	}
+    void setWorldToBaseRot(const btQuaternion &rot) 
+	{ 
+		m_baseQuat = rot;					//m_baseQuat asumed to ba alias!?
+	}
+    void setBaseOmega(const btVector3 &omega) 
+	{ 
+		m_realBuf[0]=omega[0]; 
+		m_realBuf[1]=omega[1]; 
+		m_realBuf[2]=omega[2]; 
+	}
+    //
+    // get/set pos/vel for child m_links (i = 0 to num_links-1)
+    //
+    btScalar getJointPos(int i) const;
+    btScalar getJointVel(int i) const;
+	btScalar * getJointVelMultiDof(int i);
+	btScalar * getJointPosMultiDof(int i);
+	const btScalar * getJointVelMultiDof(int i) const ;
+	const btScalar * getJointPosMultiDof(int i) const ;
+    void setJointPos(int i, btScalar q);
+    void setJointVel(int i, btScalar qdot);
+	void setJointPosMultiDof(int i, btScalar *q);
+    void setJointVelMultiDof(int i, btScalar *qdot);	
+    //
+    // direct access to velocities as a vector of 6 + num_links elements.
+    // (omega first, then v, then joint velocities.)
+    //
+    const btScalar * getVelocityVector() const 
+	{ 
+		return &m_realBuf[0]; 
+	}
+/*    btScalar * getVelocityVector() 
+	{ 
+		return &real_buf[0]; 
+	}
+  */  
+    //
+    // get the frames of reference (positions and orientations) of the child m_links
+    // (i = 0 to num_links-1)
+    //
+    const btVector3 & getRVector(int i) const;   // vector from COM(parent(i)) to COM(i), in frame i's coords
+    const btQuaternion & getParentToLocalRot(int i) const;   // rotates vectors in frame parent(i) to vectors in frame i.
+    //
+    // transform vectors in local frame of link i to world frame (or vice versa)
+    //
+    btVector3 localPosToWorld(int i, const btVector3 &vec) const;
+    btVector3 localDirToWorld(int i, const btVector3 &vec) const;
+    btVector3 worldPosToLocal(int i, const btVector3 &vec) const;
+    btVector3 worldDirToLocal(int i, const btVector3 &vec) const;
+    //
+    // transform a frame in local coordinate to a frame in world coordinate
+    //
+    btMatrix3x3 localFrameToWorld(int i, const btMatrix3x3 &mat) const;
+    //
+    // calculate kinetic energy and angular momentum
+    // useful for debugging.
+    //
+    btScalar getKineticEnergy() const;
+    btVector3 getAngularMomentum() const;
+    //
+    // set external forces and torques. Note all external forces/torques are given in the WORLD frame.
+    //
+    void clearForcesAndTorques();
+   void clearConstraintForces();
+	void clearVelocities();
+    void addBaseForce(const btVector3 &f) 
+	{ 
+		m_baseForce += f; 
+	}
+    void addBaseTorque(const btVector3 &t) { m_baseTorque += t; }
+    void addLinkForce(int i, const btVector3 &f);
+    void addLinkTorque(int i, const btVector3 &t);
+ void addBaseConstraintForce(const btVector3 &f)
+        {
+                m_baseConstraintForce += f;
+        }
+    void addBaseConstraintTorque(const btVector3 &t) { m_baseConstraintTorque += t; }
+    void addLinkConstraintForce(int i, const btVector3 &f);
+    void addLinkConstraintTorque(int i, const btVector3 &t);
+void addJointTorque(int i, btScalar Q);
+	void addJointTorqueMultiDof(int i, int dof, btScalar Q);
+	void addJointTorqueMultiDof(int i, const btScalar *Q);
+    const btVector3 & getBaseForce() const { return m_baseForce; }
+    const btVector3 & getBaseTorque() const { return m_baseTorque; }
+    const btVector3 & getLinkForce(int i) const;
+    const btVector3 & getLinkTorque(int i) const;
+    btScalar getJointTorque(int i) const;
+	btScalar * getJointTorqueMultiDof(int i);
+    //
+    // dynamics routines.
+    //
+    // timestep the velocities (given the external forces/torques set using addBaseForce etc).
+    // also sets up caches for calcAccelerationDeltas.
+    //
+    // Note: the caller must provide three vectors which are used as
+    // temporary scratch space. The idea here is to reduce dynamic
+    // memory allocation: the same scratch vectors can be re-used
+    // again and again for different Multibodies, instead of each
+    // btMultiBody allocating (and then deallocating) their own
+    // individual scratch buffers. This gives a considerable speed
+    // improvement, at least on Windows (where dynamic memory
+    // allocation appears to be fairly slow).
+    //
+	void computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar dt,
+                        btAlignedObjectArray<btScalar> &scratch_r,
+                        btAlignedObjectArray<btVector3> &scratch_v,
+                        btAlignedObjectArray<btMatrix3x3> &scratch_m,
+			bool isConstraintPass=false
+		);
+///stepVelocitiesMultiDof is deprecated, use computeAccelerationsArticulatedBodyAlgorithmMultiDof instead
+        void stepVelocitiesMultiDof(btScalar dt,
+                        btAlignedObjectArray<btScalar> &scratch_r,
+                        btAlignedObjectArray<btVector3> &scratch_v,
+                        btAlignedObjectArray<btMatrix3x3> &scratch_m,
+                        bool isConstraintPass=false)
+	{
+		computeAccelerationsArticulatedBodyAlgorithmMultiDof(dt,scratch_r,scratch_v,scratch_m,isConstraintPass);
+        }
+    // calcAccelerationDeltasMultiDof
+    // input: force vector (in same format as jacobian, i.e.:
+    //                      3 torque values, 3 force values, num_links joint torque values)
+    // output: 3 omegadot values, 3 vdot values, num_links q_double_dot values
+    // (existing contents of output array are replaced)
+    // calcAccelerationDeltasMultiDof must have been called first.
+	void calcAccelerationDeltasMultiDof(const btScalar *force, btScalar *output,
+                                btAlignedObjectArray<btScalar> &scratch_r,
+                                btAlignedObjectArray<btVector3> &scratch_v) const;
+	void applyDeltaVeeMultiDof2(const btScalar * delta_vee, btScalar multiplier)
+	{
+		for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+                {
+                        m_deltaV[dof] += delta_vee[dof] * multiplier;
+                }
+	}
+	void processDeltaVeeMultiDof2()
+	{
+		applyDeltaVeeMultiDof(&m_deltaV[0],1);
+		for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+                {
+			m_deltaV[dof] = 0.f;
+		}
+	}
+	void applyDeltaVeeMultiDof(const btScalar * delta_vee, btScalar multiplier) 
+	{
+		//for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+		//	printf("%.4f ", delta_vee[dof]*multiplier);
+		//printf("\n");
+		//btScalar sum = 0;
+		//for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+		//{
+		//	sum += delta_vee[dof]*multiplier*delta_vee[dof]*multiplier;
+		//}
+		//btScalar l = btSqrt(sum);
+		//if (l>m_maxAppliedImpulse)
+		//{
+		//	multiplier *= m_maxAppliedImpulse/l;
+		//}
+		for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+		{
+			m_realBuf[dof] += delta_vee[dof] * multiplier;
+			btClamp(m_realBuf[dof],-m_maxCoordinateVelocity,m_maxCoordinateVelocity);
+		}
+    }
+    // timestep the positions (given current velocities).
+	void stepPositionsMultiDof(btScalar dt, btScalar *pq = 0, btScalar *pqd = 0);
+    //
+    // contacts
+    //
+    // This routine fills out a contact constraint jacobian for this body.
+    // the 'normal' supplied must be -n for body1 or +n for body2 of the contact.
+    // 'normal' & 'contact_point' are both given in world coordinates.
+	void fillContactJacobianMultiDof(int link,
+                             const btVector3 &contact_point,
+                             const btVector3 &normal,
+                             btScalar *jac,
+                             btAlignedObjectArray<btScalar> &scratch_r,
+                             btAlignedObjectArray<btVector3> &scratch_v,
+							 btAlignedObjectArray<btMatrix3x3> &scratch_m) const { fillConstraintJacobianMultiDof(link, contact_point, btVector3(0, 0, 0), normal, jac, scratch_r, scratch_v, scratch_m); }
+	//a more general version of fillContactJacobianMultiDof which does not assume..
+	//.. that the constraint in question is contact or, to be more precise, constrains linear velocity only
+	void fillConstraintJacobianMultiDof(int link,
+                             const btVector3 &contact_point,
+							 const btVector3 &normal_ang,
+                             const btVector3 &normal_lin,
+                             btScalar *jac,
+                             btAlignedObjectArray<btScalar> &scratch_r,
+                             btAlignedObjectArray<btVector3> &scratch_v,
+                             btAlignedObjectArray<btMatrix3x3> &scratch_m) const;
+    //
+    // sleeping
+    //
+	void	setCanSleep(bool canSleep)
+	{
+		m_canSleep = canSleep;
+	}
+	bool getCanSleep()const
+	{
+		return m_canSleep;
+	}
+    bool isAwake() const { return m_awake; }
+    void wakeUp();
+    void goToSleep();
+    void checkMotionAndSleepIfRequired(btScalar timestep);
+	bool hasFixedBase() const
+	{
+		    return m_fixedBase;
+	}
+	int getCompanionId() const
+	{
+		return m_companionId;
+	}
+	void setCompanionId(int id)
+	{
+		//printf("for %p setCompanionId(%d)\n",this, id);
+		m_companionId = id;
+	}
+	void setNumLinks(int numLinks)//careful: when changing the number of m_links, make sure to re-initialize or update existing m_links
+	{
+		m_links.resize(numLinks);
+	}
+	btScalar getLinearDamping() const
+	{
+			return m_linearDamping;
+	}
+	void setLinearDamping( btScalar damp)
+	{
+		m_linearDamping = damp;
+	}
+	btScalar getAngularDamping() const
+	{
+		return m_angularDamping;
+	}
+	void setAngularDamping( btScalar damp)
+	{
+		m_angularDamping = damp;
+	}
+	bool getUseGyroTerm() const
+	{
+		return m_useGyroTerm;
+	}
+	void setUseGyroTerm(bool useGyro)
+	{
+		m_useGyroTerm = useGyro;
+	}
+	btScalar	getMaxCoordinateVelocity() const
+	{
+		return m_maxCoordinateVelocity ;
+	}
+	void	setMaxCoordinateVelocity(btScalar maxVel)
+	{
+		m_maxCoordinateVelocity = maxVel;
+	}
+	btScalar	getMaxAppliedImpulse() const
+	{
+		return m_maxAppliedImpulse;
+	}
+	void	setMaxAppliedImpulse(btScalar maxImp)
+	{
+		m_maxAppliedImpulse = maxImp;
+	}
+	void	setHasSelfCollision(bool hasSelfCollision)
+	{
+		m_hasSelfCollision = hasSelfCollision;
+	}
+	bool hasSelfCollision() const
+	{
+		return m_hasSelfCollision;
+	}
+	void finalizeMultiDof();
+	void useRK4Integration(bool use) { m_useRK4 = use; }
+	bool isUsingRK4Integration() const { return m_useRK4; }
+	void useGlobalVelocities(bool use) { m_useGlobalVelocities = use; }
+	bool isUsingGlobalVelocities() const { return m_useGlobalVelocities; }
+	bool isPosUpdated() const
+	{
+		return __posUpdated;
+	}
+	void setPosUpdated(bool updated)
+	{
+		__posUpdated = updated;
+	}
+	//internalNeedsJointFeedback is for internal use only
+	bool internalNeedsJointFeedback() const
+	{
+		return m_internalNeedsJointFeedback;
+	}
+	void	forwardKinematics(btAlignedObjectArray<btQuaternion>& scratch_q,btAlignedObjectArray<btVector3>& scratch_m);
+	void	updateCollisionObjectWorldTransforms(btAlignedObjectArray<btQuaternion>& scratch_q,btAlignedObjectArray<btVector3>& scratch_m);
+	virtual	int	calculateSerializeBufferSize()	const;
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	virtual	const char*	serialize(void* dataBuffer,  class btSerializer* serializer) const;
+	const char*				getBaseName() const
+	{
+		return m_baseName;
+	}
+	///memory of setBaseName needs to be manager by user
+	void	setBaseName(const char* name)
+	{
+		m_baseName = name;
+	}
+    btMultiBody(const btMultiBody &);  // not implemented
+    void operator=(const btMultiBody &);  // not implemented
+    void compTreeLinkVelocities(btVector3 *omega, btVector3 *vel) const;
+	void solveImatrix(const btVector3& rhs_top, const btVector3& rhs_bot, float result[6]) const;
+	void solveImatrix(const btSpatialForceVector &rhs, btSpatialMotionVector &result) const;
+	void updateLinksDofOffsets()
+	{
+		int dofOffset = 0, cfgOffset = 0;
+		for(int bidx = 0; bidx < m_links.size(); ++bidx)
+		{
+			m_links[bidx].m_dofOffset = dofOffset; m_links[bidx].m_cfgOffset = cfgOffset;
+			dofOffset += m_links[bidx].m_dofCount; cfgOffset += m_links[bidx].m_posVarCount;
+		}
+	}
+	void mulMatrix(btScalar *pA, btScalar *pB, int rowsA, int colsA, int rowsB, int colsB, btScalar *pC) const;
+	btMultiBodyLinkCollider* m_baseCollider;//can be NULL
+	const char*				m_baseName;//memory needs to be manager by user!
+    btVector3 m_basePos;       // position of COM of base (world frame)
+    btQuaternion m_baseQuat;   // rotates world points into base frame
+    btScalar m_baseMass;         // mass of the base
+    btVector3 m_baseInertia;   // inertia of the base (in local frame; diagonal)
+    btVector3 m_baseForce;     // external force applied to base. World frame.
+    btVector3 m_baseTorque;    // external torque applied to base. World frame.
+    btVector3 m_baseConstraintForce;     // external force applied to base. World frame.
+    btVector3 m_baseConstraintTorque;    // external torque applied to base. World frame.
+    btAlignedObjectArray<btMultibodyLink> m_links;    // array of m_links, excluding the base. index from 0 to num_links-1.
+	btAlignedObjectArray<btMultiBodyLinkCollider*> m_colliders;
+    //
+    // realBuf:
+    //  offset         size            array
+    //   0              6 + num_links   v (base_omega; base_vel; joint_vels)					MULTIDOF [sysdof x sysdof for D matrices (TOO MUCH!) + pos_delta which is sys-cfg sized]
+    //   6+num_links    num_links       D
+    //
+    // vectorBuf:
+    //  offset         size         array
+    //   0              num_links    h_top
+    //   num_links      num_links    h_bottom
+    //
+    // matrixBuf:
+    //  offset         size         array
+    //   0              num_links+1  rot_from_parent
+    //
+   btAlignedObjectArray<btScalar> m_deltaV; 
+    btAlignedObjectArray<btScalar> m_realBuf;
+    btAlignedObjectArray<btVector3> m_vectorBuf;
+    btAlignedObjectArray<btMatrix3x3> m_matrixBuf;
+	btMatrix3x3 m_cachedInertiaTopLeft;
+	btMatrix3x3 m_cachedInertiaTopRight;
+	btMatrix3x3 m_cachedInertiaLowerLeft;
+	btMatrix3x3 m_cachedInertiaLowerRight;
+    bool m_fixedBase;
+    // Sleep parameters.
+    bool m_awake;
+    bool m_canSleep;
+    btScalar m_sleepTimer;
+	int	m_companionId;
+	btScalar	m_linearDamping;
+	btScalar	m_angularDamping;
+	bool	m_useGyroTerm;
+	btScalar	m_maxAppliedImpulse;
+	btScalar	m_maxCoordinateVelocity;
+	bool		m_hasSelfCollision;
+		bool __posUpdated;
+		int m_dofCount, m_posVarCnt;
+	bool m_useRK4, m_useGlobalVelocities;
+	///the m_needsJointFeedback gets updated/computed during the stepVelocitiesMultiDof and it for internal usage only
+	bool m_internalNeedsJointFeedback;
+struct btMultiBodyLinkDoubleData
+	btQuaternionDoubleData	m_zeroRotParentToThis;
+	btVector3DoubleData		m_parentComToThisComOffset;
+	btVector3DoubleData		m_thisPivotToThisComOffset;
+	btVector3DoubleData		m_jointAxisTop[6];
+	btVector3DoubleData		m_jointAxisBottom[6];
+	btVector3DoubleData		m_linkInertia;   // inertia of the base (in local frame; diagonal)
+	double					m_linkMass;
+	int						m_parentIndex;
+	int						m_jointType;
+	int						m_dofCount;
+	int						m_posVarCount;
+	double					m_jointPos[7];
+	double					m_jointVel[6];
+	double					m_jointTorque[6];
+	double					m_jointDamping;
+	double					m_jointFriction;
+	char					*m_linkName;
+	char					*m_jointName;
+	btCollisionObjectDoubleData	*m_linkCollider;
+	char					*m_paddingPtr;
+struct btMultiBodyLinkFloatData
+	btQuaternionFloatData	m_zeroRotParentToThis;
+	btVector3FloatData		m_parentComToThisComOffset;
+	btVector3FloatData		m_thisPivotToThisComOffset;
+	btVector3FloatData		m_jointAxisTop[6];
+	btVector3FloatData		m_jointAxisBottom[6];
+	btVector3FloatData	m_linkInertia;   // inertia of the base (in local frame; diagonal)
+	int						m_dofCount;
+	float				m_linkMass;
+	int					m_parentIndex;
+	int					m_jointType;
+	float					m_jointPos[7];
+	float					m_jointVel[6];
+	float					m_jointTorque[6];
+	int						m_posVarCount;
+	float					m_jointDamping;
+	float					m_jointFriction;
+	char				*m_linkName;
+	char				*m_jointName;
+	btCollisionObjectFloatData	*m_linkCollider;
+	char				*m_paddingPtr;
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btMultiBodyDoubleData
+	btTransformDoubleData m_baseWorldTransform;
+	btVector3DoubleData m_baseInertia;   // inertia of the base (in local frame; diagonal)
+	double	m_baseMass;
+	char	*m_baseName;
+	btMultiBodyLinkDoubleData	*m_links;
+	btCollisionObjectDoubleData	*m_baseCollider;
+	char	*m_paddingPtr;
+	int		m_numLinks;
+	char	m_padding[4];
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btMultiBodyFloatData
+	char	*m_baseName;
+	btMultiBodyLinkFloatData	*m_links;
+	btCollisionObjectFloatData	*m_baseCollider;
+	btTransformFloatData m_baseWorldTransform;
+	btVector3FloatData m_baseInertia;   // inertia of the base (in local frame; diagonal)
+	float	m_baseMass;
+	int		m_numLinks;
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp
new file mode 100644
index 00000000..119a24c6
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp
@@ -0,0 +1,411 @@
+#include "btMultiBodyConstraint.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "btMultiBodyPoint2Point.h"				//for testing (BTMBP2PCONSTRAINT_BLOCK_ANGULAR_MOTION_TEST macro)
+btMultiBodyConstraint::btMultiBodyConstraint(btMultiBody* bodyA,btMultiBody* bodyB,int linkA, int linkB, int numRows, bool isUnilateral)
+	:m_bodyA(bodyA),
+	m_bodyB(bodyB),
+	m_linkA(linkA),
+	m_linkB(linkB),
+	m_numRows(numRows),
+	m_jacSizeA(0),
+	m_jacSizeBoth(0),
+	m_isUnilateral(isUnilateral),
+	m_numDofsFinalized(-1),
+	m_maxAppliedImpulse(100)
+void btMultiBodyConstraint::updateJacobianSizes()
+    if(m_bodyA)
+	{
+		m_jacSizeA = (6 + m_bodyA->getNumDofs());
+	}
+	if(m_bodyB)
+	{
+		m_jacSizeBoth = m_jacSizeA + 6 + m_bodyB->getNumDofs();
+	}
+	else
+		m_jacSizeBoth = m_jacSizeA;
+void btMultiBodyConstraint::allocateJacobiansMultiDof()
+	updateJacobianSizes();
+	m_posOffset = ((1 + m_jacSizeBoth)*m_numRows);
+	m_data.resize((2 + m_jacSizeBoth) * m_numRows);
+void	btMultiBodyConstraint::applyDeltaVee(btMultiBodyJacobianData& data, btScalar* delta_vee, btScalar impulse, int velocityIndex, int ndof)
+	for (int i = 0; i < ndof; ++i)
+		data.m_deltaVelocities[velocityIndex+i] += delta_vee[i] * impulse;
+btScalar btMultiBodyConstraint::fillMultiBodyConstraint(	btMultiBodySolverConstraint& solverConstraint,
+                                                        btMultiBodyJacobianData& data,
+                                                        btScalar* jacOrgA, btScalar* jacOrgB,
+                                                        const btVector3& constraintNormalAng,
+                                                        const btVector3& constraintNormalLin,
+                                                        const btVector3& posAworld, const btVector3& posBworld,
+                                                        btScalar posError,
+                                                        const btContactSolverInfo& infoGlobal,
+                                                        btScalar lowerLimit, btScalar upperLimit,
+                                                        bool angConstraint,
+                                                        btScalar relaxation,
+                                                        bool isFriction, btScalar desiredVelocity, btScalar cfmSlip)
+    solverConstraint.m_multiBodyA = m_bodyA;
+    solverConstraint.m_multiBodyB = m_bodyB;
+    solverConstraint.m_linkA = m_linkA;
+    solverConstraint.m_linkB = m_linkB;
+    btMultiBody* multiBodyA = solverConstraint.m_multiBodyA;
+    btMultiBody* multiBodyB = solverConstraint.m_multiBodyB;
+    btSolverBody* bodyA = multiBodyA ? 0 : &data.m_solverBodyPool->at(solverConstraint.m_solverBodyIdA);
+    btSolverBody* bodyB = multiBodyB ? 0 : &data.m_solverBodyPool->at(solverConstraint.m_solverBodyIdB);
+    btRigidBody* rb0 = multiBodyA ? 0 : bodyA->m_originalBody;
+    btRigidBody* rb1 = multiBodyB ? 0 : bodyB->m_originalBody;
+    btVector3 rel_pos1, rel_pos2;				//these two used to be inited to posAworld and posBworld (respectively) but it does not seem necessary
+    if (bodyA)
+        rel_pos1 = posAworld - bodyA->getWorldTransform().getOrigin();
+    if (bodyB)
+        rel_pos2 = posBworld - bodyB->getWorldTransform().getOrigin();
+    if (multiBodyA)
+    {
+        if (solverConstraint.m_linkA<0)
+        {
+            rel_pos1 = posAworld - multiBodyA->getBasePos();
+        } else
+        {
+            rel_pos1 = posAworld - multiBodyA->getLink(solverConstraint.m_linkA).m_cachedWorldTransform.getOrigin();
+        }
+        const int ndofA  = multiBodyA->getNumDofs() + 6;
+        solverConstraint.m_deltaVelAindex = multiBodyA->getCompanionId();
+        if (solverConstraint.m_deltaVelAindex <0)
+        {
+            solverConstraint.m_deltaVelAindex = data.m_deltaVelocities.size();
+            multiBodyA->setCompanionId(solverConstraint.m_deltaVelAindex);
+            data.m_deltaVelocities.resize(data.m_deltaVelocities.size()+ndofA);
+        } else
+        {
+            btAssert(data.m_deltaVelocities.size() >= solverConstraint.m_deltaVelAindex+ndofA);
+        }
+        //determine jacobian of this 1D constraint in terms of multibodyA's degrees of freedom
+        //resize..
+        solverConstraint.m_jacAindex = data.m_jacobians.size();
+        data.m_jacobians.resize(data.m_jacobians.size()+ndofA);
+        //copy/determine
+        if(jacOrgA)
+        {
+            for (int i=0;i<ndofA;i++)
+                data.m_jacobians[solverConstraint.m_jacAindex+i] = jacOrgA[i];
+        }
+        else
+        {
+            btScalar* jac1=&data.m_jacobians[solverConstraint.m_jacAindex];
+            //multiBodyA->fillContactJacobianMultiDof(solverConstraint.m_linkA, posAworld, constraintNormalLin, jac1, data.scratch_r, data.scratch_v, data.scratch_m);
+            multiBodyA->fillConstraintJacobianMultiDof(solverConstraint.m_linkA, posAworld, constraintNormalAng, constraintNormalLin, jac1, data.scratch_r, data.scratch_v, data.scratch_m);
+        }
+        //determine the velocity response of multibodyA to reaction impulses of this constraint (i.e. A[i,i] for i=1,...n_con: multibody's inverse inertia with respect to this 1D constraint)
+        //resize..
+        data.m_deltaVelocitiesUnitImpulse.resize(data.m_deltaVelocitiesUnitImpulse.size()+ndofA);		//=> each constraint row has the constrained tree dofs allocated in m_deltaVelocitiesUnitImpulse
+        btAssert(data.m_jacobians.size() == data.m_deltaVelocitiesUnitImpulse.size());
+        btScalar* delta = &data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacAindex];
+        //determine..
+        multiBodyA->calcAccelerationDeltasMultiDof(&data.m_jacobians[solverConstraint.m_jacAindex],delta,data.scratch_r, data.scratch_v);
+        btVector3 torqueAxis0;
+        if (angConstraint) {
+            torqueAxis0 = constraintNormalAng;
+        }
+        else {
+            torqueAxis0 = rel_pos1.cross(constraintNormalLin);
+        }
+        solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+        solverConstraint.m_contactNormal1 = constraintNormalLin;
+    }
+    else //if(rb0)
+    {
+        btVector3 torqueAxis0;
+        if (angConstraint) {
+            torqueAxis0 = constraintNormalAng;
+        }
+        else {
+            torqueAxis0 = rel_pos1.cross(constraintNormalLin);
+        }
+        solverConstraint.m_angularComponentA = rb0 ? rb0->getInvInertiaTensorWorld()*torqueAxis0*rb0->getAngularFactor() : btVector3(0,0,0);
+        solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+        solverConstraint.m_contactNormal1 = constraintNormalLin;
+    }
+    if (multiBodyB)
+    {
+        if (solverConstraint.m_linkB<0)
+        {
+            rel_pos2 = posBworld - multiBodyB->getBasePos();
+        } else
+        {
+            rel_pos2 = posBworld - multiBodyB->getLink(solverConstraint.m_linkB).m_cachedWorldTransform.getOrigin();
+        }
+        const int ndofB  = multiBodyB->getNumDofs() + 6;
+        solverConstraint.m_deltaVelBindex = multiBodyB->getCompanionId();
+        if (solverConstraint.m_deltaVelBindex <0)
+        {
+            solverConstraint.m_deltaVelBindex = data.m_deltaVelocities.size();
+            multiBodyB->setCompanionId(solverConstraint.m_deltaVelBindex);
+            data.m_deltaVelocities.resize(data.m_deltaVelocities.size()+ndofB);
+        }
+        //determine jacobian of this 1D constraint in terms of multibodyB's degrees of freedom
+        //resize..
+        solverConstraint.m_jacBindex = data.m_jacobians.size();
+        data.m_jacobians.resize(data.m_jacobians.size()+ndofB);
+        //copy/determine..
+        if(jacOrgB)
+        {
+            for (int i=0;i<ndofB;i++)
+                data.m_jacobians[solverConstraint.m_jacBindex+i] = jacOrgB[i];
+        }
+        else
+        {
+            //multiBodyB->fillContactJacobianMultiDof(solverConstraint.m_linkB, posBworld, -constraintNormalLin, &data.m_jacobians[solverConstraint.m_jacBindex], data.scratch_r, data.scratch_v, data.scratch_m);
+            multiBodyB->fillConstraintJacobianMultiDof(solverConstraint.m_linkB, posBworld, -constraintNormalAng, -constraintNormalLin, &data.m_jacobians[solverConstraint.m_jacBindex], data.scratch_r, data.scratch_v, data.scratch_m);
+        }
+        //determine velocity response of multibodyB to reaction impulses of this constraint (i.e. A[i,i] for i=1,...n_con: multibody's inverse inertia with respect to this 1D constraint)
+        //resize..
+        data.m_deltaVelocitiesUnitImpulse.resize(data.m_deltaVelocitiesUnitImpulse.size()+ndofB);
+        btAssert(data.m_jacobians.size() == data.m_deltaVelocitiesUnitImpulse.size());
+        btScalar* delta = &data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacBindex];
+        //determine..
+        multiBodyB->calcAccelerationDeltasMultiDof(&data.m_jacobians[solverConstraint.m_jacBindex],delta,data.scratch_r, data.scratch_v);
+        btVector3 torqueAxis1;
+        if (angConstraint) {
+            torqueAxis1 = constraintNormalAng;
+        }
+        else {
+            torqueAxis1 = rel_pos2.cross(constraintNormalLin);
+        }
+        solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+        solverConstraint.m_contactNormal2 = -constraintNormalLin;
+    }
+    else //if(rb1)
+    {
+        btVector3 torqueAxis1;
+        if (angConstraint) {
+            torqueAxis1 = constraintNormalAng;
+        }
+        else {
+            torqueAxis1 = rel_pos2.cross(constraintNormalLin);
+        }
+        solverConstraint.m_angularComponentB = rb1 ? rb1->getInvInertiaTensorWorld()*-torqueAxis1*rb1->getAngularFactor() : btVector3(0,0,0);
+        solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+        solverConstraint.m_contactNormal2 = -constraintNormalLin;
+    }
+    {
+        btVector3 vec;
+        btScalar denom0 = 0.f;
+        btScalar denom1 = 0.f;
+        btScalar* jacB = 0;
+        btScalar* jacA = 0;
+        btScalar* deltaVelA = 0;
+        btScalar* deltaVelB = 0;
+        int ndofA  = 0;
+        //determine the "effective mass" of the constrained multibodyA with respect to this 1D constraint (i.e. 1/A[i,i])
+        if (multiBodyA)
+        {
+            ndofA = multiBodyA->getNumDofs() + 6;
+            jacA = &data.m_jacobians[solverConstraint.m_jacAindex];
+            deltaVelA = &data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacAindex];
+            for (int i = 0; i < ndofA; ++i)
+            {
+                btScalar j = jacA[i] ;
+                btScalar l = deltaVelA[i];
+                denom0 += j*l;
+            }
+        }
+        else if(rb0)
+        {
+            vec = ( solverConstraint.m_angularComponentA).cross(rel_pos1);
+            if (angConstraint) {
+                denom0 = rb0->getInvMass() + constraintNormalAng.dot(vec);
+            }
+            else {
+                denom0 = rb0->getInvMass() + constraintNormalLin.dot(vec);
+            }
+        }
+        //
+        if (multiBodyB)
+        {
+            const int ndofB = multiBodyB->getNumDofs() + 6;
+            jacB = &data.m_jacobians[solverConstraint.m_jacBindex];
+            deltaVelB = &data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacBindex];
+            for (int i = 0; i < ndofB; ++i)
+            {
+                btScalar j = jacB[i] ;
+                btScalar l = deltaVelB[i];
+                denom1 += j*l;
+            }
+        }
+        else if(rb1)
+        {
+            vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
+            if (angConstraint) {
+                denom1 = rb1->getInvMass() + constraintNormalAng.dot(vec);
+            }
+            else {
+                denom1 = rb1->getInvMass() + constraintNormalLin.dot(vec);
+            }
+        }
+        //
+        btScalar d = denom0+denom1;
+        if (d>SIMD_EPSILON)
+        {
+            solverConstraint.m_jacDiagABInv = relaxation/(d);
+        }
+        else
+        {
+            //disable the constraint row to handle singularity/redundant constraint
+            solverConstraint.m_jacDiagABInv  = 0.f;
+        }
+    }
+    //compute rhs and remaining solverConstraint fields
+    btScalar penetration = isFriction? 0 : posError+infoGlobal.m_linearSlop;
+    btScalar rel_vel = 0.f;
+    int ndofA  = 0;
+    int ndofB  = 0;
+    {
+        btVector3 vel1,vel2;
+        if (multiBodyA)
+        {
+            ndofA = multiBodyA->getNumDofs() + 6;
+            btScalar* jacA = &data.m_jacobians[solverConstraint.m_jacAindex];
+            for (int i = 0; i < ndofA ; ++i)
+                rel_vel += multiBodyA->getVelocityVector()[i] * jacA[i];
+        }
+        else if(rb0)
+        {
+            rel_vel += rb0->getVelocityInLocalPoint(rel_pos1).dot(solverConstraint.m_contactNormal1);
+        }
+        if (multiBodyB)
+        {
+            ndofB = multiBodyB->getNumDofs() + 6;
+            btScalar* jacB = &data.m_jacobians[solverConstraint.m_jacBindex];
+            for (int i = 0; i < ndofB ; ++i)
+                rel_vel += multiBodyB->getVelocityVector()[i] * jacB[i];
+        }
+        else if(rb1)
+        {
+            rel_vel += rb1->getVelocityInLocalPoint(rel_pos2).dot(solverConstraint.m_contactNormal2);
+        }
+        solverConstraint.m_friction = 0.f;//cp.m_combinedFriction;
+    }
+    ///warm starting (or zero if disabled)
+    /*
+     if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+     {
+     solverConstraint.m_appliedImpulse = isFriction ? 0 : cp.m_appliedImpulse * infoGlobal.m_warmstartingFactor;
+     if (solverConstraint.m_appliedImpulse)
+     {
+     if (multiBodyA)
+     {
+     btScalar impulse = solverConstraint.m_appliedImpulse;
+     btScalar* deltaV = &data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacAindex];
+     multiBodyA->applyDeltaVee(deltaV,impulse);
+     applyDeltaVee(data,deltaV,impulse,solverConstraint.m_deltaVelAindex,ndofA);
+     } else
+     {
+     if (rb0)
+					bodyA->internalApplyImpulse(solverConstraint.m_contactNormal1*bodyA->internalGetInvMass()*rb0->getLinearFactor(),solverConstraint.m_angularComponentA,solverConstraint.m_appliedImpulse);
+     }
+     if (multiBodyB)
+     {
+     btScalar impulse = solverConstraint.m_appliedImpulse;
+     btScalar* deltaV = &data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacBindex];
+     multiBodyB->applyDeltaVee(deltaV,impulse);
+     applyDeltaVee(data,deltaV,impulse,solverConstraint.m_deltaVelBindex,ndofB);
+     } else
+     {
+     if (rb1)
+					bodyB->internalApplyImpulse(-solverConstraint.m_contactNormal2*bodyB->internalGetInvMass()*rb1->getLinearFactor(),-solverConstraint.m_angularComponentB,-(btScalar)solverConstraint.m_appliedImpulse);
+     }
+     }
+     } else
+     */
+    solverConstraint.m_appliedImpulse = 0.f;
+    solverConstraint.m_appliedPushImpulse = 0.f;
+    {
+        btScalar positionalError = 0.f;
+        btScalar	velocityError = desiredVelocity - rel_vel;// * damping;
+        btScalar erp = infoGlobal.m_erp2;
+        if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+        {
+            erp = infoGlobal.m_erp;
+        }
+        positionalError = -penetration * erp/infoGlobal.m_timeStep;
+        btScalar  penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+        btScalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+        if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+        {
+            //combine position and velocity into rhs
+            solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+            solverConstraint.m_rhsPenetration = 0.f;
+        } else
+        {
+            //split position and velocity into rhs and m_rhsPenetration
+            solverConstraint.m_rhs = velocityImpulse;
+            solverConstraint.m_rhsPenetration = penetrationImpulse;
+        }
+        solverConstraint.m_cfm = 0.f;
+        solverConstraint.m_lowerLimit = lowerLimit;
+        solverConstraint.m_upperLimit = upperLimit;
+    }
+    return rel_vel;
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h
new file mode 100644
index 00000000..74c6f5a8
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h
@@ -0,0 +1,183 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "btMultiBody.h"
+class btMultiBody;
+struct btSolverInfo;
+#include "btMultiBodySolverConstraint.h"
+struct btMultiBodyJacobianData
+	btAlignedObjectArray<btScalar>		m_jacobians;
+	btAlignedObjectArray<btScalar>		m_deltaVelocitiesUnitImpulse;	//holds the joint-space response of the corresp. tree to the test impulse in each constraint space dimension
+	btAlignedObjectArray<btScalar>		m_deltaVelocities;				//holds joint-space vectors of all the constrained trees accumulating the effect of corrective impulses applied in SI
+	btAlignedObjectArray<btScalar>		scratch_r;
+	btAlignedObjectArray<btVector3>		scratch_v;
+	btAlignedObjectArray<btMatrix3x3>	scratch_m;
+	btAlignedObjectArray<btSolverBody>*	m_solverBodyPool;
+	int									m_fixedBodyId;
+class btMultiBodyConstraint
+	btMultiBody*	m_bodyA;
+    btMultiBody*	m_bodyB;
+    int				m_linkA;
+    int				m_linkB;
+    int				m_numRows;
+    int				m_jacSizeA;
+    int				m_jacSizeBoth;
+    int				m_posOffset;
+	bool			m_isUnilateral;
+	int				m_numDofsFinalized;
+	btScalar		m_maxAppliedImpulse;
+    // warning: the data block lay out is not consistent for all constraints
+    // data block laid out as follows:
+    // cached impulses. (one per row.)
+    // jacobians. (interleaved, row1 body1 then row1 body2 then row2 body 1 etc)
+    // positions. (one per row.)
+    btAlignedObjectArray<btScalar> m_data;
+	void	applyDeltaVee(btMultiBodyJacobianData& data, btScalar* delta_vee, btScalar impulse, int velocityIndex, int ndof);
+	btScalar fillMultiBodyConstraint(btMultiBodySolverConstraint& solverConstraint,
+																btMultiBodyJacobianData& data,
+                                     btScalar* jacOrgA, btScalar* jacOrgB,
+                                     const btVector3& constraintNormalAng,
+																const btVector3& constraintNormalLin,
+																const btVector3& posAworld, const btVector3& posBworld,
+																btScalar posError,
+																const btContactSolverInfo& infoGlobal,
+                                     btScalar lowerLimit, btScalar upperLimit,
+                                     bool angConstraint = false,
+																btScalar relaxation = 1.f,
+																bool isFriction = false, btScalar desiredVelocity=0, btScalar cfmSlip=0);
+	btMultiBodyConstraint(btMultiBody* bodyA,btMultiBody* bodyB,int linkA, int linkB, int numRows, bool isUnilateral);
+	virtual ~btMultiBodyConstraint();
+	void updateJacobianSizes();
+	void allocateJacobiansMultiDof();
+	virtual void finalizeMultiDof()=0;
+	virtual int getIslandIdA() const =0;
+	virtual int getIslandIdB() const =0;
+	virtual void createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal)=0;
+	int	getNumRows() const
+	{
+		return m_numRows;
+	}
+	btMultiBody*	getMultiBodyA()
+	{
+		return m_bodyA;
+	}
+    btMultiBody*	getMultiBodyB()
+	{
+		return m_bodyB;
+	}
+	void	internalSetAppliedImpulse(int dof, btScalar appliedImpulse)
+	{
+		btAssert(dof>=0);
+		btAssert(dof < getNumRows());
+		m_data[dof] = appliedImpulse;
+	}
+	btScalar	getAppliedImpulse(int dof)
+	{
+		btAssert(dof>=0);
+		btAssert(dof < getNumRows());
+		return m_data[dof];
+	}
+	// current constraint position
+    // constraint is pos >= 0 for unilateral, or pos = 0 for bilateral
+    // NOTE: ignored position for friction rows.
+    btScalar getPosition(int row) const
+	{
+		return m_data[m_posOffset + row];
+	}
+    void setPosition(int row, btScalar pos)
+	{
+		m_data[m_posOffset + row] = pos;
+	}
+	bool isUnilateral() const
+	{
+		return m_isUnilateral;
+	}
+	// jacobian blocks.
+    // each of size 6 + num_links. (jacobian2 is null if no body2.)
+    // format: 3 'omega' coefficients, 3 'v' coefficients, then the 'qdot' coefficients.
+    btScalar* jacobianA(int row)
+	{
+		return &m_data[m_numRows + row * m_jacSizeBoth];
+	}
+    const btScalar* jacobianA(int row) const
+	{
+		return &m_data[m_numRows + (row * m_jacSizeBoth)];
+	}
+    btScalar* jacobianB(int row)
+	{
+		return &m_data[m_numRows + (row * m_jacSizeBoth) + m_jacSizeA];
+	}
+    const btScalar* jacobianB(int row) const
+	{
+		return &m_data[m_numRows + (row * m_jacSizeBoth) + m_jacSizeA];
+	}
+	btScalar	getMaxAppliedImpulse() const
+	{
+		return m_maxAppliedImpulse;
+	}
+	void	setMaxAppliedImpulse(btScalar maxImp)
+	{
+		m_maxAppliedImpulse = maxImp;
+	}
+	virtual void debugDraw(class btIDebugDraw* drawer)=0;
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraintSolver.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraintSolver.cpp
new file mode 100644
index 00000000..08411f40
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraintSolver.cpp
@@ -0,0 +1,1067 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btMultiBodyConstraintSolver.h"
+#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletDynamics/ConstraintSolver/btSolverBody.h"
+#include "btMultiBodyConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
+#include "LinearMath/btQuickprof.h"
+btScalar btMultiBodyConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+	btScalar val = btSequentialImpulseConstraintSolver::solveSingleIteration(iteration, bodies ,numBodies,manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer);
+	//solve featherstone non-contact constraints
+	//printf("m_multiBodyNonContactConstraints = %d\n",m_multiBodyNonContactConstraints.size());
+	for (int j=0;j<m_multiBodyNonContactConstraints.size();j++)
+	{
+		btMultiBodySolverConstraint& constraint = m_multiBodyNonContactConstraints[j];
+		resolveSingleConstraintRowGeneric(constraint);
+		if(constraint.m_multiBodyA) 
+			constraint.m_multiBodyA->setPosUpdated(false);
+		if(constraint.m_multiBodyB) 
+			constraint.m_multiBodyB->setPosUpdated(false);
+	}
+	//solve featherstone normal contact
+	for (int j=0;j<m_multiBodyNormalContactConstraints.size();j++)
+	{
+		btMultiBodySolverConstraint& constraint = m_multiBodyNormalContactConstraints[j];
+		if (iteration < infoGlobal.m_numIterations)
+			resolveSingleConstraintRowGeneric(constraint);
+		if(constraint.m_multiBodyA) 
+			constraint.m_multiBodyA->setPosUpdated(false);
+		if(constraint.m_multiBodyB) 
+			constraint.m_multiBodyB->setPosUpdated(false);
+	}
+	//solve featherstone frictional contact
+	for (int j=0;j<this->m_multiBodyFrictionContactConstraints.size();j++)
+	{
+		if (iteration < infoGlobal.m_numIterations)
+		{
+			btMultiBodySolverConstraint& frictionConstraint = m_multiBodyFrictionContactConstraints[j];
+			btScalar totalImpulse = m_multiBodyNormalContactConstraints[frictionConstraint.m_frictionIndex].m_appliedImpulse;
+			//adjust friction limits here
+			if (totalImpulse>btScalar(0))
+			{
+				frictionConstraint.m_lowerLimit = -(frictionConstraint.m_friction*totalImpulse);
+				frictionConstraint.m_upperLimit = frictionConstraint.m_friction*totalImpulse;
+				resolveSingleConstraintRowGeneric(frictionConstraint);
+				if(frictionConstraint.m_multiBodyA) 
+					frictionConstraint.m_multiBodyA->setPosUpdated(false);
+				if(frictionConstraint.m_multiBodyB) 
+					frictionConstraint.m_multiBodyB->setPosUpdated(false);
+			}
+		}
+	}
+	return val;
+btScalar btMultiBodyConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+	m_multiBodyNonContactConstraints.resize(0);
+	m_multiBodyNormalContactConstraints.resize(0);
+	m_multiBodyFrictionContactConstraints.resize(0);
+	m_data.m_jacobians.resize(0);
+	m_data.m_deltaVelocitiesUnitImpulse.resize(0);
+	m_data.m_deltaVelocities.resize(0);
+	for (int i=0;i<numBodies;i++)
+	{
+		const btMultiBodyLinkCollider* fcA = btMultiBodyLinkCollider::upcast(bodies[i]);
+		if (fcA)
+		{
+			fcA->m_multiBody->setCompanionId(-1);
+		}
+	}
+	btScalar val = btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup( bodies,numBodies,manifoldPtr, numManifolds, constraints,numConstraints,infoGlobal,debugDrawer);
+	return val;
+void	btMultiBodyConstraintSolver::applyDeltaVee(btScalar* delta_vee, btScalar impulse, int velocityIndex, int ndof)
+    for (int i = 0; i < ndof; ++i) 
+		m_data.m_deltaVelocities[velocityIndex+i] += delta_vee[i] * impulse;
+void btMultiBodyConstraintSolver::resolveSingleConstraintRowGeneric(const btMultiBodySolverConstraint& c)
+	btScalar deltaImpulse = c.m_rhs-btScalar(c.m_appliedImpulse)*c.m_cfm;
+	btScalar deltaVelADotn=0;
+	btScalar deltaVelBDotn=0;
+	btSolverBody* bodyA = 0;
+	btSolverBody* bodyB = 0;
+	int ndofA=0;
+	int ndofB=0;
+	if (c.m_multiBodyA)
+	{
+		ndofA  = c.m_multiBodyA->getNumDofs() + 6;
+		for (int i = 0; i < ndofA; ++i) 
+			deltaVelADotn += m_data.m_jacobians[c.m_jacAindex+i] * m_data.m_deltaVelocities[c.m_deltaVelAindex+i];
+	} else if(c.m_solverBodyIdA >= 0)
+	{
+		bodyA = &m_tmpSolverBodyPool[c.m_solverBodyIdA];
+		deltaVelADotn += c.m_contactNormal1.dot(bodyA->internalGetDeltaLinearVelocity()) 	+ c.m_relpos1CrossNormal.dot(bodyA->internalGetDeltaAngularVelocity());
+	}
+	if (c.m_multiBodyB)
+	{
+		ndofB  = c.m_multiBodyB->getNumDofs() + 6;
+		for (int i = 0; i < ndofB; ++i) 
+			deltaVelBDotn += m_data.m_jacobians[c.m_jacBindex+i] * m_data.m_deltaVelocities[c.m_deltaVelBindex+i];
+	} else if(c.m_solverBodyIdB >= 0)
+	{
+		bodyB = &m_tmpSolverBodyPool[c.m_solverBodyIdB];
+		deltaVelBDotn += c.m_contactNormal2.dot(bodyB->internalGetDeltaLinearVelocity())  + c.m_relpos2CrossNormal.dot(bodyB->internalGetDeltaAngularVelocity());
+	}
+	deltaImpulse	-=	deltaVelADotn*c.m_jacDiagABInv;//m_jacDiagABInv = 1./denom
+	deltaImpulse	-=	deltaVelBDotn*c.m_jacDiagABInv;
+	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else if (sum > c.m_upperLimit) 
+	{
+		deltaImpulse = c.m_upperLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_upperLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	if (c.m_multiBodyA)
+	{
+		applyDeltaVee(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacAindex],deltaImpulse,c.m_deltaVelAindex,ndofA);
+		//note: update of the actual velocities (below) in the multibody does not have to happen now since m_deltaVelocities can be applied after all iterations
+		//it would make the multibody solver more like the regular one with m_deltaVelocities being equivalent to btSolverBody::m_deltaLinearVelocity/m_deltaAngularVelocity
+		c.m_multiBodyA->applyDeltaVeeMultiDof2(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacAindex],deltaImpulse);
+	} else if(c.m_solverBodyIdA >= 0)
+	{
+		bodyA->internalApplyImpulse(c.m_contactNormal1*bodyA->internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+	}
+	if (c.m_multiBodyB)
+	{
+		applyDeltaVee(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacBindex],deltaImpulse,c.m_deltaVelBindex,ndofB);
+		//note: update of the actual velocities (below) in the multibody does not have to happen now since m_deltaVelocities can be applied after all iterations
+		//it would make the multibody solver more like the regular one with m_deltaVelocities being equivalent to btSolverBody::m_deltaLinearVelocity/m_deltaAngularVelocity
+		c.m_multiBodyB->applyDeltaVeeMultiDof2(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacBindex],deltaImpulse);
+	} else if(c.m_solverBodyIdB >= 0)
+	{
+		bodyB->internalApplyImpulse(c.m_contactNormal2*bodyB->internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+	}
+void btMultiBodyConstraintSolver::setupMultiBodyContactConstraint(btMultiBodySolverConstraint& solverConstraint, 
+																 const btVector3& contactNormal,
+																 btManifoldPoint& cp, const btContactSolverInfo& infoGlobal,
+																 btScalar& relaxation,
+																 bool isFriction, btScalar desiredVelocity, btScalar cfmSlip)
+	BT_PROFILE("setupMultiBodyContactConstraint");
+	btVector3 rel_pos1;
+	btVector3 rel_pos2;
+	btMultiBody* multiBodyA = solverConstraint.m_multiBodyA;
+	btMultiBody* multiBodyB = solverConstraint.m_multiBodyB;
+	const btVector3& pos1 = cp.getPositionWorldOnA();
+	const btVector3& pos2 = cp.getPositionWorldOnB();
+	btSolverBody* bodyA = multiBodyA ? 0 : &m_tmpSolverBodyPool[solverConstraint.m_solverBodyIdA];
+	btSolverBody* bodyB = multiBodyB ? 0 : &m_tmpSolverBodyPool[solverConstraint.m_solverBodyIdB];
+	btRigidBody* rb0 = multiBodyA ? 0 : bodyA->m_originalBody;
+	btRigidBody* rb1 = multiBodyB ? 0 : bodyB->m_originalBody;
+	if (bodyA)
+		rel_pos1 = pos1 - bodyA->getWorldTransform().getOrigin(); 
+	if (bodyB)
+		rel_pos2 = pos2 - bodyB->getWorldTransform().getOrigin();
+	relaxation = infoGlobal.m_sor;
+	btScalar invTimeStep = btScalar(1)/infoGlobal.m_timeStep;
+	btScalar cfm = (cp.m_contactPointFlags&BT_CONTACT_FLAG_HAS_CONTACT_CFM)?cp.m_contactCFM:infoGlobal.m_globalCfm;
+	cfm *= invTimeStep;
+	btScalar erp = (cp.m_contactPointFlags&BT_CONTACT_FLAG_HAS_CONTACT_ERP)?cp.m_contactERP:infoGlobal.m_erp2;
+	if (multiBodyA)
+	{
+		if (solverConstraint.m_linkA<0)
+		{
+			rel_pos1 = pos1 - multiBodyA->getBasePos();
+		} else
+		{
+			rel_pos1 = pos1 - multiBodyA->getLink(solverConstraint.m_linkA).m_cachedWorldTransform.getOrigin();
+		}
+		const int ndofA  = multiBodyA->getNumDofs() + 6;
+		solverConstraint.m_deltaVelAindex = multiBodyA->getCompanionId();
+		if (solverConstraint.m_deltaVelAindex <0)
+		{
+			solverConstraint.m_deltaVelAindex = m_data.m_deltaVelocities.size();
+			multiBodyA->setCompanionId(solverConstraint.m_deltaVelAindex);
+			m_data.m_deltaVelocities.resize(m_data.m_deltaVelocities.size()+ndofA);
+		} else
+		{
+			btAssert(m_data.m_deltaVelocities.size() >= solverConstraint.m_deltaVelAindex+ndofA);
+		}
+		solverConstraint.m_jacAindex = m_data.m_jacobians.size();
+		m_data.m_jacobians.resize(m_data.m_jacobians.size()+ndofA);
+		m_data.m_deltaVelocitiesUnitImpulse.resize(m_data.m_deltaVelocitiesUnitImpulse.size()+ndofA);
+		btAssert(m_data.m_jacobians.size() == m_data.m_deltaVelocitiesUnitImpulse.size());
+		btScalar* jac1=&m_data.m_jacobians[solverConstraint.m_jacAindex];
+		multiBodyA->fillContactJacobianMultiDof(solverConstraint.m_linkA, cp.getPositionWorldOnA(), contactNormal, jac1, m_data.scratch_r, m_data.scratch_v, m_data.scratch_m);
+		btScalar* delta = &m_data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacAindex];
+		multiBodyA->calcAccelerationDeltasMultiDof(&m_data.m_jacobians[solverConstraint.m_jacAindex],delta,m_data.scratch_r, m_data.scratch_v);
+		btVector3 torqueAxis0 = rel_pos1.cross(contactNormal);
+		solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+		solverConstraint.m_contactNormal1 = contactNormal;
+	} else
+	{
+		btVector3 torqueAxis0 = rel_pos1.cross(contactNormal);
+		solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+		solverConstraint.m_contactNormal1 = contactNormal;
+		solverConstraint.m_angularComponentA = rb0 ? rb0->getInvInertiaTensorWorld()*torqueAxis0*rb0->getAngularFactor() : btVector3(0,0,0);
+	}
+	if (multiBodyB)
+	{
+		if (solverConstraint.m_linkB<0)
+		{
+			rel_pos2 = pos2 - multiBodyB->getBasePos();
+		} else
+		{
+			rel_pos2 = pos2 - multiBodyB->getLink(solverConstraint.m_linkB).m_cachedWorldTransform.getOrigin();
+		}
+		const int ndofB  = multiBodyB->getNumDofs() + 6;
+		solverConstraint.m_deltaVelBindex = multiBodyB->getCompanionId();
+		if (solverConstraint.m_deltaVelBindex <0)
+		{
+			solverConstraint.m_deltaVelBindex = m_data.m_deltaVelocities.size();
+			multiBodyB->setCompanionId(solverConstraint.m_deltaVelBindex);
+			m_data.m_deltaVelocities.resize(m_data.m_deltaVelocities.size()+ndofB);
+		}
+		solverConstraint.m_jacBindex = m_data.m_jacobians.size();
+		m_data.m_jacobians.resize(m_data.m_jacobians.size()+ndofB);
+		m_data.m_deltaVelocitiesUnitImpulse.resize(m_data.m_deltaVelocitiesUnitImpulse.size()+ndofB);
+		btAssert(m_data.m_jacobians.size() == m_data.m_deltaVelocitiesUnitImpulse.size());
+		multiBodyB->fillContactJacobianMultiDof(solverConstraint.m_linkB, cp.getPositionWorldOnB(), -contactNormal, &m_data.m_jacobians[solverConstraint.m_jacBindex], m_data.scratch_r, m_data.scratch_v, m_data.scratch_m);
+		multiBodyB->calcAccelerationDeltasMultiDof(&m_data.m_jacobians[solverConstraint.m_jacBindex],&m_data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacBindex],m_data.scratch_r, m_data.scratch_v);
+		btVector3 torqueAxis1 = rel_pos2.cross(contactNormal);		
+		solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+		solverConstraint.m_contactNormal2 = -contactNormal;
+	} else
+	{
+		btVector3 torqueAxis1 = rel_pos2.cross(contactNormal);		
+		solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+		solverConstraint.m_contactNormal2 = -contactNormal;
+		solverConstraint.m_angularComponentB = rb1 ? rb1->getInvInertiaTensorWorld()*-torqueAxis1*rb1->getAngularFactor() : btVector3(0,0,0);
+	}
+	{
+		btVector3 vec;
+		btScalar denom0 = 0.f;
+		btScalar denom1 = 0.f;
+		btScalar* jacB = 0;
+		btScalar* jacA = 0;
+		btScalar* lambdaA =0;
+		btScalar* lambdaB =0;
+		int ndofA  = 0;
+		if (multiBodyA)
+		{
+			ndofA  = multiBodyA->getNumDofs() + 6;
+			jacA = &m_data.m_jacobians[solverConstraint.m_jacAindex];
+			lambdaA = &m_data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacAindex];
+			for (int i = 0; i < ndofA; ++i)
+			{
+				btScalar j = jacA[i] ;
+				btScalar l =lambdaA[i];
+				denom0 += j*l;
+			}
+		} else
+		{
+			if (rb0)
+			{
+				vec = ( solverConstraint.m_angularComponentA).cross(rel_pos1);
+				denom0 = rb0->getInvMass() + contactNormal.dot(vec);
+			}
+		}
+		if (multiBodyB)
+		{
+			const int ndofB  = multiBodyB->getNumDofs() + 6;
+			jacB = &m_data.m_jacobians[solverConstraint.m_jacBindex];
+			lambdaB = &m_data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacBindex];
+			for (int i = 0; i < ndofB; ++i)
+			{
+				btScalar j = jacB[i] ;
+				btScalar l =lambdaB[i];
+				denom1 += j*l;
+			}
+		} else
+		{
+			if (rb1)
+			{
+				vec = ( -solverConstraint.m_angularComponentB).cross(rel_pos2);
+				denom1 = rb1->getInvMass() + contactNormal.dot(vec);
+			}
+		}
+		 btScalar d = denom0+denom1+cfm;
+		 if (d>SIMD_EPSILON)
+		 {
+			solverConstraint.m_jacDiagABInv = relaxation/(d);
+		 } else
+		 {
+			//disable the constraint row to handle singularity/redundant constraint
+			solverConstraint.m_jacDiagABInv  = 0.f;
+		 }
+	}
+	//compute rhs and remaining solverConstraint fields
+	btScalar restitution = 0.f;
+	btScalar penetration = isFriction? 0 : cp.getDistance()+infoGlobal.m_linearSlop;
+	btScalar rel_vel = 0.f;
+	int ndofA  = 0;
+	int ndofB  = 0;
+	{
+		btVector3 vel1,vel2;
+		if (multiBodyA)
+		{
+			ndofA  = multiBodyA->getNumDofs() + 6;
+			btScalar* jacA = &m_data.m_jacobians[solverConstraint.m_jacAindex];
+			for (int i = 0; i < ndofA ; ++i) 
+				rel_vel += multiBodyA->getVelocityVector()[i] * jacA[i];
+		} else
+		{
+			if (rb0)
+			{
+				rel_vel += rb0->getVelocityInLocalPoint(rel_pos1).dot(solverConstraint.m_contactNormal1);
+			}
+		}
+		if (multiBodyB)
+		{
+			ndofB  = multiBodyB->getNumDofs() + 6;
+			btScalar* jacB = &m_data.m_jacobians[solverConstraint.m_jacBindex];
+			for (int i = 0; i < ndofB ; ++i) 
+				rel_vel += multiBodyB->getVelocityVector()[i] * jacB[i];
+		} else
+		{
+			if (rb1)
+			{
+				rel_vel += rb1->getVelocityInLocalPoint(rel_pos2).dot(solverConstraint.m_contactNormal2);
+			}
+		}
+		solverConstraint.m_friction = cp.m_combinedFriction;
+		if(!isFriction)
+		{
+			restitution =  restitutionCurve(rel_vel, cp.m_combinedRestitution);	
+			if (restitution <= btScalar(0.))
+			{
+				restitution = 0.f;
+			}
+		}
+	}
+	///warm starting (or zero if disabled)
+	//disable warmstarting for btMultiBody, it has issues gaining energy (==explosion)
+	if (0)//infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+	{
+		solverConstraint.m_appliedImpulse = isFriction ? 0 : cp.m_appliedImpulse * infoGlobal.m_warmstartingFactor;
+		if (solverConstraint.m_appliedImpulse)
+		{
+			if (multiBodyA)
+			{
+				btScalar impulse = solverConstraint.m_appliedImpulse;
+				btScalar* deltaV = &m_data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacAindex];
+				multiBodyA->applyDeltaVeeMultiDof(deltaV,impulse);
+				applyDeltaVee(deltaV,impulse,solverConstraint.m_deltaVelAindex,ndofA);
+			} else
+			{
+				if (rb0)
+					bodyA->internalApplyImpulse(solverConstraint.m_contactNormal1*bodyA->internalGetInvMass()*rb0->getLinearFactor(),solverConstraint.m_angularComponentA,solverConstraint.m_appliedImpulse);
+			}
+			if (multiBodyB)
+			{
+				btScalar impulse = solverConstraint.m_appliedImpulse;
+				btScalar* deltaV = &m_data.m_deltaVelocitiesUnitImpulse[solverConstraint.m_jacBindex];
+				multiBodyB->applyDeltaVeeMultiDof(deltaV,impulse);
+				applyDeltaVee(deltaV,impulse,solverConstraint.m_deltaVelBindex,ndofB);
+			} else
+			{
+				if (rb1)
+					bodyB->internalApplyImpulse(-solverConstraint.m_contactNormal2*bodyB->internalGetInvMass()*rb1->getLinearFactor(),-solverConstraint.m_angularComponentB,-(btScalar)solverConstraint.m_appliedImpulse);
+			}
+		}
+	} else
+	{
+		solverConstraint.m_appliedImpulse = 0.f;
+	}
+	solverConstraint.m_appliedPushImpulse = 0.f;
+	{
+		btScalar positionalError = 0.f;
+		btScalar velocityError = restitution - rel_vel;// * damping;	//note for friction restitution is always set to 0 (check above) so it is acutally velocityError = -rel_vel for friction
+		if (penetration>0)
+		{
+			positionalError = 0;
+			velocityError -= penetration / infoGlobal.m_timeStep;
+		} else
+		{
+			positionalError = -penetration * erp/infoGlobal.m_timeStep;
+		}
+		btScalar  penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+		btScalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+		if(!isFriction)
+		{
+			if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+			{
+				//combine position and velocity into rhs
+				solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+				solverConstraint.m_rhsPenetration = 0.f;
+			} else
+			{
+				//split position and velocity into rhs and m_rhsPenetration
+				solverConstraint.m_rhs = velocityImpulse;
+				solverConstraint.m_rhsPenetration = penetrationImpulse;
+			}
+			solverConstraint.m_lowerLimit = 0;
+			solverConstraint.m_upperLimit = 1e10f;
+		}
+		else
+		{
+			solverConstraint.m_rhs = velocityImpulse;
+			solverConstraint.m_rhsPenetration = 0.f;
+			solverConstraint.m_lowerLimit = -solverConstraint.m_friction;
+			solverConstraint.m_upperLimit = solverConstraint.m_friction;
+		}
+		solverConstraint.m_cfm = cfm*solverConstraint.m_jacDiagABInv;
+	}
+btMultiBodySolverConstraint&	btMultiBodyConstraintSolver::addMultiBodyFrictionConstraint(const btVector3& normalAxis,btPersistentManifold* manifold,int frictionIndex,btManifoldPoint& cp,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, const btContactSolverInfo& infoGlobal, btScalar desiredVelocity, btScalar cfmSlip)
+	BT_PROFILE("addMultiBodyFrictionConstraint");
+	btMultiBodySolverConstraint& solverConstraint = m_multiBodyFrictionContactConstraints.expandNonInitializing();
+    solverConstraint.m_orgConstraint = 0;
+    solverConstraint.m_orgDofIndex = -1;
+	solverConstraint.m_frictionIndex = frictionIndex;
+	bool isFriction = true;
+	const btMultiBodyLinkCollider* fcA = btMultiBodyLinkCollider::upcast(manifold->getBody0());
+	const btMultiBodyLinkCollider* fcB = btMultiBodyLinkCollider::upcast(manifold->getBody1());
+	btMultiBody* mbA = fcA? fcA->m_multiBody : 0;
+	btMultiBody* mbB = fcB? fcB->m_multiBody : 0;
+	int solverBodyIdA = mbA? -1 : getOrInitSolverBody(*colObj0,infoGlobal.m_timeStep);
+	int solverBodyIdB = mbB ? -1 : getOrInitSolverBody(*colObj1,infoGlobal.m_timeStep);
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
+	solverConstraint.m_multiBodyA = mbA;
+	if (mbA)
+		solverConstraint.m_linkA = fcA->m_link;
+	solverConstraint.m_multiBodyB = mbB;
+	if (mbB)
+		solverConstraint.m_linkB = fcB->m_link;
+	solverConstraint.m_originalContactPoint = &cp;
+	setupMultiBodyContactConstraint(solverConstraint, normalAxis, cp, infoGlobal,relaxation,isFriction, desiredVelocity, cfmSlip);
+	return solverConstraint;
+void	btMultiBodyConstraintSolver::convertMultiBodyContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal)
+	const btMultiBodyLinkCollider* fcA = btMultiBodyLinkCollider::upcast(manifold->getBody0());
+	const btMultiBodyLinkCollider* fcB = btMultiBodyLinkCollider::upcast(manifold->getBody1());
+	btMultiBody* mbA = fcA? fcA->m_multiBody : 0;
+	btMultiBody* mbB = fcB? fcB->m_multiBody : 0;
+	btCollisionObject* colObj0=0,*colObj1=0;
+	colObj0 = (btCollisionObject*)manifold->getBody0();
+	colObj1 = (btCollisionObject*)manifold->getBody1();
+	int solverBodyIdA = mbA? -1 : getOrInitSolverBody(*colObj0,infoGlobal.m_timeStep);
+	int solverBodyIdB = mbB ? -1 : getOrInitSolverBody(*colObj1,infoGlobal.m_timeStep);
+//	btSolverBody* solverBodyA = mbA ? 0 : &m_tmpSolverBodyPool[solverBodyIdA];
+//	btSolverBody* solverBodyB = mbB ? 0 : &m_tmpSolverBodyPool[solverBodyIdB];
+	///avoid collision response between two static objects
+//	if (!solverBodyA || (solverBodyA->m_invMass.isZero() && (!solverBodyB || solverBodyB->m_invMass.isZero())))
+	//	return;
+	for (int j=0;j<manifold->getNumContacts();j++)
+	{
+		btManifoldPoint& cp = manifold->getContactPoint(j);
+		if (cp.getDistance() <= manifold->getContactProcessingThreshold())
+		{
+			btScalar relaxation;
+			int frictionIndex = m_multiBodyNormalContactConstraints.size();
+			btMultiBodySolverConstraint& solverConstraint = m_multiBodyNormalContactConstraints.expandNonInitializing();
+	//		btRigidBody* rb0 = btRigidBody::upcast(colObj0);
+	//		btRigidBody* rb1 = btRigidBody::upcast(colObj1);
+            solverConstraint.m_orgConstraint = 0;
+            solverConstraint.m_orgDofIndex = -1;
+			solverConstraint.m_solverBodyIdA = solverBodyIdA;
+			solverConstraint.m_solverBodyIdB = solverBodyIdB;
+			solverConstraint.m_multiBodyA = mbA;
+			if (mbA)
+				solverConstraint.m_linkA = fcA->m_link;
+			solverConstraint.m_multiBodyB = mbB;
+			if (mbB)
+				solverConstraint.m_linkB = fcB->m_link;
+			solverConstraint.m_originalContactPoint = &cp;
+			bool isFriction = false;
+			setupMultiBodyContactConstraint(solverConstraint, cp.m_normalWorldOnB,cp, infoGlobal, relaxation, isFriction);
+//			const btVector3& pos1 = cp.getPositionWorldOnA();
+//			const btVector3& pos2 = cp.getPositionWorldOnB();
+			/////setup the friction constraints
+			solverConstraint.m_frictionIndex = frictionIndex;
+	int rollingFriction=1;
+			btVector3 angVelA(0,0,0),angVelB(0,0,0);
+			if (mbA)
+				angVelA = mbA->getVelocityVector()>getLink(fcA->m_link).l>getAngularVelocity();
+			if (mbB)
+				angVelB = mbB->getAngularVelocity();
+			btVector3 relAngVel = angVelB-angVelA;
+			if ((cp.m_combinedRollingFriction>0.f) && (rollingFriction>0))
+			{
+				//only a single rollingFriction per manifold
+				rollingFriction--;
+				if (relAngVel.length()>infoGlobal.m_singleAxisRollingFrictionThreshold)
+				{
+					relAngVel.normalize();
+					applyAnisotropicFriction(colObj0,relAngVel,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj1,relAngVel,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					if (relAngVel.length()>0.001)
+						addRollingFrictionConstraint(relAngVel,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				} else
+				{
+					addRollingFrictionConstraint(cp.m_normalWorldOnB,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					btVector3 axis0,axis1;
+					btPlaneSpace1(cp.m_normalWorldOnB,axis0,axis1);
+					applyAnisotropicFriction(colObj0,axis0,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj1,axis0,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj0,axis1,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					applyAnisotropicFriction(colObj1,axis1,btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION);
+					if (axis0.length()>0.001)
+						addRollingFrictionConstraint(axis0,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					if (axis1.length()>0.001)
+						addRollingFrictionConstraint(axis1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				}
+			}
+			///Bullet has several options to set the friction directions
+			///By default, each contact has only a single friction direction that is recomputed automatically very frame 
+			///based on the relative linear velocity.
+			///If the relative velocity it zero, it will automatically compute a friction direction.
+			///You can also enable two friction directions, using the SOLVER_USE_2_FRICTION_DIRECTIONS.
+			///In that case, the second friction direction will be orthogonal to both contact normal and first friction direction.
+			///
+			///If you choose SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION, then the friction will be independent from the relative projected velocity.
+			///
+			///The user can manually override the friction directions for certain contacts using a contact callback, 
+			///and set the cp.m_lateralFrictionInitialized to true
+			///In that case, you can set the target relative motion in each friction direction (cp.m_contactMotion1 and cp.m_contactMotion2)
+			///this will give a conveyor belt effect
+			///
+			{/*
+				cp.m_lateralFrictionDir1 = vel - cp.m_normalWorldOnB * rel_vel;
+				btScalar lat_rel_vel = cp.m_lateralFrictionDir1.length2();
+				if (!(infoGlobal.m_solverMode & SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION) && lat_rel_vel > SIMD_EPSILON)
+				{
+					cp.m_lateralFrictionDir1 *= 1.f/btSqrt(lat_rel_vel);
+					if((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+					{
+						cp.m_lateralFrictionDir2 = cp.m_lateralFrictionDir1.cross(cp.m_normalWorldOnB);
+						cp.m_lateralFrictionDir2.normalize();//??
+						applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						addMultiBodyFrictionConstraint(cp.m_lateralFrictionDir2,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+					}
+					applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					addMultiBodyFrictionConstraint(cp.m_lateralFrictionDir1,solverBodyIdA,solverBodyIdB,frictionIndex,cp,rel_pos1,rel_pos2,colObj0,colObj1, relaxation);
+				} else
+				*/
+				{
+					btPlaneSpace1(cp.m_normalWorldOnB,cp.m_lateralFrictionDir1,cp.m_lateralFrictionDir2);
+					applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir1,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+					addMultiBodyFrictionConstraint(cp.m_lateralFrictionDir1,manifold,frictionIndex,cp,colObj0,colObj1, relaxation,infoGlobal);
+					if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+					{
+						applyAnisotropicFriction(colObj0,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						applyAnisotropicFriction(colObj1,cp.m_lateralFrictionDir2,btCollisionObject::CF_ANISOTROPIC_FRICTION);
+						addMultiBodyFrictionConstraint(cp.m_lateralFrictionDir2,manifold,frictionIndex,cp,colObj0,colObj1, relaxation,infoGlobal);
+					}
+					{
+					}
+				}
+			} else
+			{
+				addMultiBodyFrictionConstraint(cp.m_lateralFrictionDir1,manifold,frictionIndex,cp,colObj0,colObj1, relaxation,infoGlobal,cp.m_contactMotion1, cp.m_frictionCFM);
+				if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+					addMultiBodyFrictionConstraint(cp.m_lateralFrictionDir2,manifold,frictionIndex,cp,colObj0,colObj1, relaxation, infoGlobal,cp.m_contactMotion2, cp.m_frictionCFM);
+				//setMultiBodyFrictionConstraintImpulse( solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal);
+				//todo:
+				solverConstraint.m_appliedImpulse = 0.f;
+				solverConstraint.m_appliedPushImpulse = 0.f;
+			}
+		}
+	}
+void btMultiBodyConstraintSolver::convertContacts(btPersistentManifold** manifoldPtr,int numManifolds, const btContactSolverInfo& infoGlobal)
+	//btPersistentManifold* manifold = 0;
+	for (int i=0;i<numManifolds;i++)
+	{
+		btPersistentManifold* manifold= manifoldPtr[i];
+		const btMultiBodyLinkCollider* fcA = btMultiBodyLinkCollider::upcast(manifold->getBody0());
+		const btMultiBodyLinkCollider* fcB = btMultiBodyLinkCollider::upcast(manifold->getBody1());
+		if (!fcA && !fcB)
+		{
+			//the contact doesn't involve any Featherstone btMultiBody, so deal with the regular btRigidBody/btCollisionObject case
+			convertContact(manifold,infoGlobal);
+		} else
+		{
+			convertMultiBodyContact(manifold,infoGlobal);
+		}
+	}
+	//also convert the multibody constraints, if any
+	for (int i=0;i<m_tmpNumMultiBodyConstraints;i++)
+	{
+		btMultiBodyConstraint* c = m_tmpMultiBodyConstraints[i];
+		m_data.m_solverBodyPool = &m_tmpSolverBodyPool;
+		m_data.m_fixedBodyId = m_fixedBodyId;
+		c->createConstraintRows(m_multiBodyNonContactConstraints,m_data,	infoGlobal);
+	}
+btScalar btMultiBodyConstraintSolver::solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher)
+	return btSequentialImpulseConstraintSolver::solveGroup(bodies,numBodies,manifold,numManifolds,constraints,numConstraints,info,debugDrawer,dispatcher);
+#if 0
+static void applyJointFeedback(btMultiBodyJacobianData& data, const btMultiBodySolverConstraint& solverConstraint, int jacIndex, btMultiBody* mb, btScalar appliedImpulse)
+	if (appliedImpulse!=0 && mb->internalNeedsJointFeedback())
+	{
+		//todo: get rid of those temporary memory allocations for the joint feedback
+		btAlignedObjectArray<btScalar> forceVector;
+		int numDofsPlusBase = 6+mb->getNumDofs();
+		forceVector.resize(numDofsPlusBase);
+		for (int i=0;i<numDofsPlusBase;i++)
+		{
+			forceVector[i] = data.m_jacobians[jacIndex+i]*appliedImpulse;
+		}
+		btAlignedObjectArray<btScalar> output;
+		output.resize(numDofsPlusBase);
+		bool applyJointFeedback = true;
+		mb->calcAccelerationDeltasMultiDof(&forceVector[0],&output[0],data.scratch_r,data.scratch_v,applyJointFeedback);
+	}
+#include "Bullet3Common/b3Logging.h"
+void btMultiBodyConstraintSolver::writeBackSolverBodyToMultiBody(btMultiBodySolverConstraint& c, btScalar deltaTime)
+#if 1 
+	//bod->addBaseForce(m_gravity * bod->getBaseMass());
+	//bod->addLinkForce(j, m_gravity * bod->getLinkMass(j));
+	if (c.m_orgConstraint)
+	{
+		c.m_orgConstraint->internalSetAppliedImpulse(c.m_orgDofIndex,c.m_appliedImpulse);
+	}
+	if (c.m_multiBodyA)
+	{
+		c.m_multiBodyA->setCompanionId(-1);
+		btVector3 force = c.m_contactNormal1*(c.m_appliedImpulse/deltaTime);
+		btVector3 torque = c.m_relpos1CrossNormal*(c.m_appliedImpulse/deltaTime);
+		if (c.m_linkA<0)
+		{
+			c.m_multiBodyA->addBaseConstraintForce(force);
+			c.m_multiBodyA->addBaseConstraintTorque(torque);
+		} else
+		{
+			c.m_multiBodyA->addLinkConstraintForce(c.m_linkA,force);
+				//b3Printf("force = %f,%f,%f\n",force[0],force[1],force[2]);//[0],torque[1],torque[2]);
+			c.m_multiBodyA->addLinkConstraintTorque(c.m_linkA,torque);
+		}
+	}
+	if (c.m_multiBodyB)
+	{
+		{
+			c.m_multiBodyB->setCompanionId(-1);
+			btVector3 force = c.m_contactNormal2*(c.m_appliedImpulse/deltaTime);
+			btVector3 torque = c.m_relpos2CrossNormal*(c.m_appliedImpulse/deltaTime);
+			if (c.m_linkB<0)
+			{
+				c.m_multiBodyB->addBaseConstraintForce(force);
+				c.m_multiBodyB->addBaseConstraintTorque(torque);
+			} else
+			{
+				{
+					c.m_multiBodyB->addLinkConstraintForce(c.m_linkB,force);
+					//b3Printf("t = %f,%f,%f\n",force[0],force[1],force[2]);//[0],torque[1],torque[2]);
+					c.m_multiBodyB->addLinkConstraintTorque(c.m_linkB,torque);
+				}
+			}
+		}
+	}
+	if (c.m_multiBodyA)
+	{
+		if(c.m_multiBodyA->isMultiDof())
+		{
+			c.m_multiBodyA->applyDeltaVeeMultiDof(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacAindex],c.m_appliedImpulse);
+		}
+		else
+		{
+			c.m_multiBodyA->applyDeltaVee(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacAindex],c.m_appliedImpulse);
+		}
+	}
+	if (c.m_multiBodyB)
+	{
+		if(c.m_multiBodyB->isMultiDof())
+		{
+			c.m_multiBodyB->applyDeltaVeeMultiDof(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacBindex],c.m_appliedImpulse);
+		}
+		else
+		{
+			c.m_multiBodyB->applyDeltaVee(&m_data.m_deltaVelocitiesUnitImpulse[c.m_jacBindex],c.m_appliedImpulse);
+		}
+	}
+btScalar btMultiBodyConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+	BT_PROFILE("btMultiBodyConstraintSolver::solveGroupCacheFriendlyFinish");
+	int numPoolConstraints = m_multiBodyNormalContactConstraints.size();
+	//write back the delta v to the multi bodies, either as applied impulse (direct velocity change) 
+	//or as applied force, so we can measure the joint reaction forces easier
+	for (int i=0;i<numPoolConstraints;i++)
+	{
+		btMultiBodySolverConstraint& solverConstraint = m_multiBodyNormalContactConstraints[i];
+		writeBackSolverBodyToMultiBody(solverConstraint,infoGlobal.m_timeStep);
+		writeBackSolverBodyToMultiBody(m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex],infoGlobal.m_timeStep);
+		if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+		{
+			writeBackSolverBodyToMultiBody(m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1],infoGlobal.m_timeStep);
+		}
+	}
+	for (int i=0;i<m_multiBodyNonContactConstraints.size();i++)
+	{
+		btMultiBodySolverConstraint& solverConstraint = m_multiBodyNonContactConstraints[i];
+		writeBackSolverBodyToMultiBody(solverConstraint,infoGlobal.m_timeStep);
+	}
+	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+	{
+		BT_PROFILE("warm starting write back");
+		for (int j=0;j<numPoolConstraints;j++)
+		{
+			const btMultiBodySolverConstraint& solverConstraint = m_multiBodyNormalContactConstraints[j];
+			btManifoldPoint* pt = (btManifoldPoint*) solverConstraint.m_originalContactPoint;
+			btAssert(pt);
+			pt->m_appliedImpulse = solverConstraint.m_appliedImpulse;
+			pt->m_appliedImpulseLateral1 = m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_appliedImpulse;
+			//printf("pt->m_appliedImpulseLateral1 = %f\n", pt->m_appliedImpulseLateral1);
+			if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+			{
+				pt->m_appliedImpulseLateral2 = m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_appliedImpulse;
+			}
+			//do a callback here?
+		}
+	}
+#if 0
+	//multibody joint feedback
+	{
+		BT_PROFILE("multi body joint feedback");
+		for (int j=0;j<numPoolConstraints;j++)
+		{
+			const btMultiBodySolverConstraint& solverConstraint = m_multiBodyNormalContactConstraints[j];
+			//apply the joint feedback into all links of the btMultiBody
+			//todo: double-check the signs of the applied impulse
+			if(solverConstraint.m_multiBodyA && solverConstraint.m_multiBodyA->isMultiDof())
+			{
+				applyJointFeedback(m_data,solverConstraint, solverConstraint.m_jacAindex,solverConstraint.m_multiBodyA, solverConstraint.m_appliedImpulse*btSimdScalar(1./infoGlobal.m_timeStep));
+			}
+			if(solverConstraint.m_multiBodyB && solverConstraint.m_multiBodyB->isMultiDof())
+			{
+				applyJointFeedback(m_data,solverConstraint, solverConstraint.m_jacBindex,solverConstraint.m_multiBodyB,solverConstraint.m_appliedImpulse*btSimdScalar(-1./infoGlobal.m_timeStep));
+			}
+#if 0
+			if (m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_multiBodyA && m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_multiBodyA->isMultiDof())
+			{
+				applyJointFeedback(m_data,m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex],
+					m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_jacAindex,
+					m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_multiBodyA,
+					m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_appliedImpulse*btSimdScalar(1./infoGlobal.m_timeStep));
+			}
+			if (m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_multiBodyB && m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_multiBodyB->isMultiDof())
+			{
+				applyJointFeedback(m_data,m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex],
+					m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_jacBindex,
+					m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_multiBodyB,
+					m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex].m_appliedImpulse*btSimdScalar(-1./infoGlobal.m_timeStep));
+			}
+			if ((infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS))
+			{
+				if (m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_multiBodyA && m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_multiBodyA->isMultiDof())
+				{
+					applyJointFeedback(m_data,m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1],
+						m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_jacAindex,
+						m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_multiBodyA,
+						m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_appliedImpulse*btSimdScalar(1./infoGlobal.m_timeStep));
+				}
+				if (m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_multiBodyB && m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_multiBodyB->isMultiDof())
+				{
+					applyJointFeedback(m_data,m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1],
+						m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_jacBindex,
+						m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_multiBodyB,
+						m_multiBodyFrictionContactConstraints[solverConstraint.m_frictionIndex+1].m_appliedImpulse*btSimdScalar(-1./infoGlobal.m_timeStep));
+				}
+			}
+		}
+		for (int i=0;i<m_multiBodyNonContactConstraints.size();i++)
+		{
+			const btMultiBodySolverConstraint& solverConstraint = m_multiBodyNonContactConstraints[i];
+			if(solverConstraint.m_multiBodyA && solverConstraint.m_multiBodyA->isMultiDof())
+			{
+				applyJointFeedback(m_data,solverConstraint, solverConstraint.m_jacAindex,solverConstraint.m_multiBodyA, solverConstraint.m_appliedImpulse*btSimdScalar(1./infoGlobal.m_timeStep));
+			}
+			if(solverConstraint.m_multiBodyB && solverConstraint.m_multiBodyB->isMultiDof())
+			{
+				applyJointFeedback(m_data,solverConstraint, solverConstraint.m_jacBindex,solverConstraint.m_multiBodyB,solverConstraint.m_appliedImpulse*btSimdScalar(1./infoGlobal.m_timeStep));
+			}
+		}
+	}
+	numPoolConstraints = m_multiBodyNonContactConstraints.size();
+#if 0
+	//@todo: m_originalContactPoint is not initialized for btMultiBodySolverConstraint
+	for (int i=0;i<numPoolConstraints;i++)
+	{
+		const btMultiBodySolverConstraint& c = m_multiBodyNonContactConstraints[i];
+		btTypedConstraint* constr = (btTypedConstraint*)c.m_originalContactPoint;
+		btJointFeedback* fb = constr->getJointFeedback();
+		if (fb)
+		{
+			fb->m_appliedForceBodyA += c.m_contactNormal1*c.m_appliedImpulse*constr->getRigidBodyA().getLinearFactor()/infoGlobal.m_timeStep;
+			fb->m_appliedForceBodyB += c.m_contactNormal2*c.m_appliedImpulse*constr->getRigidBodyB().getLinearFactor()/infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyA += c.m_relpos1CrossNormal* constr->getRigidBodyA().getAngularFactor()*c.m_appliedImpulse/infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyB += c.m_relpos2CrossNormal* constr->getRigidBodyB().getAngularFactor()*c.m_appliedImpulse/infoGlobal.m_timeStep; /*RGM ???? */
+		}
+		constr->internalSetAppliedImpulse(c.m_appliedImpulse);
+		if (btFabs(c.m_appliedImpulse)>=constr->getBreakingImpulseThreshold())
+		{
+			constr->setEnabled(false);
+		}
+	}
+	return btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(bodies,numBodies,infoGlobal);
+void  btMultiBodyConstraintSolver::solveMultiBodyGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,btMultiBodyConstraint** multiBodyConstraints, int numMultiBodyConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher)
+	//printf("solveMultiBodyGroup start\n");
+	m_tmpMultiBodyConstraints = multiBodyConstraints;
+	m_tmpNumMultiBodyConstraints = numMultiBodyConstraints;
+	btSequentialImpulseConstraintSolver::solveGroup(bodies,numBodies,manifold,numManifolds,constraints,numConstraints,info,debugDrawer,dispatcher);
+	m_tmpMultiBodyConstraints = 0;
+	m_tmpNumMultiBodyConstraints = 0;
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraintSolver.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraintSolver.h
new file mode 100644
index 00000000..321ee423
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyConstraintSolver.h
@@ -0,0 +1,87 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
+#include "btMultiBodySolverConstraint.h"
+class btMultiBody;
+#include "btMultiBodyConstraint.h"
+ATTRIBUTE_ALIGNED16(class) btMultiBodyConstraintSolver : public btSequentialImpulseConstraintSolver
+	btMultiBodyConstraintArray			m_multiBodyNonContactConstraints;
+	btMultiBodyConstraintArray			m_multiBodyNormalContactConstraints;
+	btMultiBodyConstraintArray			m_multiBodyFrictionContactConstraints;
+	btMultiBodyJacobianData				m_data;
+	//temp storage for multi body constraints for a specific island/group called by 'solveGroup'
+	btMultiBodyConstraint**					m_tmpMultiBodyConstraints;
+	int										m_tmpNumMultiBodyConstraints;
+	void resolveSingleConstraintRowGeneric(const btMultiBodySolverConstraint& c);
+	void convertContacts(btPersistentManifold** manifoldPtr,int numManifolds, const btContactSolverInfo& infoGlobal);
+	btMultiBodySolverConstraint&	addMultiBodyFrictionConstraint(const btVector3& normalAxis,btPersistentManifold* manifold,int frictionIndex,btManifoldPoint& cp,btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, const btContactSolverInfo& infoGlobal, btScalar desiredVelocity=0, btScalar cfmSlip=0);
+	void setupMultiBodyJointLimitConstraint(btMultiBodySolverConstraint& constraintRow, 
+																 btScalar* jacA,btScalar* jacB,
+																 btScalar penetration,btScalar combinedFrictionCoeff, btScalar combinedRestitutionCoeff,
+																 const btContactSolverInfo& infoGlobal);
+	void setupMultiBodyContactConstraint(btMultiBodySolverConstraint& solverConstraint, 
+																 const btVector3& contactNormal,
+																 btManifoldPoint& cp, const btContactSolverInfo& infoGlobal,
+																 btScalar& relaxation,
+																 bool isFriction, btScalar desiredVelocity=0, btScalar cfmSlip=0);
+	void convertMultiBodyContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal);
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+//	virtual btScalar solveGroupCacheFriendlyIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	void	applyDeltaVee(btScalar* deltaV, btScalar impulse, int velocityIndex, int ndof);
+	void writeBackSolverBodyToMultiBody(btMultiBodySolverConstraint& constraint, btScalar deltaTime);
+	///this method should not be called, it was just used during porting/integration of Featherstone btMultiBody, providing backwards compatibility but no support for btMultiBodyConstraint (only contact constraints)
+	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher);
+	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal);
+	virtual void solveMultiBodyGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,btMultiBodyConstraint** multiBodyConstraints, int numMultiBodyConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher);
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp
new file mode 100644
index 00000000..d94d1d4e
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp
@@ -0,0 +1,989 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btMultiBodyDynamicsWorld.h"
+#include "btMultiBodyConstraintSolver.h"
+#include "btMultiBody.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h"
+#include "LinearMath/btQuickprof.h"
+#include "btMultiBodyConstraint.h"
+#include "LinearMath/btIDebugDraw.h"
+#include "LinearMath/btSerializer.h"
+void	btMultiBodyDynamicsWorld::addMultiBody(btMultiBody* body, short group, short mask)
+	m_multiBodies.push_back(body);
+void	btMultiBodyDynamicsWorld::removeMultiBody(btMultiBody* body)
+	m_multiBodies.remove(body);
+void	btMultiBodyDynamicsWorld::calculateSimulationIslands()
+	BT_PROFILE("calculateSimulationIslands");
+	getSimulationIslandManager()->updateActivationState(getCollisionWorld(),getCollisionWorld()->getDispatcher());
+    {
+        //merge islands based on speculative contact manifolds too
+        for (int i=0;i<this->m_predictiveManifolds.size();i++)
+        {
+            btPersistentManifold* manifold = m_predictiveManifolds[i];
+            const btCollisionObject* colObj0 = manifold->getBody0();
+            const btCollisionObject* colObj1 = manifold->getBody1();
+            if (((colObj0) && (!(colObj0)->isStaticOrKinematicObject())) &&
+                ((colObj1) && (!(colObj1)->isStaticOrKinematicObject())))
+            {
+				getSimulationIslandManager()->getUnionFind().unite((colObj0)->getIslandTag(),(colObj1)->getIslandTag());
+            }
+        }
+    }
+	{
+		int i;
+		int numConstraints = int(m_constraints.size());
+		for (i=0;i< numConstraints ; i++ )
+		{
+			btTypedConstraint* constraint = m_constraints[i];
+			if (constraint->isEnabled())
+			{
+				const btRigidBody* colObj0 = &constraint->getRigidBodyA();
+				const btRigidBody* colObj1 = &constraint->getRigidBodyB();
+				if (((colObj0) && (!(colObj0)->isStaticOrKinematicObject())) &&
+					((colObj1) && (!(colObj1)->isStaticOrKinematicObject())))
+				{
+					getSimulationIslandManager()->getUnionFind().unite((colObj0)->getIslandTag(),(colObj1)->getIslandTag());
+				}
+			}
+		}
+	}
+	//merge islands linked by Featherstone link colliders
+	for (int i=0;i<m_multiBodies.size();i++)
+	{
+		btMultiBody* body = m_multiBodies[i];
+		{
+			btMultiBodyLinkCollider* prev = body->getBaseCollider();
+			for (int b=0;b<body->getNumLinks();b++)
+			{
+				btMultiBodyLinkCollider* cur = body->getLink(b).m_collider;
+				if (((cur) && (!(cur)->isStaticOrKinematicObject())) &&
+					((prev) && (!(prev)->isStaticOrKinematicObject())))
+				{
+					int tagPrev = prev->getIslandTag();
+					int tagCur = cur->getIslandTag();
+					getSimulationIslandManager()->getUnionFind().unite(tagPrev, tagCur);
+				}
+				if (cur && !cur->isStaticOrKinematicObject())
+					prev = cur;
+			}
+		}
+	}
+	//merge islands linked by multibody constraints
+	{
+		for (int i=0;i<this->m_multiBodyConstraints.size();i++)
+		{
+			btMultiBodyConstraint* c = m_multiBodyConstraints[i];
+			int tagA = c->getIslandIdA();
+			int tagB = c->getIslandIdB();
+			if (tagA>=0 && tagB>=0)
+				getSimulationIslandManager()->getUnionFind().unite(tagA, tagB);
+		}
+	}
+	//Store the island id in each body
+	getSimulationIslandManager()->storeIslandActivationState(getCollisionWorld());
+void	btMultiBodyDynamicsWorld::updateActivationState(btScalar timeStep)
+	BT_PROFILE("btMultiBodyDynamicsWorld::updateActivationState");
+	for ( int i=0;i<m_multiBodies.size();i++)
+	{
+		btMultiBody* body = m_multiBodies[i];
+		if (body)
+		{
+			body->checkMotionAndSleepIfRequired(timeStep);
+			if (!body->isAwake())
+			{
+				btMultiBodyLinkCollider* col = body->getBaseCollider();
+				if (col && col->getActivationState() == ACTIVE_TAG)
+				{
+					col->setActivationState( WANTS_DEACTIVATION);
+					col->setDeactivationTime(0.f);
+				}
+				for (int b=0;b<body->getNumLinks();b++)
+				{
+					btMultiBodyLinkCollider* col = body->getLink(b).m_collider;
+					if (col && col->getActivationState() == ACTIVE_TAG)
+					{
+						col->setActivationState( WANTS_DEACTIVATION);
+						col->setDeactivationTime(0.f);
+					}
+				}
+			} else
+			{
+				btMultiBodyLinkCollider* col = body->getBaseCollider();
+				if (col && col->getActivationState() != DISABLE_DEACTIVATION)
+					col->setActivationState( ACTIVE_TAG );
+				for (int b=0;b<body->getNumLinks();b++)
+				{
+					btMultiBodyLinkCollider* col = body->getLink(b).m_collider;
+					if (col && col->getActivationState() != DISABLE_DEACTIVATION)
+						col->setActivationState( ACTIVE_TAG );
+				}
+			}
+		}
+	}
+	btDiscreteDynamicsWorld::updateActivationState(timeStep);
+SIMD_FORCE_INLINE	int	btGetConstraintIslandId2(const btTypedConstraint* lhs)
+	int islandId;
+	const btCollisionObject& rcolObj0 = lhs->getRigidBodyA();
+	const btCollisionObject& rcolObj1 = lhs->getRigidBodyB();
+	islandId= rcolObj0.getIslandTag()>=0?rcolObj0.getIslandTag():rcolObj1.getIslandTag();
+	return islandId;
+class btSortConstraintOnIslandPredicate2
+	public:
+		bool operator() ( const btTypedConstraint* lhs, const btTypedConstraint* rhs ) const
+		{
+			int rIslandId0,lIslandId0;
+			rIslandId0 = btGetConstraintIslandId2(rhs);
+			lIslandId0 = btGetConstraintIslandId2(lhs);
+			return lIslandId0 < rIslandId0;
+		}
+SIMD_FORCE_INLINE	int	btGetMultiBodyConstraintIslandId(const btMultiBodyConstraint* lhs)
+	int islandId;
+	int islandTagA = lhs->getIslandIdA();
+	int islandTagB = lhs->getIslandIdB();
+	islandId= islandTagA>=0?islandTagA:islandTagB;
+	return islandId;
+class btSortMultiBodyConstraintOnIslandPredicate
+	public:
+		bool operator() ( const btMultiBodyConstraint* lhs, const btMultiBodyConstraint* rhs ) const
+		{
+			int rIslandId0,lIslandId0;
+			rIslandId0 = btGetMultiBodyConstraintIslandId(rhs);
+			lIslandId0 = btGetMultiBodyConstraintIslandId(lhs);
+			return lIslandId0 < rIslandId0;
+		}
+struct MultiBodyInplaceSolverIslandCallback : public btSimulationIslandManager::IslandCallback
+	btContactSolverInfo*	m_solverInfo;
+	btMultiBodyConstraintSolver*		m_solver;
+	btMultiBodyConstraint**		m_multiBodySortedConstraints;
+	int							m_numMultiBodyConstraints;
+	btTypedConstraint**		m_sortedConstraints;
+	int						m_numConstraints;
+	btIDebugDraw*			m_debugDrawer;
+	btDispatcher*			m_dispatcher;
+	btAlignedObjectArray<btCollisionObject*> m_bodies;
+	btAlignedObjectArray<btPersistentManifold*> m_manifolds;
+	btAlignedObjectArray<btTypedConstraint*> m_constraints;
+	btAlignedObjectArray<btMultiBodyConstraint*> m_multiBodyConstraints;
+	MultiBodyInplaceSolverIslandCallback(	btMultiBodyConstraintSolver*	solver,
+									btDispatcher* dispatcher)
+		:m_solverInfo(NULL),
+		m_solver(solver),
+		m_multiBodySortedConstraints(NULL),
+		m_numConstraints(0),
+		m_debugDrawer(NULL),
+		m_dispatcher(dispatcher)
+	{
+	}
+	MultiBodyInplaceSolverIslandCallback& operator=(MultiBodyInplaceSolverIslandCallback& other)
+	{
+		btAssert(0);
+		(void)other;
+		return *this;
+	}
+	SIMD_FORCE_INLINE void setup ( btContactSolverInfo* solverInfo, btTypedConstraint** sortedConstraints, int numConstraints, btMultiBodyConstraint** sortedMultiBodyConstraints,	int	numMultiBodyConstraints,	btIDebugDraw* debugDrawer)
+	{
+		btAssert(solverInfo);
+		m_solverInfo = solverInfo;
+		m_multiBodySortedConstraints = sortedMultiBodyConstraints;
+		m_numMultiBodyConstraints = numMultiBodyConstraints;
+		m_sortedConstraints = sortedConstraints;
+		m_numConstraints = numConstraints;
+		m_debugDrawer = debugDrawer;
+		m_bodies.resize (0);
+		m_manifolds.resize (0);
+		m_constraints.resize (0);
+		m_multiBodyConstraints.resize(0);
+	}
+	virtual	void	processIsland(btCollisionObject** bodies,int numBodies,btPersistentManifold**	manifolds,int numManifolds, int islandId)
+	{
+		if (islandId<0)
+		{
+			///we don't split islands, so all constraints/contact manifolds/bodies are passed into the solver regardless the island id
+			m_solver->solveMultiBodyGroup( bodies,numBodies,manifolds, numManifolds,m_sortedConstraints, m_numConstraints, &m_multiBodySortedConstraints[0],m_numConstraints,*m_solverInfo,m_debugDrawer,m_dispatcher);
+		} else
+		{
+				//also add all non-contact constraints/joints for this island
+			btTypedConstraint** startConstraint = 0;
+			btMultiBodyConstraint** startMultiBodyConstraint = 0;
+			int numCurConstraints = 0;
+			int numCurMultiBodyConstraints = 0;
+			int i;
+			//find the first constraint for this island
+			for (i=0;i<m_numConstraints;i++)
+			{
+				if (btGetConstraintIslandId2(m_sortedConstraints[i]) == islandId)
+				{
+					startConstraint = &m_sortedConstraints[i];
+					break;
+				}
+			}
+			//count the number of constraints in this island
+			for (;i<m_numConstraints;i++)
+			{
+				if (btGetConstraintIslandId2(m_sortedConstraints[i]) == islandId)
+				{
+					numCurConstraints++;
+				}
+			}
+			for (i=0;i<m_numMultiBodyConstraints;i++)
+			{
+				if (btGetMultiBodyConstraintIslandId(m_multiBodySortedConstraints[i]) == islandId)
+				{
+					startMultiBodyConstraint = &m_multiBodySortedConstraints[i];
+					break;
+				}
+			}
+			//count the number of multi body constraints in this island
+			for (;i<m_numMultiBodyConstraints;i++)
+			{
+				if (btGetMultiBodyConstraintIslandId(m_multiBodySortedConstraints[i]) == islandId)
+				{
+					numCurMultiBodyConstraints++;
+				}
+			}
+			if (m_solverInfo->m_minimumSolverBatchSize<=1)
+			{
+				m_solver->solveGroup( bodies,numBodies,manifolds, numManifolds,startConstraint,numCurConstraints,*m_solverInfo,m_debugDrawer,m_dispatcher);
+			} else
+			{
+				for (i=0;i<numBodies;i++)
+					m_bodies.push_back(bodies[i]);
+				for (i=0;i<numManifolds;i++)
+					m_manifolds.push_back(manifolds[i]);
+				for (i=0;i<numCurConstraints;i++)
+					m_constraints.push_back(startConstraint[i]);
+				for (i=0;i<numCurMultiBodyConstraints;i++)
+					m_multiBodyConstraints.push_back(startMultiBodyConstraint[i]);
+				if ((m_constraints.size()+m_manifolds.size())>m_solverInfo->m_minimumSolverBatchSize)
+				{
+					processConstraints();
+				} else
+				{
+					//printf("deferred\n");
+				}
+			}
+		}
+	}
+	void	processConstraints()
+	{
+		btCollisionObject** bodies = m_bodies.size()? &m_bodies[0]:0;
+		btPersistentManifold** manifold = m_manifolds.size()?&m_manifolds[0]:0;
+		btTypedConstraint** constraints = m_constraints.size()?&m_constraints[0]:0;
+		btMultiBodyConstraint** multiBodyConstraints = m_multiBodyConstraints.size() ? &m_multiBodyConstraints[0] : 0;			
+		//printf("mb contacts = %d, mb constraints = %d\n", mbContacts, m_multiBodyConstraints.size());
+		m_solver->solveMultiBodyGroup( bodies,m_bodies.size(),manifold, m_manifolds.size(),constraints, m_constraints.size() ,multiBodyConstraints, m_multiBodyConstraints.size(), *m_solverInfo,m_debugDrawer,m_dispatcher);
+		m_bodies.resize(0);
+		m_manifolds.resize(0);
+		m_constraints.resize(0);
+		m_multiBodyConstraints.resize(0);
+	}
+btMultiBodyDynamicsWorld::btMultiBodyDynamicsWorld(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btMultiBodyConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration)
+	:btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration),
+	m_multiBodyConstraintSolver(constraintSolver)
+	//split impulse is not yet supported for Featherstone hierarchies
+	getSolverInfo().m_splitImpulse = false;
+	getSolverInfo().m_solverMode |=SOLVER_USE_2_FRICTION_DIRECTIONS;
+	m_solverMultiBodyIslandCallback = new MultiBodyInplaceSolverIslandCallback(constraintSolver,dispatcher);
+btMultiBodyDynamicsWorld::~btMultiBodyDynamicsWorld ()
+	delete m_solverMultiBodyIslandCallback;
+void	btMultiBodyDynamicsWorld::forwardKinematics()
+	for (int b=0;b<m_multiBodies.size();b++)
+	{
+		btMultiBody* bod = m_multiBodies[b];
+		bod->forwardKinematics(m_scratch_world_to_local,m_scratch_local_origin);
+	}
+void	btMultiBodyDynamicsWorld::solveConstraints(btContactSolverInfo& solverInfo)
+	forwardKinematics();
+	BT_PROFILE("solveConstraints");
+	m_sortedConstraints.resize( m_constraints.size());
+	int i; 
+	for (i=0;i<getNumConstraints();i++)
+	{
+		m_sortedConstraints[i] = m_constraints[i];
+	}
+	m_sortedConstraints.quickSort(btSortConstraintOnIslandPredicate2());
+	btTypedConstraint** constraintsPtr = getNumConstraints() ? &m_sortedConstraints[0] : 0;
+	m_sortedMultiBodyConstraints.resize(m_multiBodyConstraints.size());
+	for (i=0;i<m_multiBodyConstraints.size();i++)
+	{
+		m_sortedMultiBodyConstraints[i] = m_multiBodyConstraints[i];
+	}
+	m_sortedMultiBodyConstraints.quickSort(btSortMultiBodyConstraintOnIslandPredicate());
+	btMultiBodyConstraint** sortedMultiBodyConstraints = m_sortedMultiBodyConstraints.size() ?  &m_sortedMultiBodyConstraints[0] : 0;
+	m_solverMultiBodyIslandCallback->setup(&solverInfo,constraintsPtr,m_sortedConstraints.size(),sortedMultiBodyConstraints,m_sortedMultiBodyConstraints.size(), getDebugDrawer());
+	m_constraintSolver->prepareSolve(getCollisionWorld()->getNumCollisionObjects(), getCollisionWorld()->getDispatcher()->getNumManifolds());
+	/// solve all the constraints for this island
+	m_islandManager->buildAndProcessIslands(getCollisionWorld()->getDispatcher(),getCollisionWorld(),m_solverMultiBodyIslandCallback);
+	{
+		BT_PROFILE("btMultiBody addForce");
+		for (int i=0;i<this->m_multiBodies.size();i++)
+		{
+			btMultiBody* bod = m_multiBodies[i];
+			bool isSleeping = false;
+			if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+			{
+				isSleeping = true;
+			} 
+			for (int b=0;b<bod->getNumLinks();b++)
+			{
+				if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState()==ISLAND_SLEEPING)
+					isSleeping = true;
+			} 
+			if (!isSleeping)
+			{
+				//useless? they get resized in stepVelocities once again (AND DIFFERENTLY)
+				m_scratch_r.resize(bod->getNumLinks()+1);			//multidof? ("Y"s use it and it is used to store qdd)
+				m_scratch_v.resize(bod->getNumLinks()+1);
+				m_scratch_m.resize(bod->getNumLinks()+1);
+				bod->addBaseForce(m_gravity * bod->getBaseMass());
+				for (int j = 0; j < bod->getNumLinks(); ++j) 
+				{
+					bod->addLinkForce(j, m_gravity * bod->getLinkMass(j));
+				}
+			}//if (!isSleeping)
+		}
+	}
+	{
+		BT_PROFILE("btMultiBody stepVelocities");
+		for (int i=0;i<this->m_multiBodies.size();i++)
+		{
+			btMultiBody* bod = m_multiBodies[i];
+			bool isSleeping = false;
+			if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+			{
+				isSleeping = true;
+			} 
+			for (int b=0;b<bod->getNumLinks();b++)
+			{
+				if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState()==ISLAND_SLEEPING)
+					isSleeping = true;
+			} 
+			if (!isSleeping)
+			{
+				//useless? they get resized in stepVelocities once again (AND DIFFERENTLY)
+				m_scratch_r.resize(bod->getNumLinks()+1);			//multidof? ("Y"s use it and it is used to store qdd)
+				m_scratch_v.resize(bod->getNumLinks()+1);
+				m_scratch_m.resize(bod->getNumLinks()+1);
+				bool doNotUpdatePos = false;
+				{
+					if(!bod->isUsingRK4Integration())
+					{
+						bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(solverInfo.m_timeStep, m_scratch_r, m_scratch_v, m_scratch_m);
+					}
+					else
+					{						
+						//
+						int numDofs = bod->getNumDofs() + 6;
+						int numPosVars = bod->getNumPosVars() + 7;
+						btAlignedObjectArray<btScalar> scratch_r2; scratch_r2.resize(2*numPosVars + 8*numDofs);
+						//convenience
+						btScalar *pMem = &scratch_r2[0];
+						btScalar *scratch_q0 = pMem; pMem += numPosVars;
+						btScalar *scratch_qx = pMem; pMem += numPosVars;
+						btScalar *scratch_qd0 = pMem; pMem += numDofs;
+						btScalar *scratch_qd1 = pMem; pMem += numDofs;
+						btScalar *scratch_qd2 = pMem; pMem += numDofs;
+						btScalar *scratch_qd3 = pMem; pMem += numDofs;
+						btScalar *scratch_qdd0 = pMem; pMem += numDofs;
+						btScalar *scratch_qdd1 = pMem; pMem += numDofs;
+						btScalar *scratch_qdd2 = pMem; pMem += numDofs;
+						btScalar *scratch_qdd3 = pMem; pMem += numDofs;
+						btAssert((pMem - (2*numPosVars + 8*numDofs)) == &scratch_r2[0]);
+						/////						
+						//copy q0 to scratch_q0 and qd0 to scratch_qd0
+						scratch_q0[0] = bod->getWorldToBaseRot().x();
+						scratch_q0[1] = bod->getWorldToBaseRot().y();
+						scratch_q0[2] = bod->getWorldToBaseRot().z();
+						scratch_q0[3] = bod->getWorldToBaseRot().w();
+						scratch_q0[4] = bod->getBasePos().x();
+						scratch_q0[5] = bod->getBasePos().y();
+						scratch_q0[6] = bod->getBasePos().z();
+						//
+						for(int link = 0; link < bod->getNumLinks(); ++link)
+						{
+							for(int dof = 0; dof < bod->getLink(link).m_posVarCount; ++dof)
+								scratch_q0[7 + bod->getLink(link).m_cfgOffset + dof] = bod->getLink(link).m_jointPos[dof];							
+						}
+						//
+						for(int dof = 0; dof < numDofs; ++dof)								
+							scratch_qd0[dof] = bod->getVelocityVector()[dof];
+						////
+						struct
+						{
+						    btMultiBody *bod;
+                            btScalar *scratch_qx, *scratch_q0;
+						    void operator()()
+						    {
+						        for(int dof = 0; dof < bod->getNumPosVars() + 7; ++dof)
+                                    scratch_qx[dof] = scratch_q0[dof];
+						    }
+						} pResetQx = {bod, scratch_qx, scratch_q0};
+						//
+						struct
+						{
+						    void operator()(btScalar dt, const btScalar *pDer, const btScalar *pCurVal, btScalar *pVal, int size)
+						    {
+						        for(int i = 0; i < size; ++i)
+                                    pVal[i] = pCurVal[i] + dt * pDer[i];
+						    }
+						} pEulerIntegrate;
+						//
+						struct
+                        {
+                            void operator()(btMultiBody *pBody, const btScalar *pData)
+                            {
+                                btScalar *pVel = const_cast<btScalar*>(pBody->getVelocityVector());
+                                for(int i = 0; i < pBody->getNumDofs() + 6; ++i)
+                                    pVel[i] = pData[i];
+                            }
+                        } pCopyToVelocityVector;
+						//
+                        struct
+						{
+						    void operator()(const btScalar *pSrc, btScalar *pDst, int start, int size)
+						    {
+						        for(int i = 0; i < size; ++i)
+                                    pDst[i] = pSrc[start + i];
+						    }
+						} pCopy;
+						//
+						btScalar h = solverInfo.m_timeStep;
+						#define output &m_scratch_r[bod->getNumDofs()]
+						//calc qdd0 from: q0 & qd0	
+						bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(0., m_scratch_r, m_scratch_v, m_scratch_m);
+						pCopy(output, scratch_qdd0, 0, numDofs);
+						//calc q1 = q0 + h/2 * qd0
+						pResetQx();
+						bod->stepPositionsMultiDof(btScalar(.5)*h, scratch_qx, scratch_qd0);
+						//calc qd1 = qd0 + h/2 * qdd0
+						pEulerIntegrate(btScalar(.5)*h, scratch_qdd0, scratch_qd0, scratch_qd1, numDofs);
+						//
+						//calc qdd1 from: q1 & qd1
+						pCopyToVelocityVector(bod, scratch_qd1);
+						bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(0., m_scratch_r, m_scratch_v, m_scratch_m);
+						pCopy(output, scratch_qdd1, 0, numDofs);
+						//calc q2 = q0 + h/2 * qd1
+						pResetQx();
+						bod->stepPositionsMultiDof(btScalar(.5)*h, scratch_qx, scratch_qd1);
+						//calc qd2 = qd0 + h/2 * qdd1
+						pEulerIntegrate(btScalar(.5)*h, scratch_qdd1, scratch_qd0, scratch_qd2, numDofs);
+						//
+						//calc qdd2 from: q2 & qd2
+						pCopyToVelocityVector(bod, scratch_qd2);
+						bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(0., m_scratch_r, m_scratch_v, m_scratch_m);
+						pCopy(output, scratch_qdd2, 0, numDofs);
+						//calc q3 = q0 + h * qd2
+						pResetQx();
+						bod->stepPositionsMultiDof(h, scratch_qx, scratch_qd2);
+						//calc qd3 = qd0 + h * qdd2
+						pEulerIntegrate(h, scratch_qdd2, scratch_qd0, scratch_qd3, numDofs);
+						//
+						//calc qdd3 from: q3 & qd3
+						pCopyToVelocityVector(bod, scratch_qd3);
+						bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(0., m_scratch_r, m_scratch_v, m_scratch_m);
+						pCopy(output, scratch_qdd3, 0, numDofs);
+						//
+						//calc q = q0 + h/6(qd0 + 2*(qd1 + qd2) + qd3)
+						//calc qd = qd0 + h/6(qdd0 + 2*(qdd1 + qdd2) + qdd3)						
+						btAlignedObjectArray<btScalar> delta_q; delta_q.resize(numDofs);
+						btAlignedObjectArray<btScalar> delta_qd; delta_qd.resize(numDofs);
+						for(int i = 0; i < numDofs; ++i)
+						{
+							delta_q[i] = h/btScalar(6.)*(scratch_qd0[i] + 2*scratch_qd1[i] + 2*scratch_qd2[i] + scratch_qd3[i]);
+							delta_qd[i] = h/btScalar(6.)*(scratch_qdd0[i] + 2*scratch_qdd1[i] + 2*scratch_qdd2[i] + scratch_qdd3[i]);							
+							//delta_q[i] = h*scratch_qd0[i];
+							//delta_qd[i] = h*scratch_qdd0[i];
+						}
+						//
+						pCopyToVelocityVector(bod, scratch_qd0);
+						bod->applyDeltaVeeMultiDof(&delta_qd[0], 1);						
+						//
+						if(!doNotUpdatePos)
+						{
+							btScalar *pRealBuf = const_cast<btScalar *>(bod->getVelocityVector());
+							pRealBuf += 6 + bod->getNumDofs() + bod->getNumDofs()*bod->getNumDofs();
+							for(int i = 0; i < numDofs; ++i)
+								pRealBuf[i] = delta_q[i];
+							//bod->stepPositionsMultiDof(1, 0, &delta_q[0]);
+							bod->setPosUpdated(true);							
+						}
+						//ugly hack which resets the cached data to t0 (needed for constraint solver)
+						{
+							for(int link = 0; link < bod->getNumLinks(); ++link)
+								bod->getLink(link).updateCacheMultiDof();
+							bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(0, m_scratch_r, m_scratch_v, m_scratch_m);
+						}
+					}
+				}
+				bod->clearForcesAndTorques();
+			}//if (!isSleeping)
+		}
+	}
+	clearMultiBodyConstraintForces();
+	m_solverMultiBodyIslandCallback->processConstraints();
+	m_constraintSolver->allSolved(solverInfo, m_debugDrawer);
+	{
+                BT_PROFILE("btMultiBody stepVelocities");
+                for (int i=0;i<this->m_multiBodies.size();i++)
+                {
+                        btMultiBody* bod = m_multiBodies[i];
+                        bool isSleeping = false;
+                        if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+                        {
+                                isSleeping = true;
+                        }
+                        for (int b=0;b<bod->getNumLinks();b++)
+                        {
+                                if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState()==ISLAND_SLEEPING)
+                                        isSleeping = true;
+                        }
+                        if (!isSleeping)
+                        {
+                                //useless? they get resized in stepVelocities once again (AND DIFFERENTLY)
+                                m_scratch_r.resize(bod->getNumLinks()+1);                 //multidof? ("Y"s use it and it is used to store qdd)
+                                m_scratch_v.resize(bod->getNumLinks()+1);
+                                m_scratch_m.resize(bod->getNumLinks()+1);
+                            {
+                                if(!bod->isUsingRK4Integration())
+                                {
+									bool isConstraintPass = true;
+                                    bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(solverInfo.m_timeStep, m_scratch_r, m_scratch_v, m_scratch_m, isConstraintPass);
+                                }
+				}
+			}
+		}
+	}
+	for (int i=0;i<this->m_multiBodies.size();i++)
+	{
+		btMultiBody* bod = m_multiBodies[i];
+		bod->processDeltaVeeMultiDof2();
+	}
+void	btMultiBodyDynamicsWorld::integrateTransforms(btScalar timeStep)
+	btDiscreteDynamicsWorld::integrateTransforms(timeStep);
+	{
+		BT_PROFILE("btMultiBody stepPositions");
+		//integrate and update the Featherstone hierarchies
+		for (int b=0;b<m_multiBodies.size();b++)
+		{
+			btMultiBody* bod = m_multiBodies[b];
+			bool isSleeping = false;
+			if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+			{
+				isSleeping = true;
+			} 
+			for (int b=0;b<bod->getNumLinks();b++)
+			{
+				if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState()==ISLAND_SLEEPING)
+					isSleeping = true;
+			}
+			if (!isSleeping)
+			{
+				int nLinks = bod->getNumLinks();
+				///base + num m_links
+				{
+					if(!bod->isPosUpdated())
+						bod->stepPositionsMultiDof(timeStep);
+					else
+					{
+						btScalar *pRealBuf = const_cast<btScalar *>(bod->getVelocityVector());
+						pRealBuf += 6 + bod->getNumDofs() + bod->getNumDofs()*bod->getNumDofs();
+						bod->stepPositionsMultiDof(1, 0, pRealBuf);
+						bod->setPosUpdated(false);
+					}
+				}
+				m_scratch_world_to_local.resize(nLinks+1);
+				m_scratch_local_origin.resize(nLinks+1);
+				bod->updateCollisionObjectWorldTransforms(m_scratch_world_to_local,m_scratch_local_origin);
+			} else
+			{
+				bod->clearVelocities();
+			}
+		}
+	}
+void	btMultiBodyDynamicsWorld::addMultiBodyConstraint( btMultiBodyConstraint* constraint)
+	m_multiBodyConstraints.push_back(constraint);
+void	btMultiBodyDynamicsWorld::removeMultiBodyConstraint( btMultiBodyConstraint* constraint)
+	m_multiBodyConstraints.remove(constraint);
+void btMultiBodyDynamicsWorld::debugDrawMultiBodyConstraint(btMultiBodyConstraint* constraint)
+	constraint->debugDraw(getDebugDrawer());
+void	btMultiBodyDynamicsWorld::debugDrawWorld()
+	BT_PROFILE("btMultiBodyDynamicsWorld debugDrawWorld");
+	bool drawConstraints = false;
+	if (getDebugDrawer())
+	{
+		int mode = getDebugDrawer()->getDebugMode();
+		if (mode  & (btIDebugDraw::DBG_DrawConstraints | btIDebugDraw::DBG_DrawConstraintLimits))
+		{
+			drawConstraints = true;
+		}
+		if (drawConstraints)
+		{
+			BT_PROFILE("btMultiBody debugDrawWorld");
+			for (int c=0;c<m_multiBodyConstraints.size();c++)
+			{
+				btMultiBodyConstraint* constraint = m_multiBodyConstraints[c];
+				debugDrawMultiBodyConstraint(constraint);
+			}
+			for (int b = 0; b<m_multiBodies.size(); b++)
+			{
+				btMultiBody* bod = m_multiBodies[b];
+				bod->forwardKinematics(m_scratch_world_to_local1,m_scratch_local_origin1);
+				getDebugDrawer()->drawTransform(bod->getBaseWorldTransform(), 0.1);
+				for (int m = 0; m<bod->getNumLinks(); m++)
+				{
+					const btTransform& tr = bod->getLink(m).m_cachedWorldTransform;
+					getDebugDrawer()->drawTransform(tr, 0.1);
+						//draw the joint axis
+					if (bod->getLink(m).m_jointType==btMultibodyLink::eRevolute)
+					{
+						btVector3 vec = quatRotate(tr.getRotation(),bod->getLink(m).m_axes[0].m_topVec);
+						btVector4 color(0,0,0,1);//1,1,1);
+						btVector3 from = vec+tr.getOrigin()-quatRotate(tr.getRotation(),bod->getLink(m).m_dVector);
+						btVector3 to = tr.getOrigin()-quatRotate(tr.getRotation(),bod->getLink(m).m_dVector);
+						getDebugDrawer()->drawLine(from,to,color);
+					}
+					if (bod->getLink(m).m_jointType==btMultibodyLink::eFixed)
+					{
+						btVector3 vec = quatRotate(tr.getRotation(),bod->getLink(m).m_axes[0].m_bottomVec);
+						btVector4 color(0,0,0,1);//1,1,1);
+						btVector3 from = vec+tr.getOrigin()-quatRotate(tr.getRotation(),bod->getLink(m).m_dVector);
+						btVector3 to = tr.getOrigin()-quatRotate(tr.getRotation(),bod->getLink(m).m_dVector);
+						getDebugDrawer()->drawLine(from,to,color);
+					}
+					if (bod->getLink(m).m_jointType==btMultibodyLink::ePrismatic)
+					{
+						btVector3 vec = quatRotate(tr.getRotation(),bod->getLink(m).m_axes[0].m_bottomVec);
+						btVector4 color(0,0,0,1);//1,1,1);
+						btVector3 from = vec+tr.getOrigin()-quatRotate(tr.getRotation(),bod->getLink(m).m_dVector);
+						btVector3 to = tr.getOrigin()-quatRotate(tr.getRotation(),bod->getLink(m).m_dVector);
+						getDebugDrawer()->drawLine(from,to,color);
+					}
+				}
+			}
+		}
+	}
+	btDiscreteDynamicsWorld::debugDrawWorld();
+void btMultiBodyDynamicsWorld::applyGravity()
+        btDiscreteDynamicsWorld::applyGravity();
+        BT_PROFILE("btMultiBody addGravity");
+        for (int i=0;i<this->m_multiBodies.size();i++)
+        {
+                btMultiBody* bod = m_multiBodies[i];
+                bool isSleeping = false;
+                if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+                {
+                        isSleeping = true;
+                }
+                for (int b=0;b<bod->getNumLinks();b++)
+                {
+                        if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState()==ISLAND_SLEEPING)
+                                isSleeping = true;
+                }
+                if (!isSleeping)
+                {
+                        bod->addBaseForce(m_gravity * bod->getBaseMass());
+                        for (int j = 0; j < bod->getNumLinks(); ++j)
+                        {
+                                bod->addLinkForce(j, m_gravity * bod->getLinkMass(j));
+                        }
+                }//if (!isSleeping)
+        }
+void btMultiBodyDynamicsWorld::clearMultiBodyConstraintForces()
+  for (int i=0;i<this->m_multiBodies.size();i++)
+                {       
+                        btMultiBody* bod = m_multiBodies[i];
+			bod->clearConstraintForces();
+                  } 
+void btMultiBodyDynamicsWorld::clearMultiBodyForces()
+              {
+                BT_PROFILE("clearMultiBodyForces");
+                for (int i=0;i<this->m_multiBodies.size();i++)
+                {
+                        btMultiBody* bod = m_multiBodies[i];
+                        bool isSleeping = false;
+                        if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+                        {       
+                                isSleeping = true;
+                        }
+                        for (int b=0;b<bod->getNumLinks();b++)
+                        {       
+                                if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState()==ISLAND_SLEEPING)     
+                                        isSleeping = true;
+                        }
+                        if (!isSleeping)
+                        {
+                        btMultiBody* bod = m_multiBodies[i];
+                        bod->clearForcesAndTorques();
+                	}
+		}
+	}
+void btMultiBodyDynamicsWorld::clearForces()
+        btDiscreteDynamicsWorld::clearForces();
+	clearMultiBodyForces();
+void	btMultiBodyDynamicsWorld::serialize(btSerializer* serializer)
+	serializer->startSerialization();
+	serializeDynamicsWorldInfo( serializer);
+	serializeMultiBodies(serializer);
+	serializeRigidBodies(serializer);
+	serializeCollisionObjects(serializer);
+	serializer->finishSerialization();
+void	btMultiBodyDynamicsWorld::serializeMultiBodies(btSerializer* serializer)
+	int i;
+	//serialize all collision objects
+	for (i=0;i<m_multiBodies.size();i++)
+	{
+		btMultiBody* mb = m_multiBodies[i];
+		{
+			int len = mb->calculateSerializeBufferSize();
+			btChunk* chunk = serializer->allocate(len,1);
+			const char* structType = mb->serialize(chunk->m_oldPtr, serializer);
+			serializer->finalizeChunk(chunk,structType,BT_MULTIBODY_CODE,mb);
+		}
+	}
\ No newline at end of file
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h
new file mode 100644
index 00000000..2c912da5
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h
@@ -0,0 +1,109 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h"
+class btMultiBody;
+class btMultiBodyConstraint;
+class btMultiBodyConstraintSolver;
+struct MultiBodyInplaceSolverIslandCallback;
+///The btMultiBodyDynamicsWorld adds Featherstone multi body dynamics to Bullet
+///This implementation is still preliminary/experimental.
+class btMultiBodyDynamicsWorld : public btDiscreteDynamicsWorld
+	btAlignedObjectArray<btMultiBody*> m_multiBodies;
+	btAlignedObjectArray<btMultiBodyConstraint*> m_multiBodyConstraints;
+	btAlignedObjectArray<btMultiBodyConstraint*> m_sortedMultiBodyConstraints;
+	btMultiBodyConstraintSolver*	m_multiBodyConstraintSolver;
+	MultiBodyInplaceSolverIslandCallback*	m_solverMultiBodyIslandCallback;
+	//cached data to avoid memory allocations
+	btAlignedObjectArray<btQuaternion> m_scratch_world_to_local;
+	btAlignedObjectArray<btVector3> m_scratch_local_origin;
+	btAlignedObjectArray<btQuaternion> m_scratch_world_to_local1;
+	btAlignedObjectArray<btVector3> m_scratch_local_origin1;
+	btAlignedObjectArray<btScalar> m_scratch_r;
+	btAlignedObjectArray<btVector3> m_scratch_v;
+	btAlignedObjectArray<btMatrix3x3> m_scratch_m;
+	virtual void	calculateSimulationIslands();
+	virtual void	updateActivationState(btScalar timeStep);
+	virtual void	solveConstraints(btContactSolverInfo& solverInfo);
+	virtual void	serializeMultiBodies(btSerializer* serializer);
+	btMultiBodyDynamicsWorld(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btMultiBodyConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration);
+	virtual ~btMultiBodyDynamicsWorld ();
+	virtual void	addMultiBody(btMultiBody* body, short group= btBroadphaseProxy::DefaultFilter, short mask=btBroadphaseProxy::AllFilter);
+	virtual void	removeMultiBody(btMultiBody* body);
+	virtual int		getNumMultibodies() const
+	{
+		return m_multiBodies.size();
+	}
+	btMultiBody*	getMultiBody(int mbIndex)
+	{
+		return m_multiBodies[mbIndex];
+	}
+	virtual void	addMultiBodyConstraint( btMultiBodyConstraint* constraint);
+	virtual int     getNumMultiBodyConstraints() const
+	{
+        return m_multiBodyConstraints.size();
+	}
+	virtual btMultiBodyConstraint*	getMultiBodyConstraint( int constraintIndex)
+	{
+        return m_multiBodyConstraints[constraintIndex];
+	}
+	virtual const btMultiBodyConstraint*	getMultiBodyConstraint( int constraintIndex) const
+	{
+        return m_multiBodyConstraints[constraintIndex];
+	}
+	virtual void	removeMultiBodyConstraint( btMultiBodyConstraint* constraint);
+	virtual void	integrateTransforms(btScalar timeStep);
+	virtual void	debugDrawWorld();
+	virtual void	debugDrawMultiBodyConstraint(btMultiBodyConstraint* constraint);
+	void	forwardKinematics();
+	virtual void clearForces();
+	virtual void clearMultiBodyConstraintForces();
+	virtual void clearMultiBodyForces();
+	virtual void applyGravity();
+	virtual	void	serialize(btSerializer* serializer);
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp
new file mode 100644
index 00000000..6ca5b8b0
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp
@@ -0,0 +1,211 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyFixedConstraint.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h"
+#include "LinearMath/btIDebugDraw.h"
+btMultiBodyFixedConstraint::btMultiBodyFixedConstraint(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB)
+	:btMultiBodyConstraint(body,0,link,-1,BTMBFIXEDCONSTRAINT_DIM,false),
+	m_rigidBodyA(0),
+	m_rigidBodyB(bodyB),
+	m_pivotInA(pivotInA),
+	m_pivotInB(pivotInB),
+    m_frameInA(frameInA),
+    m_frameInB(frameInB)
+    m_data.resize(BTMBFIXEDCONSTRAINT_DIM);//at least store the applied impulses
+btMultiBodyFixedConstraint::btMultiBodyFixedConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB)
+	:btMultiBodyConstraint(bodyA,bodyB,linkA,linkB,BTMBFIXEDCONSTRAINT_DIM,false),
+	m_rigidBodyA(0),
+	m_rigidBodyB(0),
+	m_pivotInA(pivotInA),
+	m_pivotInB(pivotInB),
+    m_frameInA(frameInA),
+    m_frameInB(frameInB)
+    m_data.resize(BTMBFIXEDCONSTRAINT_DIM);//at least store the applied impulses
+void btMultiBodyFixedConstraint::finalizeMultiDof()
+	//not implemented yet
+	btAssert(0);
+int btMultiBodyFixedConstraint::getIslandIdA() const
+	if (m_rigidBodyA)
+		return m_rigidBodyA->getIslandTag();
+	if (m_bodyA)
+	{
+		btMultiBodyLinkCollider* col = m_bodyA->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyA->getNumLinks();i++)
+		{
+			if (m_bodyA->getLink(i).m_collider)
+				return m_bodyA->getLink(i).m_collider->getIslandTag();
+		}
+	}
+	return -1;
+int btMultiBodyFixedConstraint::getIslandIdB() const
+	if (m_rigidBodyB)
+		return m_rigidBodyB->getIslandTag();
+	if (m_bodyB)
+	{
+		btMultiBodyLinkCollider* col = m_bodyB->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyB->getNumLinks();i++)
+		{
+			col = m_bodyB->getLink(i).m_collider;
+			if (col)
+				return col->getIslandTag();
+		}
+	}
+	return -1;
+void btMultiBodyFixedConstraint::createConstraintRows(btMultiBodyConstraintArray& constraintRows, btMultiBodyJacobianData& data, const btContactSolverInfo& infoGlobal)
+    for (int i=0;i<numDim;i++)
+	{
+        btMultiBodySolverConstraint& constraintRow = constraintRows.expandNonInitializing();
+        constraintRow.m_orgConstraint = this;
+        constraintRow.m_orgDofIndex = i;
+        constraintRow.m_relpos1CrossNormal.setValue(0,0,0);
+        constraintRow.m_contactNormal1.setValue(0,0,0);
+        constraintRow.m_relpos2CrossNormal.setValue(0,0,0);
+        constraintRow.m_contactNormal2.setValue(0,0,0);
+        constraintRow.m_angularComponentA.setValue(0,0,0);
+        constraintRow.m_angularComponentB.setValue(0,0,0);
+        constraintRow.m_solverBodyIdA = data.m_fixedBodyId;
+        constraintRow.m_solverBodyIdB = data.m_fixedBodyId;
+        // Convert local points back to world
+        btVector3 pivotAworld = m_pivotInA;
+        btMatrix3x3 frameAworld = m_frameInA;
+        if (m_rigidBodyA)
+        {
+            constraintRow.m_solverBodyIdA = m_rigidBodyA->getCompanionId();
+            pivotAworld = m_rigidBodyA->getCenterOfMassTransform()*m_pivotInA;
+            frameAworld = frameAworld.transpose()*btMatrix3x3(m_rigidBodyA->getOrientation());
+        } else
+        {
+            if (m_bodyA) {
+                pivotAworld = m_bodyA->localPosToWorld(m_linkA, m_pivotInA);
+                frameAworld = m_bodyA->localFrameToWorld(m_linkA, frameAworld);
+            }
+        }
+        btVector3 pivotBworld = m_pivotInB;
+        btMatrix3x3 frameBworld = m_frameInB;
+        if (m_rigidBodyB)
+        {
+            constraintRow.m_solverBodyIdB = m_rigidBodyB->getCompanionId();
+            pivotBworld = m_rigidBodyB->getCenterOfMassTransform()*m_pivotInB;
+            frameBworld = frameBworld.transpose()*btMatrix3x3(m_rigidBodyB->getOrientation());
+        } else
+        {
+            if (m_bodyB) {
+                pivotBworld = m_bodyB->localPosToWorld(m_linkB, m_pivotInB);
+                frameBworld = m_bodyB->localFrameToWorld(m_linkB, frameBworld);
+            }
+        }
+        btMatrix3x3 relRot = frameAworld.inverse()*frameBworld;
+        btVector3 angleDiff;
+        btGeneric6DofSpring2Constraint::matrixToEulerXYZ(relRot,angleDiff);
+        btVector3 constraintNormalLin(0,0,0);
+        btVector3 constraintNormalAng(0,0,0);
+        btScalar posError = 0.0;
+        if (i < 3) {
+            constraintNormalLin[i] = -1;
+            posError = (pivotAworld-pivotBworld).dot(constraintNormalLin);
+            fillMultiBodyConstraint(constraintRow, data, 0, 0, constraintNormalAng,
+                                    constraintNormalLin, pivotAworld, pivotBworld,
+                                    posError,
+                                    infoGlobal,
+                                    -m_maxAppliedImpulse, m_maxAppliedImpulse
+                                    );
+        }
+        else { //i>=3
+            constraintNormalAng = frameAworld.getColumn(i%3);
+            posError = angleDiff[i%3];
+            fillMultiBodyConstraint(constraintRow, data, 0, 0, constraintNormalAng,
+                                    constraintNormalLin, pivotAworld, pivotBworld,
+                                    posError,
+                                    infoGlobal,
+                                    -m_maxAppliedImpulse, m_maxAppliedImpulse, true
+                                    );
+        }
+	}
+void btMultiBodyFixedConstraint::debugDraw(class btIDebugDraw* drawer)
+	btTransform tr;
+	tr.setIdentity();
+	if (m_rigidBodyA)
+	{
+		btVector3 pivot = m_rigidBodyA->getCenterOfMassTransform() * m_pivotInA;
+		tr.setOrigin(pivot);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_bodyA)
+	{
+		btVector3 pivotAworld = m_bodyA->localPosToWorld(m_linkA, m_pivotInA);
+		tr.setOrigin(pivotAworld);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_rigidBodyB)
+	{
+		// that ideally should draw the same frame
+		btVector3 pivot = m_rigidBodyB->getCenterOfMassTransform() * m_pivotInB;
+		tr.setOrigin(pivot);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_bodyB)
+	{
+		btVector3 pivotBworld = m_bodyB->localPosToWorld(m_linkB, m_pivotInB);
+		tr.setOrigin(pivotBworld);
+		drawer->drawTransform(tr, 0.1);
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.h
new file mode 100644
index 00000000..26e28a74
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.h
@@ -0,0 +1,94 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyConstraint.h"
+class btMultiBodyFixedConstraint : public btMultiBodyConstraint
+	btRigidBody*	m_rigidBodyA;
+	btRigidBody*	m_rigidBodyB;
+	btVector3		m_pivotInA;
+	btVector3		m_pivotInB;
+    btMatrix3x3     m_frameInA;
+    btMatrix3x3     m_frameInB;
+	btMultiBodyFixedConstraint(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB);
+	btMultiBodyFixedConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB);
+	virtual ~btMultiBodyFixedConstraint();
+	virtual void finalizeMultiDof();
+	virtual int getIslandIdA() const;
+	virtual int getIslandIdB() const;
+	virtual void createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal);
+    const btVector3& getPivotInA() const
+    {
+        return m_pivotInA;
+    }
+    void setPivotInA(const btVector3& pivotInA)
+    {
+        m_pivotInA = pivotInA;
+    }
+	const btVector3& getPivotInB() const
+	{
+		return m_pivotInB;
+	}
+	void setPivotInB(const btVector3& pivotInB)
+	{
+		m_pivotInB = pivotInB;
+	}
+    const btMatrix3x3& getFrameInA() const
+    {
+        return m_frameInA;
+    }
+    void setFrameInA(const btMatrix3x3& frameInA)
+    {
+        m_frameInA = frameInA;
+    }
+    const btMatrix3x3& getFrameInB() const
+    {
+        return m_frameInB;
+    }
+    void setFrameInB(const btMatrix3x3& frameInB)
+    {
+        m_frameInB = frameInB;
+    }
+	virtual void debugDraw(class btIDebugDraw* drawer);
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointFeedback.h
similarity index 75%
rename from src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
rename to src/bullet/BulletDynamics/Featherstone/btMultiBodyJointFeedback.h
index 8b89de03..5c2fa8ed 100644
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointFeedback.h
@@ -1,6 +1,5 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+Copyright (c) 2015 Google Inc.
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -15,5 +14,14 @@ subject to the following restrictions:
+#include "LinearMath/btSpatialAlgebra.h"
+struct btMultiBodyJointFeedback
+	btSpatialForceVector	m_reactionForces;
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp
new file mode 100644
index 00000000..70781767
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp
@@ -0,0 +1,199 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyJointLimitConstraint.h"
+#include "btMultiBody.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+btMultiBodyJointLimitConstraint::btMultiBodyJointLimitConstraint(btMultiBody* body, int link, btScalar lower, btScalar upper)
+	//:btMultiBodyConstraint(body,0,link,-1,2,true),
+	:btMultiBodyConstraint(body,body,link,body->getLink(link).m_parent,2,true),
+	m_lowerBound(lower),
+	m_upperBound(upper)
+void btMultiBodyJointLimitConstraint::finalizeMultiDof()
+	// the data.m_jacobians never change, so may as well
+    // initialize them here
+	allocateJacobiansMultiDof();
+	unsigned int offset = 6 + m_bodyA->getLink(m_linkA).m_dofOffset;
+	// row 0: the lower bound
+	jacobianA(0)[offset] = 1;
+	// row 1: the upper bound
+	//jacobianA(1)[offset] = -1;
+	jacobianB(1)[offset] = -1;
+	m_numDofsFinalized = m_jacSizeBoth;
+int btMultiBodyJointLimitConstraint::getIslandIdA() const
+	if(m_bodyA)
+	{
+		btMultiBodyLinkCollider* col = m_bodyA->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyA->getNumLinks();i++)
+		{
+			if (m_bodyA->getLink(i).m_collider)
+				return m_bodyA->getLink(i).m_collider->getIslandTag();
+		}
+	}
+	return -1;
+int btMultiBodyJointLimitConstraint::getIslandIdB() const
+	if(m_bodyB)
+	{
+		btMultiBodyLinkCollider* col = m_bodyB->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyB->getNumLinks();i++)
+		{
+			col = m_bodyB->getLink(i).m_collider;
+			if (col)
+				return col->getIslandTag();
+		}
+	}
+	return -1;
+void btMultiBodyJointLimitConstraint::createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal)
+    // only positions need to be updated -- data.m_jacobians and force
+    // directions were set in the ctor and never change.
+	if (m_numDofsFinalized != m_jacSizeBoth)
+	{
+        finalizeMultiDof();
+	}
+    // row 0: the lower bound
+    setPosition(0, m_bodyA->getJointPos(m_linkA) - m_lowerBound);			//multidof: this is joint-type dependent
+    // row 1: the upper bound
+    setPosition(1, m_upperBound - m_bodyA->getJointPos(m_linkA));
+	for (int row=0;row<getNumRows();row++)
+	{
+		btScalar direction = row? -1 : 1;
+		btMultiBodySolverConstraint& constraintRow = constraintRows.expandNonInitializing();
+        constraintRow.m_orgConstraint = this;
+        constraintRow.m_orgDofIndex = row;
+		constraintRow.m_multiBodyA = m_bodyA;
+		constraintRow.m_multiBodyB = m_bodyB;
+		const btScalar posError = 0;						//why assume it's zero?
+		const btVector3 dummy(0, 0, 0);
+		btScalar rel_vel = fillMultiBodyConstraint(constraintRow,data,jacobianA(row),jacobianB(row),dummy,dummy,dummy,dummy,posError,infoGlobal,0,m_maxAppliedImpulse);
+		{
+			//expect either prismatic or revolute joint type for now
+			btAssert((m_bodyA->getLink(m_linkA).m_jointType == btMultibodyLink::eRevolute)||(m_bodyA->getLink(m_linkA).m_jointType == btMultibodyLink::ePrismatic));
+			switch (m_bodyA->getLink(m_linkA).m_jointType)
+			{
+				case btMultibodyLink::eRevolute:
+				{
+					constraintRow.m_contactNormal1.setZero();
+					constraintRow.m_contactNormal2.setZero();
+					btVector3 revoluteAxisInWorld = direction*quatRotate(m_bodyA->getLink(m_linkA).m_cachedWorldTransform.getRotation(),m_bodyA->getLink(m_linkA).m_axes[0].m_topVec);
+					constraintRow.m_relpos1CrossNormal=revoluteAxisInWorld;
+					constraintRow.m_relpos2CrossNormal=-revoluteAxisInWorld;
+					break;
+				}
+				case btMultibodyLink::ePrismatic:
+				{
+					btVector3 prismaticAxisInWorld = direction* quatRotate(m_bodyA->getLink(m_linkA).m_cachedWorldTransform.getRotation(),m_bodyA->getLink(m_linkA).m_axes[0].m_bottomVec);
+					constraintRow.m_contactNormal1=prismaticAxisInWorld;
+					constraintRow.m_contactNormal2=-prismaticAxisInWorld;
+					constraintRow.m_relpos1CrossNormal.setZero();
+					constraintRow.m_relpos2CrossNormal.setZero();
+					break;
+				}
+				default:
+				{
+					btAssert(0);
+				}
+			};
+		}
+		{
+			btScalar penetration = getPosition(row);
+			btScalar positionalError = 0.f;
+			btScalar	velocityError =  - rel_vel;// * damping;
+			btScalar erp = infoGlobal.m_erp2;
+			if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+			{
+				erp = infoGlobal.m_erp;
+			}
+			if (penetration>0)
+			{
+				positionalError = 0;
+				velocityError = -penetration / infoGlobal.m_timeStep;
+			} else
+			{
+				positionalError = -penetration * erp/infoGlobal.m_timeStep;
+			}
+			btScalar  penetrationImpulse = positionalError*constraintRow.m_jacDiagABInv;
+			btScalar velocityImpulse = velocityError *constraintRow.m_jacDiagABInv;
+			if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+			{
+				//combine position and velocity into rhs
+				constraintRow.m_rhs = penetrationImpulse+velocityImpulse;
+				constraintRow.m_rhsPenetration = 0.f;
+			} else
+			{
+				//split position and velocity into rhs and m_rhsPenetration
+				constraintRow.m_rhs = velocityImpulse;
+				constraintRow.m_rhsPenetration = penetrationImpulse;
+			}
+		}
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h
new file mode 100644
index 00000000..55b8d122
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h
@@ -0,0 +1,50 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "btMultiBodyConstraint.h"
+struct btSolverInfo;
+class btMultiBodyJointLimitConstraint : public btMultiBodyConstraint
+	btScalar	m_lowerBound;
+	btScalar	m_upperBound;
+	btMultiBodyJointLimitConstraint(btMultiBody* body, int link, btScalar lower, btScalar upper);
+	virtual ~btMultiBodyJointLimitConstraint();
+	virtual void finalizeMultiDof();
+	virtual int getIslandIdA() const;
+	virtual int getIslandIdB() const;
+	virtual void createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal);
+	virtual void debugDraw(class btIDebugDraw* drawer)
+	{
+		//todo(erwincoumans)
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
new file mode 100644
index 00000000..a055e725
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
@@ -0,0 +1,170 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyJointMotor.h"
+#include "btMultiBody.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+btMultiBodyJointMotor::btMultiBodyJointMotor(btMultiBody* body, int link, btScalar desiredVelocity, btScalar maxMotorImpulse)
+	:btMultiBodyConstraint(body,body,link,body->getLink(link).m_parent,1,true),
+	m_desiredVelocity(desiredVelocity),
+	m_desiredPosition(0),
+	m_kd(1.),
+	m_kp(0)
+	m_maxAppliedImpulse = maxMotorImpulse;
+	// the data.m_jacobians never change, so may as well
+    // initialize them here
+void btMultiBodyJointMotor::finalizeMultiDof()
+	allocateJacobiansMultiDof();
+	// note: we rely on the fact that data.m_jacobians are
+	// always initialized to zero by the Constraint ctor
+	int linkDoF = 0;
+	unsigned int offset = 6 + (m_bodyA->getLink(m_linkA).m_dofOffset + linkDoF);
+	// row 0: the lower bound
+	// row 0: the lower bound
+	jacobianA(0)[offset] = 1;
+	m_numDofsFinalized = m_jacSizeBoth;
+btMultiBodyJointMotor::btMultiBodyJointMotor(btMultiBody* body, int link, int linkDoF, btScalar desiredVelocity, btScalar maxMotorImpulse)
+	//:btMultiBodyConstraint(body,0,link,-1,1,true),
+	:btMultiBodyConstraint(body,body,link,body->getLink(link).m_parent,1,true),
+	m_desiredVelocity(desiredVelocity),
+	m_desiredPosition(0),
+	m_kd(1.),
+	m_kp(0)
+	btAssert(linkDoF < body->getLink(link).m_dofCount);
+	m_maxAppliedImpulse = maxMotorImpulse;
+int btMultiBodyJointMotor::getIslandIdA() const
+	btMultiBodyLinkCollider* col = m_bodyA->getBaseCollider();
+	if (col)
+		return col->getIslandTag();
+	for (int i=0;i<m_bodyA->getNumLinks();i++)
+	{
+		if (m_bodyA->getLink(i).m_collider)
+			return m_bodyA->getLink(i).m_collider->getIslandTag();
+	}
+	return -1;
+int btMultiBodyJointMotor::getIslandIdB() const
+	btMultiBodyLinkCollider* col = m_bodyB->getBaseCollider();
+	if (col)
+		return col->getIslandTag();
+	for (int i=0;i<m_bodyB->getNumLinks();i++)
+	{
+		col = m_bodyB->getLink(i).m_collider;
+		if (col)
+			return col->getIslandTag();
+	}
+	return -1;
+void btMultiBodyJointMotor::createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal)
+    // only positions need to be updated -- data.m_jacobians and force
+    // directions were set in the ctor and never change.
+	if (m_numDofsFinalized != m_jacSizeBoth)
+	{
+        finalizeMultiDof();
+	}
+	//don't crash
+	if (m_numDofsFinalized != m_jacSizeBoth)
+		return;
+	const btScalar posError = 0;
+	const btVector3 dummy(0, 0, 0);
+	for (int row=0;row<getNumRows();row++)
+	{
+		btMultiBodySolverConstraint& constraintRow = constraintRows.expandNonInitializing();
+        int dof = 0;
+        btScalar currentPosition = m_bodyA->getJointPosMultiDof(m_linkA)[dof];
+        btScalar currentVelocity = m_bodyA->getJointVelMultiDof(m_linkA)[dof];
+        btScalar positionStabiliationTerm = (m_desiredPosition-currentPosition)/infoGlobal.m_timeStep;
+        btScalar velocityError = (m_desiredVelocity - currentVelocity);
+        btScalar rhs =   m_kp * positionStabiliationTerm + currentVelocity+m_kd * velocityError;
+		fillMultiBodyConstraint(constraintRow,data,jacobianA(row),jacobianB(row),dummy,dummy,dummy,dummy,posError,infoGlobal,-m_maxAppliedImpulse,m_maxAppliedImpulse,false,1,false,rhs);
+		constraintRow.m_orgConstraint = this;
+		constraintRow.m_orgDofIndex = row;
+		{
+			//expect either prismatic or revolute joint type for now
+			btAssert((m_bodyA->getLink(m_linkA).m_jointType == btMultibodyLink::eRevolute)||(m_bodyA->getLink(m_linkA).m_jointType == btMultibodyLink::ePrismatic));
+			switch (m_bodyA->getLink(m_linkA).m_jointType)
+			{
+				case btMultibodyLink::eRevolute:
+				{
+					constraintRow.m_contactNormal1.setZero();
+					constraintRow.m_contactNormal2.setZero();
+					btVector3 revoluteAxisInWorld = quatRotate(m_bodyA->getLink(m_linkA).m_cachedWorldTransform.getRotation(),m_bodyA->getLink(m_linkA).m_axes[0].m_topVec);
+					constraintRow.m_relpos1CrossNormal=revoluteAxisInWorld;
+					constraintRow.m_relpos2CrossNormal=-revoluteAxisInWorld;
+					break;
+				}
+				case btMultibodyLink::ePrismatic:
+				{
+					btVector3 prismaticAxisInWorld = quatRotate(m_bodyA->getLink(m_linkA).m_cachedWorldTransform.getRotation(),m_bodyA->getLink(m_linkA).m_axes[0].m_bottomVec);
+					constraintRow.m_contactNormal1=prismaticAxisInWorld;
+					constraintRow.m_contactNormal2=-prismaticAxisInWorld;
+					constraintRow.m_relpos1CrossNormal.setZero();
+					constraintRow.m_relpos2CrossNormal.setZero();
+					break;
+				}
+				default:
+				{
+					btAssert(0);
+				}
+			};
+		}
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.h
new file mode 100644
index 00000000..96b5c440
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.h
@@ -0,0 +1,68 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyConstraint.h"
+struct btSolverInfo;
+class btMultiBodyJointMotor : public btMultiBodyConstraint
+	btScalar	m_desiredVelocity;
+	btScalar	m_desiredPosition;
+	btScalar    m_kd;
+	btScalar    m_kp;
+	btMultiBodyJointMotor(btMultiBody* body, int link, btScalar desiredVelocity, btScalar maxMotorImpulse);
+	btMultiBodyJointMotor(btMultiBody* body, int link, int linkDoF, btScalar desiredVelocity, btScalar maxMotorImpulse);
+	virtual ~btMultiBodyJointMotor();
+    virtual void finalizeMultiDof();
+	virtual int getIslandIdA() const;
+	virtual int getIslandIdB() const;
+	virtual void createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal);
+    virtual void setVelocityTarget(btScalar velTarget, btScalar kd = 1.f)
+    {
+        m_desiredVelocity = velTarget;
+        m_kd = kd;
+    }
+    virtual void setPositionTarget(btScalar posTarget, btScalar kp = 1.f)
+    {
+        m_desiredPosition = posTarget;
+        m_kp = kp;
+    }
+	virtual void debugDraw(class btIDebugDraw* drawer)
+	{
+		//todo(erwincoumans)
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h
new file mode 100644
index 00000000..a2596111
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h
@@ -0,0 +1,226 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btQuaternion.h"
+#include "LinearMath/btVector3.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+enum	btMultiBodyLinkFlags
+//both defines are now permanently enabled
+// Various spatial helper functions
+//namespace {
+#include "LinearMath/btSpatialAlgebra.h"
+// Link struct
+struct btMultibodyLink 
+    btScalar m_mass;         // mass of link
+    btVector3 m_inertiaLocal;   // inertia of link (local frame; diagonal)
+    int m_parent;         // index of the parent link (assumed to be < index of this link), or -1 if parent is the base link.
+    btQuaternion m_zeroRotParentToThis;    // rotates vectors in parent-frame to vectors in local-frame (when q=0). constant.
+    btVector3 m_dVector;   // vector from the inboard joint pos to this link's COM. (local frame.) constant.
+	//this is set to zero for planar joint (see also m_eVector comment)
+    // m_eVector is constant, but depends on the joint type:
+    // revolute, fixed, prismatic, spherical: vector from parent's COM to the pivot point, in PARENT's frame.
+	// planar: vector from COM of parent to COM of this link, WHEN Q = 0. (local frame.)
+	// todo: fix the planar so it is consistent with the other joints
+    btVector3 m_eVector;
+	btSpatialMotionVector m_absFrameTotVelocity, m_absFrameLocVelocity;
+	enum eFeatherstoneJointType
+	{
+		eRevolute = 0,
+		ePrismatic = 1,
+		eSpherical = 2,
+		ePlanar = 3,
+		eFixed = 4,
+		eInvalid
+	};
+	// "axis" = spatial joint axis (Mirtich Defn 9 p104). (expressed in local frame.) constant.
+    // for prismatic: m_axesTop[0] = zero;
+    //                m_axesBottom[0] = unit vector along the joint axis.
+    // for revolute: m_axesTop[0] = unit vector along the rotation axis (u);
+    //               m_axesBottom[0] = u cross m_dVector (i.e. COM linear motion due to the rotation at the joint)
+	//
+	// for spherical: m_axesTop[0][1][2] (u1,u2,u3) form a 3x3 identity matrix (3 rotation axes)
+	//				  m_axesBottom[0][1][2] cross u1,u2,u3 (i.e. COM linear motion due to the rotation at the joint)
+	//
+	// for planar: m_axesTop[0] = unit vector along the rotation axis (u); defines the plane of motion
+	//			   m_axesTop[1][2] = zero
+	//			   m_axesBottom[0] = zero
+	//			   m_axesBottom[1][2] = unit vectors along the translational axes on that plane		
+	btSpatialMotionVector m_axes[6];
+	void setAxisTop(int dof, const btVector3 &axis) { m_axes[dof].m_topVec = axis; }
+	void setAxisBottom(int dof, const btVector3 &axis) { m_axes[dof].m_bottomVec = axis; }
+	void setAxisTop(int dof, const btScalar &x, const btScalar &y, const btScalar &z) { m_axes[dof].m_topVec.setValue(x, y, z); }
+	void setAxisBottom(int dof, const btScalar &x, const btScalar &y, const btScalar &z) { m_axes[dof].m_bottomVec.setValue(x, y, z); }
+	const btVector3 & getAxisTop(int dof) const { return m_axes[dof].m_topVec; }
+	const btVector3 & getAxisBottom(int dof) const { return m_axes[dof].m_bottomVec; }
+	int m_dofOffset, m_cfgOffset;
+    btQuaternion m_cachedRotParentToThis;   // rotates vectors in parent frame to vectors in local frame
+    btVector3 m_cachedRVector;                // vector from COM of parent to COM of this link, in local frame.
+    btVector3 m_appliedForce;    // In WORLD frame
+    btVector3 m_appliedTorque;   // In WORLD frame
+btVector3 m_appliedConstraintForce;    // In WORLD frame
+    btVector3 m_appliedConstraintTorque;   // In WORLD frame
+	btScalar m_jointPos[7];
+    //m_jointTorque is the joint torque applied by the user using 'addJointTorque'.
+    //It gets set to zero after each internal stepSimulation call
+	btScalar m_jointTorque[6];
+	class btMultiBodyLinkCollider* m_collider;
+	int m_flags;
+	int m_dofCount, m_posVarCount;				//redundant but handy
+	eFeatherstoneJointType m_jointType;
+	struct btMultiBodyJointFeedback*	m_jointFeedback;
+	btTransform	m_cachedWorldTransform;//this cache is updated when calling btMultiBody::forwardKinematics
+	const char* m_linkName;//m_linkName memory needs to be managed by the developer/user!
+	const char* m_jointName;//m_jointName memory needs to be managed by the developer/user!
+    const void* m_userPtr;//m_userPtr ptr needs to be managed by the developer/user!
+	btScalar m_jointDamping; //todo: implement this internally. It is unused for now, it is set by a URDF loader. User can apply manual damping.
+	btScalar m_jointFriction; //todo: implement this internally. It is unused for now, it is set by a URDF loader. User can apply manual friction using a velocity motor.
+	// ctor: set some sensible defaults
+	btMultibodyLink()
+		: 	m_mass(1),
+			m_parent(-1),
+			m_zeroRotParentToThis(0, 0, 0, 1),
+			m_cachedRotParentToThis(0, 0, 0, 1),
+			m_collider(0),
+			m_flags(0),
+			m_dofCount(0),
+			m_posVarCount(0),
+			m_jointType(btMultibodyLink::eInvalid),
+			m_jointFeedback(0),
+			m_linkName(0),
+			m_jointName(0),
+            m_userPtr(0),
+			m_jointDamping(0),
+			m_jointFriction(0)
+	{
+		m_inertiaLocal.setValue(1, 1, 1);
+		setAxisTop(0, 0., 0., 0.);
+		setAxisBottom(0, 1., 0., 0.);
+		m_dVector.setValue(0, 0, 0);
+		m_eVector.setValue(0, 0, 0);
+		m_cachedRVector.setValue(0, 0, 0);
+		m_appliedForce.setValue( 0, 0, 0);
+		m_appliedTorque.setValue(0, 0, 0);
+		//		
+		m_jointPos[0] = m_jointPos[1] = m_jointPos[2] = m_jointPos[4] = m_jointPos[5] = m_jointPos[6] = 0.f;
+		m_jointPos[3] = 1.f;			//"quat.w"
+		m_jointTorque[0] = m_jointTorque[1] = m_jointTorque[2] = m_jointTorque[3] = m_jointTorque[4] = m_jointTorque[5] = 0.f;
+		m_cachedWorldTransform.setIdentity();
+	}
+    // routine to update m_cachedRotParentToThis and m_cachedRVector
+	void updateCacheMultiDof(btScalar *pq = 0)
+	{
+		btScalar *pJointPos = (pq ? pq : &m_jointPos[0]);
+		switch(m_jointType)
+		{
+			case eRevolute:
+			{
+				m_cachedRotParentToThis = btQuaternion(getAxisTop(0),-pJointPos[0]) * m_zeroRotParentToThis;
+				m_cachedRVector = m_dVector + quatRotate(m_cachedRotParentToThis,m_eVector);
+				break;
+			}
+			case ePrismatic:
+			{
+				// m_cachedRotParentToThis never changes, so no need to update
+				m_cachedRVector = m_dVector + quatRotate(m_cachedRotParentToThis,m_eVector) + pJointPos[0] * getAxisBottom(0);
+				break;
+			}
+			case eSpherical:
+			{
+				m_cachedRotParentToThis = btQuaternion(pJointPos[0], pJointPos[1], pJointPos[2], -pJointPos[3]) * m_zeroRotParentToThis;
+				m_cachedRVector = m_dVector + quatRotate(m_cachedRotParentToThis,m_eVector);
+				break;
+			}
+			case ePlanar:
+			{
+				m_cachedRotParentToThis = btQuaternion(getAxisTop(0),-pJointPos[0]) * m_zeroRotParentToThis;				
+				m_cachedRVector = quatRotate(btQuaternion(getAxisTop(0),-pJointPos[0]), pJointPos[1] * getAxisBottom(1) + pJointPos[2] * getAxisBottom(2)) + quatRotate(m_cachedRotParentToThis,m_eVector);				
+				break;
+			}
+			case eFixed:
+			{
+				m_cachedRotParentToThis = m_zeroRotParentToThis;
+				m_cachedRVector = m_dVector + quatRotate(m_cachedRotParentToThis,m_eVector);
+				break;
+			}
+			default:
+			{
+				//invalid type
+				btAssert(0);
+			}
+		}
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h
new file mode 100644
index 00000000..5080ea87
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h
@@ -0,0 +1,92 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "btMultiBody.h"
+class btMultiBodyLinkCollider : public btCollisionObject
+	btMultiBody* m_multiBody;
+	int m_link;
+	btMultiBodyLinkCollider (btMultiBody* multiBody,int link)
+		:m_multiBody(multiBody),
+		m_link(link)
+	{
+		m_checkCollideWith =  true;
+		//we need to remove the 'CF_STATIC_OBJECT' flag, otherwise links/base doesn't merge islands
+		//this means that some constraints might point to bodies that are not in the islands, causing crashes
+		//if (link>=0 || (multiBody && !multiBody->hasFixedBase()))
+		{
+			m_collisionFlags &= (~btCollisionObject::CF_STATIC_OBJECT);
+		}
+		// else
+		//{
+		//	m_collisionFlags |= (btCollisionObject::CF_STATIC_OBJECT);
+		//}
+		m_internalType = CO_FEATHERSTONE_LINK;
+	}
+	static btMultiBodyLinkCollider* upcast(btCollisionObject* colObj)
+	{
+		if (colObj->getInternalType()&btCollisionObject::CO_FEATHERSTONE_LINK)
+			return (btMultiBodyLinkCollider*)colObj;
+		return 0;
+	}
+	static const btMultiBodyLinkCollider* upcast(const btCollisionObject* colObj)
+	{
+		if (colObj->getInternalType()&btCollisionObject::CO_FEATHERSTONE_LINK)
+			return (btMultiBodyLinkCollider*)colObj;
+		return 0;
+	}
+	virtual bool checkCollideWithOverride(const  btCollisionObject* co) const
+	{
+		const btMultiBodyLinkCollider* other = btMultiBodyLinkCollider::upcast(co);
+		if (!other)
+			return true;
+		if (other->m_multiBody != this->m_multiBody)
+			return true;
+		if (!m_multiBody->hasSelfCollision())
+			return false;
+		//check if 'link' has collision disabled
+		if (m_link>=0)
+		{
+			const btMultibodyLink& link = m_multiBody->getLink(this->m_link);
+			if ((link.m_flags&BT_MULTIBODYLINKFLAGS_DISABLE_PARENT_COLLISION) && link.m_parent == other->m_link)
+				return false;
+		}
+		if (other->m_link>=0)
+		{
+			const btMultibodyLink& otherLink = other->m_multiBody->getLink(other->m_link);
+			if ((otherLink.m_flags& BT_MULTIBODYLINKFLAGS_DISABLE_PARENT_COLLISION) && otherLink.m_parent == this->m_link)
+				return false;
+		}
+		return true;
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp
new file mode 100644
index 00000000..125d52ad
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp
@@ -0,0 +1,221 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyPoint2Point.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "LinearMath/btIDebugDraw.h"
+btMultiBodyPoint2Point::btMultiBodyPoint2Point(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB)
+	:btMultiBodyConstraint(body,0,link,-1,BTMBP2PCONSTRAINT_DIM,false),
+	m_rigidBodyA(0),
+	m_rigidBodyB(bodyB),
+	m_pivotInA(pivotInA),
+	m_pivotInB(pivotInB)
+    m_data.resize(BTMBP2PCONSTRAINT_DIM);//at least store the applied impulses
+btMultiBodyPoint2Point::btMultiBodyPoint2Point(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB)
+	:btMultiBodyConstraint(bodyA,bodyB,linkA,linkB,BTMBP2PCONSTRAINT_DIM,false),
+	m_rigidBodyA(0),
+	m_rigidBodyB(0),
+	m_pivotInA(pivotInA),
+	m_pivotInB(pivotInB)
+    m_data.resize(BTMBP2PCONSTRAINT_DIM);//at least store the applied impulses
+void btMultiBodyPoint2Point::finalizeMultiDof()
+	//not implemented yet
+	btAssert(0);
+int btMultiBodyPoint2Point::getIslandIdA() const
+	if (m_rigidBodyA)
+		return m_rigidBodyA->getIslandTag();
+	if (m_bodyA)
+	{
+		btMultiBodyLinkCollider* col = m_bodyA->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyA->getNumLinks();i++)
+		{
+			if (m_bodyA->getLink(i).m_collider)
+				return m_bodyA->getLink(i).m_collider->getIslandTag();
+		}
+	}
+	return -1;
+int btMultiBodyPoint2Point::getIslandIdB() const
+	if (m_rigidBodyB)
+		return m_rigidBodyB->getIslandTag();
+	if (m_bodyB)
+	{
+		btMultiBodyLinkCollider* col = m_bodyB->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyB->getNumLinks();i++)
+		{
+			col = m_bodyB->getLink(i).m_collider;
+			if (col)
+				return col->getIslandTag();
+		}
+	}
+	return -1;
+void btMultiBodyPoint2Point::createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal)
+//	int i=1;
+	for (int i=0;i<numDim;i++)
+	{
+		btMultiBodySolverConstraint& constraintRow = constraintRows.expandNonInitializing();
+        //memset(&constraintRow,0xffffffff,sizeof(btMultiBodySolverConstraint));
+	constraintRow.m_orgConstraint = this;
+	constraintRow.m_orgDofIndex = i;
+        constraintRow.m_relpos1CrossNormal.setValue(0,0,0);
+        constraintRow.m_contactNormal1.setValue(0,0,0);
+        constraintRow.m_relpos2CrossNormal.setValue(0,0,0);
+        constraintRow.m_contactNormal2.setValue(0,0,0);
+        constraintRow.m_angularComponentA.setValue(0,0,0);
+        constraintRow.m_angularComponentB.setValue(0,0,0);
+		constraintRow.m_solverBodyIdA = data.m_fixedBodyId;
+		constraintRow.m_solverBodyIdB = data.m_fixedBodyId;
+		btVector3 contactNormalOnB(0,0,0);
+		contactNormalOnB[i] = -1;
+		contactNormalOnB[i%3] = -1;
+		 // Convert local points back to world
+		btVector3 pivotAworld = m_pivotInA;
+		if (m_rigidBodyA)
+		{
+			constraintRow.m_solverBodyIdA = m_rigidBodyA->getCompanionId();
+			pivotAworld = m_rigidBodyA->getCenterOfMassTransform()*m_pivotInA;
+		} else
+		{
+			if (m_bodyA)
+				pivotAworld = m_bodyA->localPosToWorld(m_linkA, m_pivotInA);
+		}
+		btVector3 pivotBworld = m_pivotInB;
+		if (m_rigidBodyB)
+		{
+			constraintRow.m_solverBodyIdB = m_rigidBodyB->getCompanionId();
+			pivotBworld = m_rigidBodyB->getCenterOfMassTransform()*m_pivotInB;
+		} else
+		{
+			if (m_bodyB)
+				pivotBworld = m_bodyB->localPosToWorld(m_linkB, m_pivotInB);
+		}
+		btScalar posError = i < 3 ? (pivotAworld-pivotBworld).dot(contactNormalOnB) : 0;
+		fillMultiBodyConstraint(constraintRow, data, 0, 0, btVector3(0,0,0),
+															contactNormalOnB, pivotAworld, pivotBworld,						//sucks but let it be this way "for the time being"
+															posError,
+															infoGlobal,
+															-m_maxAppliedImpulse, m_maxAppliedImpulse
+															);
+    //@todo: support the case of btMultiBody versus btRigidBody,
+    //see btPoint2PointConstraint::getInfo2NonVirtual
+		const btVector3 dummy(0, 0, 0);
+		btAssert(m_bodyA->isMultiDof());
+		btScalar* jac1 = jacobianA(i);
+		const btVector3 &normalAng = i >= 3 ? contactNormalOnB : dummy;
+		const btVector3 &normalLin = i < 3 ? contactNormalOnB : dummy;
+		m_bodyA->filConstraintJacobianMultiDof(m_linkA, pivotAworld, normalAng, normalLin, jac1, data.scratch_r, data.scratch_v, data.scratch_m);
+		fillMultiBodyConstraint(constraintRow, data, jac1, 0,
+													dummy, dummy, dummy,						//sucks but let it be this way "for the time being"
+													posError,
+													infoGlobal,
+													-m_maxAppliedImpulse, m_maxAppliedImpulse
+													);
+	}
+void btMultiBodyPoint2Point::debugDraw(class btIDebugDraw* drawer)
+	btTransform tr;
+	tr.setIdentity();
+	if (m_rigidBodyA)
+	{
+		btVector3 pivot = m_rigidBodyA->getCenterOfMassTransform() * m_pivotInA;
+		tr.setOrigin(pivot);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_bodyA)
+	{
+		btVector3 pivotAworld = m_bodyA->localPosToWorld(m_linkA, m_pivotInA);
+		tr.setOrigin(pivotAworld);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_rigidBodyB)
+	{
+		// that ideally should draw the same frame
+		btVector3 pivot = m_rigidBodyB->getCenterOfMassTransform() * m_pivotInB;
+		tr.setOrigin(pivot);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_bodyB)
+	{
+		btVector3 pivotBworld = m_bodyB->localPosToWorld(m_linkB, m_pivotInB);
+		tr.setOrigin(pivotBworld);
+		drawer->drawTransform(tr, 0.1);
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.h
new file mode 100644
index 00000000..b2e219ac
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.h
@@ -0,0 +1,65 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyConstraint.h"
+class btMultiBodyPoint2Point : public btMultiBodyConstraint
+	btRigidBody*	m_rigidBodyA;
+	btRigidBody*	m_rigidBodyB;
+	btVector3		m_pivotInA;
+	btVector3		m_pivotInB;
+	btMultiBodyPoint2Point(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB);
+	btMultiBodyPoint2Point(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB);
+	virtual ~btMultiBodyPoint2Point();
+	virtual void finalizeMultiDof();
+	virtual int getIslandIdA() const;
+	virtual int getIslandIdB() const;
+	virtual void createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal);
+	const btVector3& getPivotInB() const
+	{
+		return m_pivotInB;
+	}
+	void setPivotInB(const btVector3& pivotInB)
+	{
+		m_pivotInB = pivotInB;
+	}
+	virtual void debugDraw(class btIDebugDraw* drawer);
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp b/src/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp
new file mode 100644
index 00000000..19804389
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp
@@ -0,0 +1,230 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodySliderConstraint.h"
+#include "btMultiBodyLinkCollider.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h"
+#include "LinearMath/btIDebugDraw.h"
+#define EPSILON 0.000001
+btMultiBodySliderConstraint::btMultiBodySliderConstraint(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB, const btVector3& jointAxis)
+	:btMultiBodyConstraint(body,0,link,-1,BTMBSLIDERCONSTRAINT_DIM,false),
+	m_rigidBodyA(0),
+	m_rigidBodyB(bodyB),
+	m_pivotInA(pivotInA),
+	m_pivotInB(pivotInB),
+    m_frameInA(frameInA),
+    m_frameInB(frameInB),
+    m_jointAxis(jointAxis)
+    m_data.resize(BTMBSLIDERCONSTRAINT_DIM);//at least store the applied impulses
+btMultiBodySliderConstraint::btMultiBodySliderConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB, const btVector3& jointAxis)
+	:btMultiBodyConstraint(bodyA,bodyB,linkA,linkB,BTMBSLIDERCONSTRAINT_DIM,false),
+	m_rigidBodyA(0),
+	m_rigidBodyB(0),
+	m_pivotInA(pivotInA),
+	m_pivotInB(pivotInB),
+    m_frameInA(frameInA),
+    m_frameInB(frameInB),
+    m_jointAxis(jointAxis)
+    m_data.resize(BTMBSLIDERCONSTRAINT_DIM);//at least store the applied impulses
+void btMultiBodySliderConstraint::finalizeMultiDof()
+	//not implemented yet
+	btAssert(0);
+int btMultiBodySliderConstraint::getIslandIdA() const
+	if (m_rigidBodyA)
+		return m_rigidBodyA->getIslandTag();
+	if (m_bodyA)
+	{
+		btMultiBodyLinkCollider* col = m_bodyA->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyA->getNumLinks();i++)
+		{
+			if (m_bodyA->getLink(i).m_collider)
+				return m_bodyA->getLink(i).m_collider->getIslandTag();
+		}
+	}
+	return -1;
+int btMultiBodySliderConstraint::getIslandIdB() const
+	if (m_rigidBodyB)
+		return m_rigidBodyB->getIslandTag();
+	if (m_bodyB)
+	{
+		btMultiBodyLinkCollider* col = m_bodyB->getBaseCollider();
+		if (col)
+			return col->getIslandTag();
+		for (int i=0;i<m_bodyB->getNumLinks();i++)
+		{
+			col = m_bodyB->getLink(i).m_collider;
+			if (col)
+				return col->getIslandTag();
+		}
+	}
+	return -1;
+void btMultiBodySliderConstraint::createConstraintRows(btMultiBodyConstraintArray& constraintRows, btMultiBodyJacobianData& data, const btContactSolverInfo& infoGlobal)
+    // Convert local points back to world
+    btVector3 pivotAworld = m_pivotInA;
+    btMatrix3x3 frameAworld = m_frameInA;
+    btVector3 jointAxis = m_jointAxis;
+    if (m_rigidBodyA)
+    {
+        pivotAworld = m_rigidBodyA->getCenterOfMassTransform()*m_pivotInA;
+        frameAworld = m_frameInA.transpose()*btMatrix3x3(m_rigidBodyA->getOrientation());
+        jointAxis = quatRotate(m_rigidBodyA->getOrientation(),m_jointAxis);
+    } else if (m_bodyA) {
+        pivotAworld = m_bodyA->localPosToWorld(m_linkA, m_pivotInA);
+        frameAworld = m_bodyA->localFrameToWorld(m_linkA, m_frameInA);
+        jointAxis = m_bodyA->localDirToWorld(m_linkA, m_jointAxis);
+    }
+    btVector3 pivotBworld = m_pivotInB;
+    btMatrix3x3 frameBworld = m_frameInB;
+    if (m_rigidBodyB)
+    {
+        pivotBworld = m_rigidBodyB->getCenterOfMassTransform()*m_pivotInB;
+        frameBworld = m_frameInB.transpose()*btMatrix3x3(m_rigidBodyB->getOrientation());
+    } else if (m_bodyB) {
+        pivotBworld = m_bodyB->localPosToWorld(m_linkB, m_pivotInB);
+        frameBworld = m_bodyB->localFrameToWorld(m_linkB, m_frameInB);
+    }
+    btVector3 constraintAxis[2];
+    for (int i = 0; i < 3; ++i)
+    {
+        constraintAxis[0] = frameAworld.getColumn(i).cross(jointAxis);
+        if (constraintAxis[0].norm() > EPSILON)
+        {
+            constraintAxis[0] = constraintAxis[0].normalized();
+            constraintAxis[1] = jointAxis.cross(constraintAxis[0]);
+            constraintAxis[1] = constraintAxis[1].normalized();
+            break;
+        }
+    }
+    btMatrix3x3 relRot = frameAworld.inverse()*frameBworld;
+    btVector3 angleDiff;
+    btGeneric6DofSpring2Constraint::matrixToEulerXYZ(relRot,angleDiff);
+    for (int i=0;i<numDim;i++)
+	{
+        btMultiBodySolverConstraint& constraintRow = constraintRows.expandNonInitializing();
+        constraintRow.m_orgConstraint = this;
+        constraintRow.m_orgDofIndex = i;
+        constraintRow.m_relpos1CrossNormal.setValue(0,0,0);
+        constraintRow.m_contactNormal1.setValue(0,0,0);
+        constraintRow.m_relpos2CrossNormal.setValue(0,0,0);
+        constraintRow.m_contactNormal2.setValue(0,0,0);
+        constraintRow.m_angularComponentA.setValue(0,0,0);
+        constraintRow.m_angularComponentB.setValue(0,0,0);
+        constraintRow.m_solverBodyIdA = data.m_fixedBodyId;
+        constraintRow.m_solverBodyIdB = data.m_fixedBodyId;
+        if (m_rigidBodyA)
+        {
+            constraintRow.m_solverBodyIdA = m_rigidBodyA->getCompanionId();
+        }
+        if (m_rigidBodyB)
+        {
+            constraintRow.m_solverBodyIdB = m_rigidBodyB->getCompanionId();
+        }
+        btVector3 constraintNormalLin(0,0,0);
+        btVector3 constraintNormalAng(0,0,0);
+        btScalar posError = 0.0;
+        if (i < 2) {
+            constraintNormalLin = constraintAxis[i];
+            posError = (pivotAworld-pivotBworld).dot(constraintNormalLin);
+            fillMultiBodyConstraint(constraintRow, data, 0, 0, constraintNormalAng,
+                                    constraintNormalLin, pivotAworld, pivotBworld,
+                                    posError,
+                                    infoGlobal,
+                                    -m_maxAppliedImpulse, m_maxAppliedImpulse
+                                    );
+        }
+        else { //i>=2
+            constraintNormalAng = frameAworld.getColumn(i%3);
+            posError = angleDiff[i%3];
+            fillMultiBodyConstraint(constraintRow, data, 0, 0, constraintNormalAng,
+                                    constraintNormalLin, pivotAworld, pivotBworld,
+                                    posError,
+                                    infoGlobal,
+                                    -m_maxAppliedImpulse, m_maxAppliedImpulse, true
+                                    );
+        }
+	}
+void btMultiBodySliderConstraint::debugDraw(class btIDebugDraw* drawer)
+	btTransform tr;
+	tr.setIdentity();
+	if (m_rigidBodyA)
+	{
+		btVector3 pivot = m_rigidBodyA->getCenterOfMassTransform() * m_pivotInA;
+		tr.setOrigin(pivot);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_bodyA)
+	{
+		btVector3 pivotAworld = m_bodyA->localPosToWorld(m_linkA, m_pivotInA);
+		tr.setOrigin(pivotAworld);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_rigidBodyB)
+	{
+		// that ideally should draw the same frame
+		btVector3 pivot = m_rigidBodyB->getCenterOfMassTransform() * m_pivotInB;
+		tr.setOrigin(pivot);
+		drawer->drawTransform(tr, 0.1);
+	}
+	if (m_bodyB)
+	{
+		btVector3 pivotBworld = m_bodyB->localPosToWorld(m_linkB, m_pivotInB);
+		tr.setOrigin(pivotBworld);
+		drawer->drawTransform(tr, 0.1);
+	}
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.h
new file mode 100644
index 00000000..571dcd53
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.h
@@ -0,0 +1,105 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///This file was written by Erwin Coumans
+#include "btMultiBodyConstraint.h"
+class btMultiBodySliderConstraint : public btMultiBodyConstraint
+	btRigidBody*	m_rigidBodyA;
+	btRigidBody*	m_rigidBodyB;
+	btVector3		m_pivotInA;
+	btVector3		m_pivotInB;
+    btMatrix3x3     m_frameInA;
+    btMatrix3x3     m_frameInB;
+    btVector3       m_jointAxis;
+	btMultiBodySliderConstraint(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB, const btVector3& jointAxis);
+	btMultiBodySliderConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB, const btVector3& jointAxis);
+	virtual ~btMultiBodySliderConstraint();
+	virtual void finalizeMultiDof();
+	virtual int getIslandIdA() const;
+	virtual int getIslandIdB() const;
+	virtual void createConstraintRows(btMultiBodyConstraintArray& constraintRows,
+		btMultiBodyJacobianData& data,
+		const btContactSolverInfo& infoGlobal);
+    const btVector3& getPivotInA() const
+    {
+        return m_pivotInA;
+    }
+    void setPivotInA(const btVector3& pivotInA)
+    {
+        m_pivotInA = pivotInA;
+    }
+	const btVector3& getPivotInB() const
+	{
+		return m_pivotInB;
+	}
+	void setPivotInB(const btVector3& pivotInB)
+	{
+		m_pivotInB = pivotInB;
+	}
+    const btMatrix3x3& getFrameInA() const
+    {
+        return m_frameInA;
+    }
+    void setFrameInA(const btMatrix3x3& frameInA)
+    {
+        m_frameInA = frameInA;
+    }
+    const btMatrix3x3& getFrameInB() const
+    {
+        return m_frameInB;
+    }
+    void setFrameInB(const btMatrix3x3& frameInB)
+    {
+        m_frameInB = frameInB;
+    }
+    const btVector3& getJointAxis() const
+    {
+        return m_jointAxis;
+    }
+    void setJointAxis(const btVector3& jointAxis)
+    {
+        m_jointAxis = jointAxis;
+    }
+	virtual void debugDraw(class btIDebugDraw* drawer);
diff --git a/src/bullet/BulletDynamics/Featherstone/btMultiBodySolverConstraint.h b/src/bullet/BulletDynamics/Featherstone/btMultiBodySolverConstraint.h
new file mode 100644
index 00000000..6fa1550e
--- /dev/null
+++ b/src/bullet/BulletDynamics/Featherstone/btMultiBodySolverConstraint.h
@@ -0,0 +1,90 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btAlignedObjectArray.h"
+class btMultiBody;
+class btMultiBodyConstraint;
+#include "BulletDynamics/ConstraintSolver/btSolverBody.h"
+#include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
+///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
+ATTRIBUTE_ALIGNED16 (struct)	btMultiBodySolverConstraint
+	btMultiBodySolverConstraint() : m_solverBodyIdA(-1), m_multiBodyA(0), m_linkA(-1), m_solverBodyIdB(-1), m_multiBodyB(0), m_linkB(-1),m_orgConstraint(0), m_orgDofIndex(-1)
+	{}
+	int				m_deltaVelAindex;//more generic version of m_relpos1CrossNormal/m_contactNormal1
+	int				m_jacAindex;
+	int				m_deltaVelBindex;
+	int				m_jacBindex;
+	btVector3		m_relpos1CrossNormal;
+	btVector3		m_contactNormal1;	
+	btVector3		m_relpos2CrossNormal;
+	btVector3		m_contactNormal2; //usually m_contactNormal2 == -m_contactNormal1, but not always
+	btVector3		m_angularComponentA;
+	btVector3		m_angularComponentB;
+	mutable btSimdScalar	m_appliedPushImpulse;
+	mutable btSimdScalar	m_appliedImpulse;
+	btScalar	m_friction;
+	btScalar	m_jacDiagABInv;
+	btScalar		m_rhs;
+	btScalar		m_cfm;
+    btScalar		m_lowerLimit;
+	btScalar		m_upperLimit;
+	btScalar		m_rhsPenetration;
+    union
+	{
+		void*		m_originalContactPoint;
+		btScalar	m_unusedPadding4;
+	};
+	int	m_overrideNumSolverIterations;
+    int			m_frictionIndex;
+	int m_solverBodyIdA;
+	btMultiBody* m_multiBodyA;
+	int			m_linkA;
+	int m_solverBodyIdB;
+	btMultiBody* m_multiBodyB;
+	int			m_linkB;
+	//for writing back applied impulses
+	btMultiBodyConstraint*	m_orgConstraint;
+	int m_orgDofIndex;
+	enum		btSolverConstraintType
+	{
+	};
+typedef btAlignedObjectArray<btMultiBodySolverConstraint>	btMultiBodyConstraintArray;
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp b/src/bullet/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp
new file mode 100644
index 00000000..986f2148
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp
@@ -0,0 +1,2080 @@
+*                                                                       *
+* Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
+* All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
+*                                                                       *
+* This library is free software; you can redistribute it and/or         *
+* modify it under the terms of EITHER:                                  *
+*   (1) The GNU Lesser General Public License as published by the Free  *
+*       Software Foundation; either version 2.1 of the License, or (at  *
+*       your option) any later version. The text of the GNU Lesser      *
+*       General Public License is included with this library in the     *
+*       file LICENSE.TXT.                                               *
+*   (2) The BSD-style license that is included with this library in     *
+*       the file LICENSE-BSD.TXT.                                       *
+*                                                                       *
+* This library is distributed in the hope that it will be useful,       *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+* LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
+*                                                                       *
+solve A*x = b+w, with x and w subject to certain LCP conditions.
+each x(i),w(i) must lie on one of the three line segments in the following
+diagram. each line segment corresponds to one index set :
+     w(i)
+     /|\      |           :
+      |       |           :
+      |       |i in N     :
+  w>0 |       |state[i]=0 :
+      |       |           :
+      |       |           :  i in C
+  w=0 +       +-----------------------+
+      |                   :           |
+      |                   :           |
+  w<0 |                   :           |i in N
+      |                   :           |state[i]=1
+      |                   :           |
+      |                   :           |
+      +-------|-----------|-----------|----------> x(i)
+             lo           0           hi
+the Dantzig algorithm proceeds as follows:
+  for i=1:n
+    * if (x(i),w(i)) is not on the line, push x(i) and w(i) positive or
+      negative towards the line. as this is done, the other (x(j),w(j))
+      for j<i are constrained to be on the line. if any (x,w) reaches the
+      end of a line segment then it is switched between index sets.
+    * i is added to the appropriate index set depending on what line segment
+      it hits.
+we restrict lo(i) <= 0 and hi(i) >= 0. this makes the algorithm a bit
+simpler, because the starting point for x(i),w(i) is always on the dotted
+line x=0 and x will only ever increase in one direction, so it can only hit
+two out of the three line segments.
+this is an implementation of "lcp_dantzig2_ldlt.m" and "lcp_dantzig_lohi.m".
+the implementation is split into an LCP problem object (btLCP) and an LCP
+driver function. most optimization occurs in the btLCP object.
+a naive implementation of the algorithm requires either a lot of data motion
+or a lot of permutation-array lookup, because we are constantly re-ordering
+rows and columns. to avoid this and make a more optimized algorithm, a
+non-trivial data structure is used to represent the matrix A (this is
+implemented in the fast version of the btLCP object).
+during execution of this algorithm, some indexes in A are clamped (set C),
+some are non-clamped (set N), and some are "don't care" (where x=0).
+A,x,b,w (and other problem vectors) are permuted such that the clamped
+indexes are first, the unclamped indexes are next, and the don't-care
+indexes are last. this permutation is recorded in the array `p'.
+initially p = 0..n-1, and as the rows and columns of A,x,b,w are swapped,
+the corresponding elements of p are swapped.
+because the C and N elements are grouped together in the rows of A, we can do
+lots of work with a fast dot product function. if A,x,etc were not permuted
+and we only had a permutation array, then those dot products would be much
+slower as we would have a permutation array lookup in some inner loops.
+A is accessed through an array of row pointers, so that element (i,j) of the
+permuted matrix is A[i][j]. this makes row swapping fast. for column swapping
+we still have to actually move the data.
+during execution of this algorithm we maintain an L*D*L' factorization of
+the clamped submatrix of A (call it `AC') which is the top left nC*nC
+submatrix of A. there are two ways we could arrange the rows/columns in AC.
+(1) AC is always permuted such that L*D*L' = AC. this causes a problem
+when a row/column is removed from C, because then all the rows/columns of A
+between the deleted index and the end of C need to be rotated downward.
+this results in a lot of data motion and slows things down.
+(2) L*D*L' is actually a factorization of a *permutation* of AC (which is
+itself a permutation of the underlying A). this is what we do - the
+permutation is recorded in the vector C. call this permutation A[C,C].
+when a row/column is removed from C, all we have to do is swap two
+rows/columns and manipulate C.
+#include "btDantzigLCP.h"
+#include <string.h>//memcpy
+bool s_error = false;
+// code generation parameters
+#define btLCP_FAST		// use fast btLCP object
+// option 1 : matrix row pointers (less data copying)
+#define BTROWPTRS
+#define BTATYPE btScalar **
+#define BTAROW(i) (m_A[i])
+// option 2 : no matrix row pointers (slightly faster inner loops)
+//#define NOROWPTRS
+//#define BTATYPE btScalar *
+//#define BTAROW(i) (m_A+(i)*m_nskip)
+/* solve L*X=B, with B containing 1 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is lskip.
+ * B is an n*1 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also lskip.
+ * B is overwritten with X.
+ * this processes blocks of 2*2.
+ * if this is in the factorizer source file, n must be a multiple of 2.
+ */
+static void btSolveL1_1 (const btScalar *L, btScalar *B, int n, int lskip1)
+  /* declare variables - Z matrix, p and q vectors, etc */
+  btScalar Z11,m11,Z21,m21,p1,q1,p2,*ex;
+  const btScalar *ell;
+  int i,j;
+  /* compute all 2 x 1 blocks of X */
+  for (i=0; i < n; i+=2) {
+    /* compute all 2 x 1 block of X, from rows i..i+2-1 */
+    /* set the Z matrix to 0 */
+    Z11=0;
+    Z21=0;
+    ell = L + i*lskip1;
+    ex = B;
+    /* the inner loop that computes outer products and adds them to Z */
+    for (j=i-2; j >= 0; j -= 2) {
+      /* compute outer product and add it to the Z matrix */
+      p1=ell[0];
+      q1=ex[0];
+      m11 = p1 * q1;
+      p2=ell[lskip1];
+      m21 = p2 * q1;
+      Z11 += m11;
+      Z21 += m21;
+      /* compute outer product and add it to the Z matrix */
+      p1=ell[1];
+      q1=ex[1];
+      m11 = p1 * q1;
+      p2=ell[1+lskip1];
+      m21 = p2 * q1;
+      /* advance pointers */
+      ell += 2;
+      ex += 2;
+      Z11 += m11;
+      Z21 += m21;
+      /* end of inner loop */
+    }
+    /* compute left-over iterations */
+    j += 2;
+    for (; j > 0; j--) {
+      /* compute outer product and add it to the Z matrix */
+      p1=ell[0];
+      q1=ex[0];
+      m11 = p1 * q1;
+      p2=ell[lskip1];
+      m21 = p2 * q1;
+      /* advance pointers */
+      ell += 1;
+      ex += 1;
+      Z11 += m11;
+      Z21 += m21;
+    }
+    /* finish computing the X(i) block */
+    Z11 = ex[0] - Z11;
+    ex[0] = Z11;
+    p1 = ell[lskip1];
+    Z21 = ex[1] - Z21 - p1*Z11;
+    ex[1] = Z21;
+    /* end of outer loop */
+  }
+/* solve L*X=B, with B containing 2 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is lskip.
+ * B is an n*2 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also lskip.
+ * B is overwritten with X.
+ * this processes blocks of 2*2.
+ * if this is in the factorizer source file, n must be a multiple of 2.
+ */
+static void btSolveL1_2 (const btScalar *L, btScalar *B, int n, int lskip1)
+  /* declare variables - Z matrix, p and q vectors, etc */
+  btScalar Z11,m11,Z12,m12,Z21,m21,Z22,m22,p1,q1,p2,q2,*ex;
+  const btScalar *ell;
+  int i,j;
+  /* compute all 2 x 2 blocks of X */
+  for (i=0; i < n; i+=2) {
+    /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+    /* set the Z matrix to 0 */
+    Z11=0;
+    Z12=0;
+    Z21=0;
+    Z22=0;
+    ell = L + i*lskip1;
+    ex = B;
+    /* the inner loop that computes outer products and adds them to Z */
+    for (j=i-2; j >= 0; j -= 2) {
+      /* compute outer product and add it to the Z matrix */
+      p1=ell[0];
+      q1=ex[0];
+      m11 = p1 * q1;
+      q2=ex[lskip1];
+      m12 = p1 * q2;
+      p2=ell[lskip1];
+      m21 = p2 * q1;
+      m22 = p2 * q2;
+      Z11 += m11;
+      Z12 += m12;
+      Z21 += m21;
+      Z22 += m22;
+      /* compute outer product and add it to the Z matrix */
+      p1=ell[1];
+      q1=ex[1];
+      m11 = p1 * q1;
+      q2=ex[1+lskip1];
+      m12 = p1 * q2;
+      p2=ell[1+lskip1];
+      m21 = p2 * q1;
+      m22 = p2 * q2;
+      /* advance pointers */
+      ell += 2;
+      ex += 2;
+      Z11 += m11;
+      Z12 += m12;
+      Z21 += m21;
+      Z22 += m22;
+      /* end of inner loop */
+    }
+    /* compute left-over iterations */
+    j += 2;
+    for (; j > 0; j--) {
+      /* compute outer product and add it to the Z matrix */
+      p1=ell[0];
+      q1=ex[0];
+      m11 = p1 * q1;
+      q2=ex[lskip1];
+      m12 = p1 * q2;
+      p2=ell[lskip1];
+      m21 = p2 * q1;
+      m22 = p2 * q2;
+      /* advance pointers */
+      ell += 1;
+      ex += 1;
+      Z11 += m11;
+      Z12 += m12;
+      Z21 += m21;
+      Z22 += m22;
+    }
+    /* finish computing the X(i) block */
+    Z11 = ex[0] - Z11;
+    ex[0] = Z11;
+    Z12 = ex[lskip1] - Z12;
+    ex[lskip1] = Z12;
+    p1 = ell[lskip1];
+    Z21 = ex[1] - Z21 - p1*Z11;
+    ex[1] = Z21;
+    Z22 = ex[1+lskip1] - Z22 - p1*Z12;
+    ex[1+lskip1] = Z22;
+    /* end of outer loop */
+  }
+void btFactorLDLT (btScalar *A, btScalar *d, int n, int nskip1)
+  int i,j;
+  btScalar sum,*ell,*dee,dd,p1,p2,q1,q2,Z11,m11,Z21,m21,Z22,m22;
+  if (n < 1) return;
+  for (i=0; i<=n-2; i += 2) {
+    /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
+    btSolveL1_2 (A,A+i*nskip1,i,nskip1);
+    /* scale the elements in a 2 x i block at A(i,0), and also */
+    /* compute Z = the outer product matrix that we'll need. */
+    Z11 = 0;
+    Z21 = 0;
+    Z22 = 0;
+    ell = A+i*nskip1;
+    dee = d;
+    for (j=i-6; j >= 0; j -= 6) {
+      p1 = ell[0];
+      p2 = ell[nskip1];
+      dd = dee[0];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[0] = q1;
+      ell[nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      p1 = ell[1];
+      p2 = ell[1+nskip1];
+      dd = dee[1];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[1] = q1;
+      ell[1+nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      p1 = ell[2];
+      p2 = ell[2+nskip1];
+      dd = dee[2];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[2] = q1;
+      ell[2+nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      p1 = ell[3];
+      p2 = ell[3+nskip1];
+      dd = dee[3];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[3] = q1;
+      ell[3+nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      p1 = ell[4];
+      p2 = ell[4+nskip1];
+      dd = dee[4];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[4] = q1;
+      ell[4+nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      p1 = ell[5];
+      p2 = ell[5+nskip1];
+      dd = dee[5];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[5] = q1;
+      ell[5+nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      ell += 6;
+      dee += 6;
+    }
+    /* compute left-over iterations */
+    j += 6;
+    for (; j > 0; j--) {
+      p1 = ell[0];
+      p2 = ell[nskip1];
+      dd = dee[0];
+      q1 = p1*dd;
+      q2 = p2*dd;
+      ell[0] = q1;
+      ell[nskip1] = q2;
+      m11 = p1*q1;
+      m21 = p2*q1;
+      m22 = p2*q2;
+      Z11 += m11;
+      Z21 += m21;
+      Z22 += m22;
+      ell++;
+      dee++;
+    }
+    /* solve for diagonal 2 x 2 block at A(i,i) */
+    Z11 = ell[0] - Z11;
+    Z21 = ell[nskip1] - Z21;
+    Z22 = ell[1+nskip1] - Z22;
+    dee = d + i;
+    /* factorize 2 x 2 block Z,dee */
+    /* factorize row 1 */
+    dee[0] = btRecip(Z11);
+    /* factorize row 2 */
+    sum = 0;
+    q1 = Z21;
+    q2 = q1 * dee[0];
+    Z21 = q2;
+    sum += q1*q2;
+    dee[1] = btRecip(Z22 - sum);
+    /* done factorizing 2 x 2 block */
+    ell[nskip1] = Z21;
+  }
+  /* compute the (less than 2) rows at the bottom */
+  switch (n-i) {
+    case 0:
+    break;
+    case 1:
+    btSolveL1_1 (A,A+i*nskip1,i,nskip1);
+    /* scale the elements in a 1 x i block at A(i,0), and also */
+    /* compute Z = the outer product matrix that we'll need. */
+    Z11 = 0;
+    ell = A+i*nskip1;
+    dee = d;
+    for (j=i-6; j >= 0; j -= 6) {
+      p1 = ell[0];
+      dd = dee[0];
+      q1 = p1*dd;
+      ell[0] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      p1 = ell[1];
+      dd = dee[1];
+      q1 = p1*dd;
+      ell[1] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      p1 = ell[2];
+      dd = dee[2];
+      q1 = p1*dd;
+      ell[2] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      p1 = ell[3];
+      dd = dee[3];
+      q1 = p1*dd;
+      ell[3] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      p1 = ell[4];
+      dd = dee[4];
+      q1 = p1*dd;
+      ell[4] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      p1 = ell[5];
+      dd = dee[5];
+      q1 = p1*dd;
+      ell[5] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      ell += 6;
+      dee += 6;
+    }
+    /* compute left-over iterations */
+    j += 6;
+    for (; j > 0; j--) {
+      p1 = ell[0];
+      dd = dee[0];
+      q1 = p1*dd;
+      ell[0] = q1;
+      m11 = p1*q1;
+      Z11 += m11;
+      ell++;
+      dee++;
+    }
+    /* solve for diagonal 1 x 1 block at A(i,i) */
+    Z11 = ell[0] - Z11;
+    dee = d + i;
+    /* factorize 1 x 1 block Z,dee */
+    /* factorize row 1 */
+    dee[0] = btRecip(Z11);
+    /* done factorizing 1 x 1 block */
+    break;
+    //default: *((char*)0)=0;  /* this should never happen! */
+  }
+/* solve L*X=B, with B containing 1 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is lskip.
+ * B is an n*1 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also lskip.
+ * B is overwritten with X.
+ * this processes blocks of 4*4.
+ * if this is in the factorizer source file, n must be a multiple of 4.
+ */
+void btSolveL1 (const btScalar *L, btScalar *B, int n, int lskip1)
+  /* declare variables - Z matrix, p and q vectors, etc */
+  btScalar Z11,Z21,Z31,Z41,p1,q1,p2,p3,p4,*ex;
+  const btScalar *ell;
+  int lskip2,lskip3,i,j;
+  /* compute lskip values */
+  lskip2 = 2*lskip1;
+  lskip3 = 3*lskip1;
+  /* compute all 4 x 1 blocks of X */
+  for (i=0; i <= n-4; i+=4) {
+    /* compute all 4 x 1 block of X, from rows i..i+4-1 */
+    /* set the Z matrix to 0 */
+    Z11=0;
+    Z21=0;
+    Z31=0;
+    Z41=0;
+    ell = L + i*lskip1;
+    ex = B;
+    /* the inner loop that computes outer products and adds them to Z */
+    for (j=i-12; j >= 0; j -= 12) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      p2=ell[lskip1];
+      p3=ell[lskip2];
+      p4=ell[lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[1];
+      q1=ex[1];
+      p2=ell[1+lskip1];
+      p3=ell[1+lskip2];
+      p4=ell[1+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[2];
+      q1=ex[2];
+      p2=ell[2+lskip1];
+      p3=ell[2+lskip2];
+      p4=ell[2+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[3];
+      q1=ex[3];
+      p2=ell[3+lskip1];
+      p3=ell[3+lskip2];
+      p4=ell[3+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[4];
+      q1=ex[4];
+      p2=ell[4+lskip1];
+      p3=ell[4+lskip2];
+      p4=ell[4+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[5];
+      q1=ex[5];
+      p2=ell[5+lskip1];
+      p3=ell[5+lskip2];
+      p4=ell[5+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[6];
+      q1=ex[6];
+      p2=ell[6+lskip1];
+      p3=ell[6+lskip2];
+      p4=ell[6+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[7];
+      q1=ex[7];
+      p2=ell[7+lskip1];
+      p3=ell[7+lskip2];
+      p4=ell[7+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[8];
+      q1=ex[8];
+      p2=ell[8+lskip1];
+      p3=ell[8+lskip2];
+      p4=ell[8+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[9];
+      q1=ex[9];
+      p2=ell[9+lskip1];
+      p3=ell[9+lskip2];
+      p4=ell[9+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[10];
+      q1=ex[10];
+      p2=ell[10+lskip1];
+      p3=ell[10+lskip2];
+      p4=ell[10+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* load p and q values */
+      p1=ell[11];
+      q1=ex[11];
+      p2=ell[11+lskip1];
+      p3=ell[11+lskip2];
+      p4=ell[11+lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* advance pointers */
+      ell += 12;
+      ex += 12;
+      /* end of inner loop */
+    }
+    /* compute left-over iterations */
+    j += 12;
+    for (; j > 0; j--) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      p2=ell[lskip1];
+      p3=ell[lskip2];
+      p4=ell[lskip3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      Z21 += p2 * q1;
+      Z31 += p3 * q1;
+      Z41 += p4 * q1;
+      /* advance pointers */
+      ell += 1;
+      ex += 1;
+    }
+    /* finish computing the X(i) block */
+    Z11 = ex[0] - Z11;
+    ex[0] = Z11;
+    p1 = ell[lskip1];
+    Z21 = ex[1] - Z21 - p1*Z11;
+    ex[1] = Z21;
+    p1 = ell[lskip2];
+    p2 = ell[1+lskip2];
+    Z31 = ex[2] - Z31 - p1*Z11 - p2*Z21;
+    ex[2] = Z31;
+    p1 = ell[lskip3];
+    p2 = ell[1+lskip3];
+    p3 = ell[2+lskip3];
+    Z41 = ex[3] - Z41 - p1*Z11 - p2*Z21 - p3*Z31;
+    ex[3] = Z41;
+    /* end of outer loop */
+  }
+  /* compute rows at end that are not a multiple of block size */
+  for (; i < n; i++) {
+    /* compute all 1 x 1 block of X, from rows i..i+1-1 */
+    /* set the Z matrix to 0 */
+    Z11=0;
+    ell = L + i*lskip1;
+    ex = B;
+    /* the inner loop that computes outer products and adds them to Z */
+    for (j=i-12; j >= 0; j -= 12) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[1];
+      q1=ex[1];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[2];
+      q1=ex[2];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[3];
+      q1=ex[3];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[4];
+      q1=ex[4];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[5];
+      q1=ex[5];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[6];
+      q1=ex[6];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[7];
+      q1=ex[7];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[8];
+      q1=ex[8];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[9];
+      q1=ex[9];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[10];
+      q1=ex[10];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* load p and q values */
+      p1=ell[11];
+      q1=ex[11];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* advance pointers */
+      ell += 12;
+      ex += 12;
+      /* end of inner loop */
+    }
+    /* compute left-over iterations */
+    j += 12;
+    for (; j > 0; j--) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      /* compute outer product and add it to the Z matrix */
+      Z11 += p1 * q1;
+      /* advance pointers */
+      ell += 1;
+      ex += 1;
+    }
+    /* finish computing the X(i) block */
+    Z11 = ex[0] - Z11;
+    ex[0] = Z11;
+  }
+/* solve L^T * x=b, with b containing 1 right hand side.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is lskip.
+ * b is an n*1 matrix that contains the right hand side.
+ * b is overwritten with x.
+ * this processes blocks of 4.
+ */
+void btSolveL1T (const btScalar *L, btScalar *B, int n, int lskip1)
+  /* declare variables - Z matrix, p and q vectors, etc */
+  btScalar Z11,m11,Z21,m21,Z31,m31,Z41,m41,p1,q1,p2,p3,p4,*ex;
+  const btScalar *ell;
+  int lskip2,i,j;
+//  int lskip3;
+  /* special handling for L and B because we're solving L1 *transpose* */
+  L = L + (n-1)*(lskip1+1);
+  B = B + n-1;
+  lskip1 = -lskip1;
+  /* compute lskip values */
+  lskip2 = 2*lskip1;
+  //lskip3 = 3*lskip1;
+  /* compute all 4 x 1 blocks of X */
+  for (i=0; i <= n-4; i+=4) {
+    /* compute all 4 x 1 block of X, from rows i..i+4-1 */
+    /* set the Z matrix to 0 */
+    Z11=0;
+    Z21=0;
+    Z31=0;
+    Z41=0;
+    ell = L - i;
+    ex = B;
+    /* the inner loop that computes outer products and adds them to Z */
+    for (j=i-4; j >= 0; j -= 4) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      p2=ell[-1];
+      p3=ell[-2];
+      p4=ell[-3];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      m21 = p2 * q1;
+      m31 = p3 * q1;
+      m41 = p4 * q1;
+      ell += lskip1;
+      Z11 += m11;
+      Z21 += m21;
+      Z31 += m31;
+      Z41 += m41;
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[-1];
+      p2=ell[-1];
+      p3=ell[-2];
+      p4=ell[-3];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      m21 = p2 * q1;
+      m31 = p3 * q1;
+      m41 = p4 * q1;
+      ell += lskip1;
+      Z11 += m11;
+      Z21 += m21;
+      Z31 += m31;
+      Z41 += m41;
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[-2];
+      p2=ell[-1];
+      p3=ell[-2];
+      p4=ell[-3];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      m21 = p2 * q1;
+      m31 = p3 * q1;
+      m41 = p4 * q1;
+      ell += lskip1;
+      Z11 += m11;
+      Z21 += m21;
+      Z31 += m31;
+      Z41 += m41;
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[-3];
+      p2=ell[-1];
+      p3=ell[-2];
+      p4=ell[-3];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      m21 = p2 * q1;
+      m31 = p3 * q1;
+      m41 = p4 * q1;
+      ell += lskip1;
+      ex -= 4;
+      Z11 += m11;
+      Z21 += m21;
+      Z31 += m31;
+      Z41 += m41;
+      /* end of inner loop */
+    }
+    /* compute left-over iterations */
+    j += 4;
+    for (; j > 0; j--) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      p2=ell[-1];
+      p3=ell[-2];
+      p4=ell[-3];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      m21 = p2 * q1;
+      m31 = p3 * q1;
+      m41 = p4 * q1;
+      ell += lskip1;
+      ex -= 1;
+      Z11 += m11;
+      Z21 += m21;
+      Z31 += m31;
+      Z41 += m41;
+    }
+    /* finish computing the X(i) block */
+    Z11 = ex[0] - Z11;
+    ex[0] = Z11;
+    p1 = ell[-1];
+    Z21 = ex[-1] - Z21 - p1*Z11;
+    ex[-1] = Z21;
+    p1 = ell[-2];
+    p2 = ell[-2+lskip1];
+    Z31 = ex[-2] - Z31 - p1*Z11 - p2*Z21;
+    ex[-2] = Z31;
+    p1 = ell[-3];
+    p2 = ell[-3+lskip1];
+    p3 = ell[-3+lskip2];
+    Z41 = ex[-3] - Z41 - p1*Z11 - p2*Z21 - p3*Z31;
+    ex[-3] = Z41;
+    /* end of outer loop */
+  }
+  /* compute rows at end that are not a multiple of block size */
+  for (; i < n; i++) {
+    /* compute all 1 x 1 block of X, from rows i..i+1-1 */
+    /* set the Z matrix to 0 */
+    Z11=0;
+    ell = L - i;
+    ex = B;
+    /* the inner loop that computes outer products and adds them to Z */
+    for (j=i-4; j >= 0; j -= 4) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      ell += lskip1;
+      Z11 += m11;
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[-1];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      ell += lskip1;
+      Z11 += m11;
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[-2];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      ell += lskip1;
+      Z11 += m11;
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[-3];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      ell += lskip1;
+      ex -= 4;
+      Z11 += m11;
+      /* end of inner loop */
+    }
+    /* compute left-over iterations */
+    j += 4;
+    for (; j > 0; j--) {
+      /* load p and q values */
+      p1=ell[0];
+      q1=ex[0];
+      /* compute outer product and add it to the Z matrix */
+      m11 = p1 * q1;
+      ell += lskip1;
+      ex -= 1;
+      Z11 += m11;
+    }
+    /* finish computing the X(i) block */
+    Z11 = ex[0] - Z11;
+    ex[0] = Z11;
+  }
+void btVectorScale (btScalar *a, const btScalar *d, int n)
+  btAssert (a && d && n >= 0);
+  for (int i=0; i<n; i++) {
+    a[i] *= d[i];
+  }
+void btSolveLDLT (const btScalar *L, const btScalar *d, btScalar *b, int n, int nskip)
+  btAssert (L && d && b && n > 0 && nskip >= n);
+  btSolveL1 (L,b,n,nskip);
+  btVectorScale (b,d,n);
+  btSolveL1T (L,b,n,nskip);
+// swap row/column i1 with i2 in the n*n matrix A. the leading dimension of
+// A is nskip. this only references and swaps the lower triangle.
+// if `do_fast_row_swaps' is nonzero and row pointers are being used, then
+// rows will be swapped by exchanging row pointers. otherwise the data will
+// be copied.
+static void btSwapRowsAndCols (BTATYPE A, int n, int i1, int i2, int nskip, 
+  int do_fast_row_swaps)
+  btAssert (A && n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n &&
+    nskip >= n && i1 < i2);
+# ifdef BTROWPTRS
+  btScalar *A_i1 = A[i1];
+  btScalar *A_i2 = A[i2];
+  for (int i=i1+1; i<i2; ++i) {
+    btScalar *A_i_i1 = A[i] + i1;
+    A_i1[i] = *A_i_i1;
+    *A_i_i1 = A_i2[i];
+  }
+  A_i1[i2] = A_i1[i1];
+  A_i1[i1] = A_i2[i1];
+  A_i2[i1] = A_i2[i2];
+  // swap rows, by swapping row pointers
+  if (do_fast_row_swaps) {
+    A[i1] = A_i2;
+    A[i2] = A_i1;
+  }
+  else {
+    // Only swap till i2 column to match A plain storage variant.
+    for (int k = 0; k <= i2; ++k) {
+      btScalar tmp = A_i1[k];
+      A_i1[k] = A_i2[k];
+      A_i2[k] = tmp;
+    }
+  }
+  // swap columns the hard way
+  for (int j=i2+1; j<n; ++j) {
+    btScalar *A_j = A[j];
+    btScalar tmp = A_j[i1];
+    A_j[i1] = A_j[i2];
+    A_j[i2] = tmp;
+  }
+# else
+  btScalar *A_i1 = A+i1*nskip;
+  btScalar *A_i2 = A+i2*nskip;
+  for (int k = 0; k < i1; ++k) {
+    btScalar tmp = A_i1[k];
+    A_i1[k] = A_i2[k];
+    A_i2[k] = tmp;
+  }
+  btScalar *A_i = A_i1 + nskip;
+  for (int i=i1+1; i<i2; A_i+=nskip, ++i) {
+    btScalar tmp = A_i2[i];
+    A_i2[i] = A_i[i1];
+    A_i[i1] = tmp;
+  }
+  {
+    btScalar tmp = A_i1[i1];
+    A_i1[i1] = A_i2[i2];
+    A_i2[i2] = tmp;
+  }
+  btScalar *A_j = A_i2 + nskip;
+  for (int j=i2+1; j<n; A_j+=nskip, ++j) {
+    btScalar tmp = A_j[i1];
+    A_j[i1] = A_j[i2];
+    A_j[i2] = tmp;
+  }
+# endif
+// swap two indexes in the n*n LCP problem. i1 must be <= i2.
+static void btSwapProblem (BTATYPE A, btScalar *x, btScalar *b, btScalar *w, btScalar *lo,
+                         btScalar *hi, int *p, bool *state, int *findex,
+                         int n, int i1, int i2, int nskip,
+                         int do_fast_row_swaps)
+  btScalar tmpr;
+  int tmpi;
+  bool tmpb;
+  btAssert (n>0 && i1 >=0 && i2 >= 0 && i1 < n && i2 < n && nskip >= n && i1 <= i2);
+  if (i1==i2) return;
+  btSwapRowsAndCols (A,n,i1,i2,nskip,do_fast_row_swaps);
+  tmpr = x[i1];
+  x[i1] = x[i2];
+  x[i2] = tmpr;
+  tmpr = b[i1];
+  b[i1] = b[i2];
+  b[i2] = tmpr;
+  tmpr = w[i1];
+  w[i1] = w[i2];
+  w[i2] = tmpr;
+  tmpr = lo[i1];
+  lo[i1] = lo[i2];
+  lo[i2] = tmpr;
+  tmpr = hi[i1];
+  hi[i1] = hi[i2];
+  hi[i2] = tmpr;
+  tmpi = p[i1];
+  p[i1] = p[i2];
+  p[i2] = tmpi;
+  tmpb = state[i1];
+  state[i1] = state[i2];
+  state[i2] = tmpb;
+  if (findex) {
+    tmpi = findex[i1];
+    findex[i1] = findex[i2];
+    findex[i2] = tmpi;
+  }
+// btLCP manipulator object. this represents an n*n LCP problem.
+// two index sets C and N are kept. each set holds a subset of
+// the variable indexes 0..n-1. an index can only be in one set.
+// initially both sets are empty.
+// the index set C is special: solutions to A(C,C)\A(C,i) can be generated.
+// fast implementation of btLCP. see the above definition of btLCP for
+// interface comments.
+// `p' records the permutation of A,x,b,w,etc. p is initially 1:n and is
+// permuted as the other vectors/matrices are permuted.
+// A,x,b,w,lo,hi,state,findex,p,c are permuted such that sets C,N have
+// contiguous indexes. the don't-care indexes follow N.
+// an L*D*L' factorization is maintained of A(C,C), and whenever indexes are
+// added or removed from the set C the factorization is updated.
+// thus L*D*L'=A[C,C], i.e. a permuted top left nC*nC submatrix of A.
+// the leading dimension of the matrix L is always `nskip'.
+// at the start there may be other indexes that are unbounded but are not
+// included in `nub'. btLCP will permute the matrix so that absolutely all
+// unbounded vectors are at the start. thus there may be some initial
+// permutation.
+// the algorithms here assume certain patterns, particularly with respect to
+// index transfer.
+#ifdef btLCP_FAST
+struct btLCP 
+	const int m_n;
+	const int m_nskip;
+	int m_nub;
+	int m_nC, m_nN;				// size of each index set
+	BTATYPE const m_A;				// A rows
+	btScalar *const m_x, * const m_b, *const m_w, *const m_lo,* const m_hi;	// permuted LCP problem data
+	btScalar *const m_L, *const m_d;				// L*D*L' factorization of set C
+	btScalar *const m_Dell, *const m_ell, *const m_tmp;
+	bool *const m_state;
+	int *const m_findex, *const m_p, *const m_C;
+	btLCP (int _n, int _nskip, int _nub, btScalar *_Adata, btScalar *_x, btScalar *_b, btScalar *_w,
+		btScalar *_lo, btScalar *_hi, btScalar *l, btScalar *_d,
+		btScalar *_Dell, btScalar *_ell, btScalar *_tmp,
+		bool *_state, int *_findex, int *p, int *c, btScalar **Arows);
+	int getNub() const { return m_nub; }
+	void transfer_i_to_C (int i);
+	void transfer_i_to_N (int i) { m_nN++; }			// because we can assume C and N span 1:i-1
+	void transfer_i_from_N_to_C (int i);
+	void transfer_i_from_C_to_N (int i, btAlignedObjectArray<btScalar>& scratch);
+	int numC() const { return m_nC; }
+	int numN() const { return m_nN; }
+	int indexC (int i) const { return i; }
+	int indexN (int i) const { return i+m_nC; }
+	btScalar Aii (int i) const  { return BTAROW(i)[i]; }
+	btScalar AiC_times_qC (int i, btScalar *q) const { return btLargeDot (BTAROW(i), q, m_nC); }
+	btScalar AiN_times_qN (int i, btScalar *q) const { return btLargeDot (BTAROW(i)+m_nC, q+m_nC, m_nN); }
+	void pN_equals_ANC_times_qC (btScalar *p, btScalar *q);
+	void pN_plusequals_ANi (btScalar *p, int i, int sign=1);
+	void pC_plusequals_s_times_qC (btScalar *p, btScalar s, btScalar *q);
+	void pN_plusequals_s_times_qN (btScalar *p, btScalar s, btScalar *q);
+	void solve1 (btScalar *a, int i, int dir=1, int only_transfer=0);
+	void unpermute();
+btLCP::btLCP (int _n, int _nskip, int _nub, btScalar *_Adata, btScalar *_x, btScalar *_b, btScalar *_w,
+            btScalar *_lo, btScalar *_hi, btScalar *l, btScalar *_d,
+            btScalar *_Dell, btScalar *_ell, btScalar *_tmp,
+            bool *_state, int *_findex, int *p, int *c, btScalar **Arows):
+  m_n(_n), m_nskip(_nskip), m_nub(_nub), m_nC(0), m_nN(0),
+# ifdef BTROWPTRS
+  m_A(Arows),
+  m_A(_Adata),
+  m_x(_x), m_b(_b), m_w(_w), m_lo(_lo), m_hi(_hi),
+  m_L(l), m_d(_d), m_Dell(_Dell), m_ell(_ell), m_tmp(_tmp),
+  m_state(_state), m_findex(_findex), m_p(p), m_C(c)
+  {
+    btSetZero (m_x,m_n);
+  }
+  {
+# ifdef BTROWPTRS
+    // make matrix row pointers
+    btScalar *aptr = _Adata;
+    BTATYPE A = m_A;
+    const int n = m_n, nskip = m_nskip;
+    for (int k=0; k<n; aptr+=nskip, ++k) A[k] = aptr;
+# endif
+  }
+  {
+    int *p = m_p;
+    const int n = m_n;
+    for (int k=0; k<n; ++k) p[k]=k;		// initially unpermuted
+  }
+  /*
+  // for testing, we can do some random swaps in the area i > nub
+  {
+    const int n = m_n;
+    const int nub = m_nub;
+    if (nub < n) {
+    for (int k=0; k<100; k++) {
+      int i1,i2;
+      do {
+        i1 = dRandInt(n-nub)+nub;
+        i2 = dRandInt(n-nub)+nub;
+      }
+      while (i1 > i2); 
+      //printf ("--> %d %d\n",i1,i2);
+      btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,m_findex,n,i1,i2,m_nskip,0);
+    }
+  }
+  */
+  // permute the problem so that *all* the unbounded variables are at the
+  // start, i.e. look for unbounded variables not included in `nub'. we can
+  // potentially push up `nub' this way and get a bigger initial factorization.
+  // note that when we swap rows/cols here we must not just swap row pointers,
+  // as the initial factorization relies on the data being all in one chunk.
+  // variables that have findex >= 0 are *not* considered to be unbounded even
+  // if lo=-inf and hi=inf - this is because these limits may change during the
+  // solution process.
+  {
+    int *findex = m_findex;
+    btScalar *lo = m_lo, *hi = m_hi;
+    const int n = m_n;
+    for (int k = m_nub; k<n; ++k) {
+      if (findex && findex[k] >= 0) continue;
+      if (lo[k]==-BT_INFINITY && hi[k]==BT_INFINITY) {
+        btSwapProblem (m_A,m_x,m_b,m_w,lo,hi,m_p,m_state,findex,n,m_nub,k,m_nskip,0);
+        m_nub++;
+      }
+    }
+  }
+  // if there are unbounded variables at the start, factorize A up to that
+  // point and solve for x. this puts all indexes 0..nub-1 into C.
+  if (m_nub > 0) {
+    const int nub = m_nub;
+    {
+      btScalar *Lrow = m_L;
+      const int nskip = m_nskip;
+      for (int j=0; j<nub; Lrow+=nskip, ++j) memcpy(Lrow,BTAROW(j),(j+1)*sizeof(btScalar));
+    }
+    btFactorLDLT (m_L,m_d,nub,m_nskip);
+    memcpy (m_x,m_b,nub*sizeof(btScalar));
+    btSolveLDLT (m_L,m_d,m_x,nub,m_nskip);
+    btSetZero (m_w,nub);
+    {
+      int *C = m_C;
+      for (int k=0; k<nub; ++k) C[k] = k;
+    }
+    m_nC = nub;
+  }
+  // permute the indexes > nub such that all findex variables are at the end
+  if (m_findex) {
+    const int nub = m_nub;
+    int *findex = m_findex;
+    int num_at_end = 0;
+    for (int k=m_n-1; k >= nub; k--) {
+      if (findex[k] >= 0) {
+        btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,findex,m_n,k,m_n-1-num_at_end,m_nskip,1);
+        num_at_end++;
+      }
+    }
+  }
+  // print info about indexes
+  /*
+  {
+    const int n = m_n;
+    const int nub = m_nub;
+    for (int k=0; k<n; k++) {
+      if (k<nub) printf ("C");
+      else if (m_lo[k]==-BT_INFINITY && m_hi[k]==BT_INFINITY) printf ("c");
+      else printf (".");
+    }
+    printf ("\n");
+  }
+  */
+void btLCP::transfer_i_to_C (int i)
+  {
+    if (m_nC > 0) {
+      // ell,Dell were computed by solve1(). note, ell = D \ L1solve (L,A(i,C))
+      {
+        const int nC = m_nC;
+        btScalar *const Ltgt = m_L + nC*m_nskip, *ell = m_ell;
+        for (int j=0; j<nC; ++j) Ltgt[j] = ell[j];
+      }
+      const int nC = m_nC;
+      m_d[nC] = btRecip (BTAROW(i)[i] - btLargeDot(m_ell,m_Dell,nC));
+    }
+    else {
+      m_d[0] = btRecip (BTAROW(i)[i]);
+    }
+    btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,m_findex,m_n,m_nC,i,m_nskip,1);
+    const int nC = m_nC;
+    m_C[nC] = nC;
+    m_nC = nC + 1; // nC value is outdated after this line
+  }
+void btLCP::transfer_i_from_N_to_C (int i)
+  {
+    if (m_nC > 0) {
+      {
+        btScalar *const aptr = BTAROW(i);
+        btScalar *Dell = m_Dell;
+        const int *C = m_C;
+        // if nub>0, initial part of aptr unpermuted
+        const int nub = m_nub;
+        int j=0;
+        for ( ; j<nub; ++j) Dell[j] = aptr[j];
+        const int nC = m_nC;
+        for ( ; j<nC; ++j) Dell[j] = aptr[C[j]];
+#   else
+        const int nC = m_nC;
+        for (int j=0; j<nC; ++j) Dell[j] = aptr[C[j]];
+#   endif
+      }
+      btSolveL1 (m_L,m_Dell,m_nC,m_nskip);
+      {
+        const int nC = m_nC;
+        btScalar *const Ltgt = m_L + nC*m_nskip;
+        btScalar *ell = m_ell, *Dell = m_Dell, *d = m_d;
+        for (int j=0; j<nC; ++j) Ltgt[j] = ell[j] = Dell[j] * d[j];
+      }
+      const int nC = m_nC;
+      m_d[nC] = btRecip (BTAROW(i)[i] - btLargeDot(m_ell,m_Dell,nC));
+    }
+    else {
+      m_d[0] = btRecip (BTAROW(i)[i]);
+    }
+    btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,m_findex,m_n,m_nC,i,m_nskip,1);
+    const int nC = m_nC;
+    m_C[nC] = nC;
+    m_nN--;
+    m_nC = nC + 1; // nC value is outdated after this line
+  }
+  // @@@ TO DO LATER
+  // if we just finish here then we'll go back and re-solve for
+  // delta_x. but actually we can be more efficient and incrementally
+  // update delta_x here. but if we do this, we wont have ell and Dell
+  // to use in updating the factorization later.
+void btRemoveRowCol (btScalar *A, int n, int nskip, int r)
+  btAssert(A && n > 0 && nskip >= n && r >= 0 && r < n);
+  if (r >= n-1) return;
+  if (r > 0) {
+    {
+      const size_t move_size = (n-r-1)*sizeof(btScalar);
+      btScalar *Adst = A + r;
+      for (int i=0; i<r; Adst+=nskip,++i) {
+        btScalar *Asrc = Adst + 1;
+        memmove (Adst,Asrc,move_size);
+      }
+    }
+    {
+      const size_t cpy_size = r*sizeof(btScalar);
+      btScalar *Adst = A + r * nskip;
+      for (int i=r; i<(n-1); ++i) {
+        btScalar *Asrc = Adst + nskip;
+        memcpy (Adst,Asrc,cpy_size);
+        Adst = Asrc;
+      }
+    }
+  }
+  {
+    const size_t cpy_size = (n-r-1)*sizeof(btScalar);
+    btScalar *Adst = A + r * (nskip + 1);
+    for (int i=r; i<(n-1); ++i) {
+      btScalar *Asrc = Adst + (nskip + 1);
+      memcpy (Adst,Asrc,cpy_size);
+      Adst = Asrc - 1;
+    }
+  }
+void btLDLTAddTL (btScalar *L, btScalar *d, const btScalar *a, int n, int nskip, btAlignedObjectArray<btScalar>& scratch)
+  btAssert (L && d && a && n > 0 && nskip >= n);
+  if (n < 2) return;
+  scratch.resize(2*nskip);
+  btScalar *W1 = &scratch[0];
+  btScalar *W2 = W1 + nskip;
+  W1[0] = btScalar(0.0);
+  W2[0] = btScalar(0.0);
+  for (int j=1; j<n; ++j) {
+    W1[j] = W2[j] = (btScalar) (a[j] * SIMDSQRT12);
+  }
+  btScalar W11 = (btScalar) ((btScalar(0.5)*a[0]+1)*SIMDSQRT12);
+  btScalar W21 = (btScalar) ((btScalar(0.5)*a[0]-1)*SIMDSQRT12);
+  btScalar alpha1 = btScalar(1.0);
+  btScalar alpha2 = btScalar(1.0);
+  {
+    btScalar dee = d[0];
+    btScalar alphanew = alpha1 + (W11*W11)*dee;
+    btAssert(alphanew != btScalar(0.0));
+    dee /= alphanew;
+    btScalar gamma1 = W11 * dee;
+    dee *= alpha1;
+    alpha1 = alphanew;
+    alphanew = alpha2 - (W21*W21)*dee;
+    dee /= alphanew;
+    //btScalar gamma2 = W21 * dee;
+    alpha2 = alphanew;
+    btScalar k1 = btScalar(1.0) - W21*gamma1;
+    btScalar k2 = W21*gamma1*W11 - W21;
+    btScalar *ll = L + nskip;
+    for (int p=1; p<n; ll+=nskip, ++p) {
+      btScalar Wp = W1[p];
+      btScalar ell = *ll;
+      W1[p] =    Wp - W11*ell;
+      W2[p] = k1*Wp +  k2*ell;
+    }
+  }
+  btScalar *ll = L + (nskip + 1);
+  for (int j=1; j<n; ll+=nskip+1, ++j) {
+    btScalar k1 = W1[j];
+    btScalar k2 = W2[j];
+    btScalar dee = d[j];
+    btScalar alphanew = alpha1 + (k1*k1)*dee;
+    btAssert(alphanew != btScalar(0.0));
+    dee /= alphanew;
+    btScalar gamma1 = k1 * dee;
+    dee *= alpha1;
+    alpha1 = alphanew;
+    alphanew = alpha2 - (k2*k2)*dee;
+    dee /= alphanew;
+    btScalar gamma2 = k2 * dee;
+    dee *= alpha2;
+    d[j] = dee;
+    alpha2 = alphanew;
+    btScalar *l = ll + nskip;
+    for (int p=j+1; p<n; l+=nskip, ++p) {
+      btScalar ell = *l;
+      btScalar Wp = W1[p] - k1 * ell;
+      ell += gamma1 * Wp;
+      W1[p] = Wp;
+      Wp = W2[p] - k2 * ell;
+      ell -= gamma2 * Wp;
+      W2[p] = Wp;
+      *l = ell;
+    }
+  }
+#define _BTGETA(i,j) (A[i][j])
+//#define _GETA(i,j) (A[(i)*nskip+(j)])
+#define BTGETA(i,j) ((i > j) ? _BTGETA(i,j) : _BTGETA(j,i))
+inline size_t btEstimateLDLTAddTLTmpbufSize(int nskip)
+  return nskip * 2 * sizeof(btScalar);
+void btLDLTRemove (btScalar **A, const int *p, btScalar *L, btScalar *d,
+    int n1, int n2, int r, int nskip, btAlignedObjectArray<btScalar>& scratch)
+  btAssert(A && p && L && d && n1 > 0 && n2 > 0 && r >= 0 && r < n2 &&
+	   n1 >= n2 && nskip >= n1);
+  #ifdef BT_DEBUG
+	for (int i=0; i<n2; ++i) 
+		btAssert(p[i] >= 0 && p[i] < n1);
+  #endif
+  if (r==n2-1) {
+    return;		// deleting last row/col is easy
+  }
+  else {
+    size_t LDLTAddTL_size = btEstimateLDLTAddTLTmpbufSize(nskip);
+    btAssert(LDLTAddTL_size % sizeof(btScalar) == 0);
+	scratch.resize(nskip * 2+n2);
+    btScalar *tmp = &scratch[0];
+    if (r==0) {
+      btScalar *a = (btScalar *)((char *)tmp + LDLTAddTL_size);
+      const int p_0 = p[0];
+      for (int i=0; i<n2; ++i) {
+        a[i] = -BTGETA(p[i],p_0);
+      }
+      a[0] += btScalar(1.0);
+      btLDLTAddTL (L,d,a,n2,nskip,scratch);
+    }
+    else {
+      btScalar *t = (btScalar *)((char *)tmp + LDLTAddTL_size);
+      {
+        btScalar *Lcurr = L + r*nskip;
+        for (int i=0; i<r; ++Lcurr, ++i) {
+          btAssert(d[i] != btScalar(0.0));
+          t[i] = *Lcurr / d[i];
+        }
+      }
+      btScalar *a = t + r;
+      {
+        btScalar *Lcurr = L + r*nskip;
+        const int *pp_r = p + r, p_r = *pp_r;
+        const int n2_minus_r = n2-r;
+        for (int i=0; i<n2_minus_r; Lcurr+=nskip,++i) {
+          a[i] = btLargeDot(Lcurr,t,r) - BTGETA(pp_r[i],p_r);
+        }
+      }
+      a[0] += btScalar(1.0);
+      btLDLTAddTL (L + r*nskip+r, d+r, a, n2-r, nskip, scratch);
+    }
+  }
+  // snip out row/column r from L and d
+  btRemoveRowCol (L,n2,nskip,r);
+  if (r < (n2-1)) memmove (d+r,d+r+1,(n2-r-1)*sizeof(btScalar));
+void btLCP::transfer_i_from_C_to_N (int i, btAlignedObjectArray<btScalar>& scratch)
+  {
+    int *C = m_C;
+    // remove a row/column from the factorization, and adjust the
+    // indexes (black magic!)
+    int last_idx = -1;
+    const int nC = m_nC;
+    int j = 0;
+    for ( ; j<nC; ++j) {
+      if (C[j]==nC-1) {
+        last_idx = j;
+      }
+      if (C[j]==i) {
+        btLDLTRemove (m_A,C,m_L,m_d,m_n,nC,j,m_nskip,scratch);
+        int k;
+        if (last_idx == -1) {
+          for (k=j+1 ; k<nC; ++k) {
+            if (C[k]==nC-1) {
+              break;
+            }
+          }
+          btAssert (k < nC);
+        }
+        else {
+          k = last_idx;
+        }
+        C[k] = C[j];
+        if (j < (nC-1)) memmove (C+j,C+j+1,(nC-j-1)*sizeof(int));
+        break;
+      }
+    }
+    btAssert (j < nC);
+    btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,m_findex,m_n,i,nC-1,m_nskip,1);
+    m_nN++;
+    m_nC = nC - 1; // nC value is outdated after this line
+  }
+void btLCP::pN_equals_ANC_times_qC (btScalar *p, btScalar *q)
+  // we could try to make this matrix-vector multiplication faster using
+  // outer product matrix tricks, e.g. with the dMultidotX() functions.
+  // but i tried it and it actually made things slower on random 100x100
+  // problems because of the overhead involved. so we'll stick with the
+  // simple method for now.
+  const int nC = m_nC;
+  btScalar *ptgt = p + nC;
+  const int nN = m_nN;
+  for (int i=0; i<nN; ++i) {
+    ptgt[i] = btLargeDot (BTAROW(i+nC),q,nC);
+  }
+void btLCP::pN_plusequals_ANi (btScalar *p, int i, int sign)
+  const int nC = m_nC;
+  btScalar *aptr = BTAROW(i) + nC;
+  btScalar *ptgt = p + nC;
+  if (sign > 0) {
+    const int nN = m_nN;
+    for (int j=0; j<nN; ++j) ptgt[j] += aptr[j];
+  }
+  else {
+    const int nN = m_nN;
+    for (int j=0; j<nN; ++j) ptgt[j] -= aptr[j];
+  }
+void btLCP::pC_plusequals_s_times_qC (btScalar *p, btScalar s, btScalar *q)
+  const int nC = m_nC;
+  for (int i=0; i<nC; ++i) {
+    p[i] += s*q[i];
+  }
+void btLCP::pN_plusequals_s_times_qN (btScalar *p, btScalar s, btScalar *q)
+  const int nC = m_nC;
+  btScalar *ptgt = p + nC, *qsrc = q + nC;
+  const int nN = m_nN;
+  for (int i=0; i<nN; ++i) {
+    ptgt[i] += s*qsrc[i];
+  }
+void btLCP::solve1 (btScalar *a, int i, int dir, int only_transfer)
+  // the `Dell' and `ell' that are computed here are saved. if index i is
+  // later added to the factorization then they can be reused.
+  //
+  // @@@ question: do we need to solve for entire delta_x??? yes, but
+  //     only if an x goes below 0 during the step.
+  if (m_nC > 0) {
+    {
+      btScalar *Dell = m_Dell;
+      int *C = m_C;
+      btScalar *aptr = BTAROW(i);
+      // if nub>0, initial part of aptr[] is guaranteed unpermuted
+      const int nub = m_nub;
+      int j=0;
+      for ( ; j<nub; ++j) Dell[j] = aptr[j];
+      const int nC = m_nC;
+      for ( ; j<nC; ++j) Dell[j] = aptr[C[j]];
+#   else
+      const int nC = m_nC;
+      for (int j=0; j<nC; ++j) Dell[j] = aptr[C[j]];
+#   endif
+    }
+    btSolveL1 (m_L,m_Dell,m_nC,m_nskip);
+    {
+      btScalar *ell = m_ell, *Dell = m_Dell, *d = m_d;
+      const int nC = m_nC;
+      for (int j=0; j<nC; ++j) ell[j] = Dell[j] * d[j];
+    }
+    if (!only_transfer) {
+      btScalar *tmp = m_tmp, *ell = m_ell;
+      {
+        const int nC = m_nC;
+        for (int j=0; j<nC; ++j) tmp[j] = ell[j];
+      }
+      btSolveL1T (m_L,tmp,m_nC,m_nskip);
+      if (dir > 0) {
+        int *C = m_C;
+        btScalar *tmp = m_tmp;
+        const int nC = m_nC;
+        for (int j=0; j<nC; ++j) a[C[j]] = -tmp[j];
+      } else {
+        int *C = m_C;
+        btScalar *tmp = m_tmp;
+        const int nC = m_nC;
+        for (int j=0; j<nC; ++j) a[C[j]] = tmp[j];
+      }
+    }
+  }
+void btLCP::unpermute()
+  // now we have to un-permute x and w
+  {
+    memcpy (m_tmp,m_x,m_n*sizeof(btScalar));
+    btScalar *x = m_x, *tmp = m_tmp;
+    const int *p = m_p;
+    const int n = m_n;
+    for (int j=0; j<n; ++j) x[p[j]] = tmp[j];
+  }
+  {
+    memcpy (m_tmp,m_w,m_n*sizeof(btScalar));
+    btScalar *w = m_w, *tmp = m_tmp;
+    const int *p = m_p;
+    const int n = m_n;
+    for (int j=0; j<n; ++j) w[p[j]] = tmp[j];
+  }
+#endif // btLCP_FAST
+// an optimized Dantzig LCP driver routine for the lo-hi LCP problem.
+bool btSolveDantzigLCP (int n, btScalar *A, btScalar *x, btScalar *b,
+                btScalar* outer_w, int nub, btScalar *lo, btScalar *hi, int *findex, btDantzigScratchMemory& scratchMem)
+	s_error = false;
+//	printf("btSolveDantzigLCP n=%d\n",n);
+  btAssert (n>0 && A && x && b && lo && hi && nub >= 0 && nub <= n);
+  btAssert(outer_w);
+#ifdef BT_DEBUG
+  {
+    // check restrictions on lo and hi
+    for (int k=0; k<n; ++k) 
+		btAssert (lo[k] <= 0 && hi[k] >= 0);
+  }
+# endif
+  // if all the variables are unbounded then we can just factor, solve,
+  // and return
+  if (nub >= n) 
+  {
+    int nskip = (n);
+    btFactorLDLT (A, outer_w, n, nskip);
+    btSolveLDLT (A, outer_w, b, n, nskip);
+    memcpy (x, b, n*sizeof(btScalar));
+    return !s_error;
+  }
+  const int nskip = (n);
+  scratchMem.L.resize(n*nskip);
+  scratchMem.d.resize(n);
+  btScalar *w = outer_w;
+  scratchMem.delta_w.resize(n);
+  scratchMem.delta_x.resize(n);
+  scratchMem.Dell.resize(n);
+  scratchMem.ell.resize(n);
+  scratchMem.Arows.resize(n);
+  scratchMem.p.resize(n);
+  scratchMem.C.resize(n);
+  // for i in N, state[i] is 0 if x(i)==lo(i) or 1 if x(i)==hi(i)
+  scratchMem.state.resize(n);
+  // create LCP object. note that tmp is set to delta_w to save space, this
+  // optimization relies on knowledge of how tmp is used, so be careful!
+  btLCP lcp(n,nskip,nub,A,x,b,w,lo,hi,&scratchMem.L[0],&scratchMem.d[0],&scratchMem.Dell[0],&scratchMem.ell[0],&scratchMem.delta_w[0],&scratchMem.state[0],findex,&scratchMem.p[0],&scratchMem.C[0],&scratchMem.Arows[0]);
+  int adj_nub = lcp.getNub();
+  // loop over all indexes adj_nub..n-1. for index i, if x(i),w(i) satisfy the
+  // LCP conditions then i is added to the appropriate index set. otherwise
+  // x(i),w(i) is driven either +ve or -ve to force it to the valid region.
+  // as we drive x(i), x(C) is also adjusted to keep w(C) at zero.
+  // while driving x(i) we maintain the LCP conditions on the other variables
+  // 0..i-1. we do this by watching out for other x(i),w(i) values going
+  // outside the valid region, and then switching them between index sets
+  // when that happens.
+  bool hit_first_friction_index = false;
+  for (int i=adj_nub; i<n; ++i) 
+  {
+    s_error = false;
+    // the index i is the driving index and indexes i+1..n-1 are "dont care",
+    // i.e. when we make changes to the system those x's will be zero and we
+    // don't care what happens to those w's. in other words, we only consider
+    // an (i+1)*(i+1) sub-problem of A*x=b+w.
+    // if we've hit the first friction index, we have to compute the lo and
+    // hi values based on the values of x already computed. we have been
+    // permuting the indexes, so the values stored in the findex vector are
+    // no longer valid. thus we have to temporarily unpermute the x vector. 
+    // for the purposes of this computation, 0*infinity = 0 ... so if the
+    // contact constraint's normal force is 0, there should be no tangential
+    // force applied.
+    if (!hit_first_friction_index && findex && findex[i] >= 0) {
+      // un-permute x into delta_w, which is not being used at the moment
+      for (int j=0; j<n; ++j) scratchMem.delta_w[scratchMem.p[j]] = x[j];
+      // set lo and hi values
+      for (int k=i; k<n; ++k) {
+        btScalar wfk = scratchMem.delta_w[findex[k]];
+        if (wfk == 0) {
+          hi[k] = 0;
+          lo[k] = 0;
+        }
+        else {
+          hi[k] = btFabs (hi[k] * wfk);
+          lo[k] = -hi[k];
+        }
+      }
+      hit_first_friction_index = true;
+    }
+    // thus far we have not even been computing the w values for indexes
+    // greater than i, so compute w[i] now.
+    w[i] = lcp.AiC_times_qC (i,x) + lcp.AiN_times_qN (i,x) - b[i];
+    // if lo=hi=0 (which can happen for tangential friction when normals are
+    // 0) then the index will be assigned to set N with some state. however,
+    // set C's line has zero size, so the index will always remain in set N.
+    // with the "normal" switching logic, if w changed sign then the index
+    // would have to switch to set C and then back to set N with an inverted
+    // state. this is pointless, and also computationally expensive. to
+    // prevent this from happening, we use the rule that indexes with lo=hi=0
+    // will never be checked for set changes. this means that the state for
+    // these indexes may be incorrect, but that doesn't matter.
+    // see if x(i),w(i) is in a valid region
+    if (lo[i]==0 && w[i] >= 0) {
+      lcp.transfer_i_to_N (i);
+      scratchMem.state[i] = false;
+    }
+    else if (hi[i]==0 && w[i] <= 0) {
+      lcp.transfer_i_to_N (i);
+      scratchMem.state[i] = true;
+    }
+    else if (w[i]==0) {
+      // this is a degenerate case. by the time we get to this test we know
+      // that lo != 0, which means that lo < 0 as lo is not allowed to be +ve,
+      // and similarly that hi > 0. this means that the line segment
+      // corresponding to set C is at least finite in extent, and we are on it.
+      // NOTE: we must call lcp.solve1() before lcp.transfer_i_to_C()
+      lcp.solve1 (&scratchMem.delta_x[0],i,0,1);
+      lcp.transfer_i_to_C (i);
+    }
+    else {
+      // we must push x(i) and w(i)
+      for (;;) {
+        int dir;
+        btScalar dirf;
+        // find direction to push on x(i)
+        if (w[i] <= 0) {
+          dir = 1;
+          dirf = btScalar(1.0);
+        }
+        else {
+          dir = -1;
+          dirf = btScalar(-1.0);
+        }
+        // compute: delta_x(C) = -dir*A(C,C)\A(C,i)
+        lcp.solve1 (&scratchMem.delta_x[0],i,dir);
+        // note that delta_x[i] = dirf, but we wont bother to set it
+        // compute: delta_w = A*delta_x ... note we only care about
+        // delta_w(N) and delta_w(i), the rest is ignored
+        lcp.pN_equals_ANC_times_qC (&scratchMem.delta_w[0],&scratchMem.delta_x[0]);
+        lcp.pN_plusequals_ANi (&scratchMem.delta_w[0],i,dir);
+        scratchMem.delta_w[i] = lcp.AiC_times_qC (i,&scratchMem.delta_x[0]) + lcp.Aii(i)*dirf;
+        // find largest step we can take (size=s), either to drive x(i),w(i)
+        // to the valid LCP region or to drive an already-valid variable
+        // outside the valid region.
+        int cmd = 1;		// index switching command
+        int si = 0;		// si = index to switch if cmd>3
+        btScalar s = -w[i]/scratchMem.delta_w[i];
+        if (dir > 0) {
+          if (hi[i] < BT_INFINITY) {
+            btScalar s2 = (hi[i]-x[i])*dirf;	// was (hi[i]-x[i])/dirf	// step to x(i)=hi(i)
+            if (s2 < s) {
+              s = s2;
+              cmd = 3;
+            }
+          }
+        }
+        else {
+          if (lo[i] > -BT_INFINITY) {
+            btScalar s2 = (lo[i]-x[i])*dirf;	// was (lo[i]-x[i])/dirf	// step to x(i)=lo(i)
+            if (s2 < s) {
+              s = s2;
+              cmd = 2;
+            }
+          }
+        }
+        {
+          const int numN = lcp.numN();
+          for (int k=0; k < numN; ++k) {
+            const int indexN_k = lcp.indexN(k);
+            if (!scratchMem.state[indexN_k] ? scratchMem.delta_w[indexN_k] < 0 : scratchMem.delta_w[indexN_k] > 0) {
+                // don't bother checking if lo=hi=0
+                if (lo[indexN_k] == 0 && hi[indexN_k] == 0) continue;
+                btScalar s2 = -w[indexN_k] / scratchMem.delta_w[indexN_k];
+                if (s2 < s) {
+                  s = s2;
+                  cmd = 4;
+                  si = indexN_k;
+                }
+            }
+          }
+        }
+        {
+          const int numC = lcp.numC();
+          for (int k=adj_nub; k < numC; ++k) {
+            const int indexC_k = lcp.indexC(k);
+            if (scratchMem.delta_x[indexC_k] < 0 && lo[indexC_k] > -BT_INFINITY) {
+              btScalar s2 = (lo[indexC_k]-x[indexC_k]) / scratchMem.delta_x[indexC_k];
+              if (s2 < s) {
+                s = s2;
+                cmd = 5;
+                si = indexC_k;
+              }
+            }
+            if (scratchMem.delta_x[indexC_k] > 0 && hi[indexC_k] < BT_INFINITY) {
+              btScalar s2 = (hi[indexC_k]-x[indexC_k]) / scratchMem.delta_x[indexC_k];
+              if (s2 < s) {
+                s = s2;
+                cmd = 6;
+                si = indexC_k;
+              }
+            }
+          }
+        }
+        //static char* cmdstring[8] = {0,"->C","->NL","->NH","N->C",
+        //			     "C->NL","C->NH"};
+        //printf ("cmd=%d (%s), si=%d\n",cmd,cmdstring[cmd],(cmd>3) ? si : i);
+        // if s <= 0 then we've got a problem. if we just keep going then
+        // we're going to get stuck in an infinite loop. instead, just cross
+        // our fingers and exit with the current solution.
+        if (s <= btScalar(0.0)) 
+		{
+//          printf("LCP internal error, s <= 0 (s=%.4e)",(double)s);
+          if (i < n) {
+            btSetZero (x+i,n-i);
+            btSetZero (w+i,n-i);
+          }
+          s_error = true;
+          break;
+        }
+        // apply x = x + s * delta_x
+        lcp.pC_plusequals_s_times_qC (x, s, &scratchMem.delta_x[0]);
+        x[i] += s * dirf;
+        // apply w = w + s * delta_w
+        lcp.pN_plusequals_s_times_qN (w, s, &scratchMem.delta_w[0]);
+        w[i] += s * scratchMem.delta_w[i];
+//        void *tmpbuf;
+        // switch indexes between sets if necessary
+        switch (cmd) {
+        case 1:		// done
+          w[i] = 0;
+          lcp.transfer_i_to_C (i);
+          break;
+        case 2:		// done
+          x[i] = lo[i];
+          scratchMem.state[i] = false;
+          lcp.transfer_i_to_N (i);
+          break;
+        case 3:		// done
+          x[i] = hi[i];
+          scratchMem.state[i] = true;
+          lcp.transfer_i_to_N (i);
+          break;
+        case 4:		// keep going
+          w[si] = 0;
+          lcp.transfer_i_from_N_to_C (si);
+          break;
+        case 5:		// keep going
+          x[si] = lo[si];
+          scratchMem.state[si] = false;
+		  lcp.transfer_i_from_C_to_N (si, scratchMem.m_scratch);
+          break;
+        case 6:		// keep going
+          x[si] = hi[si];
+          scratchMem.state[si] = true;
+          lcp.transfer_i_from_C_to_N (si, scratchMem.m_scratch);
+          break;
+        }
+        if (cmd <= 3) break;
+      } // for (;;)
+    } // else
+    if (s_error) 
+	{
+      break;
+    }
+  } // for (int i=adj_nub; i<n; ++i)
+  lcp.unpermute();
+  return !s_error;
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btDantzigLCP.h b/src/bullet/BulletDynamics/MLCPSolvers/btDantzigLCP.h
new file mode 100644
index 00000000..90383277
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btDantzigLCP.h
@@ -0,0 +1,77 @@
+ *                                                                       *
+ * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
+ * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
+ *                                                                       *
+ * This library is free software; you can redistribute it and/or         *
+ * modify it under the terms of                                          * 
+ *   The BSD-style license that is included with this library in         *
+ *   the file LICENSE-BSD.TXT.                                           *
+ *                                                                       *
+ * This library is distributed in the hope that it will be useful,       *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
+ *                                                                       *
+ *************************************************************************/
+given (A,b,lo,hi), solve the LCP problem: A*x = b+w, where each x(i),w(i)
+satisfies one of
+	(1) x = lo, w >= 0
+	(2) x = hi, w <= 0
+	(3) lo < x < hi, w = 0
+A is a matrix of dimension n*n, everything else is a vector of size n*1.
+lo and hi can be +/- dInfinity as needed. the first `nub' variables are
+unbounded, i.e. hi and lo are assumed to be +/- dInfinity.
+we restrict lo(i) <= 0 and hi(i) >= 0.
+the original data (A,b) may be modified by this function.
+if the `findex' (friction index) parameter is nonzero, it points to an array
+of index values. in this case constraints that have findex[i] >= 0 are
+special. all non-special constraints are solved for, then the lo and hi values
+for the special constraints are set:
+  hi[i] = abs( hi[i] * x[findex[i]] )
+  lo[i] = -hi[i]
+and the solution continues. this mechanism allows a friction approximation
+to be implemented. the first `nub' variables are assumed to have findex < 0.
+#ifndef _BT_LCP_H_
+#define _BT_LCP_H_
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btAlignedObjectArray.h"
+struct btDantzigScratchMemory
+	btAlignedObjectArray<btScalar> m_scratch;
+	btAlignedObjectArray<btScalar> L;
+	btAlignedObjectArray<btScalar> d;
+	btAlignedObjectArray<btScalar> delta_w;
+	btAlignedObjectArray<btScalar> delta_x;
+	btAlignedObjectArray<btScalar> Dell;
+	btAlignedObjectArray<btScalar> ell;
+	btAlignedObjectArray<btScalar*> Arows;
+	btAlignedObjectArray<int> p;
+	btAlignedObjectArray<int> C;
+	btAlignedObjectArray<bool> state;
+//return false if solving failed
+bool btSolveDantzigLCP (int n, btScalar *A, btScalar *x, btScalar *b, btScalar *w,
+	int nub, btScalar *lo, btScalar *hi, int *findex,btDantzigScratchMemory& scratch);
+#endif //_BT_LCP_H_
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btDantzigSolver.h b/src/bullet/BulletDynamics/MLCPSolvers/btDantzigSolver.h
new file mode 100644
index 00000000..2a2f2d3d
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btDantzigSolver.h
@@ -0,0 +1,112 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+#include "btMLCPSolverInterface.h"
+#include "btDantzigLCP.h"
+class btDantzigSolver : public btMLCPSolverInterface
+	btScalar m_acceptableUpperLimitSolution;
+	btAlignedObjectArray<char>	m_tempBuffer;
+	btAlignedObjectArray<btScalar> m_A;
+	btAlignedObjectArray<btScalar> m_b;
+	btAlignedObjectArray<btScalar> m_x;
+	btAlignedObjectArray<btScalar> m_lo;
+	btAlignedObjectArray<btScalar> m_hi;
+	btAlignedObjectArray<int>	m_dependencies;
+	btDantzigScratchMemory m_scratchMemory;
+	btDantzigSolver()
+		:m_acceptableUpperLimitSolution(btScalar(1000))
+	{
+	}
+	virtual bool solveMLCP(const btMatrixXu & A, const btVectorXu & b, btVectorXu& x, const btVectorXu & lo,const btVectorXu & hi,const btAlignedObjectArray<int>& limitDependency, int numIterations, bool useSparsity = true)
+	{
+		bool result = true;
+		int n = b.rows();
+		if (n)
+		{
+			int nub = 0;
+			btAlignedObjectArray<btScalar> ww;
+			ww.resize(n);
+			const btScalar* Aptr = A.getBufferPointer();
+			m_A.resize(n*n);
+			for (int i=0;i<n*n;i++)
+			{
+				m_A[i] = Aptr[i];
+			}
+			m_b.resize(n);
+			m_x.resize(n);
+			m_lo.resize(n);
+			m_hi.resize(n);
+			m_dependencies.resize(n);
+			for (int i=0;i<n;i++)
+			{
+				m_lo[i] = lo[i];
+				m_hi[i] = hi[i];
+				m_b[i] = b[i];
+				m_x[i] = x[i];
+				m_dependencies[i] = limitDependency[i];
+			}
+			result = btSolveDantzigLCP (n,&m_A[0],&m_x[0],&m_b[0],&ww[0],nub,&m_lo[0],&m_hi[0],&m_dependencies[0],m_scratchMemory);
+			if (!result)
+				return result;
+//			printf("numAllocas = %d\n",numAllocas);
+			for (int i=0;i<n;i++)
+			{
+				volatile btScalar xx = m_x[i];
+				if (xx != m_x[i])
+					return false;
+				if (x[i] >= m_acceptableUpperLimitSolution)
+				{
+					return false;
+				}
+				if (x[i] <= -m_acceptableUpperLimitSolution)
+				{
+					return false;
+				}
+			}
+			for (int i=0;i<n;i++)
+			{
+				x[i] = m_x[i];
+			}
+		}
+		return result;
+	}
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp b/src/bullet/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp
new file mode 100644
index 00000000..1f4015c7
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp
@@ -0,0 +1,371 @@
+/* Copyright (C) 2004-2013 MBSim Development Team
+Code was converted for the Bullet Continuous Collision Detection and Physics Library
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//The original version is here
+//This file is re-distributed under the ZLib license, with permission of the original author
+//Math library was replaced from fmatvec to a the file src/LinearMath/btMatrixX.h
+//STL/std::vector replaced by btAlignedObjectArray
+#include "btLemkeAlgorithm.h"
+using namespace std;
+btScalar btMachEps()
+	static bool calculated=false;
+	static btScalar machEps = btScalar(1.);
+	if (!calculated)
+	{
+		do {
+			machEps /= btScalar(2.0);
+			// If next epsilon yields 1, then break, because current
+			// epsilon is the machine epsilon.
+		}
+		while ((btScalar)(1.0 + (machEps/btScalar(2.0))) != btScalar(1.0));
+//		printf( "\nCalculated Machine epsilon: %G\n", machEps );
+		calculated=true;
+	}
+	return machEps;
+btScalar btEpsRoot() {
+	static btScalar epsroot = 0.;
+	static bool alreadyCalculated = false;
+	if (!alreadyCalculated) {
+		epsroot = btSqrt(btMachEps());
+		alreadyCalculated = true;
+	}
+	return epsroot;
+  btVectorXu btLemkeAlgorithm::solve(unsigned int maxloops /* = 0*/)
+    steps = 0;
+    int dim = m_q.size();
+    if(DEBUGLEVEL >= 1) {
+      cout << "Dimension = " << dim << endl;
+    }
+	btVectorXu solutionVector(2 * dim);
+	solutionVector.setZero();
+	  //, INIT, 0.);
+	btMatrixXu ident(dim, dim);
+	ident.setIdentity();
+	cout << m_M << std::endl;
+	btMatrixXu mNeg = m_M.negative();
+    btMatrixXu A(dim, 2 * dim + 2);
+	//
+	A.setSubMatrix(0, 0, dim - 1, dim - 1,ident);
+	A.setSubMatrix(0, dim, dim - 1, 2 * dim - 1,mNeg);
+	A.setSubMatrix(0, 2 * dim, dim - 1, 2 * dim, -1.f);
+	A.setSubMatrix(0, 2 * dim + 1, dim - 1, 2 * dim + 1,m_q);
+	cout << A << std::endl;
+ //   btVectorXu q_;
+ //   q_ >> A(0, 2 * dim + 1, dim - 1, 2 * dim + 1);
+    btAlignedObjectArray<int> basis;
+    //At first, all w-values are in the basis
+    for (int i = 0; i < dim; i++)
+      basis.push_back(i);
+	int pivotRowIndex = -1;
+	btScalar minValue = 1e30f;
+	bool greaterZero = true;
+	for (int i=0;i<dim;i++)
+	{
+		btScalar v =A(i,2*dim+1);
+		if (v<minValue)
+		{
+			minValue=v;
+			pivotRowIndex = i;
+		}
+		if (v<0)
+			greaterZero = false;
+	}
+  //  int pivotRowIndex = q_.minIndex();//minIndex(q_);     // first row is that with lowest q-value
+    int z0Row = pivotRowIndex;           // remember the col of z0 for ending algorithm afterwards
+    int pivotColIndex = 2 * dim;         // first col is that of z0
+    if (DEBUGLEVEL >= 3)
+	{
+    //  cout << "A: " << A << endl;
+      cout << "pivotRowIndex " << pivotRowIndex << endl;
+      cout << "pivotColIndex " << pivotColIndex << endl;
+      cout << "Basis: ";
+      for (int i = 0; i < basis.size(); i++)
+        cout << basis[i] << " ";
+      cout << endl;
+    }
+	if (!greaterZero)
+	{
+      if (maxloops == 0) {
+		  maxloops = 100;
+//        maxloops = UINT_MAX; //TODO: not a really nice way, problem is: maxloops should be 2^dim (=1<<dim), but this could exceed UINT_MAX and thus the result would be 0 and therefore the lemke algorithm wouldn't start but probably would find a solution within less then UINT_MAX steps. Therefore this constant is used as a upper border right now...
+      }
+      /*start looping*/
+      for(steps = 0; steps < maxloops; steps++) {
+        GaussJordanEliminationStep(A, pivotRowIndex, pivotColIndex, basis);
+        if (DEBUGLEVEL >= 3) {
+        //  cout << "A: " << A << endl;
+          cout << "pivotRowIndex " << pivotRowIndex << endl;
+          cout << "pivotColIndex " << pivotColIndex << endl;
+          cout << "Basis: ";
+          for (int i = 0; i < basis.size(); i++)
+            cout << basis[i] << " ";
+          cout << endl;
+        }
+        int pivotColIndexOld = pivotColIndex;
+        /*find new column index */
+        if (basis[pivotRowIndex] < dim) //if a w-value left the basis get in the correspondent z-value
+          pivotColIndex = basis[pivotRowIndex] + dim;
+        else
+          //else do it the other way round and get in the corresponding w-value
+          pivotColIndex = basis[pivotRowIndex] - dim;
+        /*the column becomes part of the basis*/
+        basis[pivotRowIndex] = pivotColIndexOld;
+        pivotRowIndex = findLexicographicMinimum(A, pivotColIndex);
+        if(z0Row == pivotRowIndex) { //if z0 leaves the basis the solution is found --> one last elimination step is necessary
+          GaussJordanEliminationStep(A, pivotRowIndex, pivotColIndex, basis);
+          basis[pivotRowIndex] = pivotColIndex; //update basis
+          break;
+      }
+      }
+      if(DEBUGLEVEL >= 1) {
+        cout << "Number of loops: " << steps << endl;
+        cout << "Number of maximal loops: " << maxloops << endl;
+      }
+      if(!validBasis(basis)) {
+        info = -1;
+        if(DEBUGLEVEL >= 1)
+          cerr << "Lemke-Algorithm ended with Ray-Termination (no valid solution)." << endl;
+        return solutionVector;
+      }
+    }
+    if (DEBUGLEVEL >= 2) {
+     // cout << "A: " << A << endl;
+      cout << "pivotRowIndex " << pivotRowIndex << endl;
+      cout << "pivotColIndex " << pivotColIndex << endl;
+    }
+    for (int i = 0; i < basis.size(); i++)
+	{
+      solutionVector[basis[i]] = A(i,2*dim+1);//q_[i];
+	}
+    info = 0;
+    return solutionVector;
+  }
+  int btLemkeAlgorithm::findLexicographicMinimum(const btMatrixXu& A, const int & pivotColIndex) {
+	  int RowIndex = 0;
+	  int dim = A.rows();
+	  btAlignedObjectArray<btVectorXu> Rows;
+	  for (int row = 0; row < dim; row++) 
+	  {
+		  btVectorXu vec(dim + 1);
+		  vec.setZero();//, INIT, 0.)
+		  Rows.push_back(vec);
+		  btScalar a = A(row, pivotColIndex);
+		  if (a > 0) {
+			  Rows[row][0] = A(row, 2 * dim + 1) / a;
+			  Rows[row][1] = A(row, 2 * dim) / a;
+			  for (int j = 2; j < dim + 1; j++)
+				  Rows[row][j] = A(row, j - 1) / a;
+		//		if (DEBUGLEVEL) {
+			//	  cout << "Rows(" << row << ") = " << Rows[row] << endl;
+				// }
+		  }
+	  }
+	  for (int i = 0; i < Rows.size(); i++) 
+	  {
+		  if (Rows[i].nrm2() > 0.) {
+			  int j = 0;
+			  for (; j < Rows.size(); j++) 
+			  {
+				  if(i != j)
+				  {
+					  if(Rows[j].nrm2() > 0.)
+					  {
+						  btVectorXu test(dim + 1);
+						  for (int ii=0;ii<dim+1;ii++)
+						  {
+							  test[ii] = Rows[j][ii] - Rows[i][ii];
+						  }
+						  //=Rows[j] - Rows[i]
+						  if (! LexicographicPositive(test))
+							  break;
+					  }
+				  }
+			  }
+			  if (j == Rows.size()) 
+			  {
+				  RowIndex += i;
+				  break;
+			  }
+		  }
+	  }
+	  return RowIndex;
+  }
+  bool btLemkeAlgorithm::LexicographicPositive(const btVectorXu & v)
+    int i = 0;
+  //  if (DEBUGLEVEL)
+    //  cout << "v " << v << endl;
+    while(i < v.size()-1 && fabs(v[i]) < btMachEps())
+      i++;
+    if (v[i] > 0)
+      return true;
+    return false;
+  }
+void btLemkeAlgorithm::GaussJordanEliminationStep(btMatrixXu& A, int pivotRowIndex, int pivotColumnIndex, const btAlignedObjectArray<int>& basis) 
+	btScalar a = -1 / A(pivotRowIndex, pivotColumnIndex);
+	cout << A << std::endl;
+    for (int i = 0; i < A.rows(); i++)
+	{
+      if (i != pivotRowIndex)
+	  {
+        for (int j = 0; j < A.cols(); j++)
+		{
+          if (j != pivotColumnIndex)
+		  {
+			  btScalar v = A(i, j);
+			  v += A(pivotRowIndex, j) * A(i, pivotColumnIndex) * a;
+            A.setElem(i, j, v);
+		  }
+		}
+	  }
+	}
+	cout << A << std::endl;
+    for (int i = 0; i < A.cols(); i++) 
+	{
+      A.mulElem(pivotRowIndex, i,-a);
+    }
+	cout << A << std::endl;
+#endif //#ifdef BT_DEBUG_OSTREAM
+    for (int i = 0; i < A.rows(); i++)
+	{
+      if (i != pivotRowIndex)
+	  {
+        A.setElem(i, pivotColumnIndex,0);
+	  }
+	}
+	cout << A << std::endl;
+#endif //#ifdef BT_DEBUG_OSTREAM
+  }
+  bool btLemkeAlgorithm::greaterZero(const btVectorXu & vector)
+    bool isGreater = true;
+    for (int i = 0; i < vector.size(); i++) {
+      if (vector[i] < 0) {
+        isGreater = false;
+        break;
+      }
+    }
+    return isGreater;
+  }
+  bool btLemkeAlgorithm::validBasis(const btAlignedObjectArray<int>& basis) 
+  {
+    bool isValid = true;
+    for (int i = 0; i < basis.size(); i++) {
+      if (basis[i] >= basis.size() * 2) { //then z0 is in the base
+        isValid = false;
+        break;
+      }
+    }
+    return isValid;
+  }
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.h b/src/bullet/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.h
new file mode 100644
index 00000000..7555cd9d
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.h
@@ -0,0 +1,108 @@
+/* Copyright (C) 2004-2013 MBSim Development Team
+Code was converted for the Bullet Continuous Collision Detection and Physics Library
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+//The original version is here
+//This file is re-distributed under the ZLib license, with permission of the original author (Kilian Grundl)
+//Math library was replaced from fmatvec to a the file src/LinearMath/btMatrixX.h
+//STL/std::vector replaced by btAlignedObjectArray
+#include "LinearMath/btMatrixX.h"
+#include <vector> //todo: replace by btAlignedObjectArray
+class btLemkeAlgorithm
+  btLemkeAlgorithm(const btMatrixXu& M_, const btVectorXu& q_, const int & DEBUGLEVEL_ = 0) :
+  {
+	setSystem(M_, q_);
+  }
+  /* GETTER / SETTER */
+  /**
+   * \brief return info of solution process
+   */
+  int getInfo() {
+	return info;
+  }
+  /**
+   * \brief get the number of steps until the solution was found
+   */
+  int getSteps(void) {
+	return steps;
+  }
+  /**
+   * \brief set system with Matrix M and vector q
+   */
+  void setSystem(const btMatrixXu & M_, const btVectorXu & q_)
+	{
+		m_M = M_;
+		m_q = q_;
+  }
+  /***************************************************/
+  /**
+   * \brief solve algorithm adapted from : Fast Implementation of Lemke’s Algorithm for Rigid Body Contact Simulation (John E. Lloyd)
+   */
+  btVectorXu solve(unsigned int maxloops = 0);
+  virtual ~btLemkeAlgorithm() {
+  }
+  int findLexicographicMinimum(const btMatrixXu &A, const int & pivotColIndex);
+  bool LexicographicPositive(const btVectorXu & v);
+  void GaussJordanEliminationStep(btMatrixXu &A, int pivotRowIndex, int pivotColumnIndex, const btAlignedObjectArray<int>& basis);
+  bool greaterZero(const btVectorXu & vector);
+  bool validBasis(const btAlignedObjectArray<int>& basis);
+  btMatrixXu m_M;
+  btVectorXu m_q;
+  /**
+   * \brief number of steps until the Lemke algorithm found a solution
+   */
+  unsigned int steps;
+  /**
+   * \brief define level of debug output
+   */
+  /**
+   * \brief did the algorithm find a solution
+   *
+   * -1 : not successful
+   *  0 : successful
+   */
+  int info;
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btLemkeSolver.h b/src/bullet/BulletDynamics/MLCPSolvers/btLemkeSolver.h
new file mode 100644
index 00000000..98484c37
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btLemkeSolver.h
@@ -0,0 +1,350 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+#include "btMLCPSolverInterface.h"
+#include "btLemkeAlgorithm.h"
+///The btLemkeSolver is based on "Fast Implementation of Lemke�s Algorithm for Rigid Body Contact Simulation (John E. Lloyd) "
+///It is a slower but more accurate solver. Increase the m_maxLoops for better convergence, at the cost of more CPU time.
+///The original implementation of the btLemkeAlgorithm was done by Kilian Grundl from the MBSim team
+class btLemkeSolver : public btMLCPSolverInterface
+	btScalar	m_maxValue;
+	int			m_debugLevel;
+	int			m_maxLoops;
+	bool		m_useLoHighBounds;
+	btLemkeSolver()
+		:m_maxValue(100000),
+		m_debugLevel(0),
+		m_maxLoops(1000),
+		m_useLoHighBounds(true)
+	{
+	}
+	virtual bool solveMLCP(const btMatrixXu & A, const btVectorXu & b, btVectorXu& x, const btVectorXu & lo,const btVectorXu & hi,const btAlignedObjectArray<int>& limitDependency, int numIterations, bool useSparsity = true)
+	{
+		if (m_useLoHighBounds)
+		{
+		BT_PROFILE("btLemkeSolver::solveMLCP");
+		int n = A.rows();
+		if (0==n)
+			return true;
+		bool fail = false;
+		btVectorXu solution(n);
+		btVectorXu q1;
+		q1.resize(n);
+		for (int row=0;row<n;row++)
+		{
+			q1[row] = -b[row];
+		}
+	//		cout << "A" << endl;
+	//		cout << A << endl;
+			/////////////////////////////////////
+			//slow matrix inversion, replace with LU decomposition
+			btMatrixXu A1;
+			btMatrixXu B(n,n);
+			{
+				BT_PROFILE("inverse(slow)");
+				A1.resize(A.rows(),A.cols());
+				for (int row=0;row<A.rows();row++)
+				{
+					for (int col=0;col<A.cols();col++)
+					{
+						A1.setElem(row,col,A(row,col));
+					}
+				}
+				btMatrixXu matrix;
+				matrix.resize(n,2*n);
+				for (int row=0;row<n;row++)
+				{
+					for (int col=0;col<n;col++)
+					{
+						matrix.setElem(row,col,A1(row,col));
+					}
+				}
+				btScalar ratio,a;
+				int i,j,k;
+				for(i = 0; i < n; i++){
+				for(j = n; j < 2*n; j++){
+					if(i==(j-n))
+						matrix.setElem(i,j,1.0);
+					else
+						matrix.setElem(i,j,0.0);
+				}
+			}
+			for(i = 0; i < n; i++){
+				for(j = 0; j < n; j++){
+					if(i!=j)
+					{
+						btScalar v = matrix(i,i);
+						if (btFuzzyZero(v))
+						{
+							a = 0.000001f;
+						}
+						ratio = matrix(j,i)/matrix(i,i);
+						for(k = 0; k < 2*n; k++){
+							matrix.addElem(j,k,- ratio * matrix(i,k));
+						}
+					}
+				}
+			}
+			for(i = 0; i < n; i++){
+				a = matrix(i,i);
+				if (btFuzzyZero(a))
+				{
+					a = 0.000001f;
+				}
+				btScalar invA = 1.f/a;
+				for(j = 0; j < 2*n; j++){
+					matrix.mulElem(i,j,invA);
+				}
+			}
+			for (int row=0;row<n;row++)
+				{
+					for (int col=0;col<n;col++)
+					{
+						B.setElem(row,col,matrix(row,n+col));
+					}
+				}
+			}
+		btMatrixXu b1(n,1);
+		btMatrixXu M(n*2,n*2);
+		for (int row=0;row<n;row++)
+		{
+			b1.setElem(row,0,-b[row]);
+			for (int col=0;col<n;col++)
+			{
+				btScalar v =B(row,col);
+				M.setElem(row,col,v);
+				M.setElem(n+row,n+col,v);
+				M.setElem(n+row,col,-v);
+				M.setElem(row,n+col,-v);
+			}
+		}
+		btMatrixXu Bb1 = B*b1;
+//		q = [ (-B*b1 - lo)'   (hi + B*b1)' ]'
+		btVectorXu qq;
+		qq.resize(n*2);
+		for (int row=0;row<n;row++)
+		{
+			qq[row] = -Bb1(row,0)-lo[row];
+			qq[n+row] = Bb1(row,0)+hi[row];
+		}
+		btVectorXu z1;
+		btMatrixXu y1;
+		y1.resize(n,1);
+		btLemkeAlgorithm lemke(M,qq,m_debugLevel);
+		{
+			BT_PROFILE("lemke.solve");
+			lemke.setSystem(M,qq);
+			z1  = lemke.solve(m_maxLoops);
+		}
+		for (int row=0;row<n;row++)
+		{
+			y1.setElem(row,0,z1[2*n+row]-z1[3*n+row]);
+		}
+		btMatrixXu y1_b1(n,1);
+		for (int i=0;i<n;i++)
+		{
+			y1_b1.setElem(i,0,y1(i,0)-b1(i,0));
+		}
+		btMatrixXu x1;
+		x1 = B*(y1_b1);
+		for (int row=0;row<n;row++)
+		{
+			solution[row] = x1(row,0);//n];
+		}
+		int errorIndexMax = -1;
+		int errorIndexMin = -1;
+		float errorValueMax = -1e30;
+		float errorValueMin = 1e30;
+		for (int i=0;i<n;i++)
+		{
+			x[i] = solution[i];
+			volatile btScalar check = x[i];
+			if (x[i] != check)
+			{
+				//printf("Lemke result is #NAN\n");
+				x.setZero();
+				return false;
+			}
+			//this is some hack/safety mechanism, to discard invalid solutions from the Lemke solver 
+			//we need to figure out why it happens, and fix it, or detect it properly)
+			if (x[i]>m_maxValue)
+			{
+				if (x[i]> errorValueMax)
+				{
+					fail = true;
+					errorIndexMax = i;
+					errorValueMax = x[i];
+				}
+				////printf("x[i] = %f,",x[i]);
+			}
+			if (x[i]<-m_maxValue)
+			{
+				if (x[i]<errorValueMin)
+				{
+					errorIndexMin = i;
+					errorValueMin = x[i];
+					fail = true;
+					//printf("x[i] = %f,",x[i]);
+				}
+			}
+		}
+		if (fail)
+		{
+			int m_errorCountTimes = 0;
+			if (errorIndexMin<0)
+				errorValueMin = 0.f;
+			if (errorIndexMax<0)
+				errorValueMax = 0.f;
+			m_errorCountTimes++;
+		//	printf("Error (x[%d] = %f, x[%d] = %f), resetting %d times\n", errorIndexMin,errorValueMin, errorIndexMax, errorValueMax, errorCountTimes++);
+			for (int i=0;i<n;i++)
+			{
+				x[i]=0.f;
+			}
+		}
+		return !fail;
+	} else
+	{
+			int dimension = A.rows();
+		if (0==dimension)
+			return true;
+//		printf("================ solving using Lemke/Newton/Fixpoint\n");
+		btVectorXu q;
+		q.resize(dimension);
+		for (int row=0;row<dimension;row++)
+		{
+			q[row] = -b[row];
+		}
+		btLemkeAlgorithm lemke(A,q,m_debugLevel);
+		lemke.setSystem(A,q);
+		btVectorXu solution = lemke.solve(m_maxLoops);
+		//check solution
+		bool fail = false;
+		int errorIndexMax = -1;
+		int errorIndexMin = -1;
+		float errorValueMax = -1e30;
+		float errorValueMin = 1e30;
+		for (int i=0;i<dimension;i++)
+		{
+			x[i] = solution[i+dimension];
+			volatile btScalar check = x[i];
+			if (x[i] != check)
+			{
+				x.setZero();
+				return false;
+			}
+			//this is some hack/safety mechanism, to discard invalid solutions from the Lemke solver 
+			//we need to figure out why it happens, and fix it, or detect it properly)
+			if (x[i]>m_maxValue)
+			{
+				if (x[i]> errorValueMax)
+				{
+					fail = true;
+					errorIndexMax = i;
+					errorValueMax = x[i];
+				}
+				////printf("x[i] = %f,",x[i]);
+			}
+			if (x[i]<-m_maxValue)
+			{
+				if (x[i]<errorValueMin)
+				{
+					errorIndexMin = i;
+					errorValueMin = x[i];
+					fail = true;
+					//printf("x[i] = %f,",x[i]);
+				}
+			}
+		}
+		if (fail)
+		{
+			static int errorCountTimes = 0;
+			if (errorIndexMin<0)
+				errorValueMin = 0.f;
+			if (errorIndexMax<0)
+				errorValueMax = 0.f;
+			printf("Error (x[%d] = %f, x[%d] = %f), resetting %d times\n", errorIndexMin,errorValueMin, errorIndexMax, errorValueMax, errorCountTimes++);
+			for (int i=0;i<dimension;i++)
+			{
+				x[i]=0.f;
+			}
+		}
+		return !fail;
+	}
+	return true;
+	}
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolver.cpp b/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolver.cpp
new file mode 100644
index 00000000..e73f4acc
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolver.cpp
@@ -0,0 +1,639 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+#include "btMLCPSolver.h"
+#include "LinearMath/btMatrixX.h"
+#include "LinearMath/btQuickprof.h"
+#include "btSolveProjectedGaussSeidel.h"
+btMLCPSolver::btMLCPSolver(	 btMLCPSolverInterface* solver)
+bool gUseMatrixMultiply = false;
+bool interleaveContactAndFriction = false;
+btScalar btMLCPSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodiesUnUsed, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+	btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup( bodies, numBodiesUnUsed, manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer);
+	{
+		BT_PROFILE("gather constraint data");
+		int numFrictionPerContact = m_tmpSolverContactConstraintPool.size()==m_tmpSolverContactFrictionConstraintPool.size()? 1 : 2;
+	//	int numBodies = m_tmpSolverBodyPool.size();
+		m_allConstraintPtrArray.resize(0);
+		m_limitDependencies.resize(m_tmpSolverNonContactConstraintPool.size()+m_tmpSolverContactConstraintPool.size()+m_tmpSolverContactFrictionConstraintPool.size());
+		btAssert(m_limitDependencies.size() == m_tmpSolverNonContactConstraintPool.size()+m_tmpSolverContactConstraintPool.size()+m_tmpSolverContactFrictionConstraintPool.size());
+	//	printf("m_limitDependencies.size() = %d\n",m_limitDependencies.size());
+		int dindex = 0;
+		for (int i=0;i<m_tmpSolverNonContactConstraintPool.size();i++)
+		{
+			m_allConstraintPtrArray.push_back(&m_tmpSolverNonContactConstraintPool[i]);
+			m_limitDependencies[dindex++] = -1;
+		}
+		///The btSequentialImpulseConstraintSolver moves all friction constraints at the very end, we can also interleave them instead
+		int firstContactConstraintOffset=dindex;
+		if (interleaveContactAndFriction)
+		{
+			for (int i=0;i<m_tmpSolverContactConstraintPool.size();i++)
+			{
+				m_allConstraintPtrArray.push_back(&m_tmpSolverContactConstraintPool[i]);
+				m_limitDependencies[dindex++] = -1;
+				m_allConstraintPtrArray.push_back(&m_tmpSolverContactFrictionConstraintPool[i*numFrictionPerContact]);
+				int findex = (m_tmpSolverContactFrictionConstraintPool[i*numFrictionPerContact].m_frictionIndex*(1+numFrictionPerContact));
+				m_limitDependencies[dindex++] = findex +firstContactConstraintOffset;
+				if (numFrictionPerContact==2)
+				{
+					m_allConstraintPtrArray.push_back(&m_tmpSolverContactFrictionConstraintPool[i*numFrictionPerContact+1]);
+					m_limitDependencies[dindex++] = findex+firstContactConstraintOffset;
+				}
+			}
+		} else
+		{
+			for (int i=0;i<m_tmpSolverContactConstraintPool.size();i++)
+			{
+				m_allConstraintPtrArray.push_back(&m_tmpSolverContactConstraintPool[i]);
+				m_limitDependencies[dindex++] = -1;
+			}
+			for (int i=0;i<m_tmpSolverContactFrictionConstraintPool.size();i++)
+			{
+				m_allConstraintPtrArray.push_back(&m_tmpSolverContactFrictionConstraintPool[i]);
+				m_limitDependencies[dindex++] = m_tmpSolverContactFrictionConstraintPool[i].m_frictionIndex+firstContactConstraintOffset;
+			}
+		}
+		if (!m_allConstraintPtrArray.size())
+		{
+			m_A.resize(0,0);
+			m_b.resize(0);
+			m_x.resize(0);
+			m_lo.resize(0);
+			m_hi.resize(0);
+			return 0.f;
+		}
+	}
+	if (gUseMatrixMultiply)
+	{
+		BT_PROFILE("createMLCP");
+		createMLCP(infoGlobal);
+	}
+	else
+	{
+		BT_PROFILE("createMLCPFast");
+		createMLCPFast(infoGlobal);
+	}
+	return 0.f;
+bool btMLCPSolver::solveMLCP(const btContactSolverInfo& infoGlobal)
+	bool result = true;
+	if (m_A.rows()==0)
+		return true;
+	//if using split impulse, we solve 2 separate (M)LCPs
+	if (infoGlobal.m_splitImpulse)
+	{
+		btMatrixXu Acopy = m_A;
+		btAlignedObjectArray<int> limitDependenciesCopy = m_limitDependencies;
+//		printf("solve first LCP\n");
+		result = m_solver->solveMLCP(m_A, m_b, m_x, m_lo,m_hi, m_limitDependencies,infoGlobal.m_numIterations );
+		if (result)
+			result = m_solver->solveMLCP(Acopy, m_bSplit, m_xSplit, m_lo,m_hi, limitDependenciesCopy,infoGlobal.m_numIterations );
+	} else
+	{
+		result = m_solver->solveMLCP(m_A, m_b, m_x, m_lo,m_hi, m_limitDependencies,infoGlobal.m_numIterations );
+	}
+	return result;
+struct btJointNode
+	int jointIndex;     // pointer to enclosing dxJoint object
+	int otherBodyIndex;       // *other* body this joint is connected to
+	int nextJointNodeIndex;//-1 for null
+	int constraintRowIndex;
+void btMLCPSolver::createMLCPFast(const btContactSolverInfo& infoGlobal)
+	int numContactRows = interleaveContactAndFriction ? 3 : 1;
+	int numConstraintRows = m_allConstraintPtrArray.size();
+	int n = numConstraintRows;
+	{
+		BT_PROFILE("init b (rhs)");
+		m_b.resize(numConstraintRows);
+		m_bSplit.resize(numConstraintRows);
+		m_b.setZero();
+		m_bSplit.setZero();
+		for (int i=0;i<numConstraintRows ;i++)
+		{
+			btScalar jacDiag = m_allConstraintPtrArray[i]->m_jacDiagABInv;
+			if (!btFuzzyZero(jacDiag))
+			{
+				btScalar rhs = m_allConstraintPtrArray[i]->m_rhs;
+				btScalar rhsPenetration = m_allConstraintPtrArray[i]->m_rhsPenetration;
+				m_b[i]=rhs/jacDiag;
+				m_bSplit[i] = rhsPenetration/jacDiag;
+			}
+		}
+	}
+//	btScalar* w = 0;
+//	int nub = 0;
+	m_lo.resize(numConstraintRows);
+	m_hi.resize(numConstraintRows);
+	{
+		BT_PROFILE("init lo/ho");
+		for (int i=0;i<numConstraintRows;i++)
+		{
+			if (0)//m_limitDependencies[i]>=0)
+			{
+				m_lo[i] = -BT_INFINITY;
+				m_hi[i] = BT_INFINITY;
+			} else
+			{
+				m_lo[i] = m_allConstraintPtrArray[i]->m_lowerLimit;
+				m_hi[i] = m_allConstraintPtrArray[i]->m_upperLimit;
+			}
+		}
+	}
+	//
+	int m=m_allConstraintPtrArray.size();
+	int numBodies = m_tmpSolverBodyPool.size();
+	btAlignedObjectArray<int> bodyJointNodeArray;
+	{
+		BT_PROFILE("bodyJointNodeArray.resize");
+		bodyJointNodeArray.resize(numBodies,-1);
+	}
+	btAlignedObjectArray<btJointNode> jointNodeArray;
+	{
+		BT_PROFILE("jointNodeArray.reserve");
+		jointNodeArray.reserve(2*m_allConstraintPtrArray.size());
+	}
+	static btMatrixXu J3;
+	{
+		BT_PROFILE("J3.resize");
+		J3.resize(2*m,8);
+	}
+	static btMatrixXu JinvM3;
+	{
+		BT_PROFILE("JinvM3.resize/setZero");
+		JinvM3.resize(2*m,8);
+		JinvM3.setZero();
+		J3.setZero();
+	}
+	int cur=0;
+	int rowOffset = 0;
+	static btAlignedObjectArray<int> ofs;
+	{
+		BT_PROFILE("ofs resize");
+		ofs.resize(0);
+		ofs.resizeNoInitialize(m_allConstraintPtrArray.size());
+	}				
+	{
+		BT_PROFILE("Compute J and JinvM");
+		int c=0;
+		int numRows = 0;
+		for (int i=0;i<m_allConstraintPtrArray.size();i+=numRows,c++)
+		{
+			ofs[c] = rowOffset;
+			int sbA = m_allConstraintPtrArray[i]->m_solverBodyIdA;
+			int sbB = m_allConstraintPtrArray[i]->m_solverBodyIdB;
+			btRigidBody* orgBodyA = m_tmpSolverBodyPool[sbA].m_originalBody;
+			btRigidBody* orgBodyB = m_tmpSolverBodyPool[sbB].m_originalBody;
+			numRows = i<m_tmpSolverNonContactConstraintPool.size() ? m_tmpConstraintSizesPool[c].m_numConstraintRows : numContactRows ;
+			if (orgBodyA)
+			{
+				{
+					int slotA=-1;
+					//find free jointNode slot for sbA
+					slotA =jointNodeArray.size();
+					jointNodeArray.expand();//NonInitializing();
+					int prevSlot = bodyJointNodeArray[sbA];
+					bodyJointNodeArray[sbA] = slotA;
+					jointNodeArray[slotA].nextJointNodeIndex = prevSlot;
+					jointNodeArray[slotA].jointIndex = c;
+					jointNodeArray[slotA].constraintRowIndex = i;
+					jointNodeArray[slotA].otherBodyIndex = orgBodyB ? sbB : -1;
+				}
+				for (int row=0;row<numRows;row++,cur++)
+				{
+					btVector3 normalInvMass =				m_allConstraintPtrArray[i+row]->m_contactNormal1 *		orgBodyA->getInvMass();
+					btVector3 relPosCrossNormalInvInertia = m_allConstraintPtrArray[i+row]->m_relpos1CrossNormal *	orgBodyA->getInvInertiaTensorWorld();
+					for (int r=0;r<3;r++)
+					{
+						J3.setElem(cur,r,m_allConstraintPtrArray[i+row]->m_contactNormal1[r]);
+						J3.setElem(cur,r+4,m_allConstraintPtrArray[i+row]->m_relpos1CrossNormal[r]);
+						JinvM3.setElem(cur,r,normalInvMass[r]);
+						JinvM3.setElem(cur,r+4,relPosCrossNormalInvInertia[r]);
+					}
+					J3.setElem(cur,3,0);
+					JinvM3.setElem(cur,3,0);
+					J3.setElem(cur,7,0);
+					JinvM3.setElem(cur,7,0);
+				}
+			} else
+			{
+				cur += numRows;
+			}
+			if (orgBodyB)
+			{
+				{
+					int slotB=-1;
+					//find free jointNode slot for sbA
+					slotB =jointNodeArray.size();
+					jointNodeArray.expand();//NonInitializing();
+					int prevSlot = bodyJointNodeArray[sbB];
+					bodyJointNodeArray[sbB] = slotB;
+					jointNodeArray[slotB].nextJointNodeIndex = prevSlot;
+					jointNodeArray[slotB].jointIndex = c;
+					jointNodeArray[slotB].otherBodyIndex = orgBodyA ? sbA : -1;
+					jointNodeArray[slotB].constraintRowIndex = i;
+				}
+				for (int row=0;row<numRows;row++,cur++)
+				{
+					btVector3 normalInvMassB = m_allConstraintPtrArray[i+row]->m_contactNormal2*orgBodyB->getInvMass();
+					btVector3 relPosInvInertiaB = m_allConstraintPtrArray[i+row]->m_relpos2CrossNormal * orgBodyB->getInvInertiaTensorWorld();
+					for (int r=0;r<3;r++)
+					{
+						J3.setElem(cur,r,m_allConstraintPtrArray[i+row]->m_contactNormal2[r]);
+						J3.setElem(cur,r+4,m_allConstraintPtrArray[i+row]->m_relpos2CrossNormal[r]);
+						JinvM3.setElem(cur,r,normalInvMassB[r]);
+						JinvM3.setElem(cur,r+4,relPosInvInertiaB[r]);
+					}
+					J3.setElem(cur,3,0);
+					JinvM3.setElem(cur,3,0);
+					J3.setElem(cur,7,0);
+					JinvM3.setElem(cur,7,0);
+				}
+			}
+			else
+			{
+				cur += numRows;
+			}
+			rowOffset+=numRows;
+		}
+	}
+	//compute JinvM = J*invM.
+	const btScalar* JinvM = JinvM3.getBufferPointer();
+	const btScalar* Jptr = J3.getBufferPointer();
+	{
+		BT_PROFILE("m_A.resize");
+		m_A.resize(n,n);
+	}
+	{
+		BT_PROFILE("m_A.setZero");
+		m_A.setZero();
+	}
+	int c=0;
+	{
+		int numRows = 0;
+		BT_PROFILE("Compute A");
+		for (int i=0;i<m_allConstraintPtrArray.size();i+= numRows,c++)
+		{
+			int row__ = ofs[c];
+			int sbA = m_allConstraintPtrArray[i]->m_solverBodyIdA;
+			int sbB = m_allConstraintPtrArray[i]->m_solverBodyIdB;
+		//	btRigidBody* orgBodyA = m_tmpSolverBodyPool[sbA].m_originalBody;
+		//	btRigidBody* orgBodyB = m_tmpSolverBodyPool[sbB].m_originalBody;
+			numRows = i<m_tmpSolverNonContactConstraintPool.size() ? m_tmpConstraintSizesPool[c].m_numConstraintRows : numContactRows ;
+			const btScalar *JinvMrow = JinvM + 2*8*(size_t)row__;
+			{
+				int startJointNodeA = bodyJointNodeArray[sbA];
+				while (startJointNodeA>=0)
+				{
+					int j0 = jointNodeArray[startJointNodeA].jointIndex;
+					int cr0 = jointNodeArray[startJointNodeA].constraintRowIndex;
+					if (j0<c)
+					{
+						int numRowsOther = cr0 < m_tmpSolverNonContactConstraintPool.size() ? m_tmpConstraintSizesPool[j0].m_numConstraintRows : numContactRows;
+						size_t ofsother = (m_allConstraintPtrArray[cr0]->m_solverBodyIdB == sbA) ? 8*numRowsOther  : 0;
+						//printf("%d joint i %d and j0: %d: ",count++,i,j0);
+						m_A.multiplyAdd2_p8r ( JinvMrow, 
+						Jptr + 2*8*(size_t)ofs[j0] + ofsother, numRows, numRowsOther,  row__,ofs[j0]);
+					}
+					startJointNodeA = jointNodeArray[startJointNodeA].nextJointNodeIndex;
+				}
+			}
+			{
+				int startJointNodeB = bodyJointNodeArray[sbB];
+				while (startJointNodeB>=0)
+				{
+					int j1 = jointNodeArray[startJointNodeB].jointIndex;
+					int cj1 = jointNodeArray[startJointNodeB].constraintRowIndex;
+					if (j1<c)
+					{
+						int numRowsOther =  cj1 < m_tmpSolverNonContactConstraintPool.size() ? m_tmpConstraintSizesPool[j1].m_numConstraintRows : numContactRows;
+						size_t ofsother = (m_allConstraintPtrArray[cj1]->m_solverBodyIdB == sbB) ? 8*numRowsOther  : 0;
+						m_A.multiplyAdd2_p8r ( JinvMrow + 8*(size_t)numRows, 
+						Jptr + 2*8*(size_t)ofs[j1] + ofsother, numRows, numRowsOther, row__,ofs[j1]);
+					}
+					startJointNodeB = jointNodeArray[startJointNodeB].nextJointNodeIndex;
+				}
+			}
+		}
+		{
+			BT_PROFILE("compute diagonal");
+			// compute diagonal blocks of m_A
+			int  row__ = 0;
+			int numJointRows = m_allConstraintPtrArray.size();
+			int jj=0;
+			for (;row__<numJointRows;)
+			{
+				//int sbA = m_allConstraintPtrArray[row__]->m_solverBodyIdA;
+				int sbB = m_allConstraintPtrArray[row__]->m_solverBodyIdB;
+			//	btRigidBody* orgBodyA = m_tmpSolverBodyPool[sbA].m_originalBody;
+				btRigidBody* orgBodyB = m_tmpSolverBodyPool[sbB].m_originalBody;
+				const unsigned int infom =  row__ < m_tmpSolverNonContactConstraintPool.size() ? m_tmpConstraintSizesPool[jj].m_numConstraintRows : numContactRows;
+				const btScalar *JinvMrow = JinvM + 2*8*(size_t)row__;
+				const btScalar *Jrow = Jptr + 2*8*(size_t)row__;
+				m_A.multiply2_p8r (JinvMrow, Jrow, infom, infom, row__,row__);
+				if (orgBodyB) 
+				{
+					m_A.multiplyAdd2_p8r (JinvMrow + 8*(size_t)infom, Jrow + 8*(size_t)infom, infom, infom,  row__,row__);
+				}
+				row__ += infom;
+				jj++;
+			}
+		}
+	}
+	if (1)
+	{
+		// add cfm to the diagonal of m_A
+		for ( int i=0; i<m_A.rows(); ++i) 
+		{
+			m_A.setElem(i,i,m_A(i,i)+ infoGlobal.m_globalCfm/ infoGlobal.m_timeStep);
+		}
+	}
+	///fill the upper triangle of the matrix, to make it symmetric
+	{
+		BT_PROFILE("fill the upper triangle ");
+		m_A.copyLowerToUpperTriangle();
+	}
+	{
+		BT_PROFILE("resize/init x");
+		m_x.resize(numConstraintRows);
+		m_xSplit.resize(numConstraintRows);
+		if (infoGlobal.m_solverMode&SOLVER_USE_WARMSTARTING)
+		{
+			for (int i=0;i<m_allConstraintPtrArray.size();i++)
+			{
+				const btSolverConstraint& c = *m_allConstraintPtrArray[i];
+				m_x[i]=c.m_appliedImpulse;
+				m_xSplit[i] = c.m_appliedPushImpulse;
+			}
+		} else
+		{
+			m_x.setZero();
+			m_xSplit.setZero();
+		}
+	}
+void btMLCPSolver::createMLCP(const btContactSolverInfo& infoGlobal)
+	int numBodies = this->m_tmpSolverBodyPool.size();
+	int numConstraintRows = m_allConstraintPtrArray.size();
+	m_b.resize(numConstraintRows);
+	if (infoGlobal.m_splitImpulse)
+		m_bSplit.resize(numConstraintRows);
+	m_bSplit.setZero();
+	m_b.setZero();
+	for (int i=0;i<numConstraintRows ;i++)
+	{
+		if (m_allConstraintPtrArray[i]->m_jacDiagABInv)
+		{
+			m_b[i]=m_allConstraintPtrArray[i]->m_rhs/m_allConstraintPtrArray[i]->m_jacDiagABInv;
+			if (infoGlobal.m_splitImpulse)
+				m_bSplit[i] = m_allConstraintPtrArray[i]->m_rhsPenetration/m_allConstraintPtrArray[i]->m_jacDiagABInv;
+		}
+	}
+	static btMatrixXu Minv;
+	Minv.resize(6*numBodies,6*numBodies);
+	Minv.setZero();
+	for (int i=0;i<numBodies;i++)
+	{
+		const btSolverBody& rb = m_tmpSolverBodyPool[i];
+		const btVector3& invMass = rb.m_invMass;
+		setElem(Minv,i*6+0,i*6+0,invMass[0]);
+		setElem(Minv,i*6+1,i*6+1,invMass[1]);
+		setElem(Minv,i*6+2,i*6+2,invMass[2]);
+		btRigidBody* orgBody = m_tmpSolverBodyPool[i].m_originalBody;
+		for (int r=0;r<3;r++)
+			for (int c=0;c<3;c++)
+				setElem(Minv,i*6+3+r,i*6+3+c,orgBody? orgBody->getInvInertiaTensorWorld()[r][c] : 0);
+	}
+	static btMatrixXu J;
+	J.resize(numConstraintRows,6*numBodies);
+	J.setZero();
+	m_lo.resize(numConstraintRows);
+	m_hi.resize(numConstraintRows);
+	for (int i=0;i<numConstraintRows;i++)
+	{
+		m_lo[i] = m_allConstraintPtrArray[i]->m_lowerLimit;
+		m_hi[i] = m_allConstraintPtrArray[i]->m_upperLimit;
+		int bodyIndex0 = m_allConstraintPtrArray[i]->m_solverBodyIdA;
+		int bodyIndex1 = m_allConstraintPtrArray[i]->m_solverBodyIdB;
+		if (m_tmpSolverBodyPool[bodyIndex0].m_originalBody)
+		{
+			setElem(J,i,6*bodyIndex0+0,m_allConstraintPtrArray[i]->m_contactNormal1[0]);
+			setElem(J,i,6*bodyIndex0+1,m_allConstraintPtrArray[i]->m_contactNormal1[1]);
+			setElem(J,i,6*bodyIndex0+2,m_allConstraintPtrArray[i]->m_contactNormal1[2]);
+			setElem(J,i,6*bodyIndex0+3,m_allConstraintPtrArray[i]->m_relpos1CrossNormal[0]);
+			setElem(J,i,6*bodyIndex0+4,m_allConstraintPtrArray[i]->m_relpos1CrossNormal[1]);
+			setElem(J,i,6*bodyIndex0+5,m_allConstraintPtrArray[i]->m_relpos1CrossNormal[2]);
+		}
+		if (m_tmpSolverBodyPool[bodyIndex1].m_originalBody)
+		{
+			setElem(J,i,6*bodyIndex1+0,m_allConstraintPtrArray[i]->m_contactNormal2[0]);
+			setElem(J,i,6*bodyIndex1+1,m_allConstraintPtrArray[i]->m_contactNormal2[1]);
+			setElem(J,i,6*bodyIndex1+2,m_allConstraintPtrArray[i]->m_contactNormal2[2]);
+			setElem(J,i,6*bodyIndex1+3,m_allConstraintPtrArray[i]->m_relpos2CrossNormal[0]);
+			setElem(J,i,6*bodyIndex1+4,m_allConstraintPtrArray[i]->m_relpos2CrossNormal[1]);
+			setElem(J,i,6*bodyIndex1+5,m_allConstraintPtrArray[i]->m_relpos2CrossNormal[2]);
+		}
+	}
+	static btMatrixXu J_transpose;
+	J_transpose= J.transpose();
+	static btMatrixXu tmp;
+	{
+		{
+			BT_PROFILE("J*Minv");
+			tmp = J*Minv;
+		}
+		{
+			BT_PROFILE("J*tmp");
+			m_A = tmp*J_transpose;
+		}
+	}
+	if (1)
+	{
+		// add cfm to the diagonal of m_A
+		for ( int i=0; i<m_A.rows(); ++i) 
+		{
+			m_A.setElem(i,i,m_A(i,i)+ infoGlobal.m_globalCfm / infoGlobal.m_timeStep);
+		}
+	}
+	m_x.resize(numConstraintRows);
+	if (infoGlobal.m_splitImpulse)
+		m_xSplit.resize(numConstraintRows);
+//	m_x.setZero();
+	for (int i=0;i<m_allConstraintPtrArray.size();i++)
+	{
+		const btSolverConstraint& c = *m_allConstraintPtrArray[i];
+		m_x[i]=c.m_appliedImpulse;
+		if (infoGlobal.m_splitImpulse)
+			m_xSplit[i] = c.m_appliedPushImpulse;
+	}
+btScalar btMLCPSolver::solveGroupCacheFriendlyIterations(btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+	bool result = true;
+	{
+		BT_PROFILE("solveMLCP");
+//		printf("m_A(%d,%d)\n", m_A.rows(),m_A.cols());
+		result = solveMLCP(infoGlobal);
+	}
+	//check if solution is valid, and otherwise fallback to btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations
+	if (result)
+	{
+		BT_PROFILE("process MLCP results");
+		for (int i=0;i<m_allConstraintPtrArray.size();i++)
+		{
+			{
+				btSolverConstraint& c = *m_allConstraintPtrArray[i];
+				int sbA = c.m_solverBodyIdA;
+				int sbB = c.m_solverBodyIdB;
+				//btRigidBody* orgBodyA = m_tmpSolverBodyPool[sbA].m_originalBody;
+			//	btRigidBody* orgBodyB = m_tmpSolverBodyPool[sbB].m_originalBody;
+				btSolverBody& solverBodyA = m_tmpSolverBodyPool[sbA];
+				btSolverBody& solverBodyB = m_tmpSolverBodyPool[sbB];
+				{
+					btScalar deltaImpulse = m_x[i]-c.m_appliedImpulse;
+					c.m_appliedImpulse = m_x[i];
+					solverBodyA.internalApplyImpulse(c.m_contactNormal1*solverBodyA.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+					solverBodyB.internalApplyImpulse(c.m_contactNormal2*solverBodyB.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+				}
+				if (infoGlobal.m_splitImpulse)
+				{
+					btScalar deltaImpulse = m_xSplit[i] - c.m_appliedPushImpulse;
+					solverBodyA.internalApplyPushImpulse(c.m_contactNormal1*solverBodyA.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+					solverBodyB.internalApplyPushImpulse(c.m_contactNormal2*solverBodyB.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+					c.m_appliedPushImpulse = m_xSplit[i];
+				}
+			}
+		}
+	}
+	else
+	{
+	//	printf("m_fallback = %d\n",m_fallback);
+		m_fallback++;
+		btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations(bodies ,numBodies,manifoldPtr, numManifolds,constraints,numConstraints,infoGlobal,debugDrawer);
+	}
+	return 0.f;
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolver.h b/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolver.h
new file mode 100644
index 00000000..88d587c0
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolver.h
@@ -0,0 +1,83 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
+#include "LinearMath/btMatrixX.h"
+#include "BulletDynamics/MLCPSolvers/btMLCPSolverInterface.h"
+class btMLCPSolver : public btSequentialImpulseConstraintSolver
+	btMatrixXu m_A;
+	btVectorXu m_b;
+	btVectorXu m_x;
+	btVectorXu m_lo;
+	btVectorXu m_hi;
+	///when using 'split impulse' we solve two separate (M)LCPs
+	btVectorXu m_bSplit;
+	btVectorXu m_xSplit;
+	btVectorXu m_bSplit1;
+	btVectorXu m_xSplit2;
+	btAlignedObjectArray<int> m_limitDependencies;
+	btAlignedObjectArray<btSolverConstraint*>	m_allConstraintPtrArray;
+	btMLCPSolverInterface* m_solver;
+	int m_fallback;
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	virtual btScalar solveGroupCacheFriendlyIterations(btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	virtual void createMLCP(const btContactSolverInfo& infoGlobal);
+	virtual void createMLCPFast(const btContactSolverInfo& infoGlobal);
+	//return true is it solves the problem successfully
+	virtual bool solveMLCP(const btContactSolverInfo& infoGlobal);
+	btMLCPSolver(	 btMLCPSolverInterface* solver);
+	virtual ~btMLCPSolver();
+	void setMLCPSolver(btMLCPSolverInterface* solver)
+	{
+		m_solver = solver;
+	}
+	int getNumFallbacks() const
+	{
+		return m_fallback;
+	}
+	void setNumFallbacks(int num)
+	{
+		m_fallback = num;
+	}
+	virtual btConstraintSolverType	getSolverType() const
+	{
+		return BT_MLCP_SOLVER;
+	}
+#endif //BT_MLCP_SOLVER_H
diff --git a/src/bullet/BulletMultiThreaded/SpuCollisionObjectWrapper.h b/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolverInterface.h
similarity index 57%
rename from src/bullet/BulletMultiThreaded/SpuCollisionObjectWrapper.h
rename to src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolverInterface.h
index f90da277..25bb3f6d 100644
--- a/src/bullet/BulletMultiThreaded/SpuCollisionObjectWrapper.h
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btMLCPSolverInterface.h
@@ -1,6 +1,6 @@
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -12,29 +12,22 @@ subject to the following restrictions:
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
-#include "PlatformDefinitions.h"
-#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "LinearMath/btMatrixX.h"
-ATTRIBUTE_ALIGNED16(class) SpuCollisionObjectWrapper
+class btMLCPSolverInterface
-	int m_shapeType;
-	float m_margin;
-	ppu_address_t m_collisionObjectPtr;
-	SpuCollisionObjectWrapper ();
-	SpuCollisionObjectWrapper (const btCollisionObject* collisionObject);
+	virtual ~btMLCPSolverInterface()
+	{
+	}
-	int           getShapeType () const;
-	float         getCollisionMargin () const;
-	ppu_address_t getCollisionObjectPtr () const;
+	//return true is it solves the problem successfully
+	virtual bool solveMLCP(const btMatrixXu & A, const btVectorXu & b, btVectorXu& x, const btVectorXu & lo,const btVectorXu & hi,const btAlignedObjectArray<int>& limitDependency, int numIterations, bool useSparsity = true)=0;
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btPATHSolver.h b/src/bullet/BulletDynamics/MLCPSolvers/btPATHSolver.h
new file mode 100644
index 00000000..9ec31a6d
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btPATHSolver.h
@@ -0,0 +1,151 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+//#define BT_USE_PATH
+#ifdef BT_USE_PATH
+extern "C" {
+#include "PATH/SimpleLCP.h"
+#include "PATH/License.h"
+#include "PATH/Error_Interface.h"
+  void __stdcall MyError(Void *data, Char *msg)
+	printf("Path Error: %s\n",msg);
+  void __stdcall MyWarning(Void *data, Char *msg)
+	printf("Path Warning: %s\n",msg);
+Error_Interface e;
+#include "btMLCPSolverInterface.h"
+#include "Dantzig/lcp.h"
+class btPathSolver : public btMLCPSolverInterface
+	btPathSolver()
+	{
+		License_SetString("2069810742&Courtesy_License&&&USR&2013&14_12_2011&1000&PATH&GEN&31_12_2013&0_0_0&0&0_0");
+		e.error_data = 0;
+		e.warning = MyWarning;
+		e.error = MyError;
+		Error_SetInterface(&e);
+	}
+	virtual bool solveMLCP(const btMatrixXu & A, const btVectorXu & b, btVectorXu& x, const btVectorXu & lo,const btVectorXu & hi,const btAlignedObjectArray<int>& limitDependency, int numIterations, bool useSparsity = true)
+	{
+		MCP_Termination status;
+		int numVariables = b.rows();
+		if (0==numVariables)
+			return true;
+			/*	 - variables - the number of variables in the problem
+			- m_nnz - the number of nonzeros in the M matrix
+			- m_i - a vector of size m_nnz containing the row indices for M
+			- m_j - a vector of size m_nnz containing the column indices for M
+			- m_ij - a vector of size m_nnz containing the data for M
+			- q - a vector of size variables
+			- lb - a vector of size variables containing the lower bounds on x
+			- ub - a vector of size variables containing the upper bounds on x
+			*/
+		btAlignedObjectArray<double> values;
+		btAlignedObjectArray<int> rowIndices;
+		btAlignedObjectArray<int> colIndices;
+		for (int i=0;i<A.rows();i++)
+		{
+			for (int j=0;j<A.cols();j++)
+			{
+				if (A(i,j)!=0.f)
+				{
+					//add 1, because Path starts at 1, instead of 0
+					rowIndices.push_back(i+1);
+					colIndices.push_back(j+1);
+					values.push_back(A(i,j));
+				}
+			}
+		}
+		int numNonZero = rowIndices.size();
+		btAlignedObjectArray<double> zResult;
+		zResult.resize(numVariables);
+		btAlignedObjectArray<double> rhs;
+		btAlignedObjectArray<double> upperBounds;
+		btAlignedObjectArray<double> lowerBounds;
+		for (int i=0;i<numVariables;i++)
+		{
+			upperBounds.push_back(hi[i]);
+			lowerBounds.push_back(lo[i]);
+			rhs.push_back(-b[i]);
+		}
+		SimpleLCP(numVariables,numNonZero,&rowIndices[0],&colIndices[0],&values[0],&rhs[0],&lowerBounds[0],&upperBounds[0], &status, &zResult[0]);
+		if (status != MCP_Solved)
+		{
+			static const char* gReturnMsgs[] = {
+				"Invalid return",
+				"MCP_Solved: The problem was solved",
+				"MCP_NoProgress: A stationary point was found",
+				"MCP_MajorIterationLimit: Major iteration limit met",
+				"MCP_MinorIterationLimit: Cumulative minor iteration limit met",
+				"MCP_TimeLimit: Ran out of time",
+				"MCP_UserInterrupt: Control-C, typically",
+				"MCP_BoundError: Problem has a bound error",
+				"MCP_DomainError: Could not find starting point",
+				"MCP_Infeasible: Problem has no solution",
+				"MCP_Error: An error occurred within the code",
+				"MCP_LicenseError: License could not be found",
+				"MCP_OK"
+			};
+			printf("ERROR: The PATH MCP solver failed: %s\n", gReturnMsgs[(unsigned int)status]);// << std::endl;
+			printf("using Projected Gauss Seidel fallback\n");
+			return false;
+		} else
+		{
+			for (int i=0;i<numVariables;i++)
+			{
+				x[i] = zResult[i];
+				//check for #NAN
+				if (x[i] != zResult[i])
+					return false;
+			}
+			return true;
+		}
+	}
+#endif //BT_USE_PATH
+#endif //BT_PATH_SOLVER_H
diff --git a/src/bullet/BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h b/src/bullet/BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h
new file mode 100644
index 00000000..77cc57c6
--- /dev/null
+++ b/src/bullet/BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h
@@ -0,0 +1,86 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+#include "btMLCPSolverInterface.h"
+///This solver is mainly for debug/learning purposes: it is functionally equivalent to the btSequentialImpulseConstraintSolver solver, but much slower (it builds the full LCP matrix)
+class btSolveProjectedGaussSeidel : public btMLCPSolverInterface
+	virtual bool solveMLCP(const btMatrixXu & A, const btVectorXu & b, btVectorXu& x, const btVectorXu & lo,const btVectorXu & hi,const btAlignedObjectArray<int>& limitDependency, int numIterations, bool useSparsity = true)
+	{
+		if (!A.rows())
+			return true;
+		//the A matrix is sparse, so compute the non-zero elements
+		A.rowComputeNonZeroElements();
+		//A is a m-n matrix, m rows, n columns
+		btAssert(A.rows() == b.rows());
+		int i, j, numRows = A.rows();
+		float delta;
+		for (int k = 0; k <numIterations; k++)
+		{
+			for (i = 0; i <numRows; i++)
+			{
+				delta = 0.0f;
+				if (useSparsity)
+				{
+					for (int h=0;h<A.m_rowNonZeroElements1[i].size();h++)
+					{
+						int j = A.m_rowNonZeroElements1[i][h];
+						if (j != i)//skip main diagonal
+						{
+							delta += A(i,j) * x[j];
+						}
+					}
+				} else
+				{
+					for (j = 0; j <i; j++) 
+						delta += A(i,j) * x[j];
+					for (j = i+1; j<numRows; j++) 
+						delta += A(i,j) * x[j];
+				}
+				float aDiag = A(i,i);
+				x [i] = (b [i] - delta) / aDiag;
+				float s = 1.f;
+				if (limitDependency[i]>=0)
+				{
+					s = x[limitDependency[i]];
+					if (s<0)
+						s=1;
+				}
+				if (x[i]<lo[i]*s)
+					x[i]=lo[i]*s;
+				if (x[i]>hi[i]*s)
+					x[i]=hi[i]*s;
+			}
+		}
+		return true;
+	}
diff --git a/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.cpp b/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.cpp
index 5b467883..a7b16884 100644
--- a/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.cpp
+++ b/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.cpp
@@ -296,8 +296,9 @@ void btRaycastVehicle::updateVehicle( btScalar step )
 	int i=0;
 	for (i=0;i<m_wheelInfo.size();i++)
-		btScalar depth; 
-		depth = rayCast( m_wheelInfo[i]);
+		//btScalar depth; 
+		//depth = 
+		rayCast( m_wheelInfo[i]);
@@ -756,14 +757,14 @@ void* btDefaultVehicleRaycaster::castRay(const btVector3& from,const btVector3&
 	if (rayCallback.hasHit())
-		btRigidBody* body = btRigidBody::upcast(rayCallback.m_collisionObject);
+		const btRigidBody* body = btRigidBody::upcast(rayCallback.m_collisionObject);
         if (body && body->hasContactResponse())
 			result.m_hitPointInWorld = rayCallback.m_hitPointWorld;
 			result.m_hitNormalInWorld = rayCallback.m_hitNormalWorld;
 			result.m_distFraction = rayCallback.m_closestHitFraction;
-			return body;
+			return (void*)body;
 	return 0;
diff --git a/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.h b/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.h
index f59555f9..82d44c73 100644
--- a/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.h
+++ b/src/bullet/BulletDynamics/Vehicle/btRaycastVehicle.h
@@ -58,8 +58,6 @@ public:
-	btScalar	m_tau;
-	btScalar	m_damping;
 	btVehicleRaycaster*	m_vehicleRaycaster;
 	btScalar		m_pitchControl;
 	btScalar	m_steeringValue; 
diff --git a/src/bullet/BulletInverseDynamics/IDConfig.hpp b/src/bullet/BulletInverseDynamics/IDConfig.hpp
new file mode 100644
index 00000000..8e657791
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/IDConfig.hpp
@@ -0,0 +1,76 @@
+///@file Configuration for Inverse Dynamics Library,
+///	  such as choice of linear algebra library and underlying scalar type
+#ifndef IDCONFIG_HPP_
+#define IDCONFIG_HPP_
+// If true, enable jacobian calculations.
+// This adds a 3xN matrix to every body, + 2 3-Vectors.
+// so it is not advised for large systems if it is not absolutely necessary.
+// Also, this is not required for standard inverse dynamics calculations.
+// Will only work with vector math libraries that support 3xN matrices.
+// If we have a custom configuration, compile without using other parts of bullet.
+#include <cmath>
+#define BT_ID_WO_BULLET
+#define BT_ID_POW(a,b) std::pow(a,b)
+#define BT_ID_SNPRINTF snprintf
+#define BT_ID_PI M_PI
+#define BT_ID_POW(a,b) btPow(a,b)
+#define BT_ID_PI SIMD_PI
+#ifdef _WIN32
+	#define BT_ID_SNPRINTF _snprintf
+	#define BT_ID_SNPRINTF snprintf
+#endif //
+// error messages
+#include "IDErrorMessages.hpp"
+#define INVDYN_INCLUDE_HELPER_2(x) #x
+#ifndef btInverseDynamics
+#error "custom inverse dynamics config, but no custom namespace defined"
+#define BT_ID_MAX(a,b) std::max(a,b)
+#define BT_ID_MIN(a,b) std::min(a,b)
+#define btInverseDynamics btInverseDynamicsBullet3
+// Use default configuration with bullet's types
+// Use the same scalar type as rest of bullet library
+#include "LinearMath/btScalar.h"
+typedef btScalar idScalar;
+#include "LinearMath/btMinMax.h"
+#define BT_ID_MAX(a,b) btMax(a,b)
+#define BT_ID_MIN(a,b) btMin(a,b)
+// use bullet types for arrays and array indices
+#include "Bullet3Common/b3AlignedObjectArray.h"
+// this is to make it work with C++2003, otherwise we could do this:
+// template <typename T>
+// using idArray = b3AlignedObjectArray<T>;
+template <typename T>
+struct idArray {
+	typedef b3AlignedObjectArray<T> type;
+typedef int idArrayIdx;
+// use bullet's allocator functions
+#define idMalloc btAllocFunc
+#define idFree btFreeFunc
+#include "details/IDLinearMathInterface.hpp"
diff --git a/src/bullet/BulletInverseDynamics/IDConfigBuiltin.hpp b/src/bullet/BulletInverseDynamics/IDConfigBuiltin.hpp
new file mode 100644
index 00000000..130c19c6
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/IDConfigBuiltin.hpp
@@ -0,0 +1,37 @@
+///@file Configuration for Inverse Dynamics Library without external dependencies
+#define btInverseDynamics btInverseDynamicsBuiltin
+// choose double/single precision version
+typedef double idScalar;
+typedef float idScalar;
+// use std::vector for arrays
+#include <vector>
+// this is to make it work with C++2003, otherwise we could do this
+// template <typename T>
+// using idArray = std::vector<T>;
+template <typename T>
+struct idArray {
+	typedef std::vector<T> type;
+typedef std::vector<int>::size_type idArrayIdx;
+// default to standard malloc/free
+#include <cstdlib>
+#define idMalloc ::malloc
+#define idFree ::free
+// currently not aligned at all...
+#define ID_DECLARE_ALIGNED_ALLOCATOR()															 \
+	inline void* operator new(std::size_t sizeInBytes) { return idMalloc(sizeInBytes); }		   \
+	inline void operator delete(void* ptr) { idFree(ptr); }										\
+	inline void* operator new(std::size_t, void* ptr) { return ptr; }							  \
+	inline void operator delete(void*, void*) {}												   \
+	inline void* operator new[](std::size_t sizeInBytes) { return idMalloc(sizeInBytes); }		 \
+	inline void operator delete[](void* ptr) { idFree(ptr); }									  \
+	inline void* operator new[](std::size_t, void* ptr) { return ptr; }							\
+	inline void operator delete[](void*, void*) {}
+#include "details/IDMatVec.hpp"
diff --git a/src/bullet/BulletInverseDynamics/IDConfigEigen.hpp b/src/bullet/BulletInverseDynamics/IDConfigEigen.hpp
new file mode 100644
index 00000000..cbd7e8a9
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/IDConfigEigen.hpp
@@ -0,0 +1,31 @@
+///@file Configuration for Inverse Dynamics Library with Eigen
+#define btInverseDynamics btInverseDynamicsEigen
+// choose double/single precision version
+typedef double idScalar;
+typedef float idScalar;
+// use std::vector for arrays
+#include <vector>
+// this is to make it work with C++2003, otherwise we could do this
+// template <typename T>
+// using idArray = std::vector<T>;
+template <typename T>
+struct idArray {
+	typedef std::vector<T> type;
+typedef std::vector<int>::size_type idArrayIdx;
+// default to standard malloc/free
+#include <cstdlib>
+// Note on interfaces:
+// Eigen::Matrix has data(), to get c-array storage
+// HOWEVER: default storage is column-major!
+#include "Eigen/Eigen"
+#include "details/IDEigenInterface.hpp"
diff --git a/src/bullet/BulletInverseDynamics/IDErrorMessages.hpp b/src/bullet/BulletInverseDynamics/IDErrorMessages.hpp
new file mode 100644
index 00000000..a3866edc
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/IDErrorMessages.hpp
@@ -0,0 +1,34 @@
+///@file error message utility functions
+#ifndef IDUTILS_HPP_
+#define IDUTILS_HPP_
+#include <cstring>
+/// name of file being compiled, without leading path components
+#define __INVDYN_FILE_WO_DIR__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#ifndef BT_ID_WO_BULLET
+#include "Bullet3Common/b3Logging.h"
+#define error_message(...) b3Error(__VA_ARGS__)
+#define warning_message(...) b3Warning(__VA_ARGS__)
+#define id_printf(...) b3Printf(__VA_ARGS__)
+#else  // BT_ID_WO_BULLET
+#include <cstdio>
+/// print error message with file/line information
+#define error_message(...)																		 \
+	do {																						   \
+		fprintf(stderr, "[Error:%s:%d] ", __INVDYN_FILE_WO_DIR__, __LINE__);					   \
+		fprintf(stderr, __VA_ARGS__);															  \
+	} while (0)
+/// print warning message with file/line information
+#define warning_message(...)																	   \
+	do {																						   \
+		fprintf(stderr, "[Warning:%s:%d] ", __INVDYN_FILE_WO_DIR__, __LINE__);					 \
+		fprintf(stderr, __VA_ARGS__);															  \
+	} while (0)
+#define warning_message(...)																		 \
+	do {																						   \
+		fprintf(stderr, "[Warning:%s:%d] ", __INVDYN_FILE_WO_DIR__, __LINE__);					   \
+		fprintf(stderr, __VA_ARGS__);															\
+	} while (0)
+#define id_printf(...) printf(__VA_ARGS__)
+#endif  // BT_ID_WO_BULLET
diff --git a/src/bullet/BulletInverseDynamics/IDMath.cpp b/src/bullet/BulletInverseDynamics/IDMath.cpp
new file mode 100644
index 00000000..03452ca0
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/IDMath.cpp
@@ -0,0 +1,425 @@
+#include "IDMath.hpp"
+#include <cmath>
+#include <limits>
+namespace btInverseDynamics {
+static const idScalar kIsZero = 5 * std::numeric_limits<idScalar>::epsilon();
+// requirements for axis length deviation from 1.0
+// experimentally set from random euler angle rotation matrices
+static const idScalar kAxisLengthEpsilon = 10 * kIsZero;
+void setZero(vec3 &v) {
+	v(0) = 0;
+	v(1) = 0;
+	v(2) = 0;
+void setZero(vecx &v) {
+	for (int i = 0; i < v.size(); i++) {
+		v(i) = 0;
+	}
+void setZero(mat33 &m) {
+	m(0, 0) = 0;
+	m(0, 1) = 0;
+	m(0, 2) = 0;
+	m(1, 0) = 0;
+	m(1, 1) = 0;
+	m(1, 2) = 0;
+	m(2, 0) = 0;
+	m(2, 1) = 0;
+	m(2, 2) = 0;
+idScalar maxAbs(const vecx &v) {
+	idScalar result = 0.0;
+	for (int i = 0; i < v.size(); i++) {
+		const idScalar tmp = std::fabs(v(i));
+		if (tmp > result) {
+			result = tmp;
+		}
+	}
+	return result;
+idScalar maxAbs(const vec3 &v) {
+	idScalar result = 0.0;
+	for (int i = 0; i < 3; i++) {
+		const idScalar tmp = std::fabs(v(i));
+		if (tmp > result) {
+			result = tmp;
+		}
+	}
+	return result;
+#if (defined BT_ID_HAVE_MAT3X)
+idScalar maxAbsMat3x(const mat3x &m) {
+    // only used for tests -- so just loop here for portability
+    idScalar result = 0.0;
+    for (idArrayIdx col = 0; col < m.cols(); col++) {
+        for (idArrayIdx row = 0; row < 3; row++) {
+            result = BT_ID_MAX(result, std::fabs(m(row, col)));
+        }
+    }
+    return result;
+void mul(const mat33 &a, const mat3x &b, mat3x *result) {
+    if (b.cols() != result->cols()) {
+        error_message("size missmatch. a.cols()= %d, b.cols()= %d\n",
+                      static_cast<int>(b.cols()), static_cast<int>(result->cols()));
+        abort();
+    }
+    for (idArrayIdx col = 0; col < b.cols(); col++) {
+        const idScalar x = a(0,0)*b(0,col)+a(0,1)*b(1,col)+a(0,2)*b(2,col);
+        const idScalar y = a(1,0)*b(0,col)+a(1,1)*b(1,col)+a(1,2)*b(2,col);
+        const idScalar z = a(2,0)*b(0,col)+a(2,1)*b(1,col)+a(2,2)*b(2,col);
+        setMat3xElem(0, col, x, result);
+        setMat3xElem(1, col, y, result);
+        setMat3xElem(2, col, z, result);
+    }
+void add(const mat3x &a, const mat3x &b, mat3x *result) {
+    if (a.cols() != b.cols()) {
+        error_message("size missmatch. a.cols()= %d, b.cols()= %d\n",
+                      static_cast<int>(a.cols()), static_cast<int>(b.cols()));
+        abort();
+    }
+    for (idArrayIdx col = 0; col < b.cols(); col++) {
+        for (idArrayIdx row = 0; row < 3; row++) {
+            setMat3xElem(row, col, a(row, col) + b(row, col), result);
+        }
+    }
+void sub(const mat3x &a, const mat3x &b, mat3x *result) {
+    if (a.cols() != b.cols()) {
+        error_message("size missmatch. a.cols()= %d, b.cols()= %d\n",
+                      static_cast<int>(a.cols()), static_cast<int>(b.cols()));
+        abort();
+    }
+    for (idArrayIdx col = 0; col < b.cols(); col++) {
+        for (idArrayIdx row = 0; row < 3; row++) {
+            setMat3xElem(row, col, a(row, col) - b(row, col), result);
+        }
+    }
+mat33 transformX(const idScalar &alpha) {
+	mat33 T;
+	const idScalar cos_alpha = std::cos(alpha);
+	const idScalar sin_alpha = std::sin(alpha);
+	// [1  0 0]
+	// [0  c s]
+	// [0 -s c]
+	T(0, 0) = 1.0;
+	T(0, 1) = 0.0;
+	T(0, 2) = 0.0;
+	T(1, 0) = 0.0;
+	T(1, 1) = cos_alpha;
+	T(1, 2) = sin_alpha;
+	T(2, 0) = 0.0;
+	T(2, 1) = -sin_alpha;
+	T(2, 2) = cos_alpha;
+	return T;
+mat33 transformY(const idScalar &beta) {
+	mat33 T;
+	const idScalar cos_beta = std::cos(beta);
+	const idScalar sin_beta = std::sin(beta);
+	// [c 0 -s]
+	// [0 1  0]
+	// [s 0  c]
+	T(0, 0) = cos_beta;
+	T(0, 1) = 0.0;
+	T(0, 2) = -sin_beta;
+	T(1, 0) = 0.0;
+	T(1, 1) = 1.0;
+	T(1, 2) = 0.0;
+	T(2, 0) = sin_beta;
+	T(2, 1) = 0.0;
+	T(2, 2) = cos_beta;
+	return T;
+mat33 transformZ(const idScalar &gamma) {
+	mat33 T;
+	const idScalar cos_gamma = std::cos(gamma);
+	const idScalar sin_gamma = std::sin(gamma);
+	// [ c s 0]
+	// [-s c 0]
+	// [ 0 0 1]
+	T(0, 0) = cos_gamma;
+	T(0, 1) = sin_gamma;
+	T(0, 2) = 0.0;
+	T(1, 0) = -sin_gamma;
+	T(1, 1) = cos_gamma;
+	T(1, 2) = 0.0;
+	T(2, 0) = 0.0;
+	T(2, 1) = 0.0;
+	T(2, 2) = 1.0;
+	return T;
+mat33 tildeOperator(const vec3 &v) {
+	mat33 m;
+	m(0, 0) = 0.0;
+	m(0, 1) = -v(2);
+	m(0, 2) = v(1);
+	m(1, 0) = v(2);
+	m(1, 1) = 0.0;
+	m(1, 2) = -v(0);
+	m(2, 0) = -v(1);
+	m(2, 1) = v(0);
+	m(2, 2) = 0.0;
+	return m;
+void getVecMatFromDH(idScalar theta, idScalar d, idScalar a, idScalar alpha, vec3 *r, mat33 *T) {
+	const idScalar sa = std::sin(alpha);
+	const idScalar ca = std::cos(alpha);
+	const idScalar st = std::sin(theta);
+	const idScalar ct = std::cos(theta);
+	(*r)(0) = a;
+	(*r)(1) = -sa * d;
+	(*r)(2) = ca * d;
+	(*T)(0, 0) = ct;
+	(*T)(0, 1) = -st;
+	(*T)(0, 2) = 0.0;
+	(*T)(1, 0) = st * ca;
+	(*T)(1, 1) = ct * ca;
+	(*T)(1, 2) = -sa;
+	(*T)(2, 0) = st * sa;
+	(*T)(2, 1) = ct * sa;
+	(*T)(2, 2) = ca;
+void bodyTParentFromAxisAngle(const vec3 &axis, const idScalar &angle, mat33 *T) {
+	const idScalar c = cos(angle);
+	const idScalar s = -sin(angle);
+	const idScalar one_m_c = 1.0 - c;
+	const idScalar &x = axis(0);
+	const idScalar &y = axis(1);
+	const idScalar &z = axis(2);
+	(*T)(0, 0) = x * x * one_m_c + c;
+	(*T)(0, 1) = x * y * one_m_c - z * s;
+	(*T)(0, 2) = x * z * one_m_c + y * s;
+	(*T)(1, 0) = x * y * one_m_c + z * s;
+	(*T)(1, 1) = y * y * one_m_c + c;
+	(*T)(1, 2) = y * z * one_m_c - x * s;
+	(*T)(2, 0) = x * z * one_m_c - y * s;
+	(*T)(2, 1) = y * z * one_m_c + x * s;
+	(*T)(2, 2) = z * z * one_m_c + c;
+bool isPositiveDefinite(const mat33 &m) {
+	// test if all upper left determinants are positive
+	if (m(0, 0) <= 0) {  // upper 1x1
+		return false;
+	}
+	if (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0) <= 0) {  // upper 2x2
+		return false;
+	}
+	if ((m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1)) -
+		 m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) +
+		 m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0))) < 0) {
+		return false;
+	}
+	return true;
+bool isPositiveSemiDefinite(const mat33 &m) {
+	// test if all upper left determinants are positive
+	if (m(0, 0) < 0) {  // upper 1x1
+		return false;
+	}
+	if (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0) < 0) {  // upper 2x2
+		return false;
+	}
+	if ((m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1)) -
+		 m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) +
+		 m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0))) < 0) {
+		return false;
+	}
+	return true;
+bool isPositiveSemiDefiniteFuzzy(const mat33 &m) {
+	// test if all upper left determinants are positive
+	if (m(0, 0) < -kIsZero) {  // upper 1x1
+		return false;
+	}
+	if (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0) < -kIsZero) {  // upper 2x2
+		return false;
+	}
+	if ((m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1)) -
+		 m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) +
+		 m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0))) < -kIsZero) {
+		return false;
+	}
+	return true;
+idScalar determinant(const mat33 &m) {
+	return m(0, 0) * m(1, 1) * m(2, 2) + m(0, 1) * m(1, 2) * m(2, 0) + m(0, 2) * m(1, 0) * m(2, 1) -
+		   m(0, 2) * m(1, 1) * m(2, 0) - m(0, 0) * m(1, 2) * m(2, 1) - m(0, 1) * m(1, 0) * m(2, 2);
+bool isValidInertiaMatrix(const mat33 &I, const int index, bool has_fixed_joint) {
+	// TODO(Thomas) do we really want this?
+	//			  in cases where the inertia tensor about the center of mass is zero,
+	//			  the determinant of the inertia tensor about the joint axis is almost
+	//			  zero and can have a very small negative value.
+	if (!isPositiveSemiDefiniteFuzzy(I)) {
+		error_message("invalid inertia matrix for body %d, not positive definite "
+					  "(fixed joint)\n",
+					  index);
+		error_message("matrix is:\n"
+					  "[%.20e %.20e %.20e;\n"
+					  "%.20e %.20e %.20e;\n"
+					  "%.20e %.20e %.20e]\n",
+					  I(0, 0), I(0, 1), I(0, 2), I(1, 0), I(1, 1), I(1, 2), I(2, 0), I(2, 1),
+					  I(2, 2));
+		return false;
+	}
+	// check triangle inequality, must have I(i,i)+I(j,j)>=I(k,k)
+	if (!has_fixed_joint) {
+		if (I(0, 0) + I(1, 1) < I(2, 2)) {
+			error_message("invalid inertia tensor for body %d, I(0,0) + I(1,1) < I(2,2)\n", index);
+			error_message("matrix is:\n"
+						  "[%.20e %.20e %.20e;\n"
+						  "%.20e %.20e %.20e;\n"
+						  "%.20e %.20e %.20e]\n",
+						  I(0, 0), I(0, 1), I(0, 2), I(1, 0), I(1, 1), I(1, 2), I(2, 0), I(2, 1),
+						  I(2, 2));
+			return false;
+		}
+		if (I(0, 0) + I(1, 1) < I(2, 2)) {
+			error_message("invalid inertia tensor for body %d, I(0,0) + I(1,1) < I(2,2)\n", index);
+			error_message("matrix is:\n"
+						  "[%.20e %.20e %.20e;\n"
+						  "%.20e %.20e %.20e;\n"
+						  "%.20e %.20e %.20e]\n",
+						  I(0, 0), I(0, 1), I(0, 2), I(1, 0), I(1, 1), I(1, 2), I(2, 0), I(2, 1),
+						  I(2, 2));
+			return false;
+		}
+		if (I(1, 1) + I(2, 2) < I(0, 0)) {
+			error_message("invalid inertia tensor for body %d, I(1,1) + I(2,2) < I(0,0)\n", index);
+			error_message("matrix is:\n"
+						  "[%.20e %.20e %.20e;\n"
+						  "%.20e %.20e %.20e;\n"
+						  "%.20e %.20e %.20e]\n",
+						  I(0, 0), I(0, 1), I(0, 2), I(1, 0), I(1, 1), I(1, 2), I(2, 0), I(2, 1),
+						  I(2, 2));
+			return false;
+		}
+	}
+	// check positive/zero diagonal elements
+	for (int i = 0; i < 3; i++) {
+		if (I(i, i) < 0) {  // accept zero
+			error_message("invalid inertia tensor, I(%d,%d)= %e <0\n", i, i, I(i, i));
+			return false;
+		}
+	}
+	// check symmetry
+	if (std::fabs(I(1, 0) - I(0, 1)) > kIsZero) {
+		error_message("invalid inertia tensor for body %d I(1,0)!=I(0,1). I(1,0)-I(0,1)= "
+					  "%e\n",
+					  index, I(1, 0) - I(0, 1));
+		return false;
+	}
+	if (std::fabs(I(2, 0) - I(0, 2)) > kIsZero) {
+		error_message("invalid inertia tensor for body %d I(2,0)!=I(0,2). I(2,0)-I(0,2)= "
+					  "%e\n",
+					  index, I(2, 0) - I(0, 2));
+		return false;
+	}
+	if (std::fabs(I(1, 2) - I(2, 1)) > kIsZero) {
+		error_message("invalid inertia tensor body %d I(1,2)!=I(2,1). I(1,2)-I(2,1)= %e\n", index,
+					  I(1, 2) - I(2, 1));
+		return false;
+	}
+	return true;
+bool isValidTransformMatrix(const mat33 &m) {
+#define print_mat(x)																			   \
+	error_message("matrix is [%e, %e, %e; %e, %e, %e; %e, %e, %e]\n", x(0, 0), x(0, 1), x(0, 2),   \
+				  x(1, 0), x(1, 1), x(1, 2), x(2, 0), x(2, 1), x(2, 2))
+	// check for unit length column vectors
+	for (int i = 0; i < 3; i++) {
+		const idScalar length_minus_1 =
+			std::fabs(m(0, i) * m(0, i) + m(1, i) * m(1, i) + m(2, i) * m(2, i) - 1.0);
+		if (length_minus_1 > kAxisLengthEpsilon) {
+			error_message("Not a valid rotation matrix (column %d not unit length)\n"
+						  "column = [%.18e %.18e %.18e]\n"
+						  "length-1.0= %.18e\n",
+						  i, m(0, i), m(1, i), m(2, i), length_minus_1);
+			print_mat(m);
+			return false;
+		}
+	}
+	// check for orthogonal column vectors
+	if (std::fabs(m(0, 0) * m(0, 1) + m(1, 0) * m(1, 1) + m(2, 0) * m(2, 1)) > kAxisLengthEpsilon) {
+		error_message("Not a valid rotation matrix (columns 0 and 1 not orthogonal)\n");
+		print_mat(m);
+		return false;
+	}
+	if (std::fabs(m(0, 0) * m(0, 2) + m(1, 0) * m(1, 2) + m(2, 0) * m(2, 2)) > kAxisLengthEpsilon) {
+		error_message("Not a valid rotation matrix (columns 0 and 2 not orthogonal)\n");
+		print_mat(m);
+		return false;
+	}
+	if (std::fabs(m(0, 1) * m(0, 2) + m(1, 1) * m(1, 2) + m(2, 1) * m(2, 2)) > kAxisLengthEpsilon) {
+		error_message("Not a valid rotation matrix (columns 0 and 2 not orthogonal)\n");
+		print_mat(m);
+		return false;
+	}
+	// check determinant (rotation not reflection)
+	if (determinant(m) <= 0) {
+		error_message("Not a valid rotation matrix (determinant <=0)\n");
+		print_mat(m);
+		return false;
+	}
+	return true;
+bool isUnitVector(const vec3 &vector) {
+	return std::fabs(vector(0) * vector(0) + vector(1) * vector(1) + vector(2) * vector(2) - 1.0) <
+		   kIsZero;
+vec3 rpyFromMatrix(const mat33 &rot) {
+	vec3 rpy;
+	rpy(2) = std::atan2(-rot(1, 0), rot(0, 0));
+	rpy(1) = std::atan2(rot(2, 0), std::cos(rpy(2)) * rot(0, 0) - std::sin(rpy(0)) * rot(1, 0));
+	rpy(0) = std::atan2(-rot(2, 0), rot(2, 2));
+	return rpy;
diff --git a/src/bullet/BulletInverseDynamics/IDMath.hpp b/src/bullet/BulletInverseDynamics/IDMath.hpp
new file mode 100644
index 00000000..63699712
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/IDMath.hpp
@@ -0,0 +1,98 @@
+/// @file Math utility functions used in inverse dynamics library.
+///	   Defined here as they may not be provided by the math library.
+#ifndef IDMATH_HPP_
+#define IDMATH_HPP_
+#include "IDConfig.hpp"
+namespace btInverseDynamics {
+/// set all elements to zero
+void setZero(vec3& v);
+/// set all elements to zero
+void setZero(vecx& v);
+/// set all elements to zero
+void setZero(mat33& m);
+/// return maximum absolute value
+idScalar maxAbs(const vecx& v);
+/// return maximum absolute value
+idScalar maxAbs(const vec3& v);
+#if (defined BT_ID_HAVE_MAT3X)
+idScalar maxAbsMat3x(const mat3x& m);
+void setZero(mat3x&m);
+// define math functions on mat3x here to avoid allocations in operators.
+void mul(const mat33&a, const mat3x&b, mat3x* result);
+void add(const mat3x&a, const mat3x&b, mat3x* result);
+void sub(const mat3x&a, const mat3x&b, mat3x* result);
+/// get offset vector & transform matrix from DH parameters
+/// TODO: add documentation
+void getVecMatFromDH(idScalar theta, idScalar d, idScalar a, idScalar alpha, vec3* r, mat33* T);
+/// Check if a 3x3 matrix is positive definite
+/// @param m a 3x3 matrix
+/// @return true if m>0, false otherwise
+bool isPositiveDefinite(const mat33& m);
+/// Check if a 3x3 matrix is positive semi definite
+/// @param m a 3x3 matrix
+/// @return true if m>=0, false otherwise
+bool isPositiveSemiDefinite(const mat33& m);
+/// Check if a 3x3 matrix is positive semi definite within numeric limits
+/// @param m a 3x3 matrix
+/// @return true if m>=-eps, false otherwise
+bool isPositiveSemiDefiniteFuzzy(const mat33& m);
+/// Determinant of 3x3 matrix
+/// NOTE: implemented here for portability, as determinant operation
+///	   will be implemented differently for various matrix/vector libraries
+/// @param m a 3x3 matrix
+/// @return det(m)
+idScalar determinant(const mat33& m);
+/// Test if a 3x3 matrix satisfies some properties of inertia matrices
+/// @param I a 3x3 matrix
+/// @param index body index (for error messages)
+/// @param has_fixed_joint: if true, positive semi-definite matrices are accepted
+/// @return true if I satisfies inertia matrix properties, false otherwise.
+bool isValidInertiaMatrix(const mat33& I, int index, bool has_fixed_joint);
+/// Check if a 3x3 matrix is a valid transform (rotation) matrix
+/// @param m a 3x3 matrix
+/// @return true if m is a rotation matrix, false otherwise
+bool isValidTransformMatrix(const mat33& m);
+/// Transform matrix from parent to child frame,
+/// when the child frame is rotated about @param axis by @angle
+/// (mathematically positive)
+/// @param axis the axis of rotation
+/// @param angle rotation angle
+/// @param T pointer to transform matrix
+void bodyTParentFromAxisAngle(const vec3& axis, const idScalar& angle, mat33* T);
+/// Check if this is a unit vector
+/// @param vector
+/// @return true if |vector|=1 within numeric limits
+bool isUnitVector(const vec3& vector);
+/// @input a vector in R^3
+/// @returns corresponding spin tensor
+mat33 tildeOperator(const vec3& v);
+/// @param alpha angle in radians
+/// @returns transform matrix for ratation with @param alpha about x-axis
+mat33 transformX(const idScalar& alpha);
+/// @param beta angle in radians
+/// @returns transform matrix for ratation with @param beta about y-axis
+mat33 transformY(const idScalar& beta);
+/// @param gamma angle in radians
+/// @returns transform matrix for ratation with @param gamma about z-axis
+mat33 transformZ(const idScalar& gamma);
+///calculate rpy angles (x-y-z Euler angles) from a given rotation matrix
+/// @param rot rotation matrix
+/// @returns x-y-z Euler angles
+vec3 rpyFromMatrix(const mat33&rot);
+#endif  // IDMATH_HPP_
diff --git a/src/bullet/BulletInverseDynamics/MultiBodyTree.cpp b/src/bullet/BulletInverseDynamics/MultiBodyTree.cpp
new file mode 100644
index 00000000..4235f138
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/MultiBodyTree.cpp
@@ -0,0 +1,445 @@
+#include "MultiBodyTree.hpp"
+#include <cmath>
+#include <limits>
+#include <vector>
+#include "IDMath.hpp"
+#include "details/MultiBodyTreeImpl.hpp"
+#include "details/MultiBodyTreeInitCache.hpp"
+namespace btInverseDynamics {
+	: m_is_finalized(false),
+	  m_mass_parameters_are_valid(true),
+	  m_accept_invalid_mass_parameters(false),
+	  m_impl(0x0),
+	  m_init_cache(0x0) {
+	m_init_cache = new InitCache();
+MultiBodyTree::~MultiBodyTree() {
+	delete m_impl;
+	delete m_init_cache;
+void MultiBodyTree::setAcceptInvalidMassParameters(bool flag) {
+	m_accept_invalid_mass_parameters = flag;
+bool MultiBodyTree::getAcceptInvalidMassProperties() const {
+	return m_accept_invalid_mass_parameters;
+int MultiBodyTree::getBodyOrigin(const int body_index, vec3 *world_origin) const {
+	return m_impl->getBodyOrigin(body_index, world_origin);
+int MultiBodyTree::getBodyCoM(const int body_index, vec3 *world_com) const {
+	return m_impl->getBodyCoM(body_index, world_com);
+int MultiBodyTree::getBodyTransform(const int body_index, mat33 *world_T_body) const {
+	return m_impl->getBodyTransform(body_index, world_T_body);
+int MultiBodyTree::getBodyAngularVelocity(const int body_index, vec3 *world_omega) const {
+	return m_impl->getBodyAngularVelocity(body_index, world_omega);
+int MultiBodyTree::getBodyLinearVelocity(const int body_index, vec3 *world_velocity) const {
+	return m_impl->getBodyLinearVelocity(body_index, world_velocity);
+int MultiBodyTree::getBodyLinearVelocityCoM(const int body_index, vec3 *world_velocity) const {
+	return m_impl->getBodyLinearVelocityCoM(body_index, world_velocity);
+int MultiBodyTree::getBodyAngularAcceleration(const int body_index, vec3 *world_dot_omega) const {
+	return m_impl->getBodyAngularAcceleration(body_index, world_dot_omega);
+int MultiBodyTree::getBodyLinearAcceleration(const int body_index, vec3 *world_acceleration) const {
+	return m_impl->getBodyLinearAcceleration(body_index, world_acceleration);
+int MultiBodyTree::getParentRParentBodyRef(const int body_index, vec3* r) const {
+    return m_impl->getParentRParentBodyRef(body_index, r);
+int MultiBodyTree::getBodyTParentRef(const int body_index, mat33* T) const {
+    return m_impl->getBodyTParentRef(body_index, T);
+int MultiBodyTree::getBodyAxisOfMotion(const int body_index, vec3* axis) const {
+    return m_impl->getBodyAxisOfMotion(body_index, axis);
+void MultiBodyTree::printTree() { m_impl->printTree(); }
+void MultiBodyTree::printTreeData() { m_impl->printTreeData(); }
+int MultiBodyTree::numBodies() const { return m_impl->m_num_bodies; }
+int MultiBodyTree::numDoFs() const { return m_impl->m_num_dofs; }
+int MultiBodyTree::calculateInverseDynamics(const vecx &q, const vecx &u, const vecx &dot_u,
+											vecx *joint_forces) {
+	if (false == m_is_finalized) {
+		error_message("system has not been initialized\n");
+		return -1;
+	}
+	if (-1 == m_impl->calculateInverseDynamics(q, u, dot_u, joint_forces)) {
+		error_message("error in inverse dynamics calculation\n");
+		return -1;
+	}
+	return 0;
+int MultiBodyTree::calculateMassMatrix(const vecx &q, const bool update_kinematics,
+									   const bool initialize_matrix,
+									   const bool set_lower_triangular_matrix, matxx *mass_matrix) {
+	if (false == m_is_finalized) {
+		error_message("system has not been initialized\n");
+		return -1;
+	}
+	if (-1 ==
+		m_impl->calculateMassMatrix(q, update_kinematics, initialize_matrix,
+									set_lower_triangular_matrix, mass_matrix)) {
+		error_message("error in mass matrix calculation\n");
+		return -1;
+	}
+	return 0;
+int MultiBodyTree::calculateMassMatrix(const vecx &q, matxx *mass_matrix) {
+	return calculateMassMatrix(q, true, true, true, mass_matrix);
+int MultiBodyTree::calculateKinematics(const vecx& q, const vecx& u, const vecx& dot_u) {
+    vec3 world_gravity(m_impl->m_world_gravity);
+    // temporarily set gravity to zero, to ensure we get the actual accelerations
+    setZero(m_impl->m_world_gravity);
+    if (false == m_is_finalized) {
+        error_message("system has not been initialized\n");
+        return -1;
+    }
+    if (-1 == m_impl->calculateKinematics(q, u, dot_u,
+                                          MultiBodyTree::MultiBodyImpl::POSITION_VELOCITY_ACCELERATION)) {
+        error_message("error in kinematics calculation\n");
+        return -1;
+    }
+    m_impl->m_world_gravity=world_gravity;
+    return 0;
+int MultiBodyTree::calculatePositionKinematics(const vecx& q) {
+	if (false == m_is_finalized) {
+		error_message("system has not been initialized\n");
+		return -1;
+	}
+	if (-1 == m_impl->calculateKinematics(q, q, q,
+                                              MultiBodyTree::MultiBodyImpl::POSITION_VELOCITY)) {
+		error_message("error in kinematics calculation\n");
+		return -1;
+	}
+	return 0;
+int MultiBodyTree::calculatePositionAndVelocityKinematics(const vecx& q, const vecx& u) {
+	if (false == m_is_finalized) {
+		error_message("system has not been initialized\n");
+		return -1;
+	}
+	if (-1 == m_impl->calculateKinematics(q, u, u,
+                                              MultiBodyTree::MultiBodyImpl::POSITION_VELOCITY)) {
+		error_message("error in kinematics calculation\n");
+		return -1;
+	}
+	return 0;
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+int MultiBodyTree::calculateJacobians(const vecx& q, const vecx& u) {
+    if (false == m_is_finalized) {
+        error_message("system has not been initialized\n");
+        return -1;
+    }
+    if (-1 == m_impl->calculateJacobians(q, u,
+                                         MultiBodyTree::MultiBodyImpl::POSITION_VELOCITY)) {
+        error_message("error in jacobian calculation\n");
+        return -1;
+    }
+    return 0;
+int MultiBodyTree::calculateJacobians(const vecx& q){
+    if (false == m_is_finalized) {
+        error_message("system has not been initialized\n");
+        return -1;
+    }
+    if (-1 == m_impl->calculateJacobians(q, q,
+                                         MultiBodyTree::MultiBodyImpl::POSITION_ONLY)) {
+        error_message("error in jacobian calculation\n");
+        return -1;
+    }
+    return 0;
+int MultiBodyTree::getBodyDotJacobianTransU(const int body_index, vec3* world_dot_jac_trans_u) const {
+    return m_impl->getBodyDotJacobianTransU(body_index,world_dot_jac_trans_u);
+int MultiBodyTree::getBodyDotJacobianRotU(const int body_index, vec3* world_dot_jac_rot_u) const {
+    return m_impl->getBodyDotJacobianRotU(body_index,world_dot_jac_rot_u);
+int MultiBodyTree::getBodyJacobianTrans(const int body_index, mat3x* world_jac_trans) const {
+    return m_impl->getBodyJacobianTrans(body_index,world_jac_trans);
+int MultiBodyTree::getBodyJacobianRot(const int body_index, mat3x* world_jac_rot) const  {
+    return m_impl->getBodyJacobianRot(body_index,world_jac_rot);
+int MultiBodyTree::addBody(int body_index, int parent_index, JointType joint_type,
+						   const vec3 &parent_r_parent_body_ref, const mat33 &body_T_parent_ref,
+						   const vec3 &body_axis_of_motion_, idScalar mass,
+						   const vec3 &body_r_body_com, const mat33 &body_I_body,
+						   const int user_int, void *user_ptr) {
+	if (body_index < 0) {
+		error_message("body index must be positive (got %d)\n", body_index);
+		return -1;
+	}
+	vec3 body_axis_of_motion(body_axis_of_motion_);
+	switch (joint_type) {
+		case REVOLUTE:
+		case PRISMATIC:
+			// check if axis is unit vector
+			if (!isUnitVector(body_axis_of_motion)) {
+				warning_message(
+					"axis of motion not a unit axis ([%f %f %f]), will use normalized vector\n",
+					body_axis_of_motion(0), body_axis_of_motion(1), body_axis_of_motion(2));
+				idScalar length = std::sqrt(std::pow(body_axis_of_motion(0), 2) +
+											std::pow(body_axis_of_motion(1), 2) +
+											std::pow(body_axis_of_motion(2), 2));
+				if (length < std::sqrt(std::numeric_limits<idScalar>::min())) {
+					error_message("axis of motion vector too short (%e)\n", length);
+					return -1;
+				}
+				body_axis_of_motion = (1.0 / length) * body_axis_of_motion;
+			}
+			break;
+		case FIXED:
+			break;
+		case FLOATING:
+			break;
+		default:
+			error_message("unknown joint type %d\n", joint_type);
+			return -1;
+	}
+	// sanity check for mass properties. Zero mass is OK.
+	if (mass < 0) {
+		m_mass_parameters_are_valid = false;
+		error_message("Body %d has invalid mass %e\n", body_index, mass);
+		if (!m_accept_invalid_mass_parameters) {
+			return -1;
+		}
+	}
+	if (!isValidInertiaMatrix(body_I_body, body_index, FIXED == joint_type)) {
+		m_mass_parameters_are_valid = false;
+		// error message printed in function call
+		if (!m_accept_invalid_mass_parameters) {
+			return -1;
+		}
+	}
+	if (!isValidTransformMatrix(body_T_parent_ref)) {
+		return -1;
+	}
+	return m_init_cache->addBody(body_index, parent_index, joint_type, parent_r_parent_body_ref,
+								 body_T_parent_ref, body_axis_of_motion, mass, body_r_body_com,
+								 body_I_body, user_int, user_ptr);
+int MultiBodyTree::getParentIndex(const int body_index, int *parent_index) const {
+	return m_impl->getParentIndex(body_index, parent_index);
+int MultiBodyTree::getUserInt(const int body_index, int *user_int) const {
+	return m_impl->getUserInt(body_index, user_int);
+int MultiBodyTree::getUserPtr(const int body_index, void **user_ptr) const {
+	return m_impl->getUserPtr(body_index, user_ptr);
+int MultiBodyTree::setUserInt(const int body_index, const int user_int) {
+	return m_impl->setUserInt(body_index, user_int);
+int MultiBodyTree::setUserPtr(const int body_index, void *const user_ptr) {
+	return m_impl->setUserPtr(body_index, user_ptr);
+int MultiBodyTree::finalize() {
+	const int &num_bodies = m_init_cache->numBodies();
+	const int &num_dofs = m_init_cache->numDoFs();
+        if(num_dofs<=0) {
+            error_message("Need num_dofs>=1, but num_dofs= %d\n", num_dofs);
+            //return -1;
+        }
+	// 1 allocate internal MultiBody structure
+	m_impl = new MultiBodyImpl(num_bodies, num_dofs);
+	// 2 build new index set assuring index(parent) < index(child)
+	if (-1 == m_init_cache->buildIndexSets()) {
+		return -1;
+	}
+	m_init_cache->getParentIndexArray(&m_impl->m_parent_index);
+	// 3 setup internal kinematic and dynamic data
+	for (int index = 0; index < num_bodies; index++) {
+		InertiaData inertia;
+		JointData joint;
+		if (-1 == m_init_cache->getInertiaData(index, &inertia)) {
+			return -1;
+		}
+		if (-1 == m_init_cache->getJointData(index, &joint)) {
+			return -1;
+		}
+		RigidBody &rigid_body = m_impl->m_body_list[index];
+		rigid_body.m_mass = inertia.m_mass;
+		rigid_body.m_body_mass_com = inertia.m_mass * inertia.m_body_pos_body_com;
+		rigid_body.m_body_I_body = inertia.m_body_I_body;
+		rigid_body.m_joint_type = joint.m_type;
+		rigid_body.m_parent_pos_parent_body_ref = joint.m_parent_pos_parent_child_ref;
+		rigid_body.m_body_T_parent_ref = joint.m_child_T_parent_ref;
+		rigid_body.m_parent_pos_parent_body_ref = joint.m_parent_pos_parent_child_ref;
+		rigid_body.m_joint_type = joint.m_type;
+		// Set joint Jacobians. Note that the dimension is always 3x1 here to avoid variable sized
+		// matrices.
+		switch (rigid_body.m_joint_type) {
+			case REVOLUTE:
+				rigid_body.m_Jac_JR(0) = joint.m_child_axis_of_motion(0);
+				rigid_body.m_Jac_JR(1) = joint.m_child_axis_of_motion(1);
+				rigid_body.m_Jac_JR(2) = joint.m_child_axis_of_motion(2);
+				rigid_body.m_Jac_JT(0) = 0.0;
+				rigid_body.m_Jac_JT(1) = 0.0;
+				rigid_body.m_Jac_JT(2) = 0.0;
+				break;
+			case PRISMATIC:
+				rigid_body.m_Jac_JR(0) = 0.0;
+				rigid_body.m_Jac_JR(1) = 0.0;
+				rigid_body.m_Jac_JR(2) = 0.0;
+				rigid_body.m_Jac_JT(0) = joint.m_child_axis_of_motion(0);
+				rigid_body.m_Jac_JT(1) = joint.m_child_axis_of_motion(1);
+				rigid_body.m_Jac_JT(2) = joint.m_child_axis_of_motion(2);
+				break;
+			case FIXED:
+				// NOTE/TODO: dimension really should be zero ..
+				rigid_body.m_Jac_JR(0) = 0.0;
+				rigid_body.m_Jac_JR(1) = 0.0;
+				rigid_body.m_Jac_JR(2) = 0.0;
+				rigid_body.m_Jac_JT(0) = 0.0;
+				rigid_body.m_Jac_JT(1) = 0.0;
+				rigid_body.m_Jac_JT(2) = 0.0;
+				break;
+			case FLOATING:
+				// NOTE/TODO: this is not really correct.
+				// the Jacobians should be 3x3 matrices here !
+				rigid_body.m_Jac_JR(0) = 0.0;
+				rigid_body.m_Jac_JR(1) = 0.0;
+				rigid_body.m_Jac_JR(2) = 0.0;
+				rigid_body.m_Jac_JT(0) = 0.0;
+				rigid_body.m_Jac_JT(1) = 0.0;
+				rigid_body.m_Jac_JT(2) = 0.0;
+				break;
+			default:
+				error_message("unsupported joint type %d\n", rigid_body.m_joint_type);
+				return -1;
+		}
+	}
+	// 4 assign degree of freedom indices & build per-joint-type index arrays
+	if (-1 == m_impl->generateIndexSets()) {
+		error_message("generating index sets\n");
+		return -1;
+	}
+	// 5 do some pre-computations ..
+	m_impl->calculateStaticData();
+	// 6. make sure all user forces are set to zero, as this might not happen
+	//	in the vector ctors.
+	m_impl->clearAllUserForcesAndMoments();
+	m_is_finalized = true;
+	return 0;
+int MultiBodyTree::setGravityInWorldFrame(const vec3 &gravity) {
+	return m_impl->setGravityInWorldFrame(gravity);
+int MultiBodyTree::getJointType(const int body_index, JointType *joint_type) const {
+	return m_impl->getJointType(body_index, joint_type);
+int MultiBodyTree::getJointTypeStr(const int body_index, const char **joint_type) const {
+	return m_impl->getJointTypeStr(body_index, joint_type);
+int MultiBodyTree::getDoFOffset(const int body_index, int *q_offset) const {
+	return m_impl->getDoFOffset(body_index, q_offset);
+int MultiBodyTree::setBodyMass(const int body_index, idScalar mass) {
+	return m_impl->setBodyMass(body_index, mass);
+int MultiBodyTree::setBodyFirstMassMoment(const int body_index, const vec3& first_mass_moment) {
+	return m_impl->setBodyFirstMassMoment(body_index, first_mass_moment);
+int MultiBodyTree::setBodySecondMassMoment(const int body_index, const mat33& second_mass_moment) {
+	return m_impl->setBodySecondMassMoment(body_index, second_mass_moment);
+int MultiBodyTree::getBodyMass(const int body_index, idScalar *mass) const {
+	return m_impl->getBodyMass(body_index, mass);
+int MultiBodyTree::getBodyFirstMassMoment(const int body_index, vec3 *first_mass_moment) const {
+	return m_impl->getBodyFirstMassMoment(body_index, first_mass_moment);
+int MultiBodyTree::getBodySecondMassMoment(const int body_index, mat33 *second_mass_moment) const {
+	return m_impl->getBodySecondMassMoment(body_index, second_mass_moment);
+void MultiBodyTree::clearAllUserForcesAndMoments() { m_impl->clearAllUserForcesAndMoments(); }
+int MultiBodyTree::addUserForce(const int body_index, const vec3 &body_force) {
+	return m_impl->addUserForce(body_index, body_force);
+int MultiBodyTree::addUserMoment(const int body_index, const vec3 &body_moment) {
+	return m_impl->addUserMoment(body_index, body_moment);
diff --git a/src/bullet/BulletInverseDynamics/MultiBodyTree.hpp b/src/bullet/BulletInverseDynamics/MultiBodyTree.hpp
new file mode 100644
index 00000000..d235aa6e
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/MultiBodyTree.hpp
@@ -0,0 +1,363 @@
+#include "IDConfig.hpp"
+#include "IDMath.hpp"
+namespace btInverseDynamics {
+/// Enumeration of supported joint types
+enum JointType {
+	/// no degree of freedom, moves with parent
+	FIXED = 0,
+	/// one rotational degree of freedom relative to parent
+	/// one translational degree of freedom relative to parent
+	/// six degrees of freedom relative to parent
+/// Interface class for calculating inverse dynamics for tree structured
+/// multibody systems
+/// Note on degrees of freedom
+/// The q vector contains the generalized coordinate set defining the tree's configuration.
+/// Every joint adds elements that define the corresponding link's frame pose relative to
+/// its parent. For the joint types that is:
+///	- FIXED:	 none
+///	- REVOLUTE:  angle of rotation [rad]
+///	- PRISMATIC: displacement [m]
+///	- FLOATING:  Euler x-y-z angles [rad] and displacement in body-fixed frame of parent [m]
+///				 (in that order)
+/// The u vector contains the generalized speeds, which are
+///	- FIXED:	 none
+///	- REVOLUTE:  time derivative of angle of rotation [rad/s]
+///	- PRISMATIC: time derivative of displacement [m/s]
+///	- FLOATING:  angular velocity [rad/s] (*not* time derivative of rpy angles)
+///				 and time derivative of displacement in parent frame [m/s]
+/// The q and u vectors are obtained by stacking contributions of all bodies in one
+/// vector in the order of body indices.
+/// Note on generalized forces: analogous to u, i.e.,
+///	 - FIXED:	 none
+///	 - REVOLUTE:  moment [Nm], about joint axis
+///	 - PRISMATIC: force  [N], along joint axis
+///	 - FLOATING:  moment vector [Nm] and force vector [N], both in body-fixed frame
+///				  (in that order)
+/// TODO - force element interface (friction, springs, dampers, etc)
+///	  - gears and motor inertia
+class MultiBodyTree {
+	/// The contructor.
+	/// Initialization & allocation is via addBody and buildSystem calls.
+	MultiBodyTree();
+	/// the destructor. This also deallocates all memory
+	~MultiBodyTree();
+	/// Add body to the system. this allocates memory and not real-time safe.
+	/// This only adds the data to an initial cache. After all bodies have been
+	/// added,
+	/// the system is setup using the buildSystem call
+	/// @param body_index index of the body to be added. Must >=0, <number of bodies,
+	///		and index of parent must be < index of body
+	/// @param parent_index index of the parent body
+	///		The root of the tree has index 0 and its parent (the world frame)
+	///		is assigned index -1
+	///		the rotation and translation relative to the parent are taken as
+	///		pose of the root body relative to the world frame. Other parameters
+	///		are ignored
+	/// @param JointType type of joint connecting the body to the parent
+	/// @param mass the mass of the body
+	/// @param body_r_body_com the center of mass of the body relative to and
+	/// described in
+	///		the body fixed frame, which is located in the joint axis connecting
+	/// the body to its parent
+	/// @param body_I_body the moment of inertia of the body w.r.t the body-fixed
+	/// frame
+	///		(ie, the reference point is the origin of the body-fixed frame and
+	/// the matrix is written
+	///		 w.r.t. those unit vectors)
+	/// @param parent_r_parent_body_ref position of joint relative to the parent
+	/// body's reference frame
+	///		for q=0, written in the parent bodies reference frame
+	/// @param body_axis_of_motion translation/rotation axis in body-fixed frame.
+	///		Ignored for joints that are not revolute or prismatic.
+	///		must be a unit vector.
+	/// @param body_T_parent_ref transform matrix from parent to body reference
+	/// frame for q=0.
+	///		This is the matrix transforming a vector represented in the
+	/// parent's reference frame into one represented
+	///		in this body's reference frame.
+	///		ie, if parent_vec is a vector in R^3 whose components are w.r.t to
+	/// the parent's reference frame,
+	///		then the same vector written w.r.t. this body's frame (for q=0) is
+	/// given by
+	///		body_vec = parent_R_body_ref * parent_vec
+	/// @param user_ptr pointer to user data
+	/// @param user_int pointer to user integer
+	/// @return 0 on success, -1 on error
+	int addBody(int body_index, int parent_index, JointType joint_type,
+				const vec3& parent_r_parent_body_ref, const mat33& body_T_parent_ref,
+				const vec3& body_axis_of_motion, idScalar mass, const vec3& body_r_body_com,
+				const mat33& body_I_body, const int user_int, void* user_ptr);
+	/// set policy for invalid mass properties
+	/// @param flag if true, invalid mass properties are accepted,
+	///		the default is false
+	void setAcceptInvalidMassParameters(bool flag);
+	/// @return the mass properties policy flag
+	bool getAcceptInvalidMassProperties() const;
+	/// build internal data structures
+	/// call this after all bodies have been added via addBody
+	/// @return 0 on success, -1 on error
+	int finalize();
+	/// pretty print ascii description of tree to stdout
+	void printTree();
+	/// print tree data to stdout
+	void printTreeData();
+	/// Calculate joint forces for given generalized state & derivatives.
+        /// This also updates kinematic terms computed in calculateKinematics.
+        /// If gravity is not set to zero, acceleration terms will contain
+        /// gravitational acceleration.
+	/// @param q generalized coordinates
+	/// @param u generalized velocities. In the general case, u=T(q)*dot(q) and dim(q)>=dim(u)
+	/// @param dot_u time derivative of u
+	/// @param joint_forces this is where the resulting joint forces will be
+	///		stored. dim(joint_forces) = dim(u)
+	/// @return 0 on success, -1 on error
+	int calculateInverseDynamics(const vecx& q, const vecx& u, const vecx& dot_u,
+								 vecx* joint_forces);
+	/// Calculate joint space mass matrix
+	/// @param q generalized coordinates
+	/// @param initialize_matrix if true, initialize mass matrix with zero.
+	///		If mass_matrix is initialized to zero externally and only used
+	///		for mass matrix computations for the same system, it is safe to
+	///		set this to false.
+	/// @param set_lower_triangular_matrix if true, the lower triangular section of mass_matrix
+	///		is also populated, otherwise not.
+	/// @param mass_matrix matrix for storing the output (should be dim(q)xdim(q))
+	/// @return -1 on error, 0 on success
+	int calculateMassMatrix(const vecx& q, const bool update_kinematics,
+							const bool initialize_matrix, const bool set_lower_triangular_matrix,
+							matxx* mass_matrix);
+	/// Calculate joint space mass matrix.
+	/// This version will update kinematics, initialize all mass_matrix elements to zero and
+	/// populate all mass matrix entries.
+	/// @param q generalized coordinates
+	/// @param mass_matrix matrix for storing the output (should be dim(q)xdim(q))
+	/// @return -1 on error, 0 on success
+	int calculateMassMatrix(const vecx& q, matxx* mass_matrix);
+        /// Calculates kinematics also calculated in calculateInverseDynamics,
+        /// but not dynamics.
+        /// This function ensures that correct accelerations are computed that do not
+        /// contain gravitational acceleration terms.
+        /// Does not calculate Jacobians, but only vector quantities (positions, velocities & accelerations)
+        int calculateKinematics(const vecx& q, const vecx& u, const vecx& dot_u);
+        /// Calculate position kinematics
+        int calculatePositionKinematics(const vecx& q);
+         /// Calculate position and velocity kinematics
+        int calculatePositionAndVelocityKinematics(const vecx& q, const vecx& u);
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+        /// Calculate Jacobians (dvel/du), as well as velocity-dependent accelearation components
+        /// d(Jacobian)/dt*u
+        /// This function assumes that calculateInverseDynamics was called, or calculateKinematics,
+        /// or calculatePositionAndVelocityKinematics
+        int calculateJacobians(const vecx& q, const vecx& u);
+        /// Calculate Jacobians (dvel/du)
+        /// This function assumes that calculateInverseDynamics was called, or
+        /// one of the calculateKineamtics functions
+        int calculateJacobians(const vecx& q);
+#endif // BT_ID_HAVE_MAT3X
+	/// set gravitational acceleration
+	/// the default is [0;0;-9.8] in the world frame
+	/// @param gravity the gravitational acceleration in world frame
+	/// @return 0 on success, -1 on error
+	int setGravityInWorldFrame(const vec3& gravity);
+	/// returns number of bodies in tree
+	int numBodies() const;
+	/// returns number of mechanical degrees of freedom (dimension of q-vector)
+	int numDoFs() const;
+	/// get origin of a body-fixed frame, represented in world frame
+	/// @param body_index index for frame/body
+	/// @param world_origin pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyOrigin(const int body_index, vec3* world_origin) const;
+	/// get center of mass of a body, represented in world frame
+	/// @param body_index index for frame/body
+	/// @param world_com pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyCoM(const int body_index, vec3* world_com) const;
+	/// get transform from of a body-fixed frame to the world frame
+	/// @param body_index index for frame/body
+	/// @param world_T_body pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyTransform(const int body_index, mat33* world_T_body) const;
+	/// get absolute angular velocity for a body, represented in the world frame
+	/// @param body_index index for frame/body
+	/// @param world_omega pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyAngularVelocity(const int body_index, vec3* world_omega) const;
+	/// get linear velocity of a body, represented in world frame
+	/// @param body_index index for frame/body
+	/// @param world_velocity pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyLinearVelocity(const int body_index, vec3* world_velocity) const;
+	/// get linear velocity of a body's CoM, represented in world frame
+	/// (not required for inverse dynamics, provided for convenience)
+	/// @param body_index index for frame/body
+	/// @param world_vel_com pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyLinearVelocityCoM(const int body_index, vec3* world_velocity) const;
+	/// get origin of a body-fixed frame, represented in world frame
+	/// @param body_index index for frame/body
+	/// @param world_origin pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyAngularAcceleration(const int body_index, vec3* world_dot_omega) const;
+	/// get origin of a body-fixed frame, represented in world frame
+	/// NOTE: this will include the gravitational acceleration, so the actual acceleration is
+	/// obtainened by setting gravitational acceleration to zero, or subtracting it.
+	/// @param body_index index for frame/body
+	/// @param world_origin pointer for return data
+	/// @return 0 on success, -1 on error
+	int getBodyLinearAcceleration(const int body_index, vec3* world_acceleration) const;
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+        // get translational jacobian, in world frame (dworld_velocity/du)
+        int getBodyJacobianTrans(const int body_index, mat3x* world_jac_trans) const;
+        // get rotational jacobian, in world frame (dworld_omega/du)
+        int getBodyJacobianRot(const int body_index, mat3x* world_jac_rot) const;
+        // get product of translational jacobian derivative * generatlized velocities
+        int getBodyDotJacobianTransU(const int body_index, vec3* world_dot_jac_trans_u) const;
+        // get product of rotational jacobian derivative * generatlized velocities
+        int getBodyDotJacobianRotU(const int body_index, vec3* world_dot_jac_rot_u) const;
+#endif // BT_ID_HAVE_MAT3X
+	/// returns the (internal) index of body
+	/// @param body_index is the index of a body
+	/// @param parent_index pointer to where parent index will be stored
+	/// @return 0 on success, -1 on error
+	int getParentIndex(const int body_index, int* parent_index) const;
+	/// get joint type
+	/// @param body_index index of the body
+	/// @param joint_type the corresponding joint type
+	/// @return 0 on success, -1 on failure
+	int getJointType(const int body_index, JointType* joint_type) const;
+	/// get joint type as string
+	/// @param body_index index of the body
+	/// @param joint_type string naming the corresponding joint type
+	/// @return 0 on success, -1 on failure
+	int getJointTypeStr(const int body_index, const char** joint_type) const;
+    	/// get offset translation to parent body (see addBody)
+	/// @param body_index index of the body
+	/// @param r the offset translation (see above)
+	/// @return 0 on success, -1 on failure
+        int getParentRParentBodyRef(const int body_index, vec3* r) const;
+	/// get offset rotation to parent body (see addBody)
+	/// @param body_index index of the body
+	/// @param T the transform (see above)
+	/// @return 0 on success, -1 on failure
+        int getBodyTParentRef(const int body_index, mat33* T) const;
+	/// get axis of motion (see addBody)
+	/// @param body_index index of the body
+	/// @param axis the axis (see above)
+	/// @return 0 on success, -1 on failure
+        int getBodyAxisOfMotion(const int body_index, vec3* axis) const;
+	/// get offset for degrees of freedom of this body into the q-vector
+	/// @param body_index index of the body
+	/// @param q_offset offset the q vector
+	/// @return -1 on error, 0 on success
+	int getDoFOffset(const int body_index, int* q_offset) const;
+	/// get user integer. not used by the library.
+	/// @param body_index index of the body
+	/// @param user_int   the user integer
+	/// @return 0 on success, -1 on error
+	int getUserInt(const int body_index, int* user_int) const;
+	/// get user pointer. not used by the library.
+	/// @param body_index index of the body
+	/// @param user_ptr   the user pointer
+	/// @return 0 on success, -1 on error
+	int getUserPtr(const int body_index, void** user_ptr) const;
+	/// set user integer. not used by the library.
+	/// @param body_index index of the body
+	/// @param user_int   the user integer
+	/// @return 0 on success, -1 on error
+	int setUserInt(const int body_index, const int user_int);
+	/// set user pointer. not used by the library.
+	/// @param body_index index of the body
+	/// @param user_ptr   the user pointer
+	/// @return 0 on success, -1 on error
+	int setUserPtr(const int body_index, void* const user_ptr);
+	/// set mass for a body
+	/// @param body_index index of the body
+	/// @param mass the mass to set
+	/// @return 0 on success, -1 on failure
+	int setBodyMass(const int body_index, const idScalar mass);
+	/// set first moment of mass for a body
+	/// (mass * center of mass, in body fixed frame, relative to joint)
+	/// @param body_index index of the body
+	/// @param first_mass_moment the vector to set
+	/// @return 0 on success, -1 on failure
+	int setBodyFirstMassMoment(const int body_index, const vec3& first_mass_moment);
+	/// set second moment of mass for a body
+	/// (moment of inertia, in body fixed frame, relative to joint)
+	/// @param body_index index of the body
+	/// @param second_mass_moment the inertia matrix
+	/// @return 0 on success, -1 on failure
+	int setBodySecondMassMoment(const int body_index, const mat33& second_mass_moment);
+	/// get mass for a body
+	/// @param body_index index of the body
+	/// @param mass the mass
+	/// @return 0 on success, -1 on failure
+	int getBodyMass(const int body_index, idScalar* mass) const;
+	/// get first moment of mass for a body
+	/// (mass * center of mass, in body fixed frame, relative to joint)
+	/// @param body_index index of the body
+	/// @param first_moment the vector
+	/// @return 0 on success, -1 on failure
+	int getBodyFirstMassMoment(const int body_index, vec3* first_mass_moment) const;
+	/// get second moment of mass for a body
+	/// (moment of inertia, in body fixed frame, relative to joint)
+	/// @param body_index index of the body
+	/// @param second_mass_moment the inertia matrix
+	/// @return 0 on success, -1 on failure
+	int getBodySecondMassMoment(const int body_index, mat33* second_mass_moment) const;
+	/// set all user forces and moments to zero
+	void clearAllUserForcesAndMoments();
+	/// Add an external force to a body, acting at the origin of the body-fixed frame.
+	/// Calls to addUserForce are cumulative. Set the user force and moment to zero
+	/// via clearAllUserForcesAndMoments()
+	/// @param body_force the force represented in the body-fixed frame of reference
+	/// @return 0 on success, -1 on error
+	int addUserForce(const int body_index, const vec3& body_force);
+	/// Add an external moment to a body.
+	/// Calls to addUserMoment are cumulative. Set the user force and moment to zero
+	/// via clearAllUserForcesAndMoments()
+	/// @param body_moment the moment represented in the body-fixed frame of reference
+	/// @return 0 on success, -1 on error
+	int addUserMoment(const int body_index, const vec3& body_moment);
+	// flag indicating if system has been initialized
+	bool m_is_finalized;
+	// flag indicating if mass properties are physically valid
+	bool m_mass_parameters_are_valid;
+	// flag defining if unphysical mass parameters are accepted
+	bool m_accept_invalid_mass_parameters;
+	// This struct implements the inverse dynamics calculations
+	class MultiBodyImpl;
+	MultiBodyImpl* m_impl;
+	// cache data structure for initialization
+	class InitCache;
+	InitCache* m_init_cache;
+}  // namespace btInverseDynamics
diff --git a/src/bullet/BulletInverseDynamics/details/IDEigenInterface.hpp b/src/bullet/BulletInverseDynamics/details/IDEigenInterface.hpp
new file mode 100644
index 00000000..836395ce
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/IDEigenInterface.hpp
@@ -0,0 +1,36 @@
+#include "../IDConfig.hpp"
+namespace btInverseDynamics {
+#define BT_ID_HAVE_MAT3X
+typedef Eigen::Matrix<double, Eigen::Dynamic, 1, Eigen::DontAlign> vecx;
+typedef Eigen::Matrix<double, 3, 1, Eigen::DontAlign> vec3;
+typedef Eigen::Matrix<double, 3, 3, Eigen::DontAlign> mat33;
+typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::DontAlign> matxx;
+typedef Eigen::Matrix<double, 3, Eigen::Dynamic, Eigen::DontAlign> mat3x;
+typedef Eigen::Matrix<float, Eigen::Dynamic, 1, Eigen::DontAlign> vecx;
+typedef Eigen::Matrix<float, 3, 1, Eigen::DontAlign> vec3;
+typedef Eigen::Matrix<float, 3, 3, Eigen::DontAlign> mat33;
+typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::DontAlign> matxx;
+typedef Eigen::Matrix<float, 3, Eigen::Dynamic, Eigen::DontAlign> mat3x;
+inline void resize(mat3x &m, Eigen::Index size) {
+    m.resize(3, size);
+    m.setZero();
+inline void setMatxxElem(const idArrayIdx row, const idArrayIdx col, const idScalar val, matxx*m){
+    (*m)(row, col) = val;
+inline void setMat3xElem(const idArrayIdx row, const idArrayIdx col, const idScalar val, mat3x*m){
+    (*m)(row, col) = val;
diff --git a/src/bullet/BulletInverseDynamics/details/IDLinearMathInterface.hpp b/src/bullet/BulletInverseDynamics/details/IDLinearMathInterface.hpp
new file mode 100644
index 00000000..cbe6e5a9
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/IDLinearMathInterface.hpp
@@ -0,0 +1,172 @@
+#include <cstdlib>
+#include "../IDConfig.hpp"
+#include "../../LinearMath/btMatrix3x3.h"
+#include "../../LinearMath/btVector3.h"
+#include "../../LinearMath/btMatrixX.h"
+#define BT_ID_HAVE_MAT3X
+namespace btInverseDynamics {
+class vec3;
+class vecx;
+class mat33;
+typedef btMatrixX<idScalar> matxx;
+class vec3 : public btVector3 {
+	vec3() : btVector3() {}
+	vec3(const btVector3& btv) { *this = btv; }
+	idScalar& operator()(int i) { return (*this)[i]; }
+	const idScalar& operator()(int i) const { return (*this)[i]; }
+	const int size() const { return 3; }
+	const vec3& operator=(const btVector3& rhs) {
+		*static_cast<btVector3*>(this) = rhs;
+		return *this;
+	}
+class mat33 : public btMatrix3x3 {
+	mat33() : btMatrix3x3() {}
+	mat33(const btMatrix3x3& btm) { *this = btm; }
+	idScalar& operator()(int i, int j) { return (*this)[i][j]; }
+	const idScalar& operator()(int i, int j) const { return (*this)[i][j]; }
+	const mat33& operator=(const btMatrix3x3& rhs) {
+		*static_cast<btMatrix3x3*>(this) = rhs;
+		return *this;
+	}
+	friend mat33 operator*(const idScalar& s, const mat33& a);
+	friend mat33 operator/(const mat33& a, const idScalar& s);
+inline mat33 operator/(const mat33& a, const idScalar& s) { return a * (1.0 / s); }
+inline mat33 operator*(const idScalar& s, const mat33& a) { return a * s; }
+class vecx : public btVectorX<idScalar> {
+	vecx(int size) : btVectorX(size) {}
+	const vecx& operator=(const btVectorX<idScalar>& rhs) {
+		*static_cast<btVectorX*>(this) = rhs;
+		return *this;
+	}
+	idScalar& operator()(int i) { return (*this)[i]; }
+	const idScalar& operator()(int i) const { return (*this)[i]; }
+	friend vecx operator*(const vecx& a, const idScalar& s);
+	friend vecx operator*(const idScalar& s, const vecx& a);
+	friend vecx operator+(const vecx& a, const vecx& b);
+	friend vecx operator-(const vecx& a, const vecx& b);
+	friend vecx operator/(const vecx& a, const idScalar& s);
+inline vecx operator*(const vecx& a, const idScalar& s) {
+	vecx result(a.size());
+	for (int i = 0; i < result.size(); i++) {
+		result(i) = a(i) * s;
+	}
+	return result;
+inline vecx operator*(const idScalar& s, const vecx& a) { return a * s; }
+inline vecx operator+(const vecx& a, const vecx& b) {
+	vecx result(a.size());
+	// TODO: error handling for a.size() != b.size()??
+	if (a.size() != b.size()) {
+		error_message("size missmatch. a.size()= %d, b.size()= %d\n", a.size(), b.size());
+		abort();
+	}
+	for (int i = 0; i < a.size(); i++) {
+		result(i) = a(i) + b(i);
+	}
+	return result;
+inline vecx operator-(const vecx& a, const vecx& b) {
+	vecx result(a.size());
+	// TODO: error handling for a.size() != b.size()??
+	if (a.size() != b.size()) {
+		error_message("size missmatch. a.size()= %d, b.size()= %d\n", a.size(), b.size());
+		abort();
+	}
+	for (int i = 0; i < a.size(); i++) {
+		result(i) = a(i) - b(i);
+	}
+	return result;
+inline vecx operator/(const vecx& a, const idScalar& s) {
+	vecx result(a.size());
+	for (int i = 0; i < result.size(); i++) {
+		result(i) = a(i) / s;
+	}
+	return result;
+// use btMatrixX to implement 3xX matrix
+class mat3x : public matxx {
+    mat3x(){}
+    mat3x(const mat3x&rhs) {
+        matxx::resize(rhs.rows(), rhs.cols());
+        *this = rhs;
+    }
+    mat3x(int rows, int cols): matxx(3,cols) {
+    }
+    void operator=(const mat3x& rhs) {
+	if (m_cols != rhs.m_cols) {
+            error_message("size missmatch, cols= %d but rhs.cols= %d\n", cols(), rhs.cols());
+            abort();
+	}
+        for(int i=0;i<rows();i++) {
+            for(int k=0;k<cols();k++) {
+                setElem(i,k,rhs(i,k));
+            }
+        }
+    }
+    void setZero() {
+        matxx::setZero();
+    }
+inline vec3 operator*(const mat3x& a, const vecx& b) {
+    vec3 result;
+    if (a.cols() != b.size()) {
+        error_message("size missmatch. a.cols()= %d, b.size()= %d\n", a.cols(), b.size());
+        abort();
+    }
+    result(0)=0.0;
+    result(1)=0.0;
+    result(2)=0.0;
+    for(int i=0;i<b.size();i++) {
+        for(int k=0;k<3;k++) {
+            result(k)+=a(k,i)*b(i);
+        }
+    }
+    return result;
+inline void resize(mat3x &m, idArrayIdx size) {
+    m.resize(3, size);
+    m.setZero();
+inline void setMatxxElem(const idArrayIdx row, const idArrayIdx col, const idScalar val, matxx*m){
+    m->setElem(row, col, val);
+inline void setMat3xElem(const idArrayIdx row, const idArrayIdx col, const idScalar val, mat3x*m){
+    m->setElem(row, col, val);
diff --git a/src/bullet/BulletInverseDynamics/details/IDMatVec.hpp b/src/bullet/BulletInverseDynamics/details/IDMatVec.hpp
new file mode 100644
index 00000000..4d3f6c87
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/IDMatVec.hpp
@@ -0,0 +1,415 @@
+/// @file Built-In Matrix-Vector functions
+#ifndef IDMATVEC_HPP_
+#define IDMATVEC_HPP_
+#include <cstdlib>
+#include "../IDConfig.hpp"
+#define BT_ID_HAVE_MAT3X
+namespace btInverseDynamics {
+class vec3;
+class vecx;
+class mat33;
+class matxx;
+class mat3x;
+/// This is a very basic implementation to enable stand-alone use of the library.
+/// The implementation is not really optimized and misses many features that you would
+/// want from a "fully featured" linear math library.
+class vec3 {
+	idScalar& operator()(int i) { return m_data[i]; }
+	const idScalar& operator()(int i) const { return m_data[i]; }
+	const int size() const { return 3; }
+	const vec3& operator=(const vec3& rhs);
+	const vec3& operator+=(const vec3& b);
+	const vec3& operator-=(const vec3& b);
+	vec3 cross(const vec3& b) const;
+	idScalar dot(const vec3& b) const;
+	friend vec3 operator*(const mat33& a, const vec3& b);
+	friend vec3 operator*(const vec3& a, const idScalar& s);
+	friend vec3 operator*(const idScalar& s, const vec3& a);
+	friend vec3 operator+(const vec3& a, const vec3& b);
+	friend vec3 operator-(const vec3& a, const vec3& b);
+	friend vec3 operator/(const vec3& a, const idScalar& s);
+	idScalar m_data[3];
+class mat33 {
+	idScalar& operator()(int i, int j) { return m_data[3 * i + j]; }
+	const idScalar& operator()(int i, int j) const { return m_data[3 * i + j]; }
+	const mat33& operator=(const mat33& rhs);
+	mat33 transpose() const;
+	const mat33& operator+=(const mat33& b);
+	const mat33& operator-=(const mat33& b);
+	friend mat33 operator*(const mat33& a, const mat33& b);
+	friend vec3 operator*(const mat33& a, const vec3& b);
+	friend mat33 operator*(const mat33& a, const idScalar& s);
+	friend mat33 operator*(const idScalar& s, const mat33& a);
+	friend mat33 operator+(const mat33& a, const mat33& b);
+	friend mat33 operator-(const mat33& a, const mat33& b);
+	friend mat33 operator/(const mat33& a, const idScalar& s);
+	// layout is [0,1,2;3,4,5;6,7,8]
+	idScalar m_data[9];
+class vecx {
+	vecx(int size) : m_size(size) {
+		m_data = static_cast<idScalar*>(idMalloc(sizeof(idScalar) * size));
+	}
+	~vecx() { idFree(m_data); }
+	const vecx& operator=(const vecx& rhs);
+	idScalar& operator()(int i) { return m_data[i]; }
+	const idScalar& operator()(int i) const { return m_data[i]; }
+	const int& size() const { return m_size; }
+	friend vecx operator*(const vecx& a, const idScalar& s);
+	friend vecx operator*(const idScalar& s, const vecx& a);
+	friend vecx operator+(const vecx& a, const vecx& b);
+	friend vecx operator-(const vecx& a, const vecx& b);
+	friend vecx operator/(const vecx& a, const idScalar& s);
+	int m_size;
+	idScalar* m_data;
+class matxx {
+    matxx() {
+        m_data = 0x0;
+        m_cols=0;
+        m_rows=0;
+    }
+	matxx(int rows, int cols) : m_rows(rows), m_cols(cols) {
+		m_data = static_cast<idScalar*>(idMalloc(sizeof(idScalar) * rows * cols));
+	}
+	~matxx() { idFree(m_data); }
+	idScalar& operator()(int row, int col) { return m_data[row * m_cols + col]; }
+	const idScalar& operator()(int row, int col) const { return m_data[row * m_cols + col]; }
+	const int& rows() const { return m_rows; }
+	const int& cols() const { return m_cols; }
+	int m_rows;
+	int m_cols;
+	idScalar* m_data;
+class mat3x {
+    mat3x() {
+        m_data = 0x0;
+        m_cols=0;
+    }
+    mat3x(const mat3x&rhs) {
+        m_cols=rhs.m_cols;
+        allocate();
+        *this = rhs;
+    }
+    mat3x(int rows, int cols): m_cols(cols) {
+        allocate();
+    };
+    void operator=(const mat3x& rhs) {
+	if (m_cols != rhs.m_cols) {
+            error_message("size missmatch, cols= %d but rhs.cols= %d\n", cols(), rhs.cols());
+            abort();
+	}
+        for(int i=0;i<3*m_cols;i++) {
+            m_data[i] = rhs.m_data[i];
+        }
+    }
+    ~mat3x() {
+        free();
+    }
+    idScalar& operator()(int row, int col) { return m_data[row * m_cols + col]; }
+    const idScalar& operator()(int row, int col) const { return m_data[row * m_cols + col]; }
+    int rows() const { return m_rows; }
+    const int& cols() const { return m_cols; }
+    void resize(int rows, int cols) {
+        m_cols=cols;
+        free();
+        allocate();
+    }
+    void setZero() {
+        memset(m_data,0x0,sizeof(idScalar)*m_rows*m_cols);
+    }
+    // avoid operators that would allocate -- use functions sub/add/mul in IDMath.hpp instead
+    void allocate(){m_data = static_cast<idScalar*>(idMalloc(sizeof(idScalar) * m_rows * m_cols));}
+    void free() { idFree(m_data);}
+    enum {m_rows=3};
+    int m_cols;
+    idScalar* m_data;
+inline void resize(mat3x &m, idArrayIdx size) {
+    m.resize(3, size);
+    m.setZero();
+// Implementations
+inline const vec3& vec3::operator=(const vec3& rhs) {
+	if (&rhs != this) {
+		memcpy(m_data, rhs.m_data, 3 * sizeof(idScalar));
+	}
+	return *this;
+inline vec3 vec3::cross(const vec3& b) const {
+	vec3 result;
+	result.m_data[0] = m_data[1] * b.m_data[2] - m_data[2] * b.m_data[1];
+	result.m_data[1] = m_data[2] * b.m_data[0] - m_data[0] * b.m_data[2];
+	result.m_data[2] = m_data[0] * b.m_data[1] - m_data[1] * b.m_data[0];
+	return result;
+inline idScalar vec3::dot(const vec3& b) const {
+	return m_data[0] * b.m_data[0] + m_data[1] * b.m_data[1] + m_data[2] * b.m_data[2];
+inline const mat33& mat33::operator=(const mat33& rhs) {
+	if (&rhs != this) {
+		memcpy(m_data, rhs.m_data, 9 * sizeof(idScalar));
+	}
+	return *this;
+inline mat33 mat33::transpose() const {
+	mat33 result;
+	result.m_data[0] = m_data[0];
+	result.m_data[1] = m_data[3];
+	result.m_data[2] = m_data[6];
+	result.m_data[3] = m_data[1];
+	result.m_data[4] = m_data[4];
+	result.m_data[5] = m_data[7];
+	result.m_data[6] = m_data[2];
+	result.m_data[7] = m_data[5];
+	result.m_data[8] = m_data[8];
+	return result;
+inline mat33 operator*(const mat33& a, const mat33& b) {
+	mat33 result;
+	result.m_data[0] =
+		a.m_data[0] * b.m_data[0] + a.m_data[1] * b.m_data[3] + a.m_data[2] * b.m_data[6];
+	result.m_data[1] =
+		a.m_data[0] * b.m_data[1] + a.m_data[1] * b.m_data[4] + a.m_data[2] * b.m_data[7];
+	result.m_data[2] =
+		a.m_data[0] * b.m_data[2] + a.m_data[1] * b.m_data[5] + a.m_data[2] * b.m_data[8];
+	result.m_data[3] =
+		a.m_data[3] * b.m_data[0] + a.m_data[4] * b.m_data[3] + a.m_data[5] * b.m_data[6];
+	result.m_data[4] =
+		a.m_data[3] * b.m_data[1] + a.m_data[4] * b.m_data[4] + a.m_data[5] * b.m_data[7];
+	result.m_data[5] =
+		a.m_data[3] * b.m_data[2] + a.m_data[4] * b.m_data[5] + a.m_data[5] * b.m_data[8];
+	result.m_data[6] =
+		a.m_data[6] * b.m_data[0] + a.m_data[7] * b.m_data[3] + a.m_data[8] * b.m_data[6];
+	result.m_data[7] =
+		a.m_data[6] * b.m_data[1] + a.m_data[7] * b.m_data[4] + a.m_data[8] * b.m_data[7];
+	result.m_data[8] =
+		a.m_data[6] * b.m_data[2] + a.m_data[7] * b.m_data[5] + a.m_data[8] * b.m_data[8];
+	return result;
+inline const mat33& mat33::operator+=(const mat33& b) {
+	for (int i = 0; i < 9; i++) {
+		m_data[i] += b.m_data[i];
+	}
+	return *this;
+inline const mat33& mat33::operator-=(const mat33& b) {
+	for (int i = 0; i < 9; i++) {
+		m_data[i] -= b.m_data[i];
+	}
+	return *this;
+inline vec3 operator*(const mat33& a, const vec3& b) {
+	vec3 result;
+	result.m_data[0] =
+		a.m_data[0] * b.m_data[0] + a.m_data[1] * b.m_data[1] + a.m_data[2] * b.m_data[2];
+	result.m_data[1] =
+		a.m_data[3] * b.m_data[0] + a.m_data[4] * b.m_data[1] + a.m_data[5] * b.m_data[2];
+	result.m_data[2] =
+		a.m_data[6] * b.m_data[0] + a.m_data[7] * b.m_data[1] + a.m_data[8] * b.m_data[2];
+	return result;
+inline const vec3& vec3::operator+=(const vec3& b) {
+	for (int i = 0; i < 3; i++) {
+		m_data[i] += b.m_data[i];
+	}
+	return *this;
+inline const vec3& vec3::operator-=(const vec3& b) {
+	for (int i = 0; i < 3; i++) {
+		m_data[i] -= b.m_data[i];
+	}
+	return *this;
+inline mat33 operator*(const mat33& a, const idScalar& s) {
+	mat33 result;
+	for (int i = 0; i < 9; i++) {
+		result.m_data[i] = a.m_data[i] * s;
+	}
+	return result;
+inline mat33 operator*(const idScalar& s, const mat33& a) { return a * s; }
+inline vec3 operator*(const vec3& a, const idScalar& s) {
+	vec3 result;
+	for (int i = 0; i < 3; i++) {
+		result.m_data[i] = a.m_data[i] * s;
+	}
+	return result;
+inline vec3 operator*(const idScalar& s, const vec3& a) { return a * s; }
+inline mat33 operator+(const mat33& a, const mat33& b) {
+	mat33 result;
+	for (int i = 0; i < 9; i++) {
+		result.m_data[i] = a.m_data[i] + b.m_data[i];
+	}
+	return result;
+inline vec3 operator+(const vec3& a, const vec3& b) {
+	vec3 result;
+	for (int i = 0; i < 3; i++) {
+		result.m_data[i] = a.m_data[i] + b.m_data[i];
+	}
+	return result;
+inline mat33 operator-(const mat33& a, const mat33& b) {
+	mat33 result;
+	for (int i = 0; i < 9; i++) {
+		result.m_data[i] = a.m_data[i] - b.m_data[i];
+	}
+	return result;
+inline vec3 operator-(const vec3& a, const vec3& b) {
+	vec3 result;
+	for (int i = 0; i < 3; i++) {
+		result.m_data[i] = a.m_data[i] - b.m_data[i];
+	}
+	return result;
+inline mat33 operator/(const mat33& a, const idScalar& s) {
+	mat33 result;
+	for (int i = 0; i < 9; i++) {
+		result.m_data[i] = a.m_data[i] / s;
+	}
+	return result;
+inline vec3 operator/(const vec3& a, const idScalar& s) {
+	vec3 result;
+	for (int i = 0; i < 3; i++) {
+		result.m_data[i] = a.m_data[i] / s;
+	}
+	return result;
+inline const vecx& vecx::operator=(const vecx& rhs) {
+	if (size() != rhs.size()) {
+		error_message("size missmatch, size()= %d but rhs.size()= %d\n", size(), rhs.size());
+		abort();
+	}
+	if (&rhs != this) {
+		memcpy(m_data, rhs.m_data, rhs.size() * sizeof(idScalar));
+	}
+	return *this;
+inline vecx operator*(const vecx& a, const idScalar& s) {
+	vecx result(a.size());
+	for (int i = 0; i < result.size(); i++) {
+		result.m_data[i] = a.m_data[i] * s;
+	}
+	return result;
+inline vecx operator*(const idScalar& s, const vecx& a) { return a * s; }
+inline vecx operator+(const vecx& a, const vecx& b) {
+	vecx result(a.size());
+	// TODO: error handling for a.size() != b.size()??
+	if (a.size() != b.size()) {
+		error_message("size missmatch. a.size()= %d, b.size()= %d\n", a.size(), b.size());
+		abort();
+	}
+	for (int i = 0; i < a.size(); i++) {
+		result.m_data[i] = a.m_data[i] + b.m_data[i];
+	}
+	return result;
+inline vecx operator-(const vecx& a, const vecx& b) {
+	vecx result(a.size());
+	// TODO: error handling for a.size() != b.size()??
+	if (a.size() != b.size()) {
+		error_message("size missmatch. a.size()= %d, b.size()= %d\n", a.size(), b.size());
+		abort();
+	}
+	for (int i = 0; i < a.size(); i++) {
+		result.m_data[i] = a.m_data[i] - b.m_data[i];
+	}
+	return result;
+inline vecx operator/(const vecx& a, const idScalar& s) {
+	vecx result(a.size());
+	for (int i = 0; i < result.size(); i++) {
+		result.m_data[i] = a.m_data[i] / s;
+	}
+	return result;
+inline vec3 operator*(const mat3x& a, const vecx& b) {
+    vec3 result;
+    if (a.cols() != b.size()) {
+        error_message("size missmatch. a.cols()= %d, b.size()= %d\n", a.cols(), b.size());
+        abort();
+    }
+    result(0)=0.0;
+    result(1)=0.0;
+    result(2)=0.0;
+    for(int i=0;i<b.size();i++) {
+        for(int k=0;k<3;k++) {
+            result(k)+=a(k,i)*b(i);
+        }
+    }
+    return result;
+inline void setMatxxElem(const idArrayIdx row, const idArrayIdx col, const idScalar val, matxx*m){
+    (*m)(row, col) = val;
+inline void setMat3xElem(const idArrayIdx row, const idArrayIdx col, const idScalar val, mat3x*m){
+    (*m)(row, col) = val;
+} // namespace btInverseDynamcis
diff --git a/src/bullet/BulletInverseDynamics/details/MultiBodyTreeImpl.cpp b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeImpl.cpp
new file mode 100644
index 00000000..8d9aa77e
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeImpl.cpp
@@ -0,0 +1,1027 @@
+#include "MultiBodyTreeImpl.hpp"
+namespace btInverseDynamics {
+MultiBodyTree::MultiBodyImpl::MultiBodyImpl(int num_bodies_, int num_dofs_)
+	: m_num_bodies(num_bodies_), m_num_dofs(num_dofs_)
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+        ,m_m3x(3,m_num_dofs)
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+        resize(m_m3x,m_num_dofs);
+	m_body_list.resize(num_bodies_);
+	m_parent_index.resize(num_bodies_);
+	m_child_indices.resize(num_bodies_);
+	m_user_int.resize(num_bodies_);
+	m_user_ptr.resize(num_bodies_);
+	m_world_gravity(0) = 0.0;
+	m_world_gravity(1) = 0.0;
+	m_world_gravity(2) = -9.8;
+const char *MultiBodyTree::MultiBodyImpl::jointTypeToString(const JointType &type) const {
+	switch (type) {
+		case FIXED:
+			return "fixed";
+		case REVOLUTE:
+			return "revolute";
+		case PRISMATIC:
+			return "prismatic";
+		case FLOATING:
+			return "floating";
+	}
+	return "error: invalid";
+inline void indent(const int &level) {
+	for (int j = 0; j < level; j++)
+		id_printf("  ");  // indent
+void MultiBodyTree::MultiBodyImpl::printTree() {
+	id_printf("body %.2d[%s]: root\n", 0, jointTypeToString(m_body_list[0].m_joint_type));
+	printTree(0, 0);
+void MultiBodyTree::MultiBodyImpl::printTreeData() {
+	for (idArrayIdx i = 0; i < m_body_list.size(); i++) {
+		RigidBody &body = m_body_list[i];
+		id_printf("body: %d\n", static_cast<int>(i));
+		id_printf("type: %s\n", jointTypeToString(body.m_joint_type));
+		id_printf("q_index= %d\n", body.m_q_index);
+		id_printf("Jac_JR= [%f;%f;%f]\n", body.m_Jac_JR(0), body.m_Jac_JR(1), body.m_Jac_JR(2));
+		id_printf("Jac_JT= [%f;%f;%f]\n", body.m_Jac_JT(0), body.m_Jac_JT(1), body.m_Jac_JT(2));
+		id_printf("mass = %f\n", body.m_mass);
+		id_printf("mass * com = [%f %f %f]\n", body.m_body_mass_com(0), body.m_body_mass_com(1),
+				  body.m_body_mass_com(2));
+		id_printf("I_o= [%f %f %f;\n"
+				  "	  %f %f %f;\n"
+				  "	  %f %f %f]\n",
+				  body.m_body_I_body(0, 0), body.m_body_I_body(0, 1), body.m_body_I_body(0, 2),
+				  body.m_body_I_body(1, 0), body.m_body_I_body(1, 1), body.m_body_I_body(1, 2),
+				  body.m_body_I_body(2, 0), body.m_body_I_body(2, 1), body.m_body_I_body(2, 2));
+		id_printf("parent_pos_parent_body_ref= [%f %f %f]\n", body.m_parent_pos_parent_body_ref(0),
+				  body.m_parent_pos_parent_body_ref(1), body.m_parent_pos_parent_body_ref(2));
+	}
+int MultiBodyTree::MultiBodyImpl::bodyNumDoFs(const JointType &type) const {
+	switch (type) {
+		case FIXED:
+			return 0;
+		case REVOLUTE:
+		case PRISMATIC:
+			return 1;
+		case FLOATING:
+			return 6;
+	}
+	error_message("unknown joint type %d\n", type);
+	return 0;
+void MultiBodyTree::MultiBodyImpl::printTree(int index, int indentation) {
+	// this is adapted from URDF2Bullet.
+	// TODO: fix this and print proper graph (similar to git --log --graph)
+	int num_children = m_child_indices[index].size();
+	indentation += 2;
+	int count = 0;
+	for (int i = 0; i < num_children; i++) {
+		int child_index = m_child_indices[index][i];
+		indent(indentation);
+		id_printf("body %.2d[%s]: %.2d is child no. %d (qi= %d .. %d) \n", index,
+				  jointTypeToString(m_body_list[index].m_joint_type), child_index, (count++) + 1,
+				  m_body_list[index].m_q_index,
+				  m_body_list[index].m_q_index + bodyNumDoFs(m_body_list[index].m_joint_type));
+		// first grandchild
+		printTree(child_index, indentation);
+	}
+int MultiBodyTree::MultiBodyImpl::setGravityInWorldFrame(const vec3 &gravity) {
+	m_world_gravity = gravity;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::generateIndexSets() {
+	m_body_revolute_list.resize(0);
+	m_body_prismatic_list.resize(0);
+	int q_index = 0;
+	for (idArrayIdx i = 0; i < m_body_list.size(); i++) {
+		RigidBody &body = m_body_list[i];
+		body.m_q_index = -1;
+		switch (body.m_joint_type) {
+			case REVOLUTE:
+				m_body_revolute_list.push_back(i);
+				body.m_q_index = q_index;
+				q_index++;
+				break;
+			case PRISMATIC:
+				m_body_prismatic_list.push_back(i);
+				body.m_q_index = q_index;
+				q_index++;
+				break;
+			case FIXED:
+				// do nothing
+				break;
+			case FLOATING:
+				m_body_floating_list.push_back(i);
+				body.m_q_index = q_index;
+				q_index += 6;
+				break;
+			default:
+				error_message("unsupported joint type %d\n", body.m_joint_type);
+				return -1;
+		}
+	}
+	// sanity check
+	if (q_index != m_num_dofs) {
+		error_message("internal error, q_index= %d but num_dofs %d\n", q_index, m_num_dofs);
+		return -1;
+	}
+	m_child_indices.resize(m_body_list.size());
+	for (idArrayIdx child = 1; child < m_parent_index.size(); child++) {
+		const int &parent = m_parent_index[child];
+		if (parent >= 0 && parent < (static_cast<int>(m_parent_index.size()) - 1)) {
+			m_child_indices[parent].push_back(child);
+		} else {
+			if (-1 == parent) {
+				// multiple bodies are directly linked to the environment, ie, not a single root
+				error_message("building index sets parent(%zu)= -1 (multiple roots)\n", child);
+			} else {
+				// should never happen
+				error_message(
+					"building index sets. parent_index[%zu]= %d, but m_parent_index.size()= %d\n",
+					child, parent, static_cast<int>(m_parent_index.size()));
+			}
+			return -1;
+		}
+	}
+	return 0;
+void MultiBodyTree::MultiBodyImpl::calculateStaticData() {
+	// relative kinematics that are not a function of q, u, dot_u
+	for (idArrayIdx i = 0; i < m_body_list.size(); i++) {
+		RigidBody &body = m_body_list[i];
+		switch (body.m_joint_type) {
+			case REVOLUTE:
+				body.m_parent_vel_rel(0) = 0;
+				body.m_parent_vel_rel(1) = 0;
+				body.m_parent_vel_rel(2) = 0;
+				body.m_parent_acc_rel(0) = 0;
+				body.m_parent_acc_rel(1) = 0;
+				body.m_parent_acc_rel(2) = 0;
+				body.m_parent_pos_parent_body = body.m_parent_pos_parent_body_ref;
+				break;
+			case PRISMATIC:
+				body.m_body_T_parent = body.m_body_T_parent_ref;
+				body.m_parent_Jac_JT = body.m_body_T_parent_ref.transpose() * body.m_Jac_JT;
+				body.m_body_ang_vel_rel(0) = 0;
+				body.m_body_ang_vel_rel(1) = 0;
+				body.m_body_ang_vel_rel(2) = 0;
+				body.m_body_ang_acc_rel(0) = 0;
+				body.m_body_ang_acc_rel(1) = 0;
+				body.m_body_ang_acc_rel(2) = 0;
+				break;
+			case FIXED:
+				body.m_parent_pos_parent_body = body.m_parent_pos_parent_body_ref;
+				body.m_body_T_parent = body.m_body_T_parent_ref;
+				body.m_body_ang_vel_rel(0) = 0;
+				body.m_body_ang_vel_rel(1) = 0;
+				body.m_body_ang_vel_rel(2) = 0;
+				body.m_parent_vel_rel(0) = 0;
+				body.m_parent_vel_rel(1) = 0;
+				body.m_parent_vel_rel(2) = 0;
+				body.m_body_ang_acc_rel(0) = 0;
+				body.m_body_ang_acc_rel(1) = 0;
+				body.m_body_ang_acc_rel(2) = 0;
+				body.m_parent_acc_rel(0) = 0;
+				body.m_parent_acc_rel(1) = 0;
+				body.m_parent_acc_rel(2) = 0;
+				break;
+			case FLOATING:
+				// no static data
+				break;
+		}
+       // resize & initialize jacobians to zero.
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+        body.m_body_dot_Jac_T_u(0) = 0.0;
+        body.m_body_dot_Jac_T_u(1) = 0.0;
+        body.m_body_dot_Jac_T_u(2) = 0.0;
+        body.m_body_dot_Jac_R_u(0) = 0.0;
+        body.m_body_dot_Jac_R_u(1) = 0.0;
+        body.m_body_dot_Jac_R_u(2) = 0.0;
+        resize(body.m_body_Jac_T,m_num_dofs);
+        resize(body.m_body_Jac_R,m_num_dofs);
+        body.m_body_Jac_T.setZero();
+        body.m_body_Jac_R.setZero();
+#endif //
+	}
+int MultiBodyTree::MultiBodyImpl::calculateInverseDynamics(const vecx &q, const vecx &u,
+														   const vecx &dot_u, vecx *joint_forces) {
+	if (q.size() != m_num_dofs || u.size() != m_num_dofs || dot_u.size() != m_num_dofs ||
+		joint_forces->size() != m_num_dofs) {
+		error_message("wrong vector dimension. system has %d DOFs,\n"
+					  "but dim(q)= %d, dim(u)= %d, dim(dot_u)= %d, dim(joint_forces)= %d\n",
+					  m_num_dofs, static_cast<int>(q.size()), static_cast<int>(u.size()),
+					  static_cast<int>(dot_u.size()), static_cast<int>(joint_forces->size()));
+		return -1;
+	}
+	// 1. relative kinematics
+        if(-1 == calculateKinematics(q,u,dot_u, POSITION_VELOCITY_ACCELERATION)) {
+            error_message("error in calculateKinematics\n");
+            return -1;
+        }
+        // 2. update contributions to equations of motion for every body.
+	for (idArrayIdx i = 0; i < m_body_list.size(); i++) {
+		RigidBody &body = m_body_list[i];
+		// 3.4 update dynamic terms (rate of change of angular & linear momentum)
+		body.m_eom_lhs_rotational =
+			body.m_body_I_body * body.m_body_ang_acc + body.m_body_mass_com.cross(body.m_body_acc) +
+			body.m_body_ang_vel.cross(body.m_body_I_body * body.m_body_ang_vel) -
+			body.m_body_moment_user;
+		body.m_eom_lhs_translational =
+			body.m_body_ang_acc.cross(body.m_body_mass_com) + body.m_mass * body.m_body_acc +
+			body.m_body_ang_vel.cross(body.m_body_ang_vel.cross(body.m_body_mass_com)) -
+			body.m_body_force_user;
+	}
+	// 3. calculate full set of forces at parent joint
+	// (not directly calculating the joint force along the free direction
+	// simplifies inclusion of fixed joints.
+	// An alternative would be to fuse bodies in a pre-processing step,
+	// but that would make changing masses online harder (eg, payload masses
+	// added with fixed  joints to a gripper)
+	// Also, this enables adding zero weight bodies as a way to calculate frame poses
+	// for force elements, etc.
+	for (int body_idx = m_body_list.size() - 1; body_idx >= 0; body_idx--) {
+		// sum of forces and moments acting on this body from its children
+		vec3 sum_f_children;
+		vec3 sum_m_children;
+		setZero(sum_f_children);
+		setZero(sum_m_children);
+		for (idArrayIdx child_list_idx = 0; child_list_idx < m_child_indices[body_idx].size();
+			 child_list_idx++) {
+			const RigidBody &child = m_body_list[m_child_indices[body_idx][child_list_idx]];
+			vec3 child_joint_force_in_this_frame =
+				child.m_body_T_parent.transpose() * child.m_force_at_joint;
+			sum_f_children -= child_joint_force_in_this_frame;
+			sum_m_children -= child.m_body_T_parent.transpose() * child.m_moment_at_joint +
+							  child.m_parent_pos_parent_body.cross(child_joint_force_in_this_frame);
+		}
+		RigidBody &body = m_body_list[body_idx];
+		body.m_force_at_joint = body.m_eom_lhs_translational - sum_f_children;
+		body.m_moment_at_joint = body.m_eom_lhs_rotational - sum_m_children;
+	}
+	// 4. Calculate Joint forces.
+	// These are the components of force_at_joint/moment_at_joint
+	// in the free directions given by Jac_JT/Jac_JR
+	// 4.1 revolute joints
+	for (idArrayIdx i = 0; i < m_body_revolute_list.size(); i++) {
+		RigidBody &body = m_body_list[m_body_revolute_list[i]];
+		// (*joint_forces)(body.m_q_index) = body.m_Jac_JR.transpose() * body.m_moment_at_joint;
+		(*joint_forces)(body.m_q_index) = body.m_Jac_JR.dot(body.m_moment_at_joint);
+	}
+	// 4.2 for prismatic joints
+	for (idArrayIdx i = 0; i < m_body_prismatic_list.size(); i++) {
+		RigidBody &body = m_body_list[m_body_prismatic_list[i]];
+		// (*joint_forces)(body.m_q_index) = body.m_Jac_JT.transpose() * body.m_force_at_joint;
+		(*joint_forces)(body.m_q_index) = body.m_Jac_JT.dot(body.m_force_at_joint);
+	}
+	// 4.3 floating bodies (6-DoF joints)
+	for (idArrayIdx i = 0; i < m_body_floating_list.size(); i++) {
+		RigidBody &body = m_body_list[m_body_floating_list[i]];
+		(*joint_forces)(body.m_q_index + 0) = body.m_moment_at_joint(0);
+		(*joint_forces)(body.m_q_index + 1) = body.m_moment_at_joint(1);
+		(*joint_forces)(body.m_q_index + 2) = body.m_moment_at_joint(2);
+		(*joint_forces)(body.m_q_index + 3) = body.m_force_at_joint(0);
+		(*joint_forces)(body.m_q_index + 4) = body.m_force_at_joint(1);
+		(*joint_forces)(body.m_q_index + 5) = body.m_force_at_joint(2);
+	}
+	return 0;
+int MultiBodyTree::MultiBodyImpl::calculateKinematics(const vecx &q, const vecx &u, const vecx& dot_u,
+                                                      const KinUpdateType type) {
+    	if (q.size() != m_num_dofs || u.size() != m_num_dofs || dot_u.size() != m_num_dofs ) {
+		error_message("wrong vector dimension. system has %d DOFs,\n"
+					  "but dim(q)= %d, dim(u)= %d, dim(dot_u)= %d\n",
+					  m_num_dofs, static_cast<int>(q.size()), static_cast<int>(u.size()),
+					  static_cast<int>(dot_u.size()));
+		return -1;
+	}
+            error_message("invalid type %d\n", type);
+            return -1;
+        }
+	// 1. update relative kinematics
+	// 1.1 for revolute
+	for (idArrayIdx i = 0; i < m_body_revolute_list.size(); i++) {
+		RigidBody &body = m_body_list[m_body_revolute_list[i]];
+		mat33 T;
+		bodyTParentFromAxisAngle(body.m_Jac_JR, q(body.m_q_index), &T);
+		body.m_body_T_parent = T * body.m_body_T_parent_ref;
+                if(type >= POSITION_VELOCITY) {
+                    body.m_body_ang_vel_rel = body.m_Jac_JR * u(body.m_q_index);
+                }
+                if(type >= POSITION_VELOCITY_ACCELERATION) {
+                    body.m_body_ang_acc_rel = body.m_Jac_JR * dot_u(body.m_q_index);
+                }
+	}
+	// 1.2 for prismatic
+	for (idArrayIdx i = 0; i < m_body_prismatic_list.size(); i++) {
+		RigidBody &body = m_body_list[m_body_prismatic_list[i]];
+		body.m_parent_pos_parent_body =
+			body.m_parent_pos_parent_body_ref + body.m_parent_Jac_JT * q(body.m_q_index);
+                if(type >= POSITION_VELOCITY) {
+                    body.m_parent_vel_rel =
+			body.m_body_T_parent_ref.transpose() * body.m_Jac_JT * u(body.m_q_index);
+                }
+                if(type >= POSITION_VELOCITY_ACCELERATION) {
+                    body.m_parent_acc_rel = body.m_parent_Jac_JT * dot_u(body.m_q_index);
+                }
+	}
+	// 1.3 fixed joints: nothing to do
+	// 1.4 6dof joints:
+	for (idArrayIdx i = 0; i < m_body_floating_list.size(); i++) {
+		RigidBody &body = m_body_list[m_body_floating_list[i]];
+		body.m_body_T_parent = transformZ(q(body.m_q_index + 2)) *
+							   transformY(q(body.m_q_index + 1)) * transformX(q(body.m_q_index));
+		body.m_parent_pos_parent_body(0) = q(body.m_q_index + 3);
+		body.m_parent_pos_parent_body(1) = q(body.m_q_index + 4);
+		body.m_parent_pos_parent_body(2) = q(body.m_q_index + 5);
+		body.m_parent_pos_parent_body = body.m_body_T_parent * body.m_parent_pos_parent_body;
+                if(type >= POSITION_VELOCITY) {
+                    body.m_body_ang_vel_rel(0) = u(body.m_q_index + 0);
+                    body.m_body_ang_vel_rel(1) = u(body.m_q_index + 1);
+                    body.m_body_ang_vel_rel(2) = u(body.m_q_index + 2);
+                    body.m_parent_vel_rel(0) = u(body.m_q_index + 3);
+                    body.m_parent_vel_rel(1) = u(body.m_q_index + 4);
+                    body.m_parent_vel_rel(2) = u(body.m_q_index + 5);
+                    body.m_parent_vel_rel = body.m_body_T_parent.transpose() * body.m_parent_vel_rel;
+                }
+                if(type >= POSITION_VELOCITY_ACCELERATION) {
+                    body.m_body_ang_acc_rel(0) = dot_u(body.m_q_index + 0);
+                    body.m_body_ang_acc_rel(1) = dot_u(body.m_q_index + 1);
+                    body.m_body_ang_acc_rel(2) = dot_u(body.m_q_index + 2);
+                    body.m_parent_acc_rel(0) = dot_u(body.m_q_index + 3);
+                    body.m_parent_acc_rel(1) = dot_u(body.m_q_index + 4);
+                    body.m_parent_acc_rel(2) = dot_u(body.m_q_index + 5);
+                    body.m_parent_acc_rel = body.m_body_T_parent.transpose() * body.m_parent_acc_rel;
+                }
+	}
+	// 2. absolute kinematic quantities (vector valued)
+	// NOTE: this should be optimized by specializing for different body types
+	// (e.g., relative rotation is always zero for prismatic joints, etc.)
+	// calculations for root body
+	{
+		RigidBody &body = m_body_list[0];
+		// 3.1 update absolute positions and orientations:
+		// will be required if we add force elements (eg springs between bodies,
+		// or contacts)
+		// not required right now, added here for debugging purposes
+		body.m_body_pos = body.m_body_T_parent * body.m_parent_pos_parent_body;
+		body.m_body_T_world = body.m_body_T_parent;
+                if(type >= POSITION_VELOCITY) {
+                    // 3.2 update absolute velocities
+                    body.m_body_ang_vel = body.m_body_ang_vel_rel;
+                    body.m_body_vel = body.m_parent_vel_rel;
+                }
+                if(type >= POSITION_VELOCITY_ACCELERATION) {
+                    // 3.3 update absolute accelerations
+                    // NOTE: assumption: dot(J_JR) = 0; true here, but not for general joints
+                    body.m_body_ang_acc = body.m_body_ang_acc_rel;
+                    body.m_body_acc = body.m_body_T_parent * body.m_parent_acc_rel;
+                    // add gravitational acceleration to root body
+                    // this is an efficient way to add gravitational terms,
+                    // but it does mean that the kinematics are no longer
+                    // correct at the acceleration level
+                    // NOTE: To get correct acceleration kinematics, just set world_gravity to zero
+                    body.m_body_acc = body.m_body_acc - body.m_body_T_parent * m_world_gravity;
+                }
+	}
+	for (idArrayIdx i = 1; i < m_body_list.size(); i++) {
+		RigidBody &body = m_body_list[i];
+		RigidBody &parent = m_body_list[m_parent_index[i]];
+		// 2.1 update absolute positions and orientations:
+		// will be required if we add force elements (eg springs between bodies,
+		// or contacts)  not required right now added here for debugging purposes
+		body.m_body_pos =
+			body.m_body_T_parent * (parent.m_body_pos + body.m_parent_pos_parent_body);
+		body.m_body_T_world = body.m_body_T_parent * parent.m_body_T_world;
+                if(type >= POSITION_VELOCITY) {
+                    // 2.2 update absolute velocities
+                    body.m_body_ang_vel =
+			body.m_body_T_parent * parent.m_body_ang_vel + body.m_body_ang_vel_rel;
+                    body.m_body_vel =
+			body.m_body_T_parent *
+			(parent.m_body_vel + parent.m_body_ang_vel.cross(body.m_parent_pos_parent_body) +
+			 body.m_parent_vel_rel);
+                }
+                if(type >= POSITION_VELOCITY_ACCELERATION) {
+                    // 2.3 update absolute accelerations
+                    // NOTE: assumption: dot(J_JR) = 0; true here, but not for general joints
+                    body.m_body_ang_acc =
+			body.m_body_T_parent * parent.m_body_ang_acc -
+			body.m_body_ang_vel_rel.cross(body.m_body_T_parent * parent.m_body_ang_vel) +
+			body.m_body_ang_acc_rel;
+                    body.m_body_acc =
+			body.m_body_T_parent *
+			(parent.m_body_acc + parent.m_body_ang_acc.cross(body.m_parent_pos_parent_body) +
+			 parent.m_body_ang_vel.cross(parent.m_body_ang_vel.cross(body.m_parent_pos_parent_body)) +
+			 2.0 * parent.m_body_ang_vel.cross(body.m_parent_vel_rel) + body.m_parent_acc_rel);
+                }
+	}
+    return 0;
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+void MultiBodyTree::MultiBodyImpl::addRelativeJacobianComponent(RigidBody&body) {
+    const int& idx=body.m_q_index;
+    switch(body.m_joint_type) {
+    case FIXED:
+        break;
+    case REVOLUTE:
+        setMat3xElem(0,idx, body.m_Jac_JR(0), &body.m_body_Jac_R);
+        setMat3xElem(1,idx, body.m_Jac_JR(1), &body.m_body_Jac_R);
+        setMat3xElem(2,idx, body.m_Jac_JR(2), &body.m_body_Jac_R);
+        break;
+    case PRISMATIC:
+        setMat3xElem(0,idx, body.m_body_T_parent_ref(0,0)*body.m_Jac_JT(0)
+                     +body.m_body_T_parent_ref(1,0)*body.m_Jac_JT(1)
+                     +body.m_body_T_parent_ref(2,0)*body.m_Jac_JT(2),
+                     &body.m_body_Jac_T);
+        setMat3xElem(1,idx,body.m_body_T_parent_ref(0,1)*body.m_Jac_JT(0)
+                     +body.m_body_T_parent_ref(1,1)*body.m_Jac_JT(1)
+                     +body.m_body_T_parent_ref(2,1)*body.m_Jac_JT(2),
+                     &body.m_body_Jac_T);
+        setMat3xElem(2,idx, body.m_body_T_parent_ref(0,2)*body.m_Jac_JT(0)
+                     +body.m_body_T_parent_ref(1,2)*body.m_Jac_JT(1)
+                     +body.m_body_T_parent_ref(2,2)*body.m_Jac_JT(2),
+                     &body.m_body_Jac_T);
+        break;
+    case FLOATING:
+        setMat3xElem(0,idx+0, 1.0, &body.m_body_Jac_R);
+        setMat3xElem(1,idx+1, 1.0, &body.m_body_Jac_R);
+        setMat3xElem(2,idx+2, 1.0, &body.m_body_Jac_R);
+        // body_Jac_T = body_T_parent.transpose();
+        setMat3xElem(0,idx+3, body.m_body_T_parent(0,0), &body.m_body_Jac_T);
+        setMat3xElem(0,idx+4, body.m_body_T_parent(1,0), &body.m_body_Jac_T);
+        setMat3xElem(0,idx+5, body.m_body_T_parent(2,0), &body.m_body_Jac_T);
+        setMat3xElem(1,idx+3, body.m_body_T_parent(0,1), &body.m_body_Jac_T);
+        setMat3xElem(1,idx+4, body.m_body_T_parent(1,1), &body.m_body_Jac_T);
+        setMat3xElem(1,idx+5, body.m_body_T_parent(2,1), &body.m_body_Jac_T);
+        setMat3xElem(2,idx+3, body.m_body_T_parent(0,2), &body.m_body_Jac_T);
+        setMat3xElem(2,idx+4, body.m_body_T_parent(1,2), &body.m_body_Jac_T);
+        setMat3xElem(2,idx+5, body.m_body_T_parent(2,2), &body.m_body_Jac_T);
+        break;
+    }
+int MultiBodyTree::MultiBodyImpl::calculateJacobians(const vecx& q, const vecx& u, const KinUpdateType type) {
+    if (q.size() != m_num_dofs || u.size() != m_num_dofs) {
+        error_message("wrong vector dimension. system has %d DOFs,\n"
+                      "but dim(q)= %d, dim(u)= %d\n",
+                      m_num_dofs, static_cast<int>(q.size()), static_cast<int>(u.size()));
+        return -1;
+    }
+    if(type != POSITION_ONLY && type != POSITION_VELOCITY) {
+        error_message("invalid type %d\n", type);
+        return -1;
+    }
+    addRelativeJacobianComponent(m_body_list[0]);
+    for (idArrayIdx i = 1; i < m_body_list.size(); i++) {
+        RigidBody &body = m_body_list[i];
+        RigidBody &parent = m_body_list[m_parent_index[i]];
+        mul(body.m_body_T_parent, parent.m_body_Jac_R,& body.m_body_Jac_R);
+        body.m_body_Jac_T = parent.m_body_Jac_T;
+        mul(tildeOperator(body.m_parent_pos_parent_body),parent.m_body_Jac_R,&m_m3x);
+        sub(body.m_body_Jac_T,m_m3x, &body.m_body_Jac_T);
+        addRelativeJacobianComponent(body);
+        mul(body.m_body_T_parent, body.m_body_Jac_T,&body.m_body_Jac_T);
+        if(type >= POSITION_VELOCITY) {
+            body.m_body_dot_Jac_R_u = body.m_body_T_parent * parent.m_body_dot_Jac_R_u -
+                body.m_body_ang_vel_rel.cross(body.m_body_T_parent * parent.m_body_ang_vel);
+            body.m_body_dot_Jac_T_u = body.m_body_T_parent *
+                (parent.m_body_dot_Jac_T_u + parent.m_body_dot_Jac_R_u.cross(body.m_parent_pos_parent_body) +
+                 parent.m_body_ang_vel.cross(parent.m_body_ang_vel.cross(body.m_parent_pos_parent_body)) +
+                 2.0 * parent.m_body_ang_vel.cross(body.m_parent_vel_rel));
+        }
+    }
+    return 0;
+static inline void setSixDoFJacobians(const int dof, vec3 &Jac_JR, vec3 &Jac_JT) {
+	switch (dof) {
+		// rotational part
+		case 0:
+			Jac_JR(0) = 1;
+			Jac_JR(1) = 0;
+			Jac_JR(2) = 0;
+			setZero(Jac_JT);
+			break;
+		case 1:
+			Jac_JR(0) = 0;
+			Jac_JR(1) = 1;
+			Jac_JR(2) = 0;
+			setZero(Jac_JT);
+			break;
+		case 2:
+			Jac_JR(0) = 0;
+			Jac_JR(1) = 0;
+			Jac_JR(2) = 1;
+			setZero(Jac_JT);
+			break;
+		// translational part
+		case 3:
+			setZero(Jac_JR);
+			Jac_JT(0) = 1;
+			Jac_JT(1) = 0;
+			Jac_JT(2) = 0;
+			break;
+		case 4:
+			setZero(Jac_JR);
+			Jac_JT(0) = 0;
+			Jac_JT(1) = 1;
+			Jac_JT(2) = 0;
+			break;
+		case 5:
+			setZero(Jac_JR);
+			Jac_JT(0) = 0;
+			Jac_JT(1) = 0;
+			Jac_JT(2) = 1;
+			break;
+	}
+static inline int jointNumDoFs(const JointType &type) {
+	switch (type) {
+		case FIXED:
+			return 0;
+		case REVOLUTE:
+		case PRISMATIC:
+			return 1;
+		case FLOATING:
+			return 6;
+	}
+	// this should never happen
+	error_message("invalid joint type\n");
+	// TODO add configurable abort/crash function
+	abort();
+int MultiBodyTree::MultiBodyImpl::calculateMassMatrix(const vecx &q, const bool update_kinematics,
+													  const bool initialize_matrix,
+													  const bool set_lower_triangular_matrix,
+													  matxx *mass_matrix) {
+// This calculates the joint space mass matrix for the multibody system.
+// The algorithm is essentially an implementation of "method 3"
+// in "Efficient Dynamic Simulation of Robotic Mechanisms" (Walker and Orin, 1982)
+// (Later named "Composite Rigid Body Algorithm" by Featherstone).
+// This implementation, however, handles branched systems and uses a formulation centered
+// on the origin of the body-fixed frame to avoid re-computing various quantities at the com.
+	if (q.size() != m_num_dofs || mass_matrix->rows() != m_num_dofs ||
+		mass_matrix->cols() != m_num_dofs) {
+		error_message("Dimension error. System has %d DOFs,\n"
+					  "but dim(q)= %d, dim(mass_matrix)= %d x %d\n",
+					  m_num_dofs, static_cast<int>(q.size()), static_cast<int>(mass_matrix->rows()),
+					  static_cast<int>(mass_matrix->cols()));
+		return -1;
+	}
+	// TODO add optimized zeroing function?
+	if (initialize_matrix) {
+		for (int i = 0; i < m_num_dofs; i++) {
+			for (int j = 0; j < m_num_dofs; j++) {
+                            setMatxxElem(i, j, 0.0, mass_matrix);
+			}
+		}
+	}
+	if (update_kinematics) {
+		// 1. update relative kinematics
+		// 1.1 for revolute joints
+		for (idArrayIdx i = 0; i < m_body_revolute_list.size(); i++) {
+			RigidBody &body = m_body_list[m_body_revolute_list[i]];
+			// from reference orientation (q=0) of body-fixed frame to current orientation
+			mat33 body_T_body_ref;
+			bodyTParentFromAxisAngle(body.m_Jac_JR, q(body.m_q_index), &body_T_body_ref);
+			body.m_body_T_parent = body_T_body_ref * body.m_body_T_parent_ref;
+		}
+		// 1.2 for prismatic joints
+		for (idArrayIdx i = 0; i < m_body_prismatic_list.size(); i++) {
+			RigidBody &body = m_body_list[m_body_prismatic_list[i]];
+			// body.m_body_T_parent= fixed
+			body.m_parent_pos_parent_body =
+				body.m_parent_pos_parent_body_ref + body.m_parent_Jac_JT * q(body.m_q_index);
+		}
+		// 1.3 fixed joints: nothing to do
+		// 1.4 6dof joints:
+		for (idArrayIdx i = 0; i < m_body_floating_list.size(); i++) {
+			RigidBody &body = m_body_list[m_body_floating_list[i]];
+			body.m_body_T_parent = transformZ(q(body.m_q_index + 2)) *
+								   transformY(q(body.m_q_index + 1)) *
+								   transformX(q(body.m_q_index));
+			body.m_parent_pos_parent_body(0) = q(body.m_q_index + 3);
+			body.m_parent_pos_parent_body(1) = q(body.m_q_index + 4);
+			body.m_parent_pos_parent_body(2) = q(body.m_q_index + 5);
+			body.m_parent_pos_parent_body = body.m_body_T_parent * body.m_parent_pos_parent_body;
+		}
+	}
+	for (int i = m_body_list.size() - 1; i >= 0; i--) {
+		RigidBody &body = m_body_list[i];
+		// calculate mass, center of mass and inertia of "composite rigid body",
+		// ie, sub-tree starting at current body
+		body.m_subtree_mass = body.m_mass;
+		body.m_body_subtree_mass_com = body.m_body_mass_com;
+		body.m_body_subtree_I_body = body.m_body_I_body;
+		for (idArrayIdx c = 0; c < m_child_indices[i].size(); c++) {
+			RigidBody &child = m_body_list[m_child_indices[i][c]];
+			mat33 body_T_child = child.m_body_T_parent.transpose();
+			body.m_subtree_mass += child.m_subtree_mass;
+			body.m_body_subtree_mass_com += body_T_child * child.m_body_subtree_mass_com +
+											child.m_parent_pos_parent_body * child.m_subtree_mass;
+			body.m_body_subtree_I_body +=
+				body_T_child * child.m_body_subtree_I_body * child.m_body_T_parent;
+			if (child.m_subtree_mass > 0) {
+				// Shift the reference point for the child subtree inertia using the
+				// Huygens-Steiner ("parallel axis") theorem.
+				// (First shift from child origin to child com, then from there to this body's
+				// origin)
+				vec3 r_com = body_T_child * child.m_body_subtree_mass_com / child.m_subtree_mass;
+				mat33 tilde_r_child_com = tildeOperator(r_com);
+				mat33 tilde_r_body_com = tildeOperator(child.m_parent_pos_parent_body + r_com);
+				body.m_body_subtree_I_body +=
+					child.m_subtree_mass *
+					(tilde_r_child_com * tilde_r_child_com - tilde_r_body_com * tilde_r_body_com);
+			}
+		}
+	}
+	for (int i = m_body_list.size() - 1; i >= 0; i--) {
+		const RigidBody &body = m_body_list[i];
+		// determine DoF-range for body
+		const int q_index_min = body.m_q_index;
+		const int q_index_max = q_index_min + jointNumDoFs(body.m_joint_type) - 1;
+		// loop over the DoFs used by this body
+		// local joint jacobians (ok as is for 1-DoF joints)
+		vec3 Jac_JR = body.m_Jac_JR;
+		vec3 Jac_JT = body.m_Jac_JT;
+		for (int col = q_index_max; col >= q_index_min; col--) {
+			// set jacobians for 6-DoF joints
+			if (FLOATING == body.m_joint_type) {
+				setSixDoFJacobians(col - q_index_min, Jac_JR, Jac_JT);
+			}
+			vec3 body_eom_rot =
+				body.m_body_subtree_I_body * Jac_JR + body.m_body_subtree_mass_com.cross(Jac_JT);
+			vec3 body_eom_trans =
+				body.m_subtree_mass * Jac_JT - body.m_body_subtree_mass_com.cross(Jac_JR);
+			setMatxxElem(col, col, Jac_JR.dot(body_eom_rot) + Jac_JT.dot(body_eom_trans), mass_matrix);
+			// rest of the mass matrix column upwards
+			{
+				// 1. for multi-dof joints, rest of the dofs of this body
+				for (int row = col - 1; row >= q_index_min; row--) {
+					if (FLOATING != body.m_joint_type) {
+						error_message("??\n");
+						return -1;
+					}
+					setSixDoFJacobians(row - q_index_min, Jac_JR, Jac_JT);
+					const double Mrc = Jac_JR.dot(body_eom_rot) + Jac_JT.dot(body_eom_trans);
+					setMatxxElem(col, row, Mrc, mass_matrix);
+				}
+				// 2. ancestor dofs
+				int child_idx = i;
+				int parent_idx = m_parent_index[i];
+				while (parent_idx >= 0) {
+					const RigidBody &child_body = m_body_list[child_idx];
+					const RigidBody &parent_body = m_body_list[parent_idx];
+					const mat33 parent_T_child = child_body.m_body_T_parent.transpose();
+					body_eom_rot = parent_T_child * body_eom_rot;
+					body_eom_trans = parent_T_child * body_eom_trans;
+					body_eom_rot += child_body.m_parent_pos_parent_body.cross(body_eom_trans);
+					const int parent_body_q_index_min = parent_body.m_q_index;
+					const int parent_body_q_index_max =
+						parent_body_q_index_min + jointNumDoFs(parent_body.m_joint_type) - 1;
+					vec3 Jac_JR = parent_body.m_Jac_JR;
+					vec3 Jac_JT = parent_body.m_Jac_JT;
+					for (int row = parent_body_q_index_max; row >= parent_body_q_index_min; row--) {
+						// set jacobians for 6-DoF joints
+						if (FLOATING == parent_body.m_joint_type) {
+							setSixDoFJacobians(row - parent_body_q_index_min, Jac_JR, Jac_JT);
+						}
+						const double Mrc = Jac_JR.dot(body_eom_rot) + Jac_JT.dot(body_eom_trans);
+						setMatxxElem(col, row, Mrc, mass_matrix);
+					}
+					child_idx = parent_idx;
+					parent_idx = m_parent_index[child_idx];
+				}
+			}
+		}
+	}
+	if (set_lower_triangular_matrix) {
+		for (int col = 0; col < m_num_dofs; col++) {
+			for (int row = 0; row < col; row++) {
+                            setMatxxElem(row, col, (*mass_matrix)(col, row), mass_matrix);
+			}
+		}
+	}
+	return 0;
+// utility macro
+#define CHECK_IF_BODY_INDEX_IS_VALID(index)														\
+	do {																						   \
+		if (index < 0 || index >= m_num_bodies) {												  \
+			error_message("invalid index %d (num_bodies= %d)\n", index, m_num_bodies);			 \
+			return -1;																			 \
+		}																						  \
+	} while (0)
+int MultiBodyTree::MultiBodyImpl::getParentIndex(const int body_index, int *p) {
+	*p = m_parent_index[body_index];
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getUserInt(const int body_index, int *user_int) const {
+	*user_int = m_user_int[body_index];
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getUserPtr(const int body_index, void **user_ptr) const {
+	*user_ptr = m_user_ptr[body_index];
+	return 0;
+int MultiBodyTree::MultiBodyImpl::setUserInt(const int body_index, const int user_int) {
+	m_user_int[body_index] = user_int;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::setUserPtr(const int body_index, void *const user_ptr) {
+	m_user_ptr[body_index] = user_ptr;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyOrigin(int body_index, vec3 *world_origin) const {
+	const RigidBody &body = m_body_list[body_index];
+	*world_origin = body.m_body_T_world.transpose() * body.m_body_pos;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyCoM(int body_index, vec3 *world_com) const {
+	const RigidBody &body = m_body_list[body_index];
+	if (body.m_mass > 0) {
+		*world_com = body.m_body_T_world.transpose() *
+					 (body.m_body_pos + body.m_body_mass_com / body.m_mass);
+	} else {
+		*world_com = body.m_body_T_world.transpose() * (body.m_body_pos);
+	}
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyTransform(int body_index, mat33 *world_T_body) const {
+	const RigidBody &body = m_body_list[body_index];
+	*world_T_body = body.m_body_T_world.transpose();
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyAngularVelocity(int body_index, vec3 *world_omega) const {
+	const RigidBody &body = m_body_list[body_index];
+	*world_omega = body.m_body_T_world.transpose() * body.m_body_ang_vel;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyLinearVelocity(int body_index,
+														vec3 *world_velocity) const {
+	const RigidBody &body = m_body_list[body_index];
+	*world_velocity = body.m_body_T_world.transpose() * body.m_body_vel;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyLinearVelocityCoM(int body_index,
+														   vec3 *world_velocity) const {
+	const RigidBody &body = m_body_list[body_index];
+	vec3 com;
+	if (body.m_mass > 0) {
+		com = body.m_body_mass_com / body.m_mass;
+	} else {
+		com(0) = 0;
+		com(1) = 0;
+		com(2) = 0;
+	}
+	*world_velocity =
+		body.m_body_T_world.transpose() * (body.m_body_vel + body.m_body_ang_vel.cross(com));
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyAngularAcceleration(int body_index,
+															 vec3 *world_dot_omega) const {
+	const RigidBody &body = m_body_list[body_index];
+	*world_dot_omega = body.m_body_T_world.transpose() * body.m_body_ang_acc;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyLinearAcceleration(int body_index,
+															vec3 *world_acceleration) const {
+	const RigidBody &body = m_body_list[body_index];
+	*world_acceleration = body.m_body_T_world.transpose() * body.m_body_acc;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getJointType(const int body_index, JointType *joint_type) const {
+	*joint_type = m_body_list[body_index].m_joint_type;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getJointTypeStr(const int body_index,
+												  const char **joint_type) const {
+	*joint_type = jointTypeToString(m_body_list[body_index].m_joint_type);
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getParentRParentBodyRef(const int body_index, vec3* r)  const{
+    CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    *r=m_body_list[body_index].m_parent_pos_parent_body_ref;
+    return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyTParentRef(const int body_index, mat33* T)  const{
+    CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    *T=m_body_list[body_index].m_body_T_parent_ref;
+    return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyAxisOfMotion(const int body_index, vec3* axis)  const{
+    CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    if(m_body_list[body_index].m_joint_type == REVOLUTE) {
+        *axis = m_body_list[body_index].m_Jac_JR;
+        return 0;
+    }
+    if(m_body_list[body_index].m_joint_type == PRISMATIC) {
+        *axis = m_body_list[body_index].m_Jac_JT;
+        return 0;
+    }
+    setZero(*axis);
+    return 0;
+int MultiBodyTree::MultiBodyImpl::getDoFOffset(const int body_index, int *q_index) const {
+	*q_index = m_body_list[body_index].m_q_index;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::setBodyMass(const int body_index, const idScalar mass) {
+	m_body_list[body_index].m_mass = mass;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::setBodyFirstMassMoment(const int body_index,
+														 const vec3& first_mass_moment) {
+	m_body_list[body_index].m_body_mass_com = first_mass_moment;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::setBodySecondMassMoment(const int body_index,
+														  const mat33& second_mass_moment) {
+	m_body_list[body_index].m_body_I_body = second_mass_moment;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyMass(const int body_index, idScalar *mass) const {
+	*mass = m_body_list[body_index].m_mass;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodyFirstMassMoment(const int body_index,
+														 vec3 *first_mass_moment) const {
+	*first_mass_moment = m_body_list[body_index].m_body_mass_com;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::getBodySecondMassMoment(const int body_index,
+														  mat33 *second_mass_moment) const {
+	*second_mass_moment = m_body_list[body_index].m_body_I_body;
+	return 0;
+void MultiBodyTree::MultiBodyImpl::clearAllUserForcesAndMoments() {
+	for (int index = 0; index < m_num_bodies; index++) {
+		RigidBody &body = m_body_list[index];
+		setZero(body.m_body_force_user);
+		setZero(body.m_body_moment_user);
+	}
+int MultiBodyTree::MultiBodyImpl::addUserForce(const int body_index, const vec3 &body_force) {
+	m_body_list[body_index].m_body_force_user += body_force;
+	return 0;
+int MultiBodyTree::MultiBodyImpl::addUserMoment(const int body_index, const vec3 &body_moment) {
+	m_body_list[body_index].m_body_moment_user += body_moment;
+	return 0;
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+int  MultiBodyTree::MultiBodyImpl::getBodyDotJacobianTransU(const int body_index, vec3* world_dot_jac_trans_u) const {
+    CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    const RigidBody &body = m_body_list[body_index];
+    *world_dot_jac_trans_u = body.m_body_T_world.transpose() * body.m_body_dot_Jac_T_u;
+    return 0;
+int  MultiBodyTree::MultiBodyImpl::getBodyDotJacobianRotU(const int body_index, vec3* world_dot_jac_rot_u) const{
+     CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    const RigidBody &body = m_body_list[body_index];
+    *world_dot_jac_rot_u = body.m_body_T_world.transpose() * body.m_body_dot_Jac_R_u;
+    return 0;
+int  MultiBodyTree::MultiBodyImpl::getBodyJacobianTrans(const int body_index, mat3x* world_jac_trans) const{
+    CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    const RigidBody &body = m_body_list[body_index];
+    mul(body.m_body_T_world.transpose(), body.m_body_Jac_T,world_jac_trans);
+    return 0;
+int  MultiBodyTree::MultiBodyImpl::getBodyJacobianRot(const int body_index, mat3x* world_jac_rot) const{
+    CHECK_IF_BODY_INDEX_IS_VALID(body_index);
+    const RigidBody &body = m_body_list[body_index];
+    mul(body.m_body_T_world.transpose(), body.m_body_Jac_R,world_jac_rot);
+    return 0;
diff --git a/src/bullet/BulletInverseDynamics/details/MultiBodyTreeImpl.hpp b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeImpl.hpp
new file mode 100644
index 00000000..3efe9d04
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeImpl.hpp
@@ -0,0 +1,283 @@
+// The structs and classes defined here provide a basic inverse fynamics implementation used
+// by MultiBodyTree
+// User interaction should be through MultiBodyTree
+#include "../IDConfig.hpp"
+#include "../MultiBodyTree.hpp"
+namespace btInverseDynamics {
+/// Structure for for rigid body mass properties, connectivity and kinematic state
+/// all vectors and matrices are in body-fixed frame, if not indicated otherwise.
+/// The body-fixed frame is located in the joint connecting the body to its parent.
+struct RigidBody {
+	// 1 Inertial properties
+	/// Mass
+	idScalar m_mass;
+	/// Mass times center of gravity in body-fixed frame
+	vec3 m_body_mass_com;
+	/// Moment of inertia w.r.t. body-fixed frame
+	mat33 m_body_I_body;
+	// 2 dynamic properties
+	/// Left-hand side of the body equation of motion, translational part
+	vec3 m_eom_lhs_translational;
+	/// Left-hand side of the body equation of motion, rotational part
+	vec3 m_eom_lhs_rotational;
+	/// Force acting at the joint when the body is cut from its parent;
+	/// includes impressed joint force in J_JT direction,
+	/// as well as constraint force,
+	/// in body-fixed frame
+	vec3 m_force_at_joint;
+	/// Moment acting at the joint when the body is cut from its parent;
+	/// includes impressed joint moment in J_JR direction, and constraint moment
+	/// in body-fixed frame
+	vec3 m_moment_at_joint;
+	/// external (user provided) force acting at the body-fixed frame's origin, written in that
+	/// frame
+	vec3 m_body_force_user;
+	/// external (user provided) moment acting at the body-fixed frame's origin, written in that
+	/// frame
+	vec3 m_body_moment_user;
+	// 3 absolute kinematic properties
+	/// Position of body-fixed frame relative to world frame
+	/// this is currently only for debugging purposes
+	vec3 m_body_pos;
+	/// Absolute velocity of body-fixed frame
+	vec3 m_body_vel;
+	/// Absolute acceleration of body-fixed frame
+	/// NOTE: if gravitational acceleration is not zero, this is the accelation PLUS gravitational
+	/// acceleration!
+	vec3 m_body_acc;
+	/// Absolute angular velocity
+	vec3 m_body_ang_vel;
+	/// Absolute angular acceleration
+	/// NOTE: if gravitational acceleration is not zero, this is the accelation PLUS gravitational
+	/// acceleration!
+	vec3 m_body_ang_acc;
+	// 4 relative kinematic properties.
+	// these are in the parent body frame
+	/// Transform from world to body-fixed frame;
+	/// this is currently only for debugging purposes
+	mat33 m_body_T_world;
+	/// Transform from parent to body-fixed frame
+	mat33 m_body_T_parent;
+	/// Vector from parent to child frame in parent frame
+	vec3 m_parent_pos_parent_body;
+	/// Relative angular velocity
+	vec3 m_body_ang_vel_rel;
+	/// Relative linear velocity
+	vec3 m_parent_vel_rel;
+	/// Relative angular acceleration
+	vec3 m_body_ang_acc_rel;
+	/// Relative linear acceleration
+	vec3 m_parent_acc_rel;
+	// 5 Data describing the joint type and geometry
+	/// Type of joint
+	JointType m_joint_type;
+	/// Position of joint frame (body-fixed frame at q=0) relative to the parent frame
+	/// Components are in body-fixed frame of the parent
+	vec3 m_parent_pos_parent_body_ref;
+	/// Orientation of joint frame (body-fixed frame at q=0) relative to the parent frame
+	mat33 m_body_T_parent_ref;
+	/// Joint rotational Jacobian, ie, the partial derivative of the body-fixed frames absolute
+	/// angular velocity w.r.t. the generalized velocity of this body's relative degree of freedom.
+	/// For revolute joints this is the joint axis, for prismatic joints it is a null matrix.
+	/// (NOTE: dimensions will have to be dynamic for additional joint types!)
+	vec3 m_Jac_JR;
+	/// Joint translational Jacobian, ie, the partial derivative of the body-fixed frames absolute
+	/// linear velocity w.r.t. the generalized velocity of this body's relative degree of freedom.
+	/// For prismatic joints this is the joint axis, for revolute joints it is a null matrix.
+	/// (NOTE: dimensions might have to be dynamic for additional joint types!)
+	vec3 m_Jac_JT;
+	/// m_Jac_JT in the parent frame, it, m_body_T_parent_ref.transpose()*m_Jac_JT
+	vec3 m_parent_Jac_JT;
+	/// Start of index range for the position degree(s) of freedom describing this body's motion
+	/// relative to
+	/// its parent. The indices are wrt the multibody system's q-vector of generalized coordinates.
+	int m_q_index;
+	// 6 Scratch data for mass matrix computation using "composite rigid body algorithm"
+	/// mass of the subtree rooted in this body
+	idScalar m_subtree_mass;
+	/// center of mass * mass for subtree rooted in this body, in body-fixed frame
+	vec3 m_body_subtree_mass_com;
+	/// moment of inertia of subtree rooted in this body, w.r.t. body origin, in body-fixed frame
+	mat33 m_body_subtree_I_body;
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+    /// translational jacobian in body-fixed frame d(m_body_vel)/du
+    mat3x m_body_Jac_T;
+    /// rotationsl jacobian in body-fixed frame d(m_body_ang_vel)/du
+    mat3x m_body_Jac_R;
+    /// components of linear acceleration depending on u
+    /// (same as is d(m_Jac_T)/dt*u)
+    vec3 m_body_dot_Jac_T_u;
+    /// components of angular acceleration depending on u
+    /// (same as is d(m_Jac_T)/dt*u)
+    vec3 m_body_dot_Jac_R_u;
+/// The MBS implements a tree structured multibody system
+class MultiBodyTree::MultiBodyImpl {
+	friend class MultiBodyTree;
+        enum KinUpdateType {
+            POSITION_ONLY,
+        };
+	/// constructor
+	/// @param num_bodies the number of bodies in the system
+	/// @param num_dofs number of degrees of freedom in the system
+	MultiBodyImpl(int num_bodies_, int num_dofs_);
+	/// \copydoc MultiBodyTree::calculateInverseDynamics
+	int calculateInverseDynamics(const vecx& q, const vecx& u, const vecx& dot_u,
+								 vecx* joint_forces);
+	///\copydoc MultiBodyTree::calculateMassMatrix
+	int calculateMassMatrix(const vecx& q, const bool update_kinematics,
+							const bool initialize_matrix, const bool set_lower_triangular_matrix,
+							matxx* mass_matrix);
+    	/// calculate kinematics (vector quantities)
+        /// Depending on type, update positions only, positions & velocities, or positions, velocities
+        /// and accelerations.
+        int calculateKinematics(const vecx& q, const vecx& u, const vecx& dot_u, const KinUpdateType type);
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+    	/// calculate jacobians and (if type == POSITION_VELOCITY), also velocity-dependent accelration terms.
+    int calculateJacobians(const vecx& q, const vecx& u, const KinUpdateType type);
+    /// \copydoc MultiBodyTree::getBodyDotJacobianTransU
+    int getBodyDotJacobianTransU(const int body_index, vec3* world_dot_jac_trans_u) const ;
+    /// \copydoc MultiBodyTree::getBodyDotJacobianRotU
+    int getBodyDotJacobianRotU(const int body_index, vec3* world_dot_jac_rot_u) const;
+    /// \copydoc MultiBodyTree::getBodyJacobianTrans
+    int getBodyJacobianTrans(const int body_index, mat3x* world_jac_trans) const ;
+    /// \copydoc MultiBodyTree::getBodyJacobianRot
+    int getBodyJacobianRot(const int body_index, mat3x* world_jac_rot) const;
+    /// Add relative Jacobian component from motion relative to parent body
+    /// @param body the body to add the Jacobian component for
+    void addRelativeJacobianComponent(RigidBody&body);
+	/// generate additional index sets from the parent_index array
+	/// @return -1 on error, 0 on success
+	int generateIndexSets();
+	/// set gravity acceleration in world frame
+	/// @param gravity gravity vector in the world frame
+	/// @return 0 on success, -1 on error
+	int setGravityInWorldFrame(const vec3& gravity);
+	/// pretty print tree
+	void printTree();
+	/// print tree data
+	void printTreeData();
+	/// initialize fixed data
+	void calculateStaticData();
+	/// \copydoc MultiBodyTree::getBodyFrame
+	int getBodyFrame(const int index, vec3* world_origin, mat33* body_T_world) const;
+	/// \copydoc MultiBodyTree::getParentIndex
+	int getParentIndex(const int body_index, int* m_parent_index);
+	/// \copydoc MultiBodyTree::getJointType
+	int getJointType(const int body_index, JointType* joint_type) const;
+	/// \copydoc MultiBodyTree::getJointTypeStr
+	int getJointTypeStr(const int body_index, const char** joint_type) const;
+        /// \copydoc MultiBodyTree::getParentRParentBodyRef
+        int getParentRParentBodyRef(const int body_index, vec3* r) const;
+        /// \copydoc MultiBodyTree::getBodyTParentRef
+        int getBodyTParentRef(const int body_index, mat33* T) const;
+        /// \copydoc MultiBodyTree::getBodyAxisOfMotion
+        int getBodyAxisOfMotion(const int body_index, vec3* axis) const;
+	/// \copydoc MultiBodyTree:getDoFOffset
+	int getDoFOffset(const int body_index, int* q_index) const;
+	/// \copydoc MultiBodyTree::getBodyOrigin
+	int getBodyOrigin(const int body_index, vec3* world_origin) const;
+	/// \copydoc MultiBodyTree::getBodyCoM
+	int getBodyCoM(const int body_index, vec3* world_com) const;
+	/// \copydoc MultiBodyTree::getBodyTransform
+	int getBodyTransform(const int body_index, mat33* world_T_body) const;
+	/// \copydoc MultiBodyTree::getBodyAngularVelocity
+	int getBodyAngularVelocity(const int body_index, vec3* world_omega) const;
+	/// \copydoc MultiBodyTree::getBodyLinearVelocity
+	int getBodyLinearVelocity(const int body_index, vec3* world_velocity) const;
+	/// \copydoc MultiBodyTree::getBodyLinearVelocityCoM
+	int getBodyLinearVelocityCoM(const int body_index, vec3* world_velocity) const;
+	/// \copydoc MultiBodyTree::getBodyAngularAcceleration
+	int getBodyAngularAcceleration(const int body_index, vec3* world_dot_omega) const;
+	/// \copydoc MultiBodyTree::getBodyLinearAcceleration
+	int getBodyLinearAcceleration(const int body_index, vec3* world_acceleration) const;
+	/// \copydoc MultiBodyTree::getUserInt
+	int getUserInt(const int body_index, int* user_int) const;
+	/// \copydoc MultiBodyTree::getUserPtr
+	int getUserPtr(const int body_index, void** user_ptr) const;
+	/// \copydoc MultiBodyTree::setUserInt
+	int setUserInt(const int body_index, const int user_int);
+	/// \copydoc MultiBodyTree::setUserPtr
+	int setUserPtr(const int body_index, void* const user_ptr);
+	///\copydoc MultiBodytTree::setBodyMass
+	int setBodyMass(const int body_index, const idScalar mass);
+	///\copydoc MultiBodytTree::setBodyFirstMassMoment
+	int setBodyFirstMassMoment(const int body_index, const vec3& first_mass_moment);
+	///\copydoc MultiBodytTree::setBodySecondMassMoment
+	int setBodySecondMassMoment(const int body_index, const mat33& second_mass_moment);
+	///\copydoc MultiBodytTree::getBodyMass
+	int getBodyMass(const int body_index, idScalar* mass) const;
+	///\copydoc MultiBodytTree::getBodyFirstMassMoment
+	int getBodyFirstMassMoment(const int body_index, vec3* first_mass_moment) const;
+	///\copydoc MultiBodytTree::getBodySecondMassMoment
+	int getBodySecondMassMoment(const int body_index, mat33* second_mass_moment) const;
+	/// \copydoc MultiBodyTree::clearAllUserForcesAndMoments
+	void clearAllUserForcesAndMoments();
+	/// \copydoc MultiBodyTree::addUserForce
+	int addUserForce(const int body_index, const vec3& body_force);
+	/// \copydoc MultiBodyTree::addUserMoment
+	int addUserMoment(const int body_index, const vec3& body_moment);
+	// debug function. print tree structure to stdout
+	void printTree(int index, int indentation);
+	// get string representation of JointType (for debugging)
+	const char* jointTypeToString(const JointType& type) const;
+	// get number of degrees of freedom from joint type
+	int bodyNumDoFs(const JointType& type) const;
+	// number of bodies in the system
+	int m_num_bodies;
+	// number of degrees of freedom
+	int m_num_dofs;
+	// Gravitational acceleration (in world frame)
+	vec3 m_world_gravity;
+	// vector of bodies in the system
+	// body 0 is used as an environment body and is allways fixed.
+	// The bodies are ordered such that a parent body always has an index
+	// smaller than its child.
+	idArray<RigidBody>::type m_body_list;
+	// Parent_index[i] is the index for i's parent body in body_list.
+	// This fully describes the tree.
+	idArray<int>::type m_parent_index;
+	// child_indices[i] contains a vector of indices of
+	// all children of the i-th body
+	idArray<idArray<int>::type>::type m_child_indices;
+	// Indices of rotary joints
+	idArray<int>::type m_body_revolute_list;
+	// Indices of prismatic joints
+	idArray<int>::type m_body_prismatic_list;
+	// Indices of floating joints
+	idArray<int>::type m_body_floating_list;
+	// a user-provided integer
+	idArray<int>::type m_user_int;
+	// a user-provided pointer
+	idArray<void*>::type m_user_ptr;
+#if (defined BT_ID_HAVE_MAT3X) && (defined BT_ID_WITH_JACOBIANS)
+        mat3x m_m3x;
diff --git a/src/bullet/BulletInverseDynamics/details/MultiBodyTreeInitCache.cpp b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeInitCache.cpp
new file mode 100644
index 00000000..47b4ab38
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeInitCache.cpp
@@ -0,0 +1,113 @@
+#include "MultiBodyTreeInitCache.hpp"
+namespace btInverseDynamics {
+MultiBodyTree::InitCache::InitCache() {
+	m_inertias.resize(0);
+	m_joints.resize(0);
+	m_num_dofs = 0;
+	m_root_index=-1;
+int MultiBodyTree::InitCache::addBody(const int body_index, const int parent_index,
+									  const JointType joint_type,
+									  const vec3& parent_r_parent_body_ref,
+									  const mat33& body_T_parent_ref,
+									  const vec3& body_axis_of_motion, const idScalar mass,
+									  const vec3& body_r_body_com, const mat33& body_I_body,
+									  const int user_int, void* user_ptr) {
+	switch (joint_type) {
+		case REVOLUTE:
+		case PRISMATIC:
+			m_num_dofs += 1;
+			break;
+		case FIXED:
+			// does not add a degree of freedom
+			// m_num_dofs+=0;
+			break;
+		case FLOATING:
+			m_num_dofs += 6;
+			break;
+		default:
+			error_message("unknown joint type %d\n", joint_type);
+			return -1;
+	}
+	if(-1 == parent_index) {
+		if(m_root_index>=0) {
+			error_message("trying to add body %d as root, but already added %d as root body\n",
+						  body_index, m_root_index);
+			return -1;
+		}
+		m_root_index=body_index;
+	}
+	JointData joint;
+	joint.m_child = body_index;
+	joint.m_parent = parent_index;
+	joint.m_type = joint_type;
+	joint.m_parent_pos_parent_child_ref = parent_r_parent_body_ref;
+	joint.m_child_T_parent_ref = body_T_parent_ref;
+	joint.m_child_axis_of_motion = body_axis_of_motion;
+	InertiaData body;
+	body.m_mass = mass;
+	body.m_body_pos_body_com = body_r_body_com;
+	body.m_body_I_body = body_I_body;
+	m_inertias.push_back(body);
+	m_joints.push_back(joint);
+	m_user_int.push_back(user_int);
+	m_user_ptr.push_back(user_ptr);
+	return 0;
+int MultiBodyTree::InitCache::getInertiaData(const int index, InertiaData* inertia) const {
+	if (index < 0 || index > static_cast<int>(m_inertias.size())) {
+		error_message("index out of range\n");
+		return -1;
+	}
+	*inertia = m_inertias[index];
+	return 0;
+int MultiBodyTree::InitCache::getUserInt(const int index, int* user_int) const {
+	if (index < 0 || index > static_cast<int>(m_user_int.size())) {
+		error_message("index out of range\n");
+		return -1;
+	}
+	*user_int = m_user_int[index];
+	return 0;
+int MultiBodyTree::InitCache::getUserPtr(const int index, void** user_ptr) const {
+	if (index < 0 || index > static_cast<int>(m_user_ptr.size())) {
+		error_message("index out of range\n");
+		return -1;
+	}
+	*user_ptr = m_user_ptr[index];
+	return 0;
+int MultiBodyTree::InitCache::getJointData(const int index, JointData* joint) const {
+	if (index < 0 || index > static_cast<int>(m_joints.size())) {
+		error_message("index out of range\n");
+		return -1;
+	}
+	*joint = m_joints[index];
+	return 0;
+int MultiBodyTree::InitCache::buildIndexSets() {
+	// NOTE: This function assumes that proper indices were provided
+	//	   User2InternalIndex from utils can be used to facilitate this.
+	m_parent_index.resize(numBodies());
+	for (idArrayIdx j = 0; j < m_joints.size(); j++) {
+		const JointData& joint = m_joints[j];
+		m_parent_index[joint.m_child] = joint.m_parent;
+	}
+	return 0;
diff --git a/src/bullet/BulletInverseDynamics/details/MultiBodyTreeInitCache.hpp b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeInitCache.hpp
new file mode 100644
index 00000000..0d2aa4a0
--- /dev/null
+++ b/src/bullet/BulletInverseDynamics/details/MultiBodyTreeInitCache.hpp
@@ -0,0 +1,109 @@
+#include "../IDConfig.hpp"
+#include "../IDMath.hpp"
+#include "../MultiBodyTree.hpp"
+namespace btInverseDynamics {
+/// Mass properties of a rigid body
+struct InertiaData {
+	/// mass
+	idScalar m_mass;
+	/// vector from body-fixed frame to center of mass,
+	/// in body-fixed frame, multiplied by the mass
+	vec3 m_body_pos_body_com;
+	/// moment of inertia w.r.t. the origin of the body-fixed
+	/// frame, represented in that frame
+	mat33 m_body_I_body;
+/// Joint properties
+struct JointData {
+	/// type of joint
+	JointType m_type;
+	/// index of parent body
+	int m_parent;
+	/// index of child body
+	int m_child;
+	/// vector from parent's body-fixed frame to child's body-fixed
+	/// frame for q=0, written in the parent's body fixed frame
+	vec3 m_parent_pos_parent_child_ref;
+	/// Transform matrix converting vectors written in the parent's frame
+	/// into vectors written in the child's frame for q=0
+	/// ie, child_vector = child_T_parent_ref * parent_vector;
+	mat33 m_child_T_parent_ref;
+	/// Axis of motion for 1 degree-of-freedom joints,
+	/// written in the child's frame
+	/// For revolute joints, the q-value is positive for a positive
+	/// rotation about this axis.
+	/// For prismatic joints, the q-value is positive for a positive
+	/// translation is this direction.
+	vec3 m_child_axis_of_motion;
+/// Data structure to store data passed by the user.
+/// This is used in MultiBodyTree::finalize to build internal data structures.
+class MultiBodyTree::InitCache {
+	/// constructor
+	InitCache();
+	///\copydoc MultiBodyTree::addBody
+	int addBody(const int body_index, const int parent_index, const JointType joint_type,
+				const vec3 &parent_r_parent_body_ref, const mat33 &body_T_parent_ref,
+				const vec3 &body_axis_of_motion, idScalar mass, const vec3 &body_r_body_com,
+				const mat33 &body_I_body, const int user_int, void *user_ptr);
+	/// build index arrays
+	/// @return 0 on success, -1 on failure
+	int buildIndexSets();
+	/// @return number of degrees of freedom
+	int numDoFs() const { return m_num_dofs; }
+	/// @return number of bodies
+	int numBodies() const { return m_inertias.size(); }
+	/// get inertia data for index
+	/// @param index of the body
+	/// @param inertia pointer for return data
+	/// @return 0 on success, -1 on failure
+	int getInertiaData(const int index, InertiaData *inertia) const;
+	/// get joint data for index
+	/// @param index of the body
+	/// @param joint pointer for return data
+	/// @return 0 on success, -1 on failure
+	int getJointData(const int index, JointData *joint) const;
+	/// get parent index array (paren_index[i] is the index of the parent of i)
+	/// @param parent_index pointer for return data
+	void getParentIndexArray(idArray<int>::type *parent_index) { *parent_index = m_parent_index; }
+	/// get user integer
+	/// @param index body index
+	/// @param user_int user integer
+	/// @return 0 on success, -1 on failure
+	int getUserInt(const int index, int *user_int) const;
+	/// get user pointer
+	/// @param index body index
+	/// @param user_int user pointer
+	/// @return 0 on success, -1 on failure
+	int getUserPtr(const int index, void **user_ptr) const;
+	// vector of bodies
+	idArray<InertiaData>::type m_inertias;
+	// vector of joints
+	idArray<JointData>::type m_joints;
+	// number of mechanical degrees of freedom
+	int m_num_dofs;
+	// parent index array
+	idArray<int>::type m_parent_index;
+	// user integers
+	idArray<int>::type m_user_int;
+	// user pointers
+	idArray<void *>::type m_user_ptr;
+	// index of root body (or -1 if not set)
+	int m_root_index;
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverBuffer_DX11.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverBuffer_DX11.h
deleted file mode 100644
index b6a99cc1..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverBuffer_DX11.h
+++ /dev/null
@@ -1,323 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// DX11 support
-#include <windows.h>
-#include <crtdbg.h>
-#include <d3d11.h>
-#include <d3dx11.h>
-#include <d3dcompiler.h>
-#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
- * DX11 Buffer that tracks a host buffer on use to ensure size-correctness.
- */
-template <typename ElementType> class btDX11Buffer
-	ID3D11Device*				m_d3dDevice;
-	ID3D11DeviceContext*		m_d3dDeviceContext;
-	ID3D11Buffer*               m_Buffer;
-	ID3D11ShaderResourceView*   m_SRV;
-	ID3D11UnorderedAccessView*  m_UAV;
-	btAlignedObjectArray< ElementType >*	m_CPUBuffer;
-	// TODO: Separate this from the main class
-	// as read back buffers can be shared between buffers
-	ID3D11Buffer*               m_readBackBuffer;
-	int m_gpuSize;
-	bool m_onGPU;
-	bool m_readOnlyOnGPU;
-	bool createBuffer( ID3D11Buffer *preexistingBuffer = 0)
-	{
-		HRESULT hr = S_OK;
-		// Create all CS buffers
-		if( preexistingBuffer )
-		{
-			m_Buffer = preexistingBuffer;
-		} else {
-			D3D11_BUFFER_DESC buffer_desc;
-			ZeroMemory(&buffer_desc, sizeof(buffer_desc));		
-			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
-			if( m_readOnlyOnGPU )
-				buffer_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
-			else
-				buffer_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
-			buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
-			buffer_desc.ByteWidth = m_CPUBuffer->size() * sizeof(ElementType);
-			// At a minimum the buffer must exist
-			if( buffer_desc.ByteWidth == 0 )
-				buffer_desc.ByteWidth = sizeof(ElementType);
-			buffer_desc.StructureByteStride = sizeof(ElementType);
-			hr = m_d3dDevice->CreateBuffer(&buffer_desc, NULL, &m_Buffer);
-			if( FAILED( hr ) )
-		        return (hr==S_OK);
-		} 
-		if( m_readOnlyOnGPU )
-		{
-			D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
-			ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
-			srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
-			srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
-			srvbuffer_desc.Buffer.ElementWidth = m_CPUBuffer->size();
-			if( srvbuffer_desc.Buffer.ElementWidth == 0 )
-				srvbuffer_desc.Buffer.ElementWidth = 1;
-			hr = m_d3dDevice->CreateShaderResourceView(m_Buffer, &srvbuffer_desc, &m_SRV);
-			if( FAILED( hr ) )
-				return (hr==S_OK);
-		} else {
-			// Create SRV
-			D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
-			ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
-			srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
-			srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
-			srvbuffer_desc.Buffer.ElementWidth = m_CPUBuffer->size();
-			if( srvbuffer_desc.Buffer.ElementWidth == 0 )
-				srvbuffer_desc.Buffer.ElementWidth = 1;
-			hr = m_d3dDevice->CreateShaderResourceView(m_Buffer, &srvbuffer_desc, &m_SRV);
-			if( FAILED( hr ) )
-				return (hr==S_OK);
-			// Create UAV
-			D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc;
-			ZeroMemory(&uavbuffer_desc, sizeof(uavbuffer_desc));
-			uavbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
-			uavbuffer_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
-			uavbuffer_desc.Buffer.NumElements = m_CPUBuffer->size();
-			if( uavbuffer_desc.Buffer.NumElements == 0 )
-				uavbuffer_desc.Buffer.NumElements = 1;
-			hr = m_d3dDevice->CreateUnorderedAccessView(m_Buffer, &uavbuffer_desc, &m_UAV);
-			if( FAILED( hr ) )
-				return (hr==S_OK);
-			// Create read back buffer
-			D3D11_BUFFER_DESC readback_buffer_desc;
-			ZeroMemory(&readback_buffer_desc, sizeof(readback_buffer_desc));
-			readback_buffer_desc.ByteWidth = m_CPUBuffer->size() * sizeof(ElementType);
-			readback_buffer_desc.Usage = D3D11_USAGE_STAGING;
-			readback_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
-			readback_buffer_desc.StructureByteStride = sizeof(ElementType);
-			hr = m_d3dDevice->CreateBuffer(&readback_buffer_desc, NULL, &m_readBackBuffer);
-			if( FAILED( hr ) )
-				return (hr==S_OK);
-		}
-		m_gpuSize = m_CPUBuffer->size();
-		return true;
-	}
-	btDX11Buffer( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext, btAlignedObjectArray< ElementType > *CPUBuffer, bool readOnly )
-	{
-		m_d3dDevice = d3dDevice;
-		m_d3dDeviceContext = d3dDeviceContext;
-		m_Buffer = 0;
-		m_SRV = 0;
-		m_UAV = 0;
-		m_readBackBuffer = 0;
-		m_CPUBuffer = CPUBuffer;
-		m_gpuSize = 0;
-		m_onGPU = false;
-		m_readOnlyOnGPU = readOnly;
-	}
-	virtual ~btDX11Buffer()
-	{
-		SAFE_RELEASE(m_Buffer);
-		SAFE_RELEASE(m_readBackBuffer);
-	}
-	ID3D11ShaderResourceView* &getSRV()
-	{
-		return m_SRV;
-	}
-	ID3D11UnorderedAccessView* &getUAV()
-	{
-		return m_UAV;
-	}
-	ID3D11Buffer* &getBuffer()
-	{
-		return m_Buffer;
-	}
-	/**
-	 * Move the data to the GPU if it is not there already.
-	 */
-	bool moveToGPU()
-	{
-		// Reallocate if GPU size is too small
-		if( (m_CPUBuffer->size() > m_gpuSize ) )
-			m_onGPU = false;
-		if( !m_onGPU && m_CPUBuffer->size() > 0 )
-		{
-			// If the buffer doesn't exist or the CPU-side buffer has changed size, create
-			// We should really delete the old one, too, but let's leave that for later
-			if( !m_Buffer || (m_CPUBuffer->size() != m_gpuSize) )
-			{
-				SAFE_RELEASE(m_Buffer);
-				SAFE_RELEASE(m_readBackBuffer);
-				if( !createBuffer() )
-				{
-					btAssert("Buffer creation failed.");
-					return false;
-				}
-			}
-			if( m_gpuSize > 0 )
-			{
-				D3D11_BOX destRegion;
-				destRegion.left = 0;
-				destRegion.front = 0;
-				destRegion.top = 0;
-				destRegion.bottom = 1;
-				destRegion.back = 1;
-				destRegion.right = (m_CPUBuffer->size())*sizeof(ElementType);
-				m_d3dDeviceContext->UpdateSubresource(m_Buffer, 0, &destRegion, &((*m_CPUBuffer)[0]), 0, 0);
-				m_onGPU = true;
-			}
-		}
-		return true;
-	}
-	/**
-	 * Move the data back from the GPU if it is on there and isn't read only.
-	 */
-	bool moveFromGPU()
-	{
-		if( m_CPUBuffer->size() > 0 )
-		{
-			if( m_onGPU && !m_readOnlyOnGPU )
-			{
-				// Copy back
-				D3D11_MAPPED_SUBRESOURCE MappedResource = {0}; 
-				//m_pd3dImmediateContext->CopyResource(m_phAngVelReadBackBuffer, m_phAngVel);
-				D3D11_BOX destRegion;	
-				destRegion.left = 0;
-				destRegion.front = 0;
-				destRegion.top = 0;
-				destRegion.bottom = 1;
-				destRegion.back = 1;
-				destRegion.right = (m_CPUBuffer->size())*sizeof(ElementType);
-				m_d3dDeviceContext->CopySubresourceRegion(
-					m_readBackBuffer,
-					0,
-					0,
-					0,
-					0 ,
-					m_Buffer,
-					0,
-					&destRegion
-					);
-				m_d3dDeviceContext->Map(m_readBackBuffer, 0, D3D11_MAP_READ, 0, &MappedResource);   
-				//memcpy(m_hAngVel, MappedResource.pData, (m_maxObjs * sizeof(float) ));
-				memcpy(&((*m_CPUBuffer)[0]), MappedResource.pData, ((m_CPUBuffer->size()) * sizeof(ElementType) ));		
-				m_d3dDeviceContext->Unmap(m_readBackBuffer, 0);
-				m_onGPU = false;
-			}
-		}
-		return true;
-	}
-	/**
-	 * Copy the data back from the GPU without changing its state to be CPU-side.
-	 * Useful if we just want to view it on the host for visualization.
-	 */
-	bool copyFromGPU()
-	{
-		if( m_CPUBuffer->size() > 0 )
-		{
-			if( m_onGPU && !m_readOnlyOnGPU )
-			{
-				// Copy back
-				D3D11_MAPPED_SUBRESOURCE MappedResource = {0}; 
-				D3D11_BOX destRegion;	
-				destRegion.left = 0;
-				destRegion.front = 0;
-				destRegion.top = 0;
-				destRegion.bottom = 1;
-				destRegion.back = 1;
-				destRegion.right = (m_CPUBuffer->size())*sizeof(ElementType);
-				m_d3dDeviceContext->CopySubresourceRegion(
-					m_readBackBuffer,
-					0,
-					0,
-					0,
-					0 ,
-					m_Buffer,
-					0,
-					&destRegion
-					);
-				m_d3dDeviceContext->Map(m_readBackBuffer, 0, D3D11_MAP_READ, 0, &MappedResource);   
-				//memcpy(m_hAngVel, MappedResource.pData, (m_maxObjs * sizeof(float) ));
-				memcpy(&((*m_CPUBuffer)[0]), MappedResource.pData, ((m_CPUBuffer->size()) * sizeof(ElementType) ));		
-				m_d3dDeviceContext->Unmap(m_readBackBuffer, 0);
-			}
-		}
-		return true;
-	}
-	/**
-	 * Call if data has changed on the CPU.
-	 * Can then trigger a move to the GPU as necessary.
-	 */
-	virtual void changedOnCPU()
-	{
-		m_onGPU = false;
-	}
-}; // class btDX11Buffer
-#endif // #ifndef BT_SOFT_BODY_SOLVER_BUFFER_DX11_H
\ No newline at end of file
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11.h
deleted file mode 100644
index 454c3c8c..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11.h
+++ /dev/null
@@ -1,103 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_DX11.h"
-struct ID3D11Device;
-struct ID3D11DeviceContext;
-class btSoftBodyLinkDataDX11 : public btSoftBodyLinkData
-	bool				m_onGPU;
-	ID3D11Device		*m_d3dDevice;
-	ID3D11DeviceContext *m_d3dDeviceContext;
-	btDX11Buffer<LinkNodePair>				m_dx11Links;
-	btDX11Buffer<float>											m_dx11LinkStrength;
-	btDX11Buffer<float>											m_dx11LinksMassLSC;
-	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
-	btDX11Buffer<Vectormath::Aos::Vector3>						m_dx11LinksCLength;
-	btDX11Buffer<float>											m_dx11LinksLengthRatio;
-	btDX11Buffer<float>											m_dx11LinksRestLength;
-	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
-	struct BatchPair
-	{
-		int start;
-		int length;
-		BatchPair() :
-			start(0),
-			length(0)
-		{
-		}
-		BatchPair( int s, int l ) : 
-			start( s ),
-			length( l )
-		{
-		}
-	};
-	/**
-	 * Link addressing information for each cloth.
-	 * Allows link locations to be computed independently of data batching.
-	 */
-	btAlignedObjectArray< int >							m_linkAddresses;
-	/**
-	 * Start and length values for computation batches over link data.
-	 */
-	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
-	//ID3D11Buffer*               readBackBuffer;
-	btSoftBodyLinkDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
-	virtual ~btSoftBodyLinkDataDX11();
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createLinks( int numLinks );
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
-	/**
-	 * Generate (and later update) the batching for the entire link set.
-	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
-	 * In theory we could delay it until just before we need the cloth.
-	 * It's a one-off overhead, though, so that is a later optimisation.
-	 */
-	void generateBatches();
-#endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_DX11_H
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
deleted file mode 100644
index 6eb26c68..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
+++ /dev/null
@@ -1,173 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_DX11.h"
-struct ID3D11Device;
-struct ID3D11DeviceContext;
-class btSoftBodyLinkDataDX11SIMDAware : public btSoftBodyLinkData
-	bool				m_onGPU;
-	ID3D11Device		*m_d3dDevice;
-	ID3D11DeviceContext *m_d3dDeviceContext;
-	const int m_wavefrontSize;
-	const int m_linksPerWorkItem;
-	const int m_maxLinksPerWavefront;
-	int m_maxBatchesWithinWave;
-	int m_maxVerticesWithinWave;
-	int m_numWavefronts;
-	int m_maxVertex;
-	struct NumBatchesVerticesPair
-	{
-		int numBatches;
-		int numVertices;
-	};
-	// Array storing number of links in each wavefront
-	btAlignedObjectArray<int>									m_linksPerWavefront;
-	btAlignedObjectArray<NumBatchesVerticesPair>				m_numBatchesAndVerticesWithinWaves;
-	btDX11Buffer< NumBatchesVerticesPair >						m_dx11NumBatchesAndVerticesWithinWaves;
-	// All arrays here will contain batches of m_maxLinksPerWavefront links
-	// ordered by wavefront.
-	// with either global vertex pairs or local vertex pairs
-	btAlignedObjectArray< int >									m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
-	btDX11Buffer<int>											m_dx11WavefrontVerticesGlobalAddresses;
-	btAlignedObjectArray< LinkNodePair >						m_linkVerticesLocalAddresses; // Vertex pair for the link
-	btDX11Buffer<LinkNodePair>									m_dx11LinkVerticesLocalAddresses;
-	btDX11Buffer<float>											m_dx11LinkStrength;
-	btDX11Buffer<float>											m_dx11LinksMassLSC;
-	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
-	btDX11Buffer<float>											m_dx11LinksRestLength;
-	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
-	struct BatchPair
-	{
-		int start;
-		int length;
-		BatchPair() :
-			start(0),
-			length(0)
-		{
-		}
-		BatchPair( int s, int l ) : 
-			start( s ),
-			length( l )
-		{
-		}
-	};
-	/**
-	 * Link addressing information for each cloth.
-	 * Allows link locations to be computed independently of data batching.
-	 */
-	btAlignedObjectArray< int >							m_linkAddresses;
-	/**
-	 * Start and length values for computation batches over link data.
-	 */
-	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
-	//ID3D11Buffer*               readBackBuffer;
-	btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
-	virtual ~btSoftBodyLinkDataDX11SIMDAware();
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createLinks( int numLinks );
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
-	/**
-	 * Generate (and later update) the batching for the entire link set.
-	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
-	 * In theory we could delay it until just before we need the cloth.
-	 * It's a one-off overhead, though, so that is a later optimisation.
-	 */
-	void generateBatches();
-	int getMaxVerticesPerWavefront()
-	{
-		return m_maxVerticesWithinWave;
-	}
-	int getWavefrontSize()
-	{
-		return m_wavefrontSize;
-	}
-	int getLinksPerWorkItem()
-	{
-		return m_linksPerWorkItem;
-	}
-	int getMaxLinksPerWavefront()
-	{
-		return m_maxLinksPerWavefront;
-	}
-	int getMaxBatchesPerWavefront()
-	{
-		return m_maxBatchesWithinWave;
-	}
-	int getNumWavefronts()
-	{
-		return m_numWavefronts;
-	}
-	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
-	{
-		return m_numBatchesAndVerticesWithinWaves[wavefront];
-	}
-	int getVertexGlobalAddresses( int vertexIndex )
-	{
-		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
-	}
-	/**
-	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
-	 */
-	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
-	{
-		return m_linkVerticesLocalAddresses[linkIndex];
-	}
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverTriangleData_DX11.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverTriangleData_DX11.h
deleted file mode 100644
index 7012fabd..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverTriangleData_DX11.h
+++ /dev/null
@@ -1,96 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_DX11.h"
-struct ID3D11Device;
-struct ID3D11DeviceContext;
-class btSoftBodyTriangleDataDX11 : public btSoftBodyTriangleData
-	bool				m_onGPU;
-	ID3D11Device		*m_d3dDevice;
-	ID3D11DeviceContext *m_d3dDeviceContext;
-	btDX11Buffer<btSoftBodyTriangleData::TriangleNodeSet>							m_dx11VertexIndices;
-	btDX11Buffer<float>									m_dx11Area;
-	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11Normal;
-	struct BatchPair
-	{
-		int start;
-		int length;
-		BatchPair() :
-			start(0),
-			length(0)
-		{
-		}
-		BatchPair( int s, int l ) : 
-			start( s ),
-			length( l )
-		{
-		}
-	};
-	/**
-	 * Link addressing information for each cloth.
-	 * Allows link locations to be computed independently of data batching.
-	 */
-	btAlignedObjectArray< int >							m_triangleAddresses;
-	/**
-	 * Start and length values for computation batches over link data.
-	 */
-	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
-	//ID3D11Buffer*               readBackBuffer;
-	btSoftBodyTriangleDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
-	virtual ~btSoftBodyTriangleDataDX11();
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createTriangles( int numTriangles );
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setTriangleAt( const btSoftBodyTriangleData::TriangleDescription &triangle, int triangleIndex );
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
-	/**
-	 * Generate (and later update) the batching for the entire triangle set.
-	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
-	 * In theory we could delay it until just before we need the cloth.
-	 * It's a one-off overhead, though, so that is a later optimisation.
-	 */
-	void generateBatches();
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexBuffer_DX11.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexBuffer_DX11.h
deleted file mode 100644
index 66bd90fa..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexBuffer_DX11.h
+++ /dev/null
@@ -1,107 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
-#include <windows.h>
-#include <crtdbg.h>
-#include <d3d11.h>
-#include <d3dx11.h>
-#include <d3dcompiler.h>
-class btDX11VertexBufferDescriptor : public btVertexBufferDescriptor
-	/** Context of the DX11 device on which the vertex buffer is stored. */
-	ID3D11DeviceContext* m_context;
-	/** DX11 vertex buffer */
-	ID3D11Buffer* m_vertexBuffer;
-	/** UAV for DX11 buffer */
-	ID3D11UnorderedAccessView*  m_vertexBufferUAV;
-	/**
-	 * buffer is a pointer to the DX11 buffer to place the vertex data in.
-	 * UAV is a pointer to the UAV representation of the buffer laid out in floats.
-	 * vertexOffset is the offset in floats to the first vertex.
-	 * vertexStride is the stride in floats between vertices.
-	 */
-	btDX11VertexBufferDescriptor( ID3D11DeviceContext* context, ID3D11Buffer* buffer, ID3D11UnorderedAccessView *UAV, int vertexOffset, int vertexStride )
-	{
-		m_context = context;
-		m_vertexBuffer = buffer;
-		m_vertexBufferUAV = UAV;
-		m_vertexOffset = vertexOffset;
-		m_vertexStride = vertexStride;
-		m_hasVertexPositions = true;
-	}
-	/**
-	 * buffer is a pointer to the DX11 buffer to place the vertex data in.
-	 * UAV is a pointer to the UAV representation of the buffer laid out in floats.
-	 * vertexOffset is the offset in floats to the first vertex.
-	 * vertexStride is the stride in floats between vertices.
-	 * normalOffset is the offset in floats to the first normal.
-	 * normalStride is the stride in floats between normals.
-	 */
-	btDX11VertexBufferDescriptor( ID3D11DeviceContext* context, ID3D11Buffer* buffer, ID3D11UnorderedAccessView *UAV, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
-	{
-		m_context = context;
-		m_vertexBuffer = buffer;
-		m_vertexBufferUAV = UAV;
-		m_vertexOffset = vertexOffset;
-		m_vertexStride = vertexStride;
-		m_hasVertexPositions = true;
-		m_normalOffset = normalOffset;
-		m_normalStride = normalStride;
-		m_hasNormals = true;
-	}
-	virtual ~btDX11VertexBufferDescriptor()
-	{
-	}
-	/**
-	 * Return the type of the vertex buffer descriptor.
-	 */
-	virtual BufferTypes getBufferType() const
-	{
-		return DX11_BUFFER;
-	}
-	virtual ID3D11DeviceContext* getContext() const
-	{
-		return m_context;
-	}
-	virtual ID3D11Buffer* getbtDX11Buffer() const
-	{
-		return m_vertexBuffer;
-	}
-	virtual ID3D11UnorderedAccessView* getDX11UAV() const
-	{
-		return m_vertexBufferUAV;
-	}		
\ No newline at end of file
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
deleted file mode 100644
index dd7cc84c..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
+++ /dev/null
@@ -1,63 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_DX11.h"
-class btSoftBodyLinkData;
-class btSoftBodyLinkData::LinkDescription;
-struct ID3D11Device;
-struct ID3D11DeviceContext;
-class btSoftBodyVertexDataDX11 : public btSoftBodyVertexData
-	bool				m_onGPU;
-	ID3D11Device		*m_d3dDevice;
-	ID3D11DeviceContext *m_d3dDeviceContext;
-	btDX11Buffer<int>										m_dx11ClothIdentifier;
-	btDX11Buffer<Vectormath::Aos::Point3>					m_dx11VertexPosition;
-	btDX11Buffer<Vectormath::Aos::Point3>					m_dx11VertexPreviousPosition;
-	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11VertexVelocity;
-	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11VertexForceAccumulator;
-	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11VertexNormal;
-	btDX11Buffer<float>									m_dx11VertexInverseMass;
-	btDX11Buffer<float>									m_dx11VertexArea;
-	btDX11Buffer<int>										m_dx11VertexTriangleCount;
-	//ID3D11Buffer*               readBackBuffer;
-	btSoftBodyVertexDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
-	virtual ~btSoftBodyVertexDataDX11();
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
deleted file mode 100644
index 1f71425c..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ /dev/null
@@ -1,2236 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "vectormath/vmInclude.h"
-#include "btSoftBodySolver_DX11.h"
-#include "btSoftBodySolverVertexBuffer_DX11.h"
-#include "BulletSoftBody/btSoftBody.h"
-#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
-#include <stdio.h> //printf
-#define MSTRINGIFY(A) #A
-static char* PrepareLinksHLSLString = 
-#include "HLSL/PrepareLinks.hlsl"
-static char* UpdatePositionsFromVelocitiesHLSLString = 
-#include "HLSL/UpdatePositionsFromVelocities.hlsl"
-static char* SolvePositionsHLSLString = 
-#include "HLSL/SolvePositions.hlsl"
-static char* UpdateNodesHLSLString = 
-#include "HLSL/UpdateNodes.hlsl"
-static char* UpdatePositionsHLSLString = 
-#include "HLSL/UpdatePositions.hlsl"
-static char* UpdateConstantsHLSLString = 
-#include "HLSL/UpdateConstants.hlsl"
-static char* IntegrateHLSLString = 
-#include "HLSL/Integrate.hlsl"
-static char* ApplyForcesHLSLString = 
-#include "HLSL/ApplyForces.hlsl"
-static char* UpdateNormalsHLSLString = 
-#include "HLSL/UpdateNormals.hlsl"
-static char* OutputToVertexArrayHLSLString = 
-#include "HLSL/OutputToVertexArray.hlsl"
-static char* VSolveLinksHLSLString = 
-#include "HLSL/VSolveLinks.hlsl"
-static char* ComputeBoundsHLSLString = 
-#include "HLSL/ComputeBounds.hlsl"
-static char* SolveCollisionsAndUpdateVelocitiesHLSLString =
-#include "HLSL/SolveCollisionsAndUpdateVelocities.hlsl"
-btSoftBodyLinkDataDX11::btSoftBodyLinkDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext ) : 
-		m_dx11Links( d3dDevice, d3dDeviceContext, &m_links, false ),
-		m_dx11LinkStrength( d3dDevice, d3dDeviceContext, &m_linkStrength, false ),
-		m_dx11LinksMassLSC( d3dDevice, d3dDeviceContext, &m_linksMassLSC, false ),
-		m_dx11LinksRestLengthSquared( d3dDevice, d3dDeviceContext, &m_linksRestLengthSquared, false ),
-		m_dx11LinksCLength( d3dDevice, d3dDeviceContext, &m_linksCLength, false ),
-		m_dx11LinksLengthRatio( d3dDevice, d3dDeviceContext, &m_linksLengthRatio, false ),
-		m_dx11LinksRestLength( d3dDevice, d3dDeviceContext, &m_linksRestLength, false ),
-		m_dx11LinksMaterialLinearStiffnessCoefficient( d3dDevice, d3dDeviceContext, &m_linksMaterialLinearStiffnessCoefficient, false )
-	m_d3dDevice = d3dDevice;
-	m_d3dDeviceContext = d3dDeviceContext;
-static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
-	Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
-	return outVec;
-void btSoftBodyLinkDataDX11::createLinks( int numLinks )
-	int previousSize = m_links.size();
-	int newSize = previousSize + numLinks;
-	btSoftBodyLinkData::createLinks( numLinks );
-	// Resize the link addresses array as well
-	m_linkAddresses.resize( newSize );
-void btSoftBodyLinkDataDX11::setLinkAt( const btSoftBodyLinkData::LinkDescription &link, int linkIndex )
-	btSoftBodyLinkData::setLinkAt( link, linkIndex );
-	// Set the link index correctly for initialisation
-	m_linkAddresses[linkIndex] = linkIndex;
-bool btSoftBodyLinkDataDX11::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyLinkDataDX11::moveToAccelerator()
-	bool success = true;
-	success = success && m_dx11Links.moveToGPU();
-	success = success && m_dx11LinkStrength.moveToGPU();
-	success = success && m_dx11LinksMassLSC.moveToGPU();
-	success = success && m_dx11LinksRestLengthSquared.moveToGPU();
-	success = success && m_dx11LinksCLength.moveToGPU();
-	success = success && m_dx11LinksLengthRatio.moveToGPU();
-	success = success && m_dx11LinksRestLength.moveToGPU();
-	success = success && m_dx11LinksMaterialLinearStiffnessCoefficient.moveToGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
-bool btSoftBodyLinkDataDX11::moveFromAccelerator()
-	bool success = true;
-	success = success && m_dx11Links.moveFromGPU();
-	success = success && m_dx11LinkStrength.moveFromGPU();
-	success = success && m_dx11LinksMassLSC.moveFromGPU();
-	success = success && m_dx11LinksRestLengthSquared.moveFromGPU();
-	success = success && m_dx11LinksCLength.moveFromGPU();
-	success = success && m_dx11LinksLengthRatio.moveFromGPU();
-	success = success && m_dx11LinksRestLength.moveFromGPU();
-	success = success && m_dx11LinksMaterialLinearStiffnessCoefficient.moveFromGPU();
-	if( success )
-		m_onGPU = false;
-	return success;
-void btSoftBodyLinkDataDX11::generateBatches()
-	int numLinks = getNumLinks();
-	// Do the graph colouring here temporarily
-	btAlignedObjectArray< int > batchValues;
-	batchValues.resize( numLinks, 0 );
-	// Find the maximum vertex value internally for now
-	int maxVertex = 0;
-	for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-	{
-		int vertex0 = getVertexPair(linkIndex).vertex0;
-		int vertex1 = getVertexPair(linkIndex).vertex1;
-		if( vertex0 > maxVertex )
-			maxVertex = vertex0;
-		if( vertex1 > maxVertex )
-			maxVertex = vertex1;
-	}
-	int numVertices = maxVertex + 1;
-	// Set of lists, one for each node, specifying which colours are connected
-	// to that node.
-	// No two edges into a node can share a colour.
-	btAlignedObjectArray< btAlignedObjectArray< int > > vertexConnectedColourLists;
-	vertexConnectedColourLists.resize(numVertices);
-	// Simple algorithm that chooses the lowest batch number
-	// that none of the links attached to either of the connected 
-	// nodes is in
-	for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-	{				
-		int linkLocation = m_linkAddresses[linkIndex];
-		int vertex0 = getVertexPair(linkLocation).vertex0;
-		int vertex1 = getVertexPair(linkLocation).vertex1;
-		// Get the two node colour lists
-		btAlignedObjectArray< int > &colourListVertex0( vertexConnectedColourLists[vertex0] );
-		btAlignedObjectArray< int > &colourListVertex1( vertexConnectedColourLists[vertex1] );
-		// Choose the minimum colour that is in neither list
-		int colour = 0;
-		while( colourListVertex0.findLinearSearch(colour) != colourListVertex0.size() || colourListVertex1.findLinearSearch(colour) != colourListVertex1.size()  )
-			++colour;
-		// i should now be the minimum colour in neither list
-		// Add to the two lists so that future edges don't share
-		// And store the colour against this edge
-		colourListVertex0.push_back(colour);
-		colourListVertex1.push_back(colour);
-		batchValues[linkIndex] = colour;
-	}
-	// Check the colour counts
-	btAlignedObjectArray< int > batchCounts;
-	for( int i = 0; i < numLinks; ++i )
-	{
-		int batch = batchValues[i];
-		if( batch >= batchCounts.size() )
-			batchCounts.push_back(1);
-		else
-			++(batchCounts[batch]);
-	}
-	m_batchStartLengths.resize(batchCounts.size());
-	if( m_batchStartLengths.size() > 0 )
-	{
-		m_batchStartLengths[0] = BatchPair( 0, 0 );
-		int sum = 0;
-		for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
-		{
-			m_batchStartLengths[batchIndex].start = sum;
-			m_batchStartLengths[batchIndex].length = batchCounts[batchIndex];
-			sum += batchCounts[batchIndex];
-		}
-	}
-	/////////////////////////////
-	// Sort data based on batches
-	// Create source arrays by copying originals
-	btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair>				m_links_Backup(m_links);
-	btAlignedObjectArray<float>											m_linkStrength_Backup(m_linkStrength);
-	btAlignedObjectArray<float>											m_linksMassLSC_Backup(m_linksMassLSC);
-	btAlignedObjectArray<float>											m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
-	btAlignedObjectArray<Vectormath::Aos::Vector3>						m_linksCLength_Backup(m_linksCLength);
-	btAlignedObjectArray<float>											m_linksLengthRatio_Backup(m_linksLengthRatio);
-	btAlignedObjectArray<float>											m_linksRestLength_Backup(m_linksRestLength);
-	btAlignedObjectArray<float>											m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
-	for( int batch = 0; batch < batchCounts.size(); ++batch )
-		batchCounts[batch] = 0;
-	// Do sort as single pass into destination arrays	
-	for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-	{
-		// To maintain locations run off the original link locations rather than the current position.
-		// It's not cache efficient, but as we run this rarely that should not matter.
-		// It's faster than searching the link location array for the current location and then updating it.
-		// The other alternative would be to unsort before resorting, but this is equivalent to doing that.
-		int linkLocation = m_linkAddresses[linkIndex];
-		// Obtain batch and calculate target location for the
-		// next element in that batch, incrementing the batch counter
-		// afterwards
-		int batch = batchValues[linkIndex];
-		int newLocation = m_batchStartLengths[batch].start + batchCounts[batch];
-		batchCounts[batch] = batchCounts[batch] + 1;
-		m_links[newLocation] = m_links_Backup[linkLocation];
-#if 1
-		m_linkStrength[newLocation] = m_linkStrength_Backup[linkLocation];
-		m_linksMassLSC[newLocation] = m_linksMassLSC_Backup[linkLocation];
-		m_linksRestLengthSquared[newLocation] = m_linksRestLengthSquared_Backup[linkLocation];
-		m_linksLengthRatio[newLocation] = m_linksLengthRatio_Backup[linkLocation];
-		m_linksRestLength[newLocation] = m_linksRestLength_Backup[linkLocation];
-		m_linksMaterialLinearStiffnessCoefficient[newLocation] = m_linksMaterialLinearStiffnessCoefficient_Backup[linkLocation];
-		// Update the locations array to account for the moved entry
-		m_linkAddresses[linkIndex] = newLocation;
-	}
-} // void btSoftBodyLinkDataDX11::generateBatches()
-btSoftBodyVertexDataDX11::btSoftBodyVertexDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext ) : 
-	m_dx11ClothIdentifier( d3dDevice, d3dDeviceContext, &m_clothIdentifier, false ),
-	m_dx11VertexPosition( d3dDevice, d3dDeviceContext, &m_vertexPosition, false ),
-	m_dx11VertexPreviousPosition( d3dDevice, d3dDeviceContext, &m_vertexPreviousPosition, false ),
-	m_dx11VertexVelocity( d3dDevice, d3dDeviceContext, &m_vertexVelocity, false ),
-	m_dx11VertexForceAccumulator( d3dDevice, d3dDeviceContext, &m_vertexForceAccumulator, false ),
-	m_dx11VertexNormal( d3dDevice, d3dDeviceContext, &m_vertexNormal, false ),
-	m_dx11VertexInverseMass( d3dDevice, d3dDeviceContext, &m_vertexInverseMass, false ),
-	m_dx11VertexArea( d3dDevice, d3dDeviceContext, &m_vertexArea, false ),
-	m_dx11VertexTriangleCount( d3dDevice, d3dDeviceContext, &m_vertexTriangleCount, false )
-	m_d3dDevice = d3dDevice;
-	m_d3dDeviceContext = d3dDeviceContext;
-bool btSoftBodyVertexDataDX11::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyVertexDataDX11::moveToAccelerator()
-	bool success = true;
-	success = success && m_dx11ClothIdentifier.moveToGPU();
-	success = success && m_dx11VertexPosition.moveToGPU();
-	success = success && m_dx11VertexPreviousPosition.moveToGPU();
-	success = success && m_dx11VertexVelocity.moveToGPU();
-	success = success && m_dx11VertexForceAccumulator.moveToGPU();
-	success = success && m_dx11VertexNormal.moveToGPU();
-	success = success && m_dx11VertexInverseMass.moveToGPU();
-	success = success && m_dx11VertexArea.moveToGPU();
-	success = success && m_dx11VertexTriangleCount.moveToGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
-bool btSoftBodyVertexDataDX11::moveFromAccelerator(bool bCopy, bool bCopyMinimum)
-	bool success = true;
-	if (!bCopy)
-	{
-		success = success && m_dx11ClothIdentifier.moveFromGPU();
-		success = success && m_dx11VertexPosition.moveFromGPU();
-		success = success && m_dx11VertexPreviousPosition.moveFromGPU();
-		success = success && m_dx11VertexVelocity.moveFromGPU();
-		success = success && m_dx11VertexForceAccumulator.moveFromGPU();
-		success = success && m_dx11VertexNormal.moveFromGPU();
-		success = success && m_dx11VertexInverseMass.moveFromGPU();
-		success = success && m_dx11VertexArea.moveFromGPU();
-		success = success && m_dx11VertexTriangleCount.moveFromGPU();
-	}
-	else
-	{
-		if (bCopyMinimum)
-		{
-			success = success && m_dx11VertexPosition.copyFromGPU();
-			success = success && m_dx11VertexNormal.copyFromGPU();
-		}
-		else
-		{
-			success = success && m_dx11ClothIdentifier.copyFromGPU();
-			success = success && m_dx11VertexPosition.copyFromGPU();
-			success = success && m_dx11VertexPreviousPosition.copyFromGPU();
-			success = success && m_dx11VertexVelocity.copyFromGPU();
-			success = success && m_dx11VertexForceAccumulator.copyFromGPU();
-			success = success && m_dx11VertexNormal.copyFromGPU();
-			success = success && m_dx11VertexInverseMass.copyFromGPU();
-			success = success && m_dx11VertexArea.copyFromGPU();
-			success = success && m_dx11VertexTriangleCount.copyFromGPU();
-		}
-	}
-	if( success )
-		m_onGPU = true;
-	return success;
-btSoftBodyTriangleDataDX11::btSoftBodyTriangleDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext ) : 
-	m_dx11VertexIndices( d3dDevice, d3dDeviceContext, &m_vertexIndices, false ),
-	m_dx11Area( d3dDevice, d3dDeviceContext, &m_area, false ),
-	m_dx11Normal( d3dDevice, d3dDeviceContext, &m_normal, false )
-	m_d3dDevice = d3dDevice;
-	m_d3dDeviceContext = d3dDeviceContext;
-/** Allocate enough space in all link-related arrays to fit numLinks links */
-void btSoftBodyTriangleDataDX11::createTriangles( int numTriangles )
-	int previousSize = getNumTriangles();
-	int newSize = previousSize + numTriangles;
-	btSoftBodyTriangleData::createTriangles( numTriangles );
-	// Resize the link addresses array as well
-	m_triangleAddresses.resize( newSize );
-/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-void btSoftBodyTriangleDataDX11::setTriangleAt( const btSoftBodyTriangleData::TriangleDescription &triangle, int triangleIndex )
-	btSoftBodyTriangleData::setTriangleAt( triangle, triangleIndex );
-	m_triangleAddresses[triangleIndex] = triangleIndex;
-bool btSoftBodyTriangleDataDX11::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyTriangleDataDX11::moveToAccelerator()
-	bool success = true;
-	success = success && m_dx11VertexIndices.moveToGPU();
-	success = success && m_dx11Area.moveToGPU();
-	success = success && m_dx11Normal.moveToGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
-bool btSoftBodyTriangleDataDX11::moveFromAccelerator()
-	bool success = true;
-	success = success && m_dx11VertexIndices.moveFromGPU();
-	success = success && m_dx11Area.moveFromGPU();
-	success = success && m_dx11Normal.moveFromGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
- * Generate (and later update) the batching for the entire triangle set.
- * This redoes a lot of work because it batches the entire set when each cloth is inserted.
- * In theory we could delay it until just before we need the cloth.
- * It's a one-off overhead, though, so that is a later optimisation.
- */
-void btSoftBodyTriangleDataDX11::generateBatches()
-	int numTriangles = getNumTriangles();
-	if( numTriangles == 0 )
-		return;
-	// Do the graph colouring here temporarily
-	btAlignedObjectArray< int > batchValues;
-	batchValues.resize( numTriangles );
-	// Find the maximum vertex value internally for now
-	int maxVertex = 0;
-	for( int triangleIndex = 0; triangleIndex < numTriangles; ++triangleIndex )
-	{
-		int vertex0 = getVertexSet(triangleIndex).vertex0;
-		int vertex1 = getVertexSet(triangleIndex).vertex1;
-		int vertex2 = getVertexSet(triangleIndex).vertex2;
-		if( vertex0 > maxVertex )
-			maxVertex = vertex0;
-		if( vertex1 > maxVertex )
-			maxVertex = vertex1;
-		if( vertex2 > maxVertex )
-			maxVertex = vertex2;
-	}
-	int numVertices = maxVertex + 1;
-	// Set of lists, one for each node, specifying which colours are connected
-	// to that node.
-	// No two edges into a node can share a colour.
-	btAlignedObjectArray< btAlignedObjectArray< int > > vertexConnectedColourLists;
-	vertexConnectedColourLists.resize(numVertices);
-	//std::cout << "\n";
-	// Simple algorithm that chooses the lowest batch number
-	// that none of the faces attached to either of the connected 
-	// nodes is in
-	for( int triangleIndex = 0; triangleIndex < numTriangles; ++triangleIndex )
-	{
-		// To maintain locations run off the original link locations rather than the current position.
-		// It's not cache efficient, but as we run this rarely that should not matter.
-		// It's faster than searching the link location array for the current location and then updating it.
-		// The other alternative would be to unsort before resorting, but this is equivalent to doing that.
-		int triangleLocation = m_triangleAddresses[triangleIndex];
-		int vertex0 = getVertexSet(triangleLocation).vertex0;
-		int vertex1 = getVertexSet(triangleLocation).vertex1;
-		int vertex2 = getVertexSet(triangleLocation).vertex2;
-		// Get the three node colour lists
-		btAlignedObjectArray< int > &colourListVertex0( vertexConnectedColourLists[vertex0] );
-		btAlignedObjectArray< int > &colourListVertex1( vertexConnectedColourLists[vertex1] );
-		btAlignedObjectArray< int > &colourListVertex2( vertexConnectedColourLists[vertex2] );
-		// Choose the minimum colour that is in none of the lists
-		int colour = 0;
-		while( 
-			colourListVertex0.findLinearSearch(colour) != colourListVertex0.size() || 
-			colourListVertex1.findLinearSearch(colour) != colourListVertex1.size() ||
-			colourListVertex2.findLinearSearch(colour) != colourListVertex2.size() )
-		{
-			++colour;
-		}
-		// i should now be the minimum colour in neither list
-		// Add to the three lists so that future edges don't share
-		// And store the colour against this face
-		colourListVertex0.push_back(colour);
-		colourListVertex1.push_back(colour);
-		colourListVertex2.push_back(colour);
-		batchValues[triangleIndex] = colour;
-	}
-	// Check the colour counts
-	btAlignedObjectArray< int > batchCounts;
-	for( int i = 0; i < numTriangles; ++i )
-	{
-		int batch = batchValues[i];
-		if( batch >= batchCounts.size() )
-			batchCounts.push_back(1);
-		else
-			++(batchCounts[batch]);
-	}
-	m_batchStartLengths.resize(batchCounts.size());
-	m_batchStartLengths[0] = BatchPair( 0, 0 );
-	int sum = 0;
-	for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
-	{
-		m_batchStartLengths[batchIndex].start = sum;
-		m_batchStartLengths[batchIndex].length = batchCounts[batchIndex];
-		sum += batchCounts[batchIndex];
-	}
-	/////////////////////////////
-	// Sort data based on batches
-	// Create source arrays by copying originals
-	btAlignedObjectArray<btSoftBodyTriangleData::TriangleNodeSet>							m_vertexIndices_Backup(m_vertexIndices);
-	btAlignedObjectArray<float>										m_area_Backup(m_area);
-	btAlignedObjectArray<Vectormath::Aos::Vector3>					m_normal_Backup(m_normal);
-	for( int batch = 0; batch < batchCounts.size(); ++batch )
-		batchCounts[batch] = 0;
-	// Do sort as single pass into destination arrays	
-	for( int triangleIndex = 0; triangleIndex < numTriangles; ++triangleIndex )
-	{
-		// To maintain locations run off the original link locations rather than the current position.
-		// It's not cache efficient, but as we run this rarely that should not matter.
-		// It's faster than searching the link location array for the current location and then updating it.
-		// The other alternative would be to unsort before resorting, but this is equivalent to doing that.
-		int triangleLocation = m_triangleAddresses[triangleIndex];
-		// Obtain batch and calculate target location for the
-		// next element in that batch, incrementing the batch counter
-		// afterwards
-		int batch = batchValues[triangleIndex];
-		int newLocation = m_batchStartLengths[batch].start + batchCounts[batch];
-		batchCounts[batch] = batchCounts[batch] + 1;
-		m_vertexIndices[newLocation] = m_vertexIndices_Backup[triangleLocation];
-		m_area[newLocation] = m_area_Backup[triangleLocation];
-		m_normal[newLocation] = m_normal_Backup[triangleLocation];
-		// Update the locations array to account for the moved entry
-		m_triangleAddresses[triangleIndex] = newLocation;
-	}
-} // btSoftBodyTriangleDataDX11::generateBatches
-btDX11SoftBodySolver::btDX11SoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory) :
-	m_dx11Device( dx11Device ),
-	m_dx11Context( dx11Context ),
-	dxFunctions( m_dx11Device, m_dx11Context, dx11CompileFromMemory ),
-	m_linkData(m_dx11Device, m_dx11Context),
-	m_vertexData(m_dx11Device, m_dx11Context),
-	m_triangleData(m_dx11Device, m_dx11Context),
-	m_dx11PerClothAcceleration( m_dx11Device, m_dx11Context, &m_perClothAcceleration, true ),
-	m_dx11PerClothWindVelocity( m_dx11Device, m_dx11Context, &m_perClothWindVelocity, true ),
-	m_dx11PerClothDampingFactor( m_dx11Device, m_dx11Context, &m_perClothDampingFactor, true ),
-	m_dx11PerClothVelocityCorrectionCoefficient( m_dx11Device, m_dx11Context, &m_perClothVelocityCorrectionCoefficient, true ),
-	m_dx11PerClothLiftFactor( m_dx11Device, m_dx11Context, &m_perClothLiftFactor, true ),
-	m_dx11PerClothDragFactor( m_dx11Device, m_dx11Context, &m_perClothDragFactor, true ),
-	m_dx11PerClothMediumDensity( m_dx11Device, m_dx11Context, &m_perClothMediumDensity, true ),
-	m_dx11PerClothCollisionObjects( m_dx11Device, m_dx11Context, &m_perClothCollisionObjects, true ),
-	m_dx11CollisionObjectDetails( m_dx11Device, m_dx11Context, &m_collisionObjectDetails, true ),
-	m_dx11PerClothMinBounds( m_dx11Device, m_dx11Context, &m_perClothMinBounds, false ),
-	m_dx11PerClothMaxBounds( m_dx11Device, m_dx11Context, &m_perClothMaxBounds, false ),
-	m_dx11PerClothFriction( m_dx11Device, m_dx11Context, &m_perClothFriction, false ),
-	m_enableUpdateBounds(false)
-	// Initial we will clearly need to update solver constants
-	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
-	// for performance in future once we understand more clearly when constants need to be updated
-	m_updateSolverConstants = true;
-	m_shadersInitialized = false;
-	releaseKernels();
-void btDX11SoftBodySolver::releaseKernels()
-	SAFE_RELEASE( prepareLinksKernel.kernel );
-	SAFE_RELEASE( prepareLinksKernel.constBuffer );
-	SAFE_RELEASE( integrateKernel.kernel );
-	SAFE_RELEASE( integrateKernel.constBuffer );
-	SAFE_RELEASE( integrateKernel.kernel );
-	SAFE_RELEASE( solvePositionsFromLinksKernel.constBuffer );
-	SAFE_RELEASE( solvePositionsFromLinksKernel.kernel );
-	SAFE_RELEASE( updatePositionsFromVelocitiesKernel.constBuffer );
-	SAFE_RELEASE( updatePositionsFromVelocitiesKernel.kernel );
-	SAFE_RELEASE( updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer );
-	SAFE_RELEASE( updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel );
-	SAFE_RELEASE( updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer );
-	SAFE_RELEASE( updateVelocitiesFromPositionsWithVelocitiesKernel.kernel );
-	SAFE_RELEASE( resetNormalsAndAreasKernel.constBuffer );
-	SAFE_RELEASE( resetNormalsAndAreasKernel.kernel );
-	SAFE_RELEASE( normalizeNormalsAndAreasKernel.constBuffer );
-	SAFE_RELEASE( normalizeNormalsAndAreasKernel.kernel );
-	SAFE_RELEASE( updateSoftBodiesKernel.constBuffer );
-	SAFE_RELEASE( updateSoftBodiesKernel.kernel );
-	SAFE_RELEASE( solveCollisionsAndUpdateVelocitiesKernel.kernel );
-	SAFE_RELEASE( solveCollisionsAndUpdateVelocitiesKernel.constBuffer );
-	SAFE_RELEASE( computeBoundsKernel.kernel );
-	SAFE_RELEASE( computeBoundsKernel.constBuffer );
-	SAFE_RELEASE( vSolveLinksKernel.kernel );
-	SAFE_RELEASE( vSolveLinksKernel.constBuffer );
-	SAFE_RELEASE( addVelocityKernel.constBuffer );
-	SAFE_RELEASE( addVelocityKernel.kernel );
-	SAFE_RELEASE( applyForcesKernel.constBuffer );
-	SAFE_RELEASE( applyForcesKernel.kernel );
-	m_shadersInitialized = false;
-void btDX11SoftBodySolver::copyBackToSoftBodies(bool bMove)
-	// Move the vertex data back to the host first
-	m_vertexData.moveFromAccelerator(!bMove);
-	// Loop over soft bodies, copying all the vertex positions back for each body in turn
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[ softBodyIndex ];
-		btSoftBody *softBody = softBodyInterface->getSoftBody();
-		int firstVertex = softBodyInterface->getFirstVertex();
-		int numVertices = softBodyInterface->getNumVertices();
-		// Copy vertices from solver back into the softbody
-		for( int vertex = 0; vertex < numVertices; ++vertex )
-		{
-			using Vectormath::Aos::Point3;
-			Point3 vertexPosition( getVertexData().getVertexPositions()[firstVertex + vertex] );
-			softBody->m_nodes[vertex].m_x.setX( vertexPosition.getX() );
-			softBody->m_nodes[vertex].m_x.setY( vertexPosition.getY() );
-			softBody->m_nodes[vertex].m_x.setZ( vertexPosition.getZ() );
-			softBody->m_nodes[vertex].m_n.setX( vertexPosition.getX() );
-			softBody->m_nodes[vertex].m_n.setY( vertexPosition.getY() );
-			softBody->m_nodes[vertex].m_n.setZ( vertexPosition.getZ() );
-		}
-	}
-} // btDX11SoftBodySolver::copyBackToSoftBodies
-void btDX11SoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softBodies, bool forceUpdate )
-	if( forceUpdate || m_softBodySet.size() != softBodies.size() )
-	{
-		// Have a change in the soft body set so update, reloading all the data
-		getVertexData().clear();
-		getTriangleData().clear();
-		getLinkData().clear();
-		m_softBodySet.resize(0);
-		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
-		{
-			btSoftBody *softBody = softBodies[ softBodyIndex ];
-			using Vectormath::Aos::Matrix3;
-			using Vectormath::Aos::Point3;
-			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
-			m_softBodySet.push_back( newSoftBody );
-			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
-			m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
-			m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
-			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
-			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
-			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
-			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
-			m_perClothMinBounds.push_back( UIntVector3( 0, 0, 0 ) );
-			m_perClothMaxBounds.push_back( UIntVector3( UINT_MAX, UINT_MAX, UINT_MAX ) );
-			m_perClothFriction.push_back( softBody->getFriction() );
-			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
-			// Add space for new vertices and triangles in the default solver for now
-			// TODO: Include space here for tearing too later
-			int firstVertex = getVertexData().getNumVertices();
-			int numVertices = softBody->m_nodes.size();
-			int maxVertices = numVertices;
-			// Allocate space for new vertices in all the vertex arrays
-			getVertexData().createVertices( maxVertices, softBodyIndex );
-			int firstTriangle = getTriangleData().getNumTriangles();
-			int numTriangles = softBody->m_faces.size();
-			int maxTriangles = numTriangles;
-			getTriangleData().createTriangles( maxTriangles );
-			// Copy vertices from softbody into the solver
-			for( int vertex = 0; vertex < numVertices; ++vertex )
-			{
-				Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
-				btSoftBodyVertexData::VertexDescription desc;
-				// TODO: Position in the softbody might be pre-transformed
-				// or we may need to adapt for the pose.
-				//desc.setPosition( cloth.getMeshTransform()*multPoint );
-				desc.setPosition( multPoint );
-				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
-				desc.setInverseMass(vertexInverseMass);
-				getVertexData().setVertexAt( desc, firstVertex + vertex );
-			}
-			// Copy triangles similarly
-			// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
-			for( int triangle = 0; triangle < numTriangles; ++triangle )
-			{
-				// Note that large array storage is relative to the array not to the cloth
-				// So we need to add firstVertex to each value
-				int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
-				int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
-				int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
-				btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
-				getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
-				// Increase vertex triangle counts for this triangle		
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
-			}
-			int firstLink = getLinkData().getNumLinks();
-			int numLinks = softBody->m_links.size();
-			int maxLinks = numLinks;
-			// Allocate space for the links
-			getLinkData().createLinks( numLinks );
-			// Add the links
-			for( int link = 0; link < numLinks; ++link )
-			{
-				int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
-				int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
-				btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
-				newLink.setLinkStrength(1.f);
-				getLinkData().setLinkAt(newLink, firstLink + link);
-			}
-			newSoftBody->setFirstVertex( firstVertex );
-			newSoftBody->setFirstTriangle( firstTriangle );
-			newSoftBody->setNumVertices( numVertices );
-			newSoftBody->setMaxVertices( maxVertices );
-			newSoftBody->setNumTriangles( numTriangles );
-			newSoftBody->setMaxTriangles( maxTriangles );
-			newSoftBody->setFirstLink( firstLink );
-			newSoftBody->setNumLinks( numLinks );
-		}
-		updateConstants(0.f);
-		m_linkData.generateBatches();		
-		m_triangleData.generateBatches();
-	}
-btSoftBodyLinkData &btDX11SoftBodySolver::getLinkData()
-	// TODO: Consider setting link data to "changed" here
-	return m_linkData;
-btSoftBodyVertexData &btDX11SoftBodySolver::getVertexData()
-	// TODO: Consider setting vertex data to "changed" here
-	return m_vertexData;
-btSoftBodyTriangleData &btDX11SoftBodySolver::getTriangleData()
-	// TODO: Consider setting triangle data to "changed" here
-	return m_triangleData;
-bool btDX11SoftBodySolver::checkInitialized()
-	if( !m_shadersInitialized )
-		if( buildShaders() )
-			m_shadersInitialized = true;
-	return m_shadersInitialized;
-void btDX11SoftBodySolver::resetNormalsAndAreas( int numVertices )
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	UpdateSoftBodiesCB constBuffer;
-	constBuffer.numNodes = numVertices;
-	constBuffer.epsilon = FLT_EPSILON;
-	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateSoftBodiesCB) );	
-	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexNormal.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexArea.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( resetNormalsAndAreasKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks, 1, 1 );
-	{
-		// Tidy up 
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::resetNormalsAndAreas
-void btDX11SoftBodySolver::normalizeNormalsAndAreas( int numVertices )
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	UpdateSoftBodiesCB constBuffer;
-	constBuffer.numNodes = numVertices;
-	constBuffer.epsilon = FLT_EPSILON;
-	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateSoftBodiesCB) );	
-	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
-	// Set resources and dispatch	
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexTriangleCount.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexNormal.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexArea.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( normalizeNormalsAndAreasKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks, 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::normalizeNormalsAndAreas
-void btDX11SoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	UpdateSoftBodiesCB constBuffer;
-	constBuffer.startFace = firstTriangle;
-	constBuffer.numFaces = numTriangles;
-	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( updateSoftBodiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateSoftBodiesCB) );	
-	m_dx11Context->Unmap( updateSoftBodiesKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &updateSoftBodiesKernel.constBuffer );
-	// Set resources and dispatch	
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_triangleData.m_dx11VertexIndices.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexNormal.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexArea.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &(m_triangleData.m_dx11Normal.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 3, 1, &(m_triangleData.m_dx11Area.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( updateSoftBodiesKernel.kernel, NULL, 0 );
-	int	numBlocks = (numTriangles + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks, 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::executeUpdateSoftBodies
-void btDX11SoftBodySolver::updateSoftBodies()
-	using namespace Vectormath::Aos;
-	int numVertices = m_vertexData.getNumVertices();
-	int numTriangles = m_triangleData.getNumTriangles();
-	// Ensure data is on accelerator
-	m_vertexData.moveToAccelerator();
-	m_triangleData.moveToAccelerator();
-	resetNormalsAndAreas( numVertices );
-	// Go through triangle batches so updates occur correctly
-	for( int batchIndex = 0; batchIndex < m_triangleData.m_batchStartLengths.size(); ++batchIndex )
-	{
-		int startTriangle = m_triangleData.m_batchStartLengths[batchIndex].start;
-		int numTriangles = m_triangleData.m_batchStartLengths[batchIndex].length;
-		executeUpdateSoftBodies( startTriangle, numTriangles );
-	}
-	normalizeNormalsAndAreas( numVertices );
-} // btDX11SoftBodySolver::updateSoftBodies
-Vectormath::Aos::Vector3 btDX11SoftBodySolver::ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a )
-	return a*Vectormath::Aos::dot(v, a);
-void btDX11SoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce )
-	float dtInverseMass = solverdt*inverseMass;
-	if( Vectormath::Aos::lengthSqr(force * dtInverseMass) > Vectormath::Aos::lengthSqr(vertexVelocity) )
-	{
-		vertexForce -= ProjectOnAxis( vertexVelocity, normalize( force ) )/dtInverseMass;
-	} else {
-		vertexForce += force;
-	}
-void btDX11SoftBodySolver::applyForces( float solverdt )
-	using namespace Vectormath::Aos;
-	// Ensure data is on accelerator
-	m_vertexData.moveToAccelerator();
-	m_dx11PerClothAcceleration.moveToGPU();
-	m_dx11PerClothLiftFactor.moveToGPU();
-	m_dx11PerClothDragFactor.moveToGPU();
-	m_dx11PerClothMediumDensity.moveToGPU();
-	m_dx11PerClothWindVelocity.moveToGPU();
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	ApplyForcesCB constBuffer;
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.solverdt = solverdt;
-	constBuffer.epsilon = FLT_EPSILON;
-	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(ApplyForcesCB) );	
-	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
-	// Set resources and dispatch	
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexNormal.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexArea.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 4, 1, &(m_dx11PerClothLiftFactor.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 5, 1, &(m_dx11PerClothDragFactor.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 6, 1, &(m_dx11PerClothWindVelocity.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 7, 1, &(m_dx11PerClothAcceleration.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 8, 1, &(m_dx11PerClothMediumDensity.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( applyForcesKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks, 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 5, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 6, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 7, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 8, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::applyForces
- * Integrate motion on the solver.
- */
-void btDX11SoftBodySolver::integrate( float solverdt )
-	m_vertexData.moveToAccelerator();
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	IntegrateCB constBuffer;
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.solverdt = solverdt;
-	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(IntegrateCB) );	
-	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 3, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( integrateKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks, 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 3, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::integrate
-float btDX11SoftBodySolver::computeTriangleArea( 
-	const Vectormath::Aos::Point3 &vertex0,
-	const Vectormath::Aos::Point3 &vertex1,
-	const Vectormath::Aos::Point3 &vertex2 )
-	Vectormath::Aos::Vector3 a = vertex1 - vertex0;
-	Vectormath::Aos::Vector3 b = vertex2 - vertex0;
-	Vectormath::Aos::Vector3 crossProduct = cross(a, b);
-	float area = length( crossProduct );
-	return area;
-} // btDX11SoftBodySolver::computeTriangleArea
-void btDX11SoftBodySolver::updateBounds()
-	using Vectormath::Aos::Point3;
-	// Interpretation structure for float and int
-	struct FPRep {
-		unsigned int mantissa  : 23;
-		unsigned int exponent : 8;
-		unsigned int sign    : 1;
-	};
-	union FloatAsInt
-	{
-		float floatValue;
-		int intValue;
-		unsigned int uintValue;
-		FPRep fpRep;
-	};
-	// Update bounds array to min and max int values to allow easy atomics
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		m_perClothMinBounds[softBodyIndex] = UIntVector3( UINT_MAX, UINT_MAX, UINT_MAX );
-		m_perClothMaxBounds[softBodyIndex] = UIntVector3( 0, 0, 0 );
-	}
-	m_dx11PerClothMinBounds.moveToGPU();
-	m_dx11PerClothMaxBounds.moveToGPU();
-	computeBounds( );
-	m_dx11PerClothMinBounds.moveFromGPU();
-	m_dx11PerClothMaxBounds.moveFromGPU();
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		UIntVector3 minBoundUInt = m_perClothMinBounds[softBodyIndex];
-		UIntVector3 maxBoundUInt = m_perClothMaxBounds[softBodyIndex];
-		// Convert back to float
-		FloatAsInt fai;
-		btVector3 minBound;
-		fai.uintValue = minBoundUInt.x;
-	    fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
-		minBound.setX( fai.floatValue );
-		fai.uintValue = minBoundUInt.y;
-		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
-		minBound.setY( fai.floatValue );
-		fai.uintValue = minBoundUInt.z;
-		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
-		minBound.setZ( fai.floatValue );
-		btVector3 maxBound;
-		fai.uintValue = maxBoundUInt.x;
-		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
-		maxBound.setX( fai.floatValue );
-		fai.uintValue = maxBoundUInt.y;
-		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
-		maxBound.setY( fai.floatValue );
-		fai.uintValue = maxBoundUInt.z;
-		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
-		maxBound.setZ( fai.floatValue );
-		// And finally assign to the soft body
-		m_softBodySet[softBodyIndex]->updateBounds( minBound, maxBound );
-	}
-void btDX11SoftBodySolver::updateConstants( float timeStep )
-	using namespace Vectormath::Aos;
-	if( m_updateSolverConstants )
-	{
-		m_updateSolverConstants = false;
-		// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
-		// Initialise link constants
-		const int numLinks = m_linkData.getNumLinks();
-		for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-		{
-			btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
-			m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
-			float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
-			float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
-			float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
-			float massLSC = (invMass0 + invMass1)/linearStiffness;
-			m_linkData.getMassLSC(linkIndex) = massLSC;
-			float restLength = m_linkData.getRestLength(linkIndex);
-			float restLengthSquared = restLength*restLength;
-			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
-		}
-	}
-} // btDX11SoftBodySolver::updateConstants
- * Sort the collision object details array and generate indexing into it for the per-cloth collision object array.
- */
-void btDX11SoftBodySolver::prepareCollisionConstraints()
-	// First do a simple sort on the collision objects
-	btAlignedObjectArray<int> numObjectsPerClothPrefixSum;
-	btAlignedObjectArray<int> numObjectsPerCloth;
-	numObjectsPerCloth.resize( m_softBodySet.size(), 0 );
-	numObjectsPerClothPrefixSum.resize( m_softBodySet.size(), 0 );
-	class QuickSortCompare
-	{
-		public:
-		bool operator() ( const CollisionShapeDescription& a, const CollisionShapeDescription& b ) const
-		{
-			return ( a.softBodyIdentifier < b.softBodyIdentifier );
-		}
-	};
-	QuickSortCompare comparator;
-	m_collisionObjectDetails.quickSort( comparator );
-	// Generating indexing for perClothCollisionObjects
-	// First clear the previous values with the "no collision object for cloth" constant
-	for( int clothIndex = 0; clothIndex < m_perClothCollisionObjects.size(); ++clothIndex )
-	{
-		m_perClothCollisionObjects[clothIndex].firstObject = -1;
-		m_perClothCollisionObjects[clothIndex].endObject = -1;
-	}
-	int currentCloth = 0;
-	int startIndex = 0;
-	for( int collisionObject = 0; collisionObject < m_collisionObjectDetails.size(); ++collisionObject )
-	{
-		int nextCloth = m_collisionObjectDetails[collisionObject].softBodyIdentifier;
-		if( nextCloth != currentCloth )
-		{	
-			// Changed cloth in the array
-			// Set the end index and the range is what we need for currentCloth
-			m_perClothCollisionObjects[currentCloth].firstObject = startIndex;
-			m_perClothCollisionObjects[currentCloth].endObject = collisionObject;
-			currentCloth = nextCloth;
-			startIndex = collisionObject;
-		}
-	}
-	// And update last cloth	
-	m_perClothCollisionObjects[currentCloth].firstObject = startIndex;
-	m_perClothCollisionObjects[currentCloth].endObject =  m_collisionObjectDetails.size();
-} // btDX11SoftBodySolver::prepareCollisionConstraints
-void btDX11SoftBodySolver::solveConstraints( float solverdt )
-	//std::cerr << "'GPU' solve constraints\n";
-	using Vectormath::Aos::Vector3;
-	using Vectormath::Aos::Point3;
-	using Vectormath::Aos::lengthSqr;
-	using Vectormath::Aos::dot;
-	// Prepare links
-	int numLinks = m_linkData.getNumLinks();
-	int numVertices = m_vertexData.getNumVertices();
-	float kst = 1.f;
-	float ti = 0.f;
-	m_dx11PerClothDampingFactor.moveToGPU();
-	m_dx11PerClothVelocityCorrectionCoefficient.moveToGPU();
-	// Ensure data is on accelerator
-	m_linkData.moveToAccelerator();
-	m_vertexData.moveToAccelerator();
-	prepareLinks();	
-	for( int iteration = 0; iteration < m_numberOfVelocityIterations ; ++iteration )
-	{
-		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
-		{
-			int startLink = m_linkData.m_batchStartLengths[i].start;
-			int numLinks = m_linkData.m_batchStartLengths[i].length;
-			solveLinksForVelocity( startLink, numLinks, kst );
-		}
-	}
-	prepareCollisionConstraints();
-	// Compute new positions from velocity
-	// Also update the previous position so that our position computation is now based on the new position from the velocity solution
-	// rather than based directly on the original positions
-	if( m_numberOfVelocityIterations > 0 )
-	{
-		updateVelocitiesFromPositionsWithVelocities( 1.f/solverdt );
-	} else {
-		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
-	}
-	// Solve drift
-	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	{
-		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
-		{
-			int startLink = m_linkData.m_batchStartLengths[i].start;
-			int numLinks = m_linkData.m_batchStartLengths[i].length;
-			solveLinksForPosition( startLink, numLinks, kst, ti );
-		}
-	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	// At this point assume that the force array is blank - we will overwrite it
-	solveCollisionsAndUpdateVelocities( 1.f/solverdt );
-} // btDX11SoftBodySolver::solveConstraints
-// Kernel dispatches
-void btDX11SoftBodySolver::prepareLinks()
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	PrepareLinksCB constBuffer;
-	constBuffer.numLinks = m_linkData.getNumLinks();
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( prepareLinksKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(PrepareLinksCB) );	
-	m_dx11Context->Unmap( prepareLinksKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &prepareLinksKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_linkData.m_dx11Links.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_linkData.m_dx11LinksMassLSC.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_linkData.m_dx11LinksLengthRatio.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_linkData.m_dx11LinksCLength.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( prepareLinksKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numLinks + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}
-} // btDX11SoftBodySolver::prepareLinks
-void btDX11SoftBodySolver::updatePositionsFromVelocities( float solverdt )
-	// No need to batch link solver, it is entirely parallel
-	// Copy kernel parameters to GPU
-	UpdatePositionsFromVelocitiesCB constBuffer;
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.solverSDT = solverdt;
-	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( updatePositionsFromVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdatePositionsFromVelocitiesCB) );	
-	m_dx11Context->Unmap( updatePositionsFromVelocitiesKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &updatePositionsFromVelocitiesKernel.constBuffer );
-	// Set resources and dispatch			
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( updatePositionsFromVelocitiesKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks, 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::updatePositionsFromVelocities
-void btDX11SoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
-	// Copy kernel parameters to GPU
-	SolvePositionsFromLinksKernelCB constBuffer;
-	// Set the first link of the batch
-	// and the batch size
-	constBuffer.startLink = startLink;
-	constBuffer.numLinks = numLinks;
-	constBuffer.kst = kst;
-	constBuffer.ti = ti;
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( solvePositionsFromLinksKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(SolvePositionsFromLinksKernelCB) );	
-	m_dx11Context->Unmap( solvePositionsFromLinksKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &solvePositionsFromLinksKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_linkData.m_dx11Links.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_linkData.m_dx11LinksMassLSC.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_linkData.m_dx11LinksRestLengthSquared.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( solvePositionsFromLinksKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numLinks + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::solveLinksForPosition
-void btDX11SoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
-	// Copy kernel parameters to GPU
-	VSolveLinksCB constBuffer;
-	// Set the first link of the batch
-	// and the batch size
-	constBuffer.startLink = startLink;
-	constBuffer.numLinks = numLinks;
-	constBuffer.kst = kst;
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( vSolveLinksKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(VSolveLinksCB) );	
-	m_dx11Context->Unmap( vSolveLinksKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &vSolveLinksKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_linkData.m_dx11Links.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_linkData.m_dx11LinksLengthRatio.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_linkData.m_dx11LinksCLength.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( vSolveLinksKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numLinks + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::solveLinksForVelocity
-void btDX11SoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
-	// Copy kernel parameters to GPU
-	UpdateVelocitiesFromPositionsWithVelocitiesCB constBuffer;
-	// Set the first link of the batch
-	// and the batch size
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.isolverdt = isolverdt;
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateVelocitiesFromPositionsWithVelocitiesCB) );	
-	m_dx11Context->Unmap( updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_dx11PerClothVelocityCorrectionCoefficient.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 4, 1, &(m_dx11PerClothDampingFactor.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( updateVelocitiesFromPositionsWithVelocitiesKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::updateVelocitiesFromPositionsWithVelocities
-void btDX11SoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
-	// Copy kernel parameters to GPU
-	UpdateVelocitiesFromPositionsWithoutVelocitiesCB constBuffer;
-	// Set the first link of the batch
-	// and the batch size
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.isolverdt = isolverdt;
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateVelocitiesFromPositionsWithoutVelocitiesCB) );	
-	m_dx11Context->Unmap( updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_dx11PerClothDampingFactor.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities
-void btDX11SoftBodySolver::computeBounds( )
-	ComputeBoundsCB constBuffer;
-	m_vertexData.moveToAccelerator();
-	// Set the first link of the batch
-	// and the batch size
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.numSoftBodies = m_softBodySet.size();
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( computeBoundsKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(ComputeBoundsCB) );	
-	m_dx11Context->Unmap( computeBoundsKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &computeBoundsKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_dx11PerClothMinBounds.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_dx11PerClothMaxBounds.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( computeBoundsKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-void btDX11SoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt )
-	// Copy kernel parameters to GPU
-	m_vertexData.moveToAccelerator();
-	m_dx11PerClothFriction.moveToGPU();
-	m_dx11PerClothDampingFactor.moveToGPU();
-	m_dx11PerClothCollisionObjects.moveToGPU();
-	m_dx11CollisionObjectDetails.moveToGPU();
-	SolveCollisionsAndUpdateVelocitiesCB constBuffer;
-	// Set the first link of the batch
-	// and the batch size
-	constBuffer.numNodes = m_vertexData.getNumVertices();
-	constBuffer.isolverdt = isolverdt;
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( solveCollisionsAndUpdateVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(SolveCollisionsAndUpdateVelocitiesCB) );	
-	m_dx11Context->Unmap( solveCollisionsAndUpdateVelocitiesKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &solveCollisionsAndUpdateVelocitiesKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );	
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_dx11PerClothFriction.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_dx11PerClothDampingFactor.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 4, 1, &(m_dx11PerClothCollisionObjects.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 5, 1, &(m_dx11CollisionObjectDetails.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
-	m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( solveCollisionsAndUpdateVelocitiesKernel.kernel, NULL, 0 );
-	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 5, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
-		m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SoftBodySolver::solveCollisionsAndUpdateVelocities
-// End kernel dispatches
-btDX11SoftBodySolver::btAcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
-		if( softBodyInterface->getSoftBody() == softBody )
-			return softBodyInterface;
-	}
-	return 0;
-const btDX11SoftBodySolver::btAcceleratedSoftBodyInterface * const btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody ) const
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
-		if( softBodyInterface->getSoftBody() == softBody )
-			return softBodyInterface;
-	}
-	return 0;
-int btDX11SoftBodySolver::findSoftBodyIndex( const btSoftBody* const softBody )
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
-		if( softBodyInterface->getSoftBody() == softBody )
-			return softBodyIndex;
-	}
-	return 1;
-void btSoftBodySolverOutputDXtoCPU::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
-	btSoftBodySolver *solver = softBody->getSoftBodySolver();
-	btAssert( solver->getSolverType() == btSoftBodySolver::DX_SOLVER || solver->getSolverType() == btSoftBodySolver::DX_SIMD_SOLVER );
-	btDX11SoftBodySolver *dxSolver = static_cast< btDX11SoftBodySolver * >( solver );
-	btDX11SoftBodySolver::btAcceleratedSoftBodyInterface * currentCloth = dxSolver->findSoftBodyInterface( softBody );
-	btSoftBodyVertexDataDX11 &vertexData( dxSolver->m_vertexData );
-	const int firstVertex = currentCloth->getFirstVertex();
-	const int lastVertex = firstVertex + currentCloth->getNumVertices();
-	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
-	{		
-		// If we're doing a CPU-buffer copy must copy the data back to the host first
-		vertexData.m_dx11VertexPosition.copyFromGPU();
-		vertexData.m_dx11VertexNormal.copyFromGPU();
-		const int firstVertex = currentCloth->getFirstVertex();
-		const int lastVertex = firstVertex + currentCloth->getNumVertices();
-		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
-		float *basePointer = cpuVertexBuffer->getBasePointer();						
-		if( vertexBuffer->hasVertexPositions() )
-		{
-			const int vertexOffset = cpuVertexBuffer->getVertexOffset();
-			const int vertexStride = cpuVertexBuffer->getVertexStride();
-			float *vertexPointer = basePointer + vertexOffset;
-			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
-			{
-				Vectormath::Aos::Point3 position = vertexData.getPosition(vertexIndex);
-				*(vertexPointer + 0) = position.getX();
-				*(vertexPointer + 1) = position.getY();
-				*(vertexPointer + 2) = position.getZ();
-				vertexPointer += vertexStride;
-			}
-		}
-		if( vertexBuffer->hasNormals() )
-		{
-			const int normalOffset = cpuVertexBuffer->getNormalOffset();
-			const int normalStride = cpuVertexBuffer->getNormalStride();
-			float *normalPointer = basePointer + normalOffset;
-			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
-			{
-				Vectormath::Aos::Vector3 normal = vertexData.getNormal(vertexIndex);
-				*(normalPointer + 0) = normal.getX();
-				*(normalPointer + 1) = normal.getY();
-				*(normalPointer + 2) = normal.getZ();
-				normalPointer += normalStride;
-			}
-		}
-	} 
-} // btDX11SoftBodySolver::outputToVertexBuffers
-bool btSoftBodySolverOutputDXtoDX::checkInitialized()
-	if( !m_shadersInitialized )
-		if( buildShaders() )
-			m_shadersInitialized = true;
-	return m_shadersInitialized;
-void btSoftBodySolverOutputDXtoDX::releaseKernels()
-	SAFE_RELEASE( outputToVertexArrayWithNormalsKernel.constBuffer );
-	SAFE_RELEASE( outputToVertexArrayWithNormalsKernel.kernel );
-	SAFE_RELEASE( outputToVertexArrayWithoutNormalsKernel.constBuffer );
-	SAFE_RELEASE( outputToVertexArrayWithoutNormalsKernel.kernel );
-	m_shadersInitialized = false;
-bool btSoftBodySolverOutputDXtoDX::buildShaders()
-	// Ensure current kernels are released first
-	releaseKernels();
-	bool returnVal = true;
-	if( m_shadersInitialized )
-		return true;
-	outputToVertexArrayWithNormalsKernel = dxFunctions.compileComputeShaderFromString( OutputToVertexArrayHLSLString, "OutputToVertexArrayWithNormalsKernel", sizeof(OutputToVertexArrayCB) );
-	if( !outputToVertexArrayWithNormalsKernel.constBuffer)
-		returnVal = false;
-	outputToVertexArrayWithoutNormalsKernel = dxFunctions.compileComputeShaderFromString( OutputToVertexArrayHLSLString, "OutputToVertexArrayWithoutNormalsKernel", sizeof(OutputToVertexArrayCB) );
-	if( !outputToVertexArrayWithoutNormalsKernel.constBuffer )
-		returnVal = false;
-	if( returnVal )
-		m_shadersInitialized = true;
-	return returnVal;
-void btSoftBodySolverOutputDXtoDX::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
-	btSoftBodySolver *solver = softBody->getSoftBodySolver();
-	btAssert( solver->getSolverType() == btSoftBodySolver::DX_SOLVER || solver->getSolverType() == btSoftBodySolver::DX_SIMD_SOLVER );
-	btDX11SoftBodySolver *dxSolver = static_cast< btDX11SoftBodySolver * >( solver );
-	checkInitialized();
-	btDX11SoftBodySolver::btAcceleratedSoftBodyInterface * currentCloth = dxSolver->findSoftBodyInterface( softBody );
-	btSoftBodyVertexDataDX11 &vertexData( dxSolver->m_vertexData );
-	const int firstVertex = currentCloth->getFirstVertex();
-	const int lastVertex = firstVertex + currentCloth->getNumVertices();
-	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
-	{		
-		btSoftBodySolverOutputDXtoDX::copySoftBodyToVertexBuffer( softBody, vertexBuffer );
-	} else 	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::DX11_BUFFER )
-	{
-		// Do a DX11 copy shader DX to DX copy
-		const btDX11VertexBufferDescriptor *dx11VertexBuffer = static_cast< btDX11VertexBufferDescriptor* >(vertexBuffer);	
-		// No need to batch link solver, it is entirely parallel
-		// Copy kernel parameters to GPU
-		OutputToVertexArrayCB constBuffer;
-		ID3D11ComputeShader* outputToVertexArrayShader = outputToVertexArrayWithoutNormalsKernel.kernel;
-		ID3D11Buffer* outputToVertexArrayConstBuffer = outputToVertexArrayWithoutNormalsKernel.constBuffer;
-		constBuffer.startNode = firstVertex;
-		constBuffer.numNodes = currentCloth->getNumVertices();
-		constBuffer.positionOffset = vertexBuffer->getVertexOffset();
-		constBuffer.positionStride = vertexBuffer->getVertexStride();
-		if( vertexBuffer->hasNormals() )
-		{
-			constBuffer.normalOffset = vertexBuffer->getNormalOffset();
-			constBuffer.normalStride = vertexBuffer->getNormalStride();
-			outputToVertexArrayShader = outputToVertexArrayWithNormalsKernel.kernel;
-			outputToVertexArrayConstBuffer = outputToVertexArrayWithNormalsKernel.constBuffer;
-		}	
-		// TODO: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
-		D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-		dxFunctions.m_dx11Context->Map( outputToVertexArrayConstBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-		memcpy( MappedResource.pData, &constBuffer, sizeof(OutputToVertexArrayCB) );	
-		dxFunctions.m_dx11Context->Unmap( outputToVertexArrayConstBuffer, 0 );
-		dxFunctions.m_dx11Context->CSSetConstantBuffers( 0, 1, &outputToVertexArrayConstBuffer );
-		// Set resources and dispatch
-		dxFunctions.m_dx11Context->CSSetShaderResources( 0, 1, &(vertexData.m_dx11VertexPosition.getSRV()) );
-		dxFunctions.m_dx11Context->CSSetShaderResources( 1, 1, &(vertexData.m_dx11VertexNormal.getSRV()) );
-		ID3D11UnorderedAccessView* dx11UAV = dx11VertexBuffer->getDX11UAV();
-		dxFunctions.m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(dx11UAV), NULL );
-		// Execute the kernel
-		dxFunctions.m_dx11Context->CSSetShader( outputToVertexArrayShader, NULL, 0 );
-		int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
-		dxFunctions.m_dx11Context->Dispatch(numBlocks, 1, 1 );
-		{
-			// Tidy up 
-			ID3D11ShaderResourceView* pViewNULL = NULL;
-			dxFunctions.m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-			dxFunctions.m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-			ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-			dxFunctions.m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-			ID3D11Buffer *pBufferNull = NULL;
-			dxFunctions.m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-		}	
-	}
-} // btDX11SoftBodySolver::outputToVertexBuffers
-DXFunctions::KernelDesc DXFunctions::compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros )
-	const char *cs5String = "cs_5_0";
-	HRESULT hr = S_OK;
-	ID3DBlob* pErrorBlob = NULL;
-	ID3DBlob* pBlob = NULL;
-	ID3D11ComputeShader*		kernelPointer = 0;
-	hr = m_dx11CompileFromMemory( 
-		shaderString,
-		strlen(shaderString),
-		shaderName,
-		compileMacros,
-		NULL,
-		shaderName,
-		cs5String,
-		NULL,
-		NULL,
-		&pBlob,
-		&pErrorBlob,
-		);
-	if( FAILED(hr) )
-	{
-		if( pErrorBlob ) {
-			btAssert( "Compilation of compute shader failed\n" );
-			char *debugString = (char*)pErrorBlob->GetBufferPointer();
-			OutputDebugStringA( debugString );
-		}
-		SAFE_RELEASE( pErrorBlob );
-		SAFE_RELEASE( pBlob );    
-		DXFunctions::KernelDesc descriptor;
-		descriptor.kernel = 0;
-		descriptor.constBuffer = 0;
-		return descriptor;
-	}    
-	// Create the Compute Shader
-	hr = m_dx11Device->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL, &kernelPointer );
-	if( FAILED( hr ) )
-	{
-		DXFunctions::KernelDesc descriptor;
-		descriptor.kernel = 0;
-		descriptor.constBuffer = 0;
-		return descriptor;
-	}
-	ID3D11Buffer* constBuffer = 0;
-	if( constBufferSize > 0 )
-	{
-		// Create the constant buffer
-		D3D11_BUFFER_DESC constant_buffer_desc;
-		ZeroMemory(&constant_buffer_desc, sizeof(constant_buffer_desc));
-		constant_buffer_desc.ByteWidth = constBufferSize;
-		constant_buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
-		constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
-		constant_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
-		m_dx11Device->CreateBuffer(&constant_buffer_desc, NULL, &constBuffer);
-		if( FAILED( hr ) )
-		{
-			KernelDesc descriptor;
-			descriptor.kernel = 0;
-			descriptor.constBuffer = 0;
-			return descriptor;
-		}
-	}
-	SAFE_RELEASE( pErrorBlob );
-	SAFE_RELEASE( pBlob );
-	DXFunctions::KernelDesc descriptor;
-	descriptor.kernel = kernelPointer;
-	descriptor.constBuffer = constBuffer;
-	return descriptor;
-} // compileComputeShader
-bool btDX11SoftBodySolver::buildShaders()
-	// Ensure current kernels are released first
-	releaseKernels();
-	bool returnVal = true;
-	if( m_shadersInitialized )
-		return true;
-	prepareLinksKernel = dxFunctions.compileComputeShaderFromString( PrepareLinksHLSLString, "PrepareLinksKernel", sizeof(PrepareLinksCB) );
-	if( !prepareLinksKernel.constBuffer )
-		returnVal = false;
-	updatePositionsFromVelocitiesKernel = dxFunctions.compileComputeShaderFromString( UpdatePositionsFromVelocitiesHLSLString, "UpdatePositionsFromVelocitiesKernel", sizeof(UpdatePositionsFromVelocitiesCB) );
-	if( !updatePositionsFromVelocitiesKernel.constBuffer )
-		returnVal = false;
-	solvePositionsFromLinksKernel = dxFunctions.compileComputeShaderFromString( SolvePositionsHLSLString, "SolvePositionsFromLinksKernel", sizeof(SolvePositionsFromLinksKernelCB) );
-	if( !updatePositionsFromVelocitiesKernel.constBuffer )
-		returnVal = false;
-	vSolveLinksKernel = dxFunctions.compileComputeShaderFromString( VSolveLinksHLSLString, "VSolveLinksKernel", sizeof(VSolveLinksCB) );
-	if( !vSolveLinksKernel.constBuffer )
-		returnVal = false;
-	updateVelocitiesFromPositionsWithVelocitiesKernel = dxFunctions.compileComputeShaderFromString( UpdateNodesHLSLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", sizeof(UpdateVelocitiesFromPositionsWithVelocitiesCB) );
-	if( !updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer )
-		returnVal = false;
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel = dxFunctions.compileComputeShaderFromString( UpdatePositionsHLSLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", sizeof(UpdateVelocitiesFromPositionsWithoutVelocitiesCB) );
-	if( !updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer )
-		returnVal = false;
-	integrateKernel = dxFunctions.compileComputeShaderFromString( IntegrateHLSLString, "IntegrateKernel", sizeof(IntegrateCB) );
-	if( !integrateKernel.constBuffer )
-		returnVal = false;
-	applyForcesKernel = dxFunctions.compileComputeShaderFromString( ApplyForcesHLSLString, "ApplyForcesKernel", sizeof(ApplyForcesCB) );
-	if( !applyForcesKernel.constBuffer )
-		returnVal = false;
-	solveCollisionsAndUpdateVelocitiesKernel = dxFunctions.compileComputeShaderFromString( SolveCollisionsAndUpdateVelocitiesHLSLString, "SolveCollisionsAndUpdateVelocitiesKernel", sizeof(SolveCollisionsAndUpdateVelocitiesCB) );
-	if( !solveCollisionsAndUpdateVelocitiesKernel.constBuffer )
-		returnVal = false;
-	// TODO: Rename to UpdateSoftBodies
-	resetNormalsAndAreasKernel = dxFunctions.compileComputeShaderFromString( UpdateNormalsHLSLString, "ResetNormalsAndAreasKernel", sizeof(UpdateSoftBodiesCB) );
-	if( !resetNormalsAndAreasKernel.constBuffer )
-		returnVal = false;
-	normalizeNormalsAndAreasKernel = dxFunctions.compileComputeShaderFromString( UpdateNormalsHLSLString, "NormalizeNormalsAndAreasKernel", sizeof(UpdateSoftBodiesCB) );
-	if( !normalizeNormalsAndAreasKernel.constBuffer )
-		returnVal = false;
-	updateSoftBodiesKernel = dxFunctions.compileComputeShaderFromString( UpdateNormalsHLSLString, "UpdateSoftBodiesKernel", sizeof(UpdateSoftBodiesCB) );
-	if( !updateSoftBodiesKernel.constBuffer )
-		returnVal = false;
-	computeBoundsKernel = dxFunctions.compileComputeShaderFromString( ComputeBoundsHLSLString, "ComputeBoundsKernel", sizeof(ComputeBoundsCB) );
-	if( !computeBoundsKernel.constBuffer )
-		returnVal = false;
-	if( returnVal )
-		m_shadersInitialized = true;
-	return returnVal;
-static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
-	Vectormath::Aos::Transform3 outTransform;
-	outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
-	outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
-	outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
-	outTransform.setCol(3, toVector3(transform.getOrigin()));
-	return outTransform;	
-void btDX11SoftBodySolver::btAcceleratedSoftBodyInterface::updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound )
-	float scalarMargin = this->getSoftBody()->getCollisionShape()->getMargin();
-	btVector3 vectorMargin( scalarMargin, scalarMargin, scalarMargin );
-	m_softBody->m_bounds[0] = lowerBound - vectorMargin;
-	m_softBody->m_bounds[1] = upperBound + vectorMargin;
-void btDX11SoftBodySolver::processCollision( btSoftBody*, btSoftBody* )
-// Add the collision object to the set to deal with for a particular soft body
-void btDX11SoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
-	int softBodyIndex = findSoftBodyIndex( softBody );
-	if( softBodyIndex >= 0 )
-	{
-		btCollisionShape *collisionShape = collisionObject->getCollisionShape();
-		float friction = collisionObject->getFriction();
-		int shapeType = collisionShape->getShapeType();
-		if( shapeType == CAPSULE_SHAPE_PROXYTYPE )
-		{
-			// Add to the list of expected collision objects
-			CollisionShapeDescription newCollisionShapeDescription;
-			newCollisionShapeDescription.softBodyIdentifier = softBodyIndex;
-			newCollisionShapeDescription.collisionShapeType = shapeType;
-			// TODO: May need to transpose this matrix either here or in HLSL
-			newCollisionShapeDescription.shapeTransform = toTransform3(collisionObject->getWorldTransform());
-			btCapsuleShape *capsule = static_cast<btCapsuleShape*>( collisionShape );
-			newCollisionShapeDescription.radius = capsule->getRadius();
-			newCollisionShapeDescription.halfHeight = capsule->getHalfHeight();
-			newCollisionShapeDescription.margin = capsule->getMargin();
-			newCollisionShapeDescription.friction = friction;
-			btRigidBody* body = static_cast< btRigidBody* >( collisionObject );
-			newCollisionShapeDescription.linearVelocity = toVector3(body->getLinearVelocity());
-			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
-			m_collisionObjectDetails.push_back( newCollisionShapeDescription );
-		} else {
-#ifdef _DEBUG
-			printf("Unsupported collision shape type\n");
-		}
-	} else {
-		btAssert("Unknown soft body");
-	}
-} // btDX11SoftBodySolver::processCollision
-void btDX11SoftBodySolver::predictMotion( float timeStep )
-	// Clear the collision shape array for the next frame
-	// Ensure that the DX11 ones are moved off the device so they will be updated correctly
-	m_dx11CollisionObjectDetails.changedOnCPU();
-	m_dx11PerClothCollisionObjects.changedOnCPU();
-	m_collisionObjectDetails.clear();
-	// Fill the force arrays with current acceleration data etc
-	m_perClothWindVelocity.resize( m_softBodySet.size() );
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();
-		m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
-	}
-	m_dx11PerClothWindVelocity.changedOnCPU();
-	// Apply forces that we know about to the cloths
-	applyForces(  timeStep * getTimeScale() );
-	// Itegrate motion for all soft bodies dealt with by the solver
-	integrate( timeStep * getTimeScale() );
-	// Update bounds
-	// Will update the bounds for all softBodies being dealt with by the solver and 
-	// set the values in the btSoftBody object
-	if (m_enableUpdateBounds)
-		updateBounds();
-	// End prediction work for solvers
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
deleted file mode 100644
index 939eabaf..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ /dev/null
@@ -1,691 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "vectormath/vmInclude.h"
-#include "BulletSoftBody/btSoftBodySolvers.h"
-#include "btSoftBodySolverVertexBuffer_DX11.h"
-#include "btSoftBodySolverLinkData_DX11.h"
-#include "btSoftBodySolverVertexData_DX11.h"
-#include "btSoftBodySolverTriangleData_DX11.h"
-class DXFunctions
-	ID3D11Device *		 m_dx11Device;
-	ID3D11DeviceContext* m_dx11Context;
-	CompileFromMemoryFunc m_dx11CompileFromMemory;
-	DXFunctions(ID3D11Device *dx11Device, ID3D11DeviceContext* dx11Context, CompileFromMemoryFunc dx11CompileFromMemory) :
-		m_dx11Device( dx11Device ),
-		m_dx11Context( dx11Context ),
-		m_dx11CompileFromMemory( dx11CompileFromMemory )
-	{
-	}
-	class KernelDesc
-	{
-	protected:
-	public:
-		ID3D11ComputeShader* kernel;
-		ID3D11Buffer* constBuffer;
-		KernelDesc()
-		{
-			kernel = 0;
-			constBuffer = 0;
-		}
-		virtual ~KernelDesc()
-		{
-			// TODO: this should probably destroy its kernel but we need to be careful
-			// in case KernelDescs are copied
-		}
-	}; 
-	/**
-	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
-	 */
-	KernelDesc compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros = 0 );
-class btDX11SoftBodySolver : public btSoftBodySolver
-	/**
-	 * Entry in the collision shape array.
-	 * Specifies the shape type, the transform matrix and the necessary details of the collisionShape.
-	 */
-	struct CollisionShapeDescription
-	{
-		Vectormath::Aos::Transform3 shapeTransform;
-		Vectormath::Aos::Vector3 linearVelocity;
-		Vectormath::Aos::Vector3 angularVelocity;
-		int softBodyIdentifier;
-		int collisionShapeType;
-		// Both needed for capsule
-		float radius;
-		float halfHeight;
-		float margin;
-		float friction;
-		CollisionShapeDescription()
-		{
-			collisionShapeType = 0;
-			margin = 0;
-			friction = 0;
-		}
-	};
-	struct UIntVector3
-	{
-		UIntVector3()
-		{
-			x = 0;
-			y = 0;
-			z = 0;
-			_padding = 0;
-		}
-		UIntVector3( unsigned int x_, unsigned int y_, unsigned int z_ )
-		{
-			x = x_;
-			y = y_;
-			z = z_;
-			_padding = 0;
-		}
-		unsigned int x;
-		unsigned int y;
-		unsigned int z;
-		unsigned int _padding;
-	};
-	/**
-	 * SoftBody class to maintain information about a soft body instance
-	 * within a solver.
-	 * This data addresses the main solver arrays.
-	 */
-	class btAcceleratedSoftBodyInterface
-	{
-	protected:
-		/** Current number of vertices that are part of this cloth */
-		int m_numVertices;
-		/** Maximum number of vertices allocated to be part of this cloth */
-		int m_maxVertices;
-		/** Current number of triangles that are part of this cloth */
-		int m_numTriangles;
-		/** Maximum number of triangles allocated to be part of this cloth */
-		int m_maxTriangles;
-		/** Index of first vertex in the world allocated to this cloth */
-		int m_firstVertex;
-		/** Index of first triangle in the world allocated to this cloth */
-		int m_firstTriangle;
-		/** Index of first link in the world allocated to this cloth */
-		int m_firstLink;
-		/** Maximum number of links allocated to this cloth */
-		int m_maxLinks;
-		/** Current number of links allocated to this cloth */
-		int m_numLinks;
-		/** The actual soft body this data represents */
-		btSoftBody *m_softBody;
-	public:
-		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
-		  m_softBody( softBody )
-		{
-			m_numVertices = 0;
-			m_maxVertices = 0;
-			m_numTriangles = 0;
-			m_maxTriangles = 0;
-			m_firstVertex = 0;
-			m_firstTriangle = 0;
-			m_firstLink = 0;
-			m_maxLinks = 0;
-			m_numLinks = 0;
-		}
-		int getNumVertices() const
-		{
-			return m_numVertices;
-		}
-		int getNumTriangles() const
-		{
-			return m_numTriangles;
-		}
-		int getMaxVertices() const
-		{
-			return m_maxVertices;
-		}
-		int getMaxTriangles() const
-		{
-			return m_maxTriangles;
-		}
-		int getFirstVertex() const
-		{
-			return m_firstVertex;
-		}
-		int getFirstTriangle() const
-		{
-			return m_firstTriangle;
-		}
-		/**
-		 * Update the bounds in the btSoftBody object
-		 */
-		void updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound );
-		// TODO: All of these set functions will have to do checks and
-		// update the world because restructuring of the arrays will be necessary
-		// Reasonable use of "friend"?
-		void setNumVertices( int numVertices )
-		{
-			m_numVertices = numVertices;
-		}	
-		void setNumTriangles( int numTriangles )
-		{
-			m_numTriangles = numTriangles;
-		}
-		void setMaxVertices( int maxVertices )
-		{
-			m_maxVertices = maxVertices;
-		}
-		void setMaxTriangles( int maxTriangles )
-		{
-			m_maxTriangles = maxTriangles;
-		}
-		void setFirstVertex( int firstVertex )
-		{
-			m_firstVertex = firstVertex;
-		}
-		void setFirstTriangle( int firstTriangle )
-		{
-			m_firstTriangle = firstTriangle;
-		}
-		void setMaxLinks( int maxLinks )
-		{
-			m_maxLinks = maxLinks;
-		}
-		void setNumLinks( int numLinks )
-		{
-			m_numLinks = numLinks;
-		}
-		void setFirstLink( int firstLink )
-		{
-			m_firstLink = firstLink;
-		}
-		int getMaxLinks()
-		{
-			return m_maxLinks;
-		}
-		int getNumLinks()
-		{
-			return m_numLinks;
-		}
-		int getFirstLink()
-		{
-			return m_firstLink;
-		}
-		btSoftBody* getSoftBody()
-		{
-			return m_softBody;
-		}
-	};
-	struct CollisionObjectIndices
-	{
-		CollisionObjectIndices( int f, int e )
-		{
-			firstObject = f;
-			endObject = e;
-		}
-		int firstObject;
-		int endObject;
-	};
-	struct PrepareLinksCB
-	{		
-		int numLinks;
-		int padding0;
-		int padding1;
-		int padding2;
-	};
-	struct SolvePositionsFromLinksKernelCB
-	{		
-		int startLink;
-		int numLinks;
-		float kst;
-		float ti;
-	};
-	struct IntegrateCB
-	{
-		int numNodes;
-		float solverdt;
-		int padding1;
-		int padding2;
-	};
-	struct UpdatePositionsFromVelocitiesCB
-	{
-		int numNodes;
-		float solverSDT;
-		int padding1;
-		int padding2;
-	};
-	struct UpdateVelocitiesFromPositionsWithoutVelocitiesCB
-	{
-		int numNodes;
-		float isolverdt;
-		int padding1;
-		int padding2;
-	};
-	struct UpdateVelocitiesFromPositionsWithVelocitiesCB
-	{
-		int numNodes;
-		float isolverdt;
-		int padding1;
-		int padding2;
-	};
-	struct UpdateSoftBodiesCB
-	{
-		int numNodes;
-		int startFace;
-		int numFaces;
-		float epsilon;
-	};
-	struct ApplyForcesCB
-	{
-		unsigned int numNodes;
-		float solverdt;
-		float epsilon;
-		int padding3;
-	};
-	struct AddVelocityCB
-	{
-		int startNode;
-		int lastNode;
-		float velocityX;
-		float velocityY;
-		float velocityZ;
-		int padding1;
-		int padding2;
-		int padding3;
-	};
-	struct VSolveLinksCB
-	{
-		int startLink;
-		int numLinks;
-		float kst;
-		int padding;
-	};
-	struct ComputeBoundsCB
-	{
-		int numNodes;
-		int numSoftBodies;
-		int padding1;
-		int padding2;
-	};
-	struct SolveCollisionsAndUpdateVelocitiesCB
-	{
-		unsigned int numNodes;
-		float isolverdt;
-		int padding0;
-		int padding1;
-	};
-	ID3D11Device *		 m_dx11Device;
-	ID3D11DeviceContext* m_dx11Context;
-	DXFunctions dxFunctions;
-	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
-	btSoftBodyLinkDataDX11 m_linkData;
-	btSoftBodyVertexDataDX11 m_vertexData;
-	btSoftBodyTriangleDataDX11 m_triangleData;
-	/** Variable to define whether we need to update solver constants on the next iteration */
-	bool m_updateSolverConstants;
-	bool m_shadersInitialized;
-	/** 
-	 * Cloths owned by this solver.
-	 * Only our cloths are in this array.
-	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
-	/** Acceleration value to be applied to all non-static vertices in the solver. 
-	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
-	 */
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
-	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothAcceleration;
-	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
-	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
-	 */
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
-	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothWindVelocity;
-	/** Velocity damping factor */
-	btAlignedObjectArray< float >						m_perClothDampingFactor;
-	btDX11Buffer<float>									m_dx11PerClothDampingFactor;
-	/** Velocity correction coefficient */
-	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
-	btDX11Buffer<float>									m_dx11PerClothVelocityCorrectionCoefficient;
-	/** Lift parameter for wind effect on cloth. */
-	btAlignedObjectArray< float >						m_perClothLiftFactor;
-	btDX11Buffer<float>									m_dx11PerClothLiftFactor;
-	/** Drag parameter for wind effect on cloth. */
-	btAlignedObjectArray< float >						m_perClothDragFactor;
-	btDX11Buffer<float>									m_dx11PerClothDragFactor;
-	/** Density of the medium in which each cloth sits */
-	btAlignedObjectArray< float >						m_perClothMediumDensity;
-	btDX11Buffer<float>									m_dx11PerClothMediumDensity;
-	/** 
-	 * Collision shape details: pair of index of first collision shape for the cloth and number of collision objects.
-	 */
-	btAlignedObjectArray< CollisionObjectIndices >		m_perClothCollisionObjects;
-	btDX11Buffer<CollisionObjectIndices>				m_dx11PerClothCollisionObjects;
-	/** 
-	 * Collision shapes being passed across to the cloths in this solver.
-	 */
-	btAlignedObjectArray< CollisionShapeDescription >	m_collisionObjectDetails;
-	btDX11Buffer< CollisionShapeDescription >			m_dx11CollisionObjectDetails;
-	/** 
-	 * Minimum bounds for each cloth.
-	 * Updated by GPU and returned for use by broad phase.
-	 * These are int vectors as a reminder that they store the int representation of a float, not a float.
-	 * Bit 31 is inverted - is floats are stored with int-sortable values.
-	 */
-	btAlignedObjectArray< UIntVector3 >	m_perClothMinBounds;
-	btDX11Buffer< UIntVector3 >			m_dx11PerClothMinBounds;
-	/** 
-	 * Maximum bounds for each cloth.
-	 * Updated by GPU and returned for use by broad phase.
-	 * These are int vectors as a reminder that they store the int representation of a float, not a float.
-	 * Bit 31 is inverted - is floats are stored with int-sortable values.
-	 */
-	btAlignedObjectArray< UIntVector3 >	m_perClothMaxBounds;
-	btDX11Buffer< UIntVector3 >			m_dx11PerClothMaxBounds;
-	/** 
-	 * Friction coefficient for each cloth
-	 */
-	btAlignedObjectArray< float >	m_perClothFriction;
-	btDX11Buffer< float >			m_dx11PerClothFriction;
-	DXFunctions::KernelDesc		prepareLinksKernel;
-	DXFunctions::KernelDesc		solvePositionsFromLinksKernel;
-	DXFunctions::KernelDesc		vSolveLinksKernel;
-	DXFunctions::KernelDesc		integrateKernel;
-	DXFunctions::KernelDesc		addVelocityKernel;
-	DXFunctions::KernelDesc		updatePositionsFromVelocitiesKernel;
-	DXFunctions::KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
-	DXFunctions::KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
-	DXFunctions::KernelDesc		solveCollisionsAndUpdateVelocitiesKernel;
-	DXFunctions::KernelDesc		resetNormalsAndAreasKernel;
-	DXFunctions::KernelDesc		normalizeNormalsAndAreasKernel;
-	DXFunctions::KernelDesc		computeBoundsKernel;
-	DXFunctions::KernelDesc		updateSoftBodiesKernel;
-	DXFunctions::KernelDesc		applyForcesKernel;
-	bool	m_enableUpdateBounds;
-	/**
-	 * Integrate motion on the solver.
-	 */
-	virtual void integrate( float solverdt );
-	float computeTriangleArea( 
-		const Vectormath::Aos::Point3 &vertex0,
-		const Vectormath::Aos::Point3 &vertex1,
-		const Vectormath::Aos::Point3 &vertex2 );
-	virtual bool buildShaders();
-	void resetNormalsAndAreas( int numVertices );
-	void normalizeNormalsAndAreas( int numVertices );
-	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
-	void prepareCollisionConstraints();
-	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
-	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
-	virtual void applyForces( float solverdt );
-	virtual void updateConstants( float timeStep );
-	int findSoftBodyIndex( const btSoftBody* const softBody );
-	//////////////////////////////////////
-	// Kernel dispatches
-	virtual void prepareLinks();
-	void updatePositionsFromVelocities( float solverdt );
-	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
-	void solveLinksForVelocity( int startLink, int numLinks, float kst );
-	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
-	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
-	void computeBounds( );
-	void solveCollisionsAndUpdateVelocities( float isolverdt );
-	// End kernel dispatches
-	/////////////////////////////////////
-	void updateBounds();
-	void releaseKernels();
-	btDX11SoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory = &D3DX11CompileFromMemory);
-	virtual ~btDX11SoftBodySolver();
-	virtual SolverTypes getSolverType() const
-	{
-		return DX_SOLVER;
-	}
-	void	setEnableUpdateBounds(bool enableBounds)
-	{
-		m_enableUpdateBounds = enableBounds;
-	}
-	bool getEnableUpdateBounds() const
-	{
-		return  m_enableUpdateBounds;
-	}
-	virtual btSoftBodyLinkData &getLinkData();
-	virtual btSoftBodyVertexData &getVertexData();
-	virtual btSoftBodyTriangleData &getTriangleData();
-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
-	const btAcceleratedSoftBodyInterface * const findSoftBodyInterface( const btSoftBody* const softBody ) const;
-	virtual bool checkInitialized();
-	virtual void updateSoftBodies( );
-	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
-	virtual void copyBackToSoftBodies(bool bMove = true);
-	virtual void solveConstraints( float solverdt );
-	virtual void predictMotion( float solverdt );
-	virtual void processCollision( btSoftBody *, btCollisionObject* );
-	virtual void processCollision( btSoftBody*, btSoftBody* );
- * Class to manage movement of data from a solver to a given target.
- * This version is the DX to CPU version.
- */
-class btSoftBodySolverOutputDXtoCPU : public btSoftBodySolverOutput
-	btSoftBodySolverOutputDXtoCPU()
-	{
-	}
-	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
-	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
- * Class to manage movement of data from a solver to a given target.
- * This version is the DX to DX version and subclasses DX to CPU so that it works for that too.
- */
-class btSoftBodySolverOutputDXtoDX : public btSoftBodySolverOutputDXtoCPU
-	struct OutputToVertexArrayCB
-	{
-		int startNode;
-		int numNodes;
-		int positionOffset;
-		int positionStride;
-		int normalOffset;	
-		int normalStride;
-		int padding1;
-		int padding2;
-	};
-	DXFunctions dxFunctions;
-	DXFunctions::KernelDesc outputToVertexArrayWithNormalsKernel;
-	DXFunctions::KernelDesc outputToVertexArrayWithoutNormalsKernel;
-	bool m_shadersInitialized;
-	bool checkInitialized();
-	bool buildShaders();
-	void releaseKernels();
-	btSoftBodySolverOutputDXtoDX(ID3D11Device *dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory = &D3DX11CompileFromMemory) :
-	  dxFunctions( dx11Device, dx11Context, dx11CompileFromMemory )
-	{
-		m_shadersInitialized = false;
-	}
-	~btSoftBodySolverOutputDXtoDX()
-	{
-		releaseKernels();
-	}
-	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
-	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
deleted file mode 100644
index 5c73ee5d..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
+++ /dev/null
@@ -1,1051 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <cstdio>
-#define WAVEFRONT_SIZE 32
-#define STRINGIFY2( S ) #S
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "vectormath/vmInclude.h"
-#include "btSoftBodySolverLinkData_DX11SIMDAware.h"
-#include "btSoftBodySolver_DX11SIMDAware.h"
-#include "btSoftBodySolverVertexBuffer_DX11.h"
-#include "BulletSoftBody/btSoftBody.h"
-#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
-#define MSTRINGIFY(A) #A
-static char* UpdatePositionsFromVelocitiesHLSLString = 
-#include "HLSL/UpdatePositionsFromVelocities.hlsl"
-static char* SolvePositionsSIMDBatchedHLSLString = 
-#include "HLSL/SolvePositionsSIMDBatched.hlsl"
-static char* UpdateNodesHLSLString = 
-#include "HLSL/UpdateNodes.hlsl"
-static char* UpdatePositionsHLSLString = 
-#include "HLSL/UpdatePositions.hlsl"
-static char* UpdateConstantsHLSLString = 
-#include "HLSL/UpdateConstants.hlsl"
-static char* IntegrateHLSLString = 
-#include "HLSL/Integrate.hlsl"
-static char* ApplyForcesHLSLString = 
-#include "HLSL/ApplyForces.hlsl"
-static char* UpdateNormalsHLSLString = 
-#include "HLSL/UpdateNormals.hlsl"
-static char* OutputToVertexArrayHLSLString = 
-#include "HLSL/OutputToVertexArray.hlsl"
-static char* VSolveLinksHLSLString = 
-#include "HLSL/VSolveLinks.hlsl"
-static char* ComputeBoundsHLSLString = 
-#include "HLSL/ComputeBounds.hlsl"
-static char* SolveCollisionsAndUpdateVelocitiesHLSLString =
-#include "HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl"
-btSoftBodyLinkDataDX11SIMDAware::btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext ) : 
-		m_d3dDevice( d3dDevice ),
-		m_d3dDeviceContext( d3dDeviceContext ),
-		m_wavefrontSize( WAVEFRONT_SIZE ),
-		m_linksPerWorkItem( LINKS_PER_SIMD_LANE ),
-		m_maxBatchesWithinWave( 0 ),
-		m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ),
-		m_numWavefronts( 0 ),
-		m_maxVertex( 0 ),
-		m_dx11NumBatchesAndVerticesWithinWaves( d3dDevice, d3dDeviceContext, &m_numBatchesAndVerticesWithinWaves, true ),
-		m_dx11WavefrontVerticesGlobalAddresses( d3dDevice, d3dDeviceContext, &m_wavefrontVerticesGlobalAddresses, true ),
-		m_dx11LinkVerticesLocalAddresses( d3dDevice, d3dDeviceContext, &m_linkVerticesLocalAddresses, true ),
-		m_dx11LinkStrength( d3dDevice, d3dDeviceContext, &m_linkStrength, true ),
-		m_dx11LinksMassLSC( d3dDevice, d3dDeviceContext, &m_linksMassLSC, true ),
-		m_dx11LinksRestLengthSquared( d3dDevice, d3dDeviceContext, &m_linksRestLengthSquared, true ),
-		m_dx11LinksRestLength( d3dDevice, d3dDeviceContext, &m_linksRestLength, true ),
-		m_dx11LinksMaterialLinearStiffnessCoefficient( d3dDevice, d3dDeviceContext, &m_linksMaterialLinearStiffnessCoefficient, true )
-	m_d3dDevice = d3dDevice;
-	m_d3dDeviceContext = d3dDeviceContext;
-static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
-	Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
-	return outVec;
-void btSoftBodyLinkDataDX11SIMDAware::createLinks( int numLinks )
-	int previousSize = m_links.size();
-	int newSize = previousSize + numLinks;
-	btSoftBodyLinkData::createLinks( numLinks );
-	// Resize the link addresses array as well
-	m_linkAddresses.resize( newSize );
-void btSoftBodyLinkDataDX11SIMDAware::setLinkAt( const btSoftBodyLinkData::LinkDescription &link, int linkIndex )
-	btSoftBodyLinkData::setLinkAt( link, linkIndex );
-	if( link.getVertex0() > m_maxVertex )
-		m_maxVertex = link.getVertex0();
-	if( link.getVertex1() > m_maxVertex )
-		m_maxVertex = link.getVertex1();
-	// Set the link index correctly for initialisation
-	m_linkAddresses[linkIndex] = linkIndex;
-bool btSoftBodyLinkDataDX11SIMDAware::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyLinkDataDX11SIMDAware::moveToAccelerator()
-	bool success = true;
-	success = success && m_dx11NumBatchesAndVerticesWithinWaves.moveToGPU();
-	success = success && m_dx11WavefrontVerticesGlobalAddresses.moveToGPU();
-	success = success && m_dx11LinkVerticesLocalAddresses.moveToGPU();
-	success = success && m_dx11LinkStrength.moveToGPU();
-	success = success && m_dx11LinksMassLSC.moveToGPU();
-	success = success && m_dx11LinksRestLengthSquared.moveToGPU();
-	success = success && m_dx11LinksRestLength.moveToGPU();
-	success = success && m_dx11LinksMaterialLinearStiffnessCoefficient.moveToGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
-bool btSoftBodyLinkDataDX11SIMDAware::moveFromAccelerator()
-	bool success = true;
-	success = success && m_dx11NumBatchesAndVerticesWithinWaves.moveFromGPU();
-	success = success && m_dx11WavefrontVerticesGlobalAddresses.moveFromGPU();
-	success = success && m_dx11LinkVerticesLocalAddresses.moveFromGPU();
-	success = success && m_dx11LinkStrength.moveFromGPU();
-	success = success && m_dx11LinksMassLSC.moveFromGPU();
-	success = success && m_dx11LinksRestLengthSquared.moveFromGPU();
-	success = success && m_dx11LinksRestLength.moveFromGPU();
-	success = success && m_dx11LinksMaterialLinearStiffnessCoefficient.moveFromGPU();
-	if( success )
-		m_onGPU = false;
-	return success;
-btDX11SIMDAwareSoftBodySolver::btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory) :
-	btDX11SoftBodySolver( dx11Device, dx11Context, dx11CompileFromMemory ),
-	m_linkData(m_dx11Device, m_dx11Context)
-	// Initial we will clearly need to update solver constants
-	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
-	// for performance in future once we understand more clearly when constants need to be updated
-	m_updateSolverConstants = true;
-	m_shadersInitialized = false;
-	releaseKernels();
-btSoftBodyLinkData &btDX11SIMDAwareSoftBodySolver::getLinkData()
-	// TODO: Consider setting link data to "changed" here
-	return m_linkData;
-void btDX11SIMDAwareSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate)
-	if(forceUpdate || m_softBodySet.size() != softBodies.size() )
-	{
-		// Have a change in the soft body set so update, reloading all the data
-		getVertexData().clear();
-		getTriangleData().clear();
-		getLinkData().clear();
-		m_softBodySet.resize(0);
-		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
-		{
-			btSoftBody *softBody = softBodies[ softBodyIndex ];
-			using Vectormath::Aos::Matrix3;
-			using Vectormath::Aos::Point3;
-			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
-			m_softBodySet.push_back( newSoftBody );
-			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
-			m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
-			m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
-			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
-			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
-			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
-			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
-			m_perClothMinBounds.push_back( UIntVector3( 0, 0, 0 ) );
-			m_perClothMaxBounds.push_back( UIntVector3( UINT_MAX, UINT_MAX, UINT_MAX ) );
-			m_perClothFriction.push_back( softBody->getFriction() );
-			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
-			// Add space for new vertices and triangles in the default solver for now
-			// TODO: Include space here for tearing too later
-			int firstVertex = getVertexData().getNumVertices();
-			int numVertices = softBody->m_nodes.size();
-			// Round maxVertices to a multiple of the workgroup size so we know we're safe to run over in a given group
-			// maxVertices can be increased to allow tearing, but should be used sparingly because these extra verts will always be processed
-			int maxVertices = GROUP_SIZE*((numVertices+GROUP_SIZE)/GROUP_SIZE);
-			// Allocate space for new vertices in all the vertex arrays
-			getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );
-			int firstTriangle = getTriangleData().getNumTriangles();
-			int numTriangles = softBody->m_faces.size();
-			int maxTriangles = numTriangles;
-			getTriangleData().createTriangles( maxTriangles );
-			// Copy vertices from softbody into the solver
-			for( int vertex = 0; vertex < numVertices; ++vertex )
-			{
-				Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
-				btSoftBodyVertexData::VertexDescription desc;
-				// TODO: Position in the softbody might be pre-transformed
-				// or we may need to adapt for the pose.
-				//desc.setPosition( cloth.getMeshTransform()*multPoint );
-				desc.setPosition( multPoint );
-				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
-				desc.setInverseMass(vertexInverseMass);
-				getVertexData().setVertexAt( desc, firstVertex + vertex );
-			}
-			// Copy triangles similarly
-			// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
-			for( int triangle = 0; triangle < numTriangles; ++triangle )
-			{
-				// Note that large array storage is relative to the array not to the cloth
-				// So we need to add firstVertex to each value
-				int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
-				int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
-				int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
-				btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
-				getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
-				// Increase vertex triangle counts for this triangle		
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
-			}
-			int firstLink = getLinkData().getNumLinks();
-			int numLinks = softBody->m_links.size();
-			int maxLinks = numLinks;
-			// Allocate space for the links
-			getLinkData().createLinks( numLinks );
-			// Add the links
-			for( int link = 0; link < numLinks; ++link )
-			{
-				int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
-				int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
-				btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
-				newLink.setLinkStrength(1.f);
-				getLinkData().setLinkAt(newLink, firstLink + link);
-			}
-			newSoftBody->setFirstVertex( firstVertex );
-			newSoftBody->setFirstTriangle( firstTriangle );
-			newSoftBody->setNumVertices( numVertices );
-			newSoftBody->setMaxVertices( maxVertices );
-			newSoftBody->setNumTriangles( numTriangles );
-			newSoftBody->setMaxTriangles( maxTriangles );
-			newSoftBody->setFirstLink( firstLink );
-			newSoftBody->setNumLinks( numLinks );
-		}
-		updateConstants(0.f);
-		m_linkData.generateBatches();		
-		m_triangleData.generateBatches();
-		// Build the shaders to match the batching parameters
-		buildShaders();
-	}
-void btDX11SIMDAwareSoftBodySolver::solveConstraints( float solverdt )
-	//std::cerr << "'GPU' solve constraints\n";
-	using Vectormath::Aos::Vector3;
-	using Vectormath::Aos::Point3;
-	using Vectormath::Aos::lengthSqr;
-	using Vectormath::Aos::dot;
-	// Prepare links
-	int numLinks = m_linkData.getNumLinks();
-	int numVertices = m_vertexData.getNumVertices();
-	float kst = 1.f;
-	float ti = 0.f;
-	m_dx11PerClothDampingFactor.moveToGPU();
-	m_dx11PerClothVelocityCorrectionCoefficient.moveToGPU();
-	// Ensure data is on accelerator
-	m_linkData.moveToAccelerator();
-	m_vertexData.moveToAccelerator();
-	prepareCollisionConstraints();
-	// Solve drift
-  	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	{
-		for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i )
-		{
-			int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start;
-			int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length;
-			solveLinksForPosition( startWave, numWaves, kst, ti );
-		}	
-	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	// At this point assume that the force array is blank - we will overwrite it
-	solveCollisionsAndUpdateVelocities( 1.f/solverdt );
-} // btDX11SIMDAwareSoftBodySolver::solveConstraints
-void btDX11SIMDAwareSoftBodySolver::updateConstants( float timeStep )
-	using namespace Vectormath::Aos;
-	if( m_updateSolverConstants )
-	{
-		m_updateSolverConstants = false;
-		// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
-		// Initialise link constants
-		const int numLinks = m_linkData.getNumLinks();
-		for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-		{
-			btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
-			m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
-			float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
-			float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
-			float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
-			float massLSC = (invMass0 + invMass1)/linearStiffness;
-			m_linkData.getMassLSC(linkIndex) = massLSC;
-			float restLength = m_linkData.getRestLength(linkIndex);
-			float restLengthSquared = restLength*restLength;
-			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
-		}
-	}
-} // btDX11SIMDAwareSoftBodySolver::updateConstants
-// Kernel dispatches
-void btDX11SIMDAwareSoftBodySolver::solveLinksForPosition( int startWave, int numWaves, float kst, float ti )
-	m_vertexData.moveToAccelerator();
-	m_linkData.moveToAccelerator();
-	// Copy kernel parameters to GPU
-	SolvePositionsFromLinksKernelCB constBuffer;
-	// Set the first wave of the batch and the number of waves
-	constBuffer.startWave = startWave;
-	constBuffer.numWaves = numWaves;
-	constBuffer.kst = kst;
-	constBuffer.ti = ti;
-	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
-	m_dx11Context->Map( solvePositionsFromLinksKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
-	memcpy( MappedResource.pData, &constBuffer, sizeof(SolvePositionsFromLinksKernelCB) );	
-	m_dx11Context->Unmap( solvePositionsFromLinksKernel.constBuffer, 0 );
-	m_dx11Context->CSSetConstantBuffers( 0, 1, &solvePositionsFromLinksKernel.constBuffer );
-	// Set resources and dispatch
-	m_dx11Context->CSSetShaderResources( 0, 1, &(m_linkData.m_dx11NumBatchesAndVerticesWithinWaves.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 1, 1, &(m_linkData.m_dx11WavefrontVerticesGlobalAddresses.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 3, 1, &(m_linkData.m_dx11LinkVerticesLocalAddresses.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 4, 1, &(m_linkData.m_dx11LinksMassLSC.getSRV()) );
-	m_dx11Context->CSSetShaderResources( 5, 1, &(m_linkData.m_dx11LinksRestLengthSquared.getSRV()) );
-	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
-	// Execute the kernel
-	m_dx11Context->CSSetShader( solvePositionsFromLinksKernel.kernel, NULL, 0 );
-	int	numBlocks = ((constBuffer.numWaves + WAVEFRONT_BLOCK_MULTIPLIER - 1) / WAVEFRONT_BLOCK_MULTIPLIER );
-	m_dx11Context->Dispatch(numBlocks , 1, 1 );
-	{
-		// Tidy up 
-		ID3D11ShaderResourceView* pViewNULL = NULL;
-		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
-		m_dx11Context->CSSetShaderResources( 5, 1, &pViewNULL );
-		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
-		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
-		ID3D11Buffer *pBufferNull = NULL;
-		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
-	}	
-} // btDX11SIMDAwareSoftBodySolver::solveLinksForPosition
-// End kernel dispatches
-bool btDX11SIMDAwareSoftBodySolver::buildShaders()
-	// Ensure current kernels are released first
-	releaseKernels();
-	bool returnVal = true;
-	if( m_shadersInitialized )
-		return true;
-	updatePositionsFromVelocitiesKernel = dxFunctions.compileComputeShaderFromString( UpdatePositionsFromVelocitiesHLSLString, "UpdatePositionsFromVelocitiesKernel", sizeof(UpdatePositionsFromVelocitiesCB) );
-	if( !updatePositionsFromVelocitiesKernel.constBuffer )
-		returnVal = false;
-	char maxVerticesPerWavefront[20];
-	char maxBatchesPerWavefront[20];
-	char waveFrontSize[20];
-	char waveFrontBlockMultiplier[20];
-	char blockSize[20];
-	sprintf(maxVerticesPerWavefront, "%d", m_linkData.getMaxVerticesPerWavefront());
-	sprintf(maxBatchesPerWavefront, "%d", m_linkData.getMaxBatchesPerWavefront());
-	sprintf(waveFrontSize, "%d", m_linkData.getWavefrontSize());	
-	sprintf(waveFrontBlockMultiplier, "%d", WAVEFRONT_BLOCK_MULTIPLIER);
-	sprintf(blockSize, "%d", WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
-	D3D10_SHADER_MACRO solvePositionsMacros[6] = { "MAX_NUM_VERTICES_PER_WAVE", maxVerticesPerWavefront, "MAX_BATCHES_PER_WAVE", maxBatchesPerWavefront, "WAVEFRONT_SIZE", waveFrontSize, "WAVEFRONT_BLOCK_MULTIPLIER", waveFrontBlockMultiplier, "BLOCK_SIZE", blockSize, 0, 0 };
-	solvePositionsFromLinksKernel = dxFunctions.compileComputeShaderFromString( SolvePositionsSIMDBatchedHLSLString, "SolvePositionsFromLinksKernel", sizeof(SolvePositionsFromLinksKernelCB), solvePositionsMacros );
-	if( !solvePositionsFromLinksKernel.constBuffer )
-		returnVal = false;
-	updateVelocitiesFromPositionsWithVelocitiesKernel = dxFunctions.compileComputeShaderFromString( UpdateNodesHLSLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", sizeof(UpdateVelocitiesFromPositionsWithVelocitiesCB) );
-	if( !updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer )
-		returnVal = false;
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel = dxFunctions.compileComputeShaderFromString( UpdatePositionsHLSLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", sizeof(UpdateVelocitiesFromPositionsWithoutVelocitiesCB));
-	if( !updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer )
-		returnVal = false;
-	integrateKernel = dxFunctions.compileComputeShaderFromString( IntegrateHLSLString, "IntegrateKernel", sizeof(IntegrateCB) );
-	if( !integrateKernel.constBuffer )
-		returnVal = false;
-	applyForcesKernel = dxFunctions.compileComputeShaderFromString( ApplyForcesHLSLString, "ApplyForcesKernel", sizeof(ApplyForcesCB) );
-	if( !applyForcesKernel.constBuffer )
-		returnVal = false;
-	solveCollisionsAndUpdateVelocitiesKernel = dxFunctions.compileComputeShaderFromString( SolveCollisionsAndUpdateVelocitiesHLSLString, "SolveCollisionsAndUpdateVelocitiesKernel", sizeof(SolveCollisionsAndUpdateVelocitiesCB) );
-	if( !solveCollisionsAndUpdateVelocitiesKernel.constBuffer )
-		returnVal = false;
-	resetNormalsAndAreasKernel = dxFunctions.compileComputeShaderFromString( UpdateNormalsHLSLString, "ResetNormalsAndAreasKernel", sizeof(UpdateSoftBodiesCB) );
-	if( !resetNormalsAndAreasKernel.constBuffer )
-		returnVal = false;
-	normalizeNormalsAndAreasKernel = dxFunctions.compileComputeShaderFromString( UpdateNormalsHLSLString, "NormalizeNormalsAndAreasKernel", sizeof(UpdateSoftBodiesCB) );
-	if( !normalizeNormalsAndAreasKernel.constBuffer )
-		returnVal = false;
-	updateSoftBodiesKernel = dxFunctions.compileComputeShaderFromString( UpdateNormalsHLSLString, "UpdateSoftBodiesKernel", sizeof(UpdateSoftBodiesCB) );
-	if( !updateSoftBodiesKernel.constBuffer )
-		returnVal = false;
-	computeBoundsKernel = dxFunctions.compileComputeShaderFromString( ComputeBoundsHLSLString, "ComputeBoundsKernel", sizeof(ComputeBoundsCB) );
-	if( !computeBoundsKernel.constBuffer )
-		returnVal = false;
-	if( returnVal )
-		m_shadersInitialized = true;
-	return returnVal;
-} // btDX11SIMDAwareSoftBodySolver::buildShaders
-static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
-	Vectormath::Aos::Transform3 outTransform;
-	outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
-	outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
-	outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
-	outTransform.setCol(3, toVector3(transform.getOrigin()));
-	return outTransform;	
-static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray <int> > &wavefrontBatches )
-	// A per-batch map of truth values stating whether a given vertex is in that batch
-	// This allows us to significantly optimize the batching
-	btAlignedObjectArray <btAlignedObjectArray<bool> > mapOfVerticesInBatches;
-	for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex )
-	{
-		btAlignedObjectArray <int> &wavefront( linksForWavefronts[waveIndex] );
-		int batch = 0;
-		bool placed = false;
-		while( batch < wavefrontBatches.size() && !placed )
-		{
-			// Test the current batch, see if this wave shares any vertex with the waves in the batch
-			bool foundSharedVertex = false;
-			for( int link = 0; link < wavefront.size(); ++link )
-			{
-				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
-				if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] )
-				{
-					foundSharedVertex = true;
-				}
-			}
-			if( !foundSharedVertex )
-			{
-				wavefrontBatches[batch].push_back( waveIndex );	
-				// Insert vertices into this batch too
-				for( int link = 0; link < wavefront.size(); ++link )
-				{
-					btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
-					(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
-					(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
-				}
-				placed = true;
-			}
-			batch++;
-		}
-		if( batch == wavefrontBatches.size() && !placed )
-		{
-			wavefrontBatches.resize( batch + 1 );
-			wavefrontBatches[batch].push_back( waveIndex );
-			// And resize map as well
-			mapOfVerticesInBatches.resize( batch + 1 );
-			// Resize maps with total number of vertices
-			mapOfVerticesInBatches[batch].resize( numVertices+1, false );
-			// Insert vertices into this batch too
-			for( int link = 0; link < wavefront.size(); ++link )
-			{
-				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
-				(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
-				(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
-			}
-		}
-	}
-	mapOfVerticesInBatches.clear();
-// Function to remove an object from a vector maintaining correct ordering of the vector
-template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove )
-	int currentSize = vectorToUpdate.size();
-	for( int i = indexToRemove; i < (currentSize-1); ++i )
-	{
-		vectorToUpdate[i] = vectorToUpdate[i+1];
-	}
-	if( currentSize > 0 )
-		vectorToUpdate.resize( currentSize - 1 );
- * Insert element into vectorToUpdate at index index.
- */
-template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element )
-	vectorToUpdate.resize( vectorToUpdate.size() + 1 );
-	for( int i = (vectorToUpdate.size() - 1); i > index; --i )
-	{
-		vectorToUpdate[i] = vectorToUpdate[i-1];
-	}
-	vectorToUpdate[index] = element;
- * Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness.
- * ie it treats vectorToUpdate as an ordered set.
- */
-template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray<T> &vectorToUpdate, T element )
-	int index = 0;
-	while( index < vectorToUpdate.size() && vectorToUpdate[index] < element )
-	{
-		index++;
-	}
-	if( index == vectorToUpdate.size() || vectorToUpdate[index] != element )
-		insertAtIndex( vectorToUpdate, index, element );
-static void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray <int> &numLinksPerVertex, int &maxLinks )
-	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
-	{
-		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
-		numLinksPerVertex[nodes.vertex0]++;
-		numLinksPerVertex[nodes.vertex1]++;
-	}
-	int maxLinksPerVertex = 0;
-	for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex )
-	{
-		maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex);
-	}
-	maxLinks = maxLinksPerVertex;
-	btAlignedObjectArray< int > linksFoundPerVertex;
-	linksFoundPerVertex.resize( numVertices, 0 );
-	listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices );
-	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
-	{
-		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
-		{
-			// Do vertex 0
-			int vertexIndex = nodes.vertex0;
-			int linkForVertex = linksFoundPerVertex[nodes.vertex0];
-			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
-			listOfLinksPerVertex[linkAddress] = linkIndex;
-			linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1;
-		}
-		{
-			// Do vertex 1
-			int vertexIndex = nodes.vertex1;
-			int linkForVertex = linksFoundPerVertex[nodes.vertex1];
-			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
-			listOfLinksPerVertex[linkAddress] = linkIndex;
-			linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1;
-		}
-	}
-static void computeBatchingIntoWavefronts( 
-	btSoftBodyLinkData &linkData, 
-	int wavefrontSize, 
-	int linksPerWorkItem, 
-	int maxLinksPerWavefront, 
-	btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, 
-	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > &batchesWithinWaves, /* wave, batch, links in batch */
-	btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */
-	)
-	// Attempt generation of larger batches of links.
-	btAlignedObjectArray< bool > processedLink;
-	processedLink.resize( linkData.getNumLinks() );
-	btAlignedObjectArray< int > listOfLinksPerVertex;
-	int maxLinksPerVertex = 0;
-	// Count num vertices
-	int numVertices = 0;
-	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
-	{
-		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
-		numVertices = btMax( numVertices, nodes.vertex0 + 1 );
-		numVertices = btMax( numVertices, nodes.vertex1 + 1 );
-	}
-	// Need list of links per vertex
-	// Compute valence of each vertex
-	btAlignedObjectArray <int> numLinksPerVertex;
-	numLinksPerVertex.resize(0);
-	numLinksPerVertex.resize( numVertices, 0 );
-	generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex );
-	// At this point we know what links we have for each vertex so we can start batching
-	// We want a vertex to start with, let's go with 0
-	int currentVertex = 0;
-	int linksProcessed = 0;
-	btAlignedObjectArray <int> verticesToProcess;
-	while( linksProcessed < linkData.getNumLinks() )
-	{
-		// Next wavefront
-		int nextWavefront = linksForWavefronts.size();
-		linksForWavefronts.resize( nextWavefront + 1 );
-		btAlignedObjectArray <int> &linksForWavefront(linksForWavefronts[nextWavefront]);
-		verticesForWavefronts.resize( nextWavefront + 1 );
-		btAlignedObjectArray<int> &vertexSet( verticesForWavefronts[nextWavefront] );
-		linksForWavefront.resize(0);
-		// Loop to find enough links to fill the wavefront
-		// Stopping if we either run out of links, or fill it
-		while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront )
-		{
-			// Go through the links for the current vertex
-			for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link )
-			{
-				int linkAddress = currentVertex * maxLinksPerVertex + link;
-				int linkIndex = listOfLinksPerVertex[linkAddress];
-				// If we have not already processed this link, add it to the wavefront
-				// Claim it as another processed link
-				// Add the vertex at the far end to the list of vertices to process.
-				if( !processedLink[linkIndex] )
-				{
-					linksForWavefront.push_back( linkIndex );
-					linksProcessed++;
-					processedLink[linkIndex] = true;
-					int v0 = linkData.getVertexPair(linkIndex).vertex0;
-					int v1 = linkData.getVertexPair(linkIndex).vertex1;
-					if( v0 == currentVertex )
-						verticesToProcess.push_back( v1 );
-					else
-						verticesToProcess.push_back( v0 );
-				}
-			}
-			if( verticesToProcess.size() > 0 )
-			{
-				// Get the element on the front of the queue and remove it
-				currentVertex = verticesToProcess[0];
-				removeFromVector( verticesToProcess, 0 );
-			} else {		
-				// If we've not yet processed all the links, find the first unprocessed one
-				// and select one of its vertices as the current vertex
-				if( linksProcessed < linkData.getNumLinks() )
-				{
-					int searchLink = 0;
-					while( processedLink[searchLink] )
-						searchLink++;
-					currentVertex = linkData.getVertexPair(searchLink).vertex0;
-				}	
-			}
-		}
-		// We have either finished or filled a wavefront
-		for( int link = 0; link < linksForWavefront.size(); ++link )
-		{
-			int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0;
-			int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1;
-			insertUniqueAndOrderedIntoVector( vertexSet, v0 );
-			insertUniqueAndOrderedIntoVector( vertexSet, v1 );
-		}
-		// Iterate over links mapped to the wave and batch those
-		// We can run a batch on each cycle trivially
-		batchesWithinWaves.resize( batchesWithinWaves.size() + 1 );
-		btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] );
-		for( int link = 0; link < linksForWavefront.size(); ++link )
-		{
-			int linkIndex = linksForWavefront[link];
-			btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex );
-			int batch = 0;
-			bool placed = false;
-			while( batch < batchesWithinWave.size() && !placed )
-			{
-				bool foundSharedVertex = false;
-				if( batchesWithinWave[batch].size() >= wavefrontSize )
-				{
-					// If we have already filled this batch, move on to another
-					foundSharedVertex = true;
-				} else {
-					for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 )
-					{
-						btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] );
-						if( vertices.vertex0 == vertices2.vertex0 ||
-							vertices.vertex1 == vertices2.vertex0 ||
-							vertices.vertex0 == vertices2.vertex1 ||
-							vertices.vertex1 == vertices2.vertex1 )
-						{
-							foundSharedVertex = true;
-							break;
-						}
-					}
-				}
-				if( !foundSharedVertex )
-				{
-					batchesWithinWave[batch].push_back( linkIndex );
-					placed = true;
-				} else {
-					++batch;
-				}
-			}
-			if( batch == batchesWithinWave.size() && !placed )
-			{
-				batchesWithinWave.resize( batch + 1 );
-				batchesWithinWave[batch].push_back( linkIndex );
-			}
-		}
-	}
-void btSoftBodyLinkDataDX11SIMDAware::generateBatches()
-	btAlignedObjectArray < btAlignedObjectArray <int> > linksForWavefronts;
-	btAlignedObjectArray < btAlignedObjectArray <int> > wavefrontBatches;
-	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > batchesWithinWaves;
-	btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set
-	// Group the links into wavefronts
-	computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts );
-	// Batch the wavefronts
-	generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches );
-	m_numWavefronts = linksForWavefronts.size();
-	// At this point we have a description of which links we need to process in each wavefront
-	// First correctly fill the batch ranges vector
-	int numBatches = wavefrontBatches.size();
-	m_wavefrontBatchStartLengths.resize(0);
-	int prefixSum = 0;
-	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
-	{
-		int wavesInBatch = wavefrontBatches[batchIndex].size();
-		int nextPrefixSum = prefixSum + wavesInBatch;
-		m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) );
-		prefixSum += wavesInBatch;
-	}
-	// Also find max number of batches within a wave
-	m_maxBatchesWithinWave = 0;
-	m_maxVerticesWithinWave = 0;
-	m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts );
-	for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex )
-	{
-		// See if the number of batches in this wave is greater than the current maxium
-		int batchesInCurrentWave = batchesWithinWaves[waveIndex].size();
-		int verticesInCurrentWave = verticesForWavefronts[waveIndex].size();
-		m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave );
-		m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave );
-	}
-	// Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around
-	m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2);
-	// Now we know the maximum number of vertices per-wave we can resize the global vertices array
-	m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts );
-	// Grab backup copies of all the link data arrays for the sorting process
-	btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair>				m_links_Backup(m_links);
-	btAlignedObjectArray<float>											m_linkStrength_Backup(m_linkStrength);
-	btAlignedObjectArray<float>											m_linksMassLSC_Backup(m_linksMassLSC);
-	btAlignedObjectArray<float>											m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
-	//btAlignedObjectArray<Vectormath::Aos::Vector3>						m_linksCLength_Backup(m_linksCLength);
-	//btAlignedObjectArray<float>											m_linksLengthRatio_Backup(m_linksLengthRatio);
-	btAlignedObjectArray<float>											m_linksRestLength_Backup(m_linksRestLength);
-	btAlignedObjectArray<float>											m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
-	// Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses.
-	m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );	
-	// Then re-order links into wavefront blocks
-	// Total number of wavefronts moved. This will decide the ordering of sorted wavefronts.
-	int wavefrontCount = 0;
-	// Iterate over batches of wavefronts, then wavefronts in the batch
-	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
-	{
-		btAlignedObjectArray <int> &batch( wavefrontBatches[batchIndex] );
-		int wavefrontsInBatch = batch.size();
-		for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex )
-		{	
-			int originalWavefrontIndex = batch[wavefrontIndex];
-			btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] );
-			int verticesUsedByWavefront = wavefrontVertices.size();
-			// Copy the set of vertices into the correctly structured array for use on the device
-			// Fill the non-vertices with -1s
-			// so we can mask out those reads
-			for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex )
-			{
-				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex];
-			}
-			for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex )
-			{
-				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1;
-			}
-			// Obtain the set of batches within the current wavefront
-			btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] );
-			// Set the size of the batches for use in the solver, correctly ordered
-			NumBatchesVerticesPair batchesAndVertices;
-			batchesAndVertices.numBatches = batchesWithinWavefront.size();
-			batchesAndVertices.numVertices = verticesUsedByWavefront;
-			m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices;
-			// Now iterate over batches within the wavefront to structure the links correctly
-			for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch )
-			{
-				btAlignedObjectArray <int> &linksInBatch( batchesWithinWavefront[wavefrontBatch] );
-				int wavefrontBatchSize = linksInBatch.size();
-				int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch;
-				for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex )
-				{
-					int originalLinkAddress = linksInBatch[linkIndex];
-					// Reorder simple arrays trivially
-					m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress];
-					m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress];
-					m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress];
-					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress];
-					m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress];
-					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress];
-					// The local address is more complicated. We need to work out where a given vertex will end up
-					// by searching the set of vertices for this link and using the index as the local address
-					btSoftBodyLinkData::LinkNodePair localPair;
-					btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex];
-					localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 );
-					localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 );
-					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
-				}
-				for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex )
-				{
-					// Put 0s into these arrays for padding for cleanliness
-					m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0);
-					m_linkStrength[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f;
-					// For local addresses of junk data choose a set of addresses just above the range of valid ones 
-					// and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses
-					// The valid addresses will do scatter and gather in the valid range, the junk ones should happily work
-					// off the end of that range so we need no control
-					btSoftBodyLinkData::LinkNodePair localPair;
-					localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16);
-					localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16);
-					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
-				}
-			}
-			wavefrontCount++;
-		}
-	}
-} // void btSoftBodyLinkDataDX11SIMDAware::generateBatches()
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
deleted file mode 100644
index 34881973..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
+++ /dev/null
@@ -1,81 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "vectormath/vmInclude.h"
-#include "btSoftBodySolver_DX11.h"
-#include "btSoftBodySolverVertexBuffer_DX11.h"
-#include "btSoftBodySolverLinkData_DX11SIMDAware.h"
-#include "btSoftBodySolverVertexData_DX11.h"
-#include "btSoftBodySolverTriangleData_DX11.h"
-class btDX11SIMDAwareSoftBodySolver : public btDX11SoftBodySolver
-	struct SolvePositionsFromLinksKernelCB
-	{		
-		int startWave;
-		int numWaves;
-		float kst;
-		float ti;
-	};
-	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
-	btSoftBodyLinkDataDX11SIMDAware m_linkData;
-	/** Variable to define whether we need to update solver constants on the next iteration */
-	bool m_updateSolverConstants;
-	virtual bool buildShaders();
-	void updateConstants( float timeStep );
-	//////////////////////////////////////
-	// Kernel dispatches
-	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
-	// End kernel dispatches
-	/////////////////////////////////////
-	btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory = &D3DX11CompileFromMemory);
-	virtual ~btDX11SIMDAwareSoftBodySolver();
-	virtual btSoftBodyLinkData &getLinkData();
-	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
-	virtual void solveConstraints( float solverdt );
-	virtual SolverTypes getSolverType() const
-	{
-		return DX_SIMD_SOLVER;
-	}
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
deleted file mode 100644
index dfa60e66..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <MiniCL/cl_MiniCL_Defs.h>
-#define MSTRINGIFY(A) A
-#include "../OpenCLC10/ApplyForces.cl"
-#include "../OpenCLC10/Integrate.cl"
-#include "../OpenCLC10/PrepareLinks.cl"
-#include "../OpenCLC10/SolvePositions.cl"
-#include "../OpenCLC10/UpdateNodes.cl"
-#include "../OpenCLC10/UpdateNormals.cl"
-#include "../OpenCLC10/UpdatePositions.cl"
-#include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
-#include "../OpenCLC10/VSolveLinks.cl"
-#include "../OpenCLC10/UpdateFixedVertexPositions.cl"
-//#include "../OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
-float mydot3a(float4 a, float4 b)
-   return a.x*b.x + a.y*b.y + a.z*b.z;
-typedef struct 
-	int firstObject;
-	int endObject;
-} CollisionObjectIndices;
-typedef struct 
-	float4 shapeTransform[4]; // column major 4x4 matrix
-	float4 linearVelocity;
-	float4 angularVelocity;
-	int softBodyIdentifier;
-	int collisionShapeType;
-	// Shape information
-	// Compressed from the union
-	float radius;
-	float halfHeight;
-	int upAxis;
-	float margin;
-	float friction;
-	int padding0;
-} CollisionShapeDescription;
-// From btBroadphaseProxy.h
-__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
-// Multiply column-major matrix against vector
-float4 matrixVectorMul( float4 matrix[4], float4 vector )
-	float4 returnVector;
-	float4 row0 = float4(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
-	float4 row1 = float4(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
-	float4 row2 = float4(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
-	float4 row3 = float4(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
-	returnVector.x = dot(row0, vector);
-	returnVector.y = dot(row1, vector);
-	returnVector.z = dot(row2, vector);
-	returnVector.w = dot(row3, vector);
-	return returnVector;
-__kernel void 
-	const int numNodes,
-	const float isolverdt,
-	__global int *g_vertexClothIdentifier,
-	__global float4 *g_vertexPreviousPositions,
-	__global float * g_perClothFriction,
-	__global float * g_clothDampingFactor,
-	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
-	__global CollisionShapeDescription * g_collisionObjectDetails,
-	__global float4 * g_vertexForces,
-	__global float4 *g_vertexVelocities,
-	__global float4 *g_vertexPositions GUID_ARG)
-	int nodeID = get_global_id(0);
-	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
-	if( get_global_id(0) < numNodes )
-	{	
-		int clothIdentifier = g_vertexClothIdentifier[nodeID];
-		// Abort if this is not a valid cloth
-		if( clothIdentifier < 0 )
-			return;
-		float4 position (g_vertexPositions[nodeID].xyz, 1.f);
-		float4 previousPosition (g_vertexPreviousPositions[nodeID].xyz, 1.f);
-		float clothFriction = g_perClothFriction[clothIdentifier];
-		float dampingFactor = g_clothDampingFactor[clothIdentifier];
-		float velocityCoefficient = (1.f - dampingFactor);		
-		float4 difference = position - previousPosition;
-		float4 velocity = difference*velocityCoefficient*isolverdt;
-		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
-		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
-		if( numObjects > 0 )
-		{
-			// We have some possible collisions to deal with
-			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
-			{
-				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
-				float colliderFriction = shapeDescription.friction;
-				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
-				{
-					// Colliding with a capsule
-					float capsuleHalfHeight = shapeDescription.halfHeight;
-					float capsuleRadius = shapeDescription.radius;
-					float capsuleMargin = shapeDescription.margin;
-					int capsuleupAxis = shapeDescription.upAxis;
-					// Four columns of worldTransform matrix
-					float4 worldTransform[4];
-					worldTransform[0] = shapeDescription.shapeTransform[0];
-					worldTransform[1] = shapeDescription.shapeTransform[1];
-					worldTransform[2] = shapeDescription.shapeTransform[2];
-					worldTransform[3] = shapeDescription.shapeTransform[3];
-					// Correctly define capsule centerline vector 
-					float4 c1 (0.f, 0.f, 0.f, 1.f); 
-					float4 c2 (0.f, 0.f, 0.f, 1.f);
-					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
-					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
-					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
-					c2.x = -c1.x;
-					c2.y = -c1.y;
-					c2.z = -c1.z;
-					float4 worldC1 = matrixVectorMul(worldTransform, c1);
-					float4 worldC2 = matrixVectorMul(worldTransform, c2);
-					float4 segment = (worldC2 - worldC1);
-					// compute distance of tangent to vertex along line segment in capsule
-					float distanceAlongSegment = -( mydot3a( (worldC1 - position), segment ) / mydot3a(segment, segment) );
-					float4 closestPoint = (worldC1 + (segment * distanceAlongSegment));
-					float distanceFromLine = length(position - closestPoint);
-					float distanceFromC1 = length(worldC1 - position);
-					float distanceFromC2 = length(worldC2 - position);
-					// Final distance from collision, point to push from, direction to push in
-					// for impulse force
-					float dist;
-					float4 normalVector;
-					if( distanceAlongSegment < 0 )
-					{
-						dist = distanceFromC1;
-						normalVector = float4(normalize(position - worldC1).xyz, 0.f);
-					} else if( distanceAlongSegment > 1.f ) {
-						dist = distanceFromC2;
-						normalVector = float4(normalize(position - worldC2).xyz, 0.f);	
-					} else {
-						dist = distanceFromLine;
-						normalVector = float4(normalize(position - closestPoint).xyz, 0.f);
-					}
-					float4 colliderLinearVelocity = shapeDescription.linearVelocity;
-					float4 colliderAngularVelocity = shapeDescription.angularVelocity;
-					float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - float4(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
-					float minDistance = capsuleRadius + capsuleMargin;
-					// In case of no collision, this is the value of velocity
-					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
-					// Check for a collision
-					if( dist < minDistance )
-					{
-						// Project back to surface along normal
-						position = position + float4(normalVector*(minDistance - dist)*0.9f);
-						velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
-						float4 relativeVelocity = velocity - velocityOfSurfacePoint;
-						float4 p1 = normalize(cross(normalVector, segment));
-						float4 p2 = normalize(cross(p1, normalVector));
-						// Full friction is sum of velocities in each direction of plane
-						float4 frictionVector = p1*mydot3a(relativeVelocity, p1) + p2*mydot3a(relativeVelocity, p2);
-						// Real friction is peak friction corrected by friction coefficients
-						frictionVector = frictionVector * (colliderFriction*clothFriction);
-						float approachSpeed = dot(relativeVelocity, normalVector);
-						if( approachSpeed <= 0.0f )
-							forceOnVertex -= frictionVector;
-					}
-				}
-			}
-		}
-		g_vertexVelocities[nodeID] = float4(velocity.xyz, 0.f);	
-		// Update external force
-		g_vertexForces[nodeID] = float4(forceOnVertex.xyz, 0.f);
-		g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
-	}
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
deleted file mode 100644
index f824f281..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
+++ /dev/null
@@ -1,209 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// OpenCL support
-#ifdef USE_MINICL
-	#include "MiniCL/cl.h"
-#else //USE_MINICL
-	#ifdef __APPLE__
-		#include <OpenCL/OpenCL.h>
-	#else
-		#include <CL/cl.h>
-	#endif //__APPLE__
-#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
-template <typename ElementType> class btOpenCLBuffer
-	cl_command_queue	m_cqCommandQue;
-	cl_context			m_clContext;
-	cl_mem				m_buffer;
-	btAlignedObjectArray< ElementType > * m_CPUBuffer;
-	int  m_gpuSize;
-	bool m_onGPU;
-	bool m_readOnlyOnGPU;
-	bool m_allocated;
-	bool createBuffer( cl_mem* preexistingBuffer = 0)
-	{
-		cl_int err;
-		if( preexistingBuffer )
-		{
-			m_buffer = *preexistingBuffer;
-		} 
-		else {
-			cl_mem_flags flags= m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
-			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
-			// At a minimum the buffer must exist
-			if( size == 0 )
-				size = sizeof(ElementType);
-			m_buffer = clCreateBuffer(m_clContext, flags, size, 0, &err);
-			if( err != CL_SUCCESS )
-			{
-				btAssert( "Buffer::Buffer(m_buffer)");
-			}
-		}
-		m_gpuSize = m_CPUBuffer->size();
-		return true;
-	}
-	btOpenCLBuffer( cl_command_queue	commandQue,cl_context ctx, btAlignedObjectArray< ElementType >* CPUBuffer, bool readOnly)
-		:m_cqCommandQue(commandQue),
-		m_clContext(ctx),
-		m_buffer(0),
-		m_CPUBuffer(CPUBuffer),
-		m_gpuSize(0),
-		m_onGPU(false),
-		m_readOnlyOnGPU(readOnly),
-		m_allocated(false)
-	{
-	}
-	~btOpenCLBuffer()
-	{
-		clReleaseMemObject(m_buffer);
-	}
-	bool moveToGPU()
-	{
-		cl_int err;
-		if( (m_CPUBuffer->size() != m_gpuSize) )
-		{
-			m_onGPU = false;
-		}
-		if( !m_allocated && m_CPUBuffer->size() == 0  )
-		{
-			// If it isn't on the GPU and yet there is no data on the CPU side this may cause a problem with some kernels.
-			// We should create *something* on the device side
-			if (!createBuffer()) {
-				return false;
-			}
-			m_allocated = true;
-		}
-		if( !m_onGPU && m_CPUBuffer->size() > 0 )
-		{
-			if (!m_allocated || (m_CPUBuffer->size() != m_gpuSize)) {
-				if (!createBuffer()) {
-					return false;
-				}
-				m_allocated = true;
-			}
-			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
-			err = clEnqueueWriteBuffer(m_cqCommandQue,m_buffer,
-				CL_FALSE,
-				0,
-				size, 
-				&((*m_CPUBuffer)[0]),0,0,0);
-			if( err != CL_SUCCESS )
-			{
-				btAssert( "CommandQueue::enqueueWriteBuffer(m_buffer)" );
-			}
-			m_onGPU = true;
-		}
-		return true;
-	}
-	bool moveFromGPU()
-	{
-		cl_int err;
-		if (m_CPUBuffer->size() > 0) {
-			if (m_onGPU && !m_readOnlyOnGPU) {
-				size_t size = m_CPUBuffer->size() * sizeof(ElementType);
-				err = clEnqueueReadBuffer(m_cqCommandQue,
-					m_buffer,
-					CL_TRUE,
-					0,
-					size,
-					&((*m_CPUBuffer)[0]),0,0,0);
-				if( err != CL_SUCCESS )
-				{
-					btAssert( "CommandQueue::enqueueReadBuffer(m_buffer)" );
-				}
-				m_onGPU = false;
-			}
-		}
-		return true;
-	}
-	bool copyFromGPU()
-	{
-		cl_int err;
-		size_t size = m_CPUBuffer->size() * sizeof(ElementType);
-		if (m_CPUBuffer->size() > 0) {
-			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = clEnqueueReadBuffer(m_cqCommandQue,
-					m_buffer,
-					CL_TRUE,
-					0,size, 
-					&((*m_CPUBuffer)[0]),0,0,0);
-				if( err != CL_SUCCESS )
-				{
-					btAssert( "CommandQueue::enqueueReadBuffer(m_buffer)");
-				}
-			}
-		}
-		return true;
-	}
-	virtual void changedOnCPU()
-	{
-		m_onGPU = false;
-	}
-}; // class btOpenCLBuffer
\ No newline at end of file
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
deleted file mode 100644
index 6921f7da..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
+++ /dev/null
@@ -1,99 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_OpenCL.h"
-class btSoftBodyLinkDataOpenCL : public btSoftBodyLinkData
-	bool				m_onGPU;
-	cl_command_queue	m_cqCommandQue;
-	btOpenCLBuffer<LinkNodePair> m_clLinks;
-	btOpenCLBuffer<float>							      m_clLinkStrength;
-	btOpenCLBuffer<float>								  m_clLinksMassLSC;
-	btOpenCLBuffer<float>								  m_clLinksRestLengthSquared;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>			  m_clLinksCLength;
-	btOpenCLBuffer<float>								  m_clLinksLengthRatio;
-	btOpenCLBuffer<float>								  m_clLinksRestLength;
-	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
-	struct BatchPair
-	{
-		int start;
-		int length;
-		BatchPair() :
-			start(0),
-			length(0)
-		{
-		}
-		BatchPair( int s, int l ) : 
-			start( s ),
-			length( l )
-		{
-		}
-	};
-	/**
-	 * Link addressing information for each cloth.
-	 * Allows link locations to be computed independently of data batching.
-	 */
-	btAlignedObjectArray< int >							m_linkAddresses;
-	/**
-	 * Start and length values for computation batches over link data.
-	 */
-	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
-	btSoftBodyLinkDataOpenCL(cl_command_queue queue, cl_context ctx);
-	virtual ~btSoftBodyLinkDataOpenCL();
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createLinks( int numLinks );
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setLinkAt( 
-		const LinkDescription &link, 
-		int linkIndex );
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
-	/**
-	 * Generate (and later update) the batching for the entire link set.
-	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
-	 * In theory we could delay it until just before we need the cloth.
-	 * It's a one-off overhead, though, so that is a later optimisation.
-	 */
-	void generateBatches();
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
deleted file mode 100644
index b20e8055..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
+++ /dev/null
@@ -1,169 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_OpenCL.h"
-class btSoftBodyLinkDataOpenCLSIMDAware : public btSoftBodyLinkData
-	bool				m_onGPU;
-	cl_command_queue	m_cqCommandQue;
-	const int m_wavefrontSize;
-	const int m_linksPerWorkItem;
-	const int m_maxLinksPerWavefront;
-	int m_maxBatchesWithinWave;
-	int m_maxVerticesWithinWave;
-	int m_numWavefronts;
-	int m_maxVertex;
-	struct NumBatchesVerticesPair
-	{
-		int numBatches;
-		int numVertices;
-	};
-	btAlignedObjectArray<int>							  m_linksPerWavefront;
-	btAlignedObjectArray<NumBatchesVerticesPair>		  m_numBatchesAndVerticesWithinWaves;
-	btOpenCLBuffer< NumBatchesVerticesPair >			  m_clNumBatchesAndVerticesWithinWaves;
-	// All arrays here will contain batches of m_maxLinksPerWavefront links
-	// ordered by wavefront.
-	// with either global vertex pairs or local vertex pairs
-	btAlignedObjectArray< int >							  m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
-	btOpenCLBuffer<int>									  m_clWavefrontVerticesGlobalAddresses;
-	btAlignedObjectArray< LinkNodePair >				  m_linkVerticesLocalAddresses; // Vertex pair for the link
-	btOpenCLBuffer<LinkNodePair>						  m_clLinkVerticesLocalAddresses;
-	btOpenCLBuffer<float>							      m_clLinkStrength;
-	btOpenCLBuffer<float>								  m_clLinksMassLSC;
-	btOpenCLBuffer<float>								  m_clLinksRestLengthSquared;
-	btOpenCLBuffer<float>								  m_clLinksRestLength;
-	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
-	struct BatchPair
-	{
-		int start;
-		int length;
-		BatchPair() :
-			start(0),
-			length(0)
-		{
-		}
-		BatchPair( int s, int l ) : 
-			start( s ),
-			length( l )
-		{
-		}
-	};
-	/**
-	 * Link addressing information for each cloth.
-	 * Allows link locations to be computed independently of data batching.
-	 */
-	btAlignedObjectArray< int >							m_linkAddresses;
-	/**
-	 * Start and length values for computation batches over link data.
-	 */
-	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
-	btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx);
-	virtual ~btSoftBodyLinkDataOpenCLSIMDAware();
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createLinks( int numLinks );
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setLinkAt( 
-		const LinkDescription &link, 
-		int linkIndex );
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
-	/**
-	 * Generate (and later update) the batching for the entire link set.
-	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
-	 * In theory we could delay it until just before we need the cloth.
-	 * It's a one-off overhead, though, so that is a later optimisation.
-	 */
-	void generateBatches();
-	int getMaxVerticesPerWavefront()
-	{
-		return m_maxVerticesWithinWave;
-	}
-	int getWavefrontSize()
-	{
-		return m_wavefrontSize;
-	}
-	int getLinksPerWorkItem()
-	{
-		return m_linksPerWorkItem;
-	}
-	int getMaxLinksPerWavefront()
-	{
-		return m_maxLinksPerWavefront;
-	}
-	int getMaxBatchesPerWavefront()
-	{
-		return m_maxBatchesWithinWave;
-	}
-	int getNumWavefronts()
-	{
-		return m_numWavefronts;
-	}
-	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
-	{
-		return m_numBatchesAndVerticesWithinWaves[wavefront];
-	}
-	int getVertexGlobalAddresses( int vertexIndex )
-	{
-		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
-	}
-	/**
-	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
-	 */
-	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
-	{
-		return m_linkVerticesLocalAddresses[linkIndex];
-	}
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
deleted file mode 100644
index 1000440b..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "btSoftBodySolverOutputCLtoGL.h"
-#include <stdio.h> //@todo: remove the debugging printf at some stage
-#include "btSoftBodySolver_OpenCL.h"
-#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
-#include "btSoftBodySolverVertexBuffer_OpenGL.h"
-#include "BulletSoftBody/btSoftBody.h"
-////OpenCL 1.0 kernels don't use float3
-#define MSTRINGIFY(A) #A
-static char* OutputToVertexArrayCLString =
-#include "OpenCLC10/OutputToVertexArray.cl"
-#define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
-static const size_t workGroupSize = 128;
-void btSoftBodySolverOutputCLtoGL::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
-	btSoftBodySolver *solver = softBody->getSoftBodySolver();
-	btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER );
-	btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver );
-	checkInitialized();
-	btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody );
-	btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData );	
-	const int firstVertex = currentCloth->getFirstVertex();
-	const int lastVertex = firstVertex + currentCloth->getNumVertices();
-	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::OPENGL_BUFFER ) {		
-		const btOpenGLInteropVertexBufferDescriptor *openGLVertexBuffer = static_cast< btOpenGLInteropVertexBufferDescriptor* >(vertexBuffer);						
-		cl_int ciErrNum = CL_SUCCESS;    
-		cl_mem clBuffer = openGLVertexBuffer->getBuffer();		
-		cl_kernel outputKernel = outputToVertexArrayWithNormalsKernel;
-		if( !vertexBuffer->hasNormals() )
-			outputKernel = outputToVertexArrayWithoutNormalsKernel;
-		ciErrNum = clEnqueueAcquireGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, NULL);
-		if( ciErrNum != CL_SUCCESS )
-		{
-			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
-		}
-		int numVertices = currentCloth->getNumVertices();
-		ciErrNum = clSetKernelArg(outputKernel, 0, sizeof(int), &firstVertex );
-		ciErrNum = clSetKernelArg(outputKernel, 1, sizeof(int), &numVertices );
-		ciErrNum = clSetKernelArg(outputKernel, 2, sizeof(cl_mem), (void*)&clBuffer );
-		if( vertexBuffer->hasVertexPositions() )
-		{
-			int vertexOffset = vertexBuffer->getVertexOffset();
-			int vertexStride = vertexBuffer->getVertexStride();
-			ciErrNum = clSetKernelArg(outputKernel, 3, sizeof(int), &vertexOffset );
-			ciErrNum = clSetKernelArg(outputKernel, 4, sizeof(int), &vertexStride );
-			ciErrNum = clSetKernelArg(outputKernel, 5, sizeof(cl_mem), (void*)&vertexData.m_clVertexPosition.m_buffer );
-		}
-		if( vertexBuffer->hasNormals() )
-		{
-			int normalOffset = vertexBuffer->getNormalOffset();
-			int normalStride = vertexBuffer->getNormalStride();
-			ciErrNum = clSetKernelArg(outputKernel, 6, sizeof(int), &normalOffset );
-			ciErrNum = clSetKernelArg(outputKernel, 7, sizeof(int), &normalStride );
-			ciErrNum = clSetKernelArg(outputKernel, 8, sizeof(cl_mem), (void*)&vertexData.m_clVertexNormal.m_buffer );
-		}
-		size_t	numWorkItems = workGroupSize*((vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, outputKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
-		if( ciErrNum != CL_SUCCESS ) 
-		{
-			btAssert( 0 &&  "enqueueNDRangeKernel(copySoftBodyToVertexBuffer)");
-		}
-		ciErrNum = clEnqueueReleaseGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, 0);
-		if( ciErrNum != CL_SUCCESS )
-		{
-			btAssert( 0 &&  "clEnqueueReleaseGLObjects(copySoftBodyToVertexBuffer)");
-		}
-	} else {
-		btAssert( "Undefined output for this solver output" == false );
-	}
-	// clFinish in here may not be the best thing. It's possible that we should have a waitForFrameComplete function.
-	clFinish(m_cqCommandQue);
-} // btSoftBodySolverOutputCLtoGL::outputToVertexBuffers
-bool btSoftBodySolverOutputCLtoGL::buildShaders()
-	// Ensure current kernels are released first
-	releaseKernels();
-	bool returnVal = true;
-	if( m_shadersInitialized )
-		return true;
-	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
-	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
-	if( returnVal )
-		m_shadersInitialized = true;
-	return returnVal;
-} // btSoftBodySolverOutputCLtoGL::buildShaders
-void btSoftBodySolverOutputCLtoGL::releaseKernels()
-	RELEASE_CL_KERNEL( outputToVertexArrayWithNormalsKernel );
-	RELEASE_CL_KERNEL( outputToVertexArrayWithoutNormalsKernel );
-	m_shadersInitialized = false;
-} // btSoftBodySolverOutputCLtoGL::releaseKernels
-bool btSoftBodySolverOutputCLtoGL::checkInitialized()
-	if( !m_shadersInitialized )
-		if( buildShaders() )
-			m_shadersInitialized = true;
-	return m_shadersInitialized;
\ No newline at end of file
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
deleted file mode 100644
index ab3ea264..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
+++ /dev/null
@@ -1,62 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "btSoftBodySolver_OpenCL.h"
- * Class to manage movement of data from a solver to a given target.
- * This version is the CL to GL interop version.
- */
-class btSoftBodySolverOutputCLtoGL : public btSoftBodySolverOutput
-	cl_command_queue	m_cqCommandQue;
-	cl_context			m_cxMainContext;
-	CLFunctions			clFunctions;
-	cl_kernel		outputToVertexArrayWithNormalsKernel;
-	cl_kernel		outputToVertexArrayWithoutNormalsKernel;
-	bool m_shadersInitialized;
-	virtual bool checkInitialized();	
-	virtual bool buildShaders();
-	void releaseKernels();
-	btSoftBodySolverOutputCLtoGL(cl_command_queue cqCommandQue, cl_context cxMainContext) :
-		m_cqCommandQue( cqCommandQue ),
-		m_cxMainContext( cxMainContext ),
-		clFunctions(cqCommandQue, cxMainContext),
-		outputToVertexArrayWithNormalsKernel( 0 ),
-		outputToVertexArrayWithoutNormalsKernel( 0 ),
-		m_shadersInitialized( false )
-	{
-	}
-	virtual ~btSoftBodySolverOutputCLtoGL()
-	{
-		releaseKernels();
-	}
-	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
-	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
\ No newline at end of file
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
deleted file mode 100644
index 7e376785..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
+++ /dev/null
@@ -1,84 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_OpenCL.h"
-class btSoftBodyTriangleDataOpenCL : public btSoftBodyTriangleData
-	bool				m_onGPU;
-	cl_command_queue    m_queue;
-	btOpenCLBuffer<btSoftBodyTriangleData::TriangleNodeSet>					m_clVertexIndices;
-	btOpenCLBuffer<float>								m_clArea;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>			m_clNormal;
-	/**
-	 * Link addressing information for each cloth.
-	 * Allows link locations to be computed independently of data batching.
-	 */
-	btAlignedObjectArray< int >							m_triangleAddresses;
-	/**
-	 * Start and length values for computation batches over link data.
-	 */
-	struct btSomePair
-	{
-		btSomePair() {}
-		btSomePair(int f,int s)
-			:first(f),second(s)
-		{
-		}
-		int first;
-		int second;
-	};
-	btAlignedObjectArray< btSomePair >		m_batchStartLengths;
-	btSoftBodyTriangleDataOpenCL( cl_command_queue queue, cl_context ctx );
-	virtual ~btSoftBodyTriangleDataOpenCL();
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createTriangles( int numTriangles );
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setTriangleAt( const btSoftBodyTriangleData::TriangleDescription &triangle, int triangleIndex );
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
-	/**
-	 * Generate (and later update) the batching for the entire triangle set.
-	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
-	 * In theory we could delay it until just before we need the cloth.
-	 * It's a one-off overhead, though, so that is a later optimisation.
-	 */
-	void generateBatches();
-}; // class btSoftBodyTriangleDataOpenCL
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
deleted file mode 100644
index 7c223ecc..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
+++ /dev/null
@@ -1,166 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
-#ifdef USE_MINICL
-	#include "MiniCL/cl.h"
-#else //USE_MINICL
-	#ifdef __APPLE__
-		#include <OpenCL/OpenCL.h>
-	#else
-		#include <CL/cl.h>
-		#include <CL/cl_gl.h>
-	#endif //__APPLE__
-#ifdef _WIN32//for glut.h
-#include <windows.h>
-//think different
-#if defined(__APPLE__) && !defined (VMDMESA)
-#include <OpenGL/OpenGL.h>
-#include <OpenGL/gl.h>
-#include <OpenGL/glu.h>
-#include <GLUT/glut.h>
-#ifdef _WINDOWS
-#include <windows.h>
-#include <GL/gl.h>
-#include <GL/glu.h>
-#include <GL/glut.h>
-#endif //_WINDOWS
-#endif //APPLE
-class btOpenGLInteropVertexBufferDescriptor : public btVertexBufferDescriptor
-	/** OpenCL context */
-	cl_context			m_context;
-	/** OpenCL command queue */
-	cl_command_queue	m_commandQueue;
-	/** OpenCL interop buffer */
-	cl_mem m_buffer;
-	/** VBO in GL that is the basis of the interop buffer */
-	GLuint m_openGLVBO;
-	/**
-	 * context is the OpenCL context this interop buffer will work in.
-	 * queue is the command queue that kernels and data movement will be enqueued into.
-	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
-	 * vertexOffset is the offset in floats to the first vertex.
-	 * vertexStride is the stride in floats between vertices.
-	 */
-	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride )
-	{
-#ifndef USE_MINICL
-		cl_int ciErrNum = CL_SUCCESS;
-		m_context = context;
-		m_commandQueue = cqCommandQue;
-		m_vertexOffset = vertexOffset;
-		m_vertexStride = vertexStride;
-		m_openGLVBO = openGLVBO;
-		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
-		if( ciErrNum != CL_SUCCESS )
-		{
-			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
-		}
-		m_hasVertexPositions = true;
-		btAssert(0);//MiniCL shouldn't get here
-	}
-	/**
-	 * context is the OpenCL context this interop buffer will work in.
-	 * queue is the command queue that kernels and data movement will be enqueued into.
-	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
-	 * vertexOffset is the offset in floats to the first vertex.
-	 * vertexStride is the stride in floats between vertices.
-	 * normalOffset is the offset in floats to the first normal.
-	 * normalStride is the stride in floats between normals.
-	 */
-	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
-	{
-#ifndef USE_MINICL
-		cl_int ciErrNum = CL_SUCCESS;
-		m_context = context;
-		m_commandQueue = cqCommandQue;
-		m_openGLVBO = openGLVBO;
-		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
-		if( ciErrNum != CL_SUCCESS )
-		{
-			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
-		}
-		m_vertexOffset = vertexOffset;
-		m_vertexStride = vertexStride;
-		m_hasVertexPositions = true;
-		m_normalOffset = normalOffset;
-		m_normalStride = normalStride;
-		m_hasNormals = true;
-		btAssert(0);
-#endif //USE_MINICL
-	}
-	virtual ~btOpenGLInteropVertexBufferDescriptor()
-	{
-		clReleaseMemObject( m_buffer );
-	}
-	/**
-	 * Return the type of the vertex buffer descriptor.
-	 */
-	virtual BufferTypes getBufferType() const
-	{
-		return OPENGL_BUFFER;
-	}
-	virtual cl_context getContext() const
-	{
-		return m_context;
-	}
-	virtual cl_mem getBuffer() const
-	{
-		return m_buffer;
-	}	
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
deleted file mode 100644
index 531c3427..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ /dev/null
@@ -1,52 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
-#include "btSoftBodySolverBuffer_OpenCL.h"
-class btSoftBodyVertexDataOpenCL : public btSoftBodyVertexData
-	bool		m_onGPU;
-	cl_command_queue	m_queue;
-	btOpenCLBuffer<int>									m_clClothIdentifier;
-	btOpenCLBuffer<Vectormath::Aos::Point3>				m_clVertexPosition;
-	btOpenCLBuffer<Vectormath::Aos::Point3>				m_clVertexPreviousPosition;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>				m_clVertexVelocity;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>				m_clVertexForceAccumulator;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>				m_clVertexNormal;
-	btOpenCLBuffer<float>									m_clVertexInverseMass;
-	btOpenCLBuffer<float>									m_clVertexArea;
-	btOpenCLBuffer<int>									m_clVertexTriangleCount;
-	btSoftBodyVertexDataOpenCL( cl_command_queue queue,  cl_context ctx);
-	virtual ~btSoftBodyVertexDataOpenCL();
-	virtual bool onAccelerator();
-	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
deleted file mode 100644
index f84448a6..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ /dev/null
@@ -1,1820 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "vectormath/vmInclude.h"
-#include <stdio.h> //@todo: remove the debugging printf at some stage
-#include "btSoftBodySolver_OpenCL.h"
-#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
-#include "BulletSoftBody/btSoftBody.h"
-#include "BulletSoftBody/btSoftBodyInternals.h"
-#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
-#include "BulletCollision/CollisionShapes/btSphereShape.h"
-#include "LinearMath/btQuickprof.h"
-#include <limits.h>
-#ifdef USE_MINICL
-	#include "MiniCL/cl.h"
-#else //USE_MINICL
-	#ifdef __APPLE__
-		#include <OpenCL/OpenCL.h>
-	#else
-		#include <CL/cl.h>
-	#endif //__APPLE__
-#define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
-//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
-////OpenCL 1.0 kernels don't use float3
-#define MSTRINGIFY(A) #A
-static const char* PrepareLinksCLString = 
-#include "OpenCLC10/PrepareLinks.cl"
-static const char* UpdatePositionsFromVelocitiesCLString = 
-#include "OpenCLC10/UpdatePositionsFromVelocities.cl"
-static const char* SolvePositionsCLString = 
-#include "OpenCLC10/SolvePositions.cl"
-static const char* UpdateNodesCLString = 
-#include "OpenCLC10/UpdateNodes.cl"
-static const char* UpdatePositionsCLString = 
-#include "OpenCLC10/UpdatePositions.cl"
-static const char* UpdateConstantsCLString = 
-#include "OpenCLC10/UpdateConstants.cl"
-static const char* IntegrateCLString = 
-#include "OpenCLC10/Integrate.cl"
-static const char* ApplyForcesCLString = 
-#include "OpenCLC10/ApplyForces.cl"
-static const char* UpdateFixedVertexPositionsCLString = 
-#include "OpenCLC10/UpdateFixedVertexPositions.cl"
-static const char* UpdateNormalsCLString = 
-#include "OpenCLC10/UpdateNormals.cl"
-static const char* VSolveLinksCLString = 
-#include "OpenCLC10/VSolveLinks.cl"
-static const char* SolveCollisionsAndUpdateVelocitiesCLString =
-#include "OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
-btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl_command_queue queue, cl_context ctx) :
-    m_queue(queue),
-	m_clClothIdentifier( queue, ctx, &m_clothIdentifier, false ),
-	m_clVertexPosition( queue, ctx, &m_vertexPosition, false ),
-	m_clVertexPreviousPosition( queue, ctx, &m_vertexPreviousPosition, false ),
-	m_clVertexVelocity( queue, ctx, &m_vertexVelocity, false ),
-	m_clVertexForceAccumulator( queue, ctx, &m_vertexForceAccumulator, false ),
-	m_clVertexNormal( queue, ctx, &m_vertexNormal, false ),
-	m_clVertexInverseMass( queue, ctx, &m_vertexInverseMass, false ),
-	m_clVertexArea( queue, ctx, &m_vertexArea, false ),
-	m_clVertexTriangleCount( queue, ctx, &m_vertexTriangleCount, false )
-bool btSoftBodyVertexDataOpenCL::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyVertexDataOpenCL::moveToAccelerator()
-	bool success = true;
-	success = success && m_clClothIdentifier.moveToGPU();
-	success = success && m_clVertexPosition.moveToGPU();
-	success = success && m_clVertexPreviousPosition.moveToGPU();
-	success = success && m_clVertexVelocity.moveToGPU();
-	success = success && m_clVertexForceAccumulator.moveToGPU();
-	success = success && m_clVertexNormal.moveToGPU();
-	success = success && m_clVertexInverseMass.moveToGPU();
-	success = success && m_clVertexArea.moveToGPU();
-	success = success && m_clVertexTriangleCount.moveToGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
-bool btSoftBodyVertexDataOpenCL::moveFromAccelerator(bool bCopy, bool bCopyMinimum)
-	bool success = true;
-	if (!bCopy)
-	{
-		success = success && m_clClothIdentifier.moveFromGPU();
-		success = success && m_clVertexPosition.moveFromGPU();
-		success = success && m_clVertexPreviousPosition.moveFromGPU();
-		success = success && m_clVertexVelocity.moveFromGPU();
-		success = success && m_clVertexForceAccumulator.moveFromGPU();
-		success = success && m_clVertexNormal.moveFromGPU();
-		success = success && m_clVertexInverseMass.moveFromGPU();
-		success = success && m_clVertexArea.moveFromGPU();
-		success = success && m_clVertexTriangleCount.moveFromGPU();
-	}
-	else
-	{
-		if (bCopyMinimum)
-		{
-			success = success && m_clVertexPosition.copyFromGPU();
-			success = success && m_clVertexNormal.copyFromGPU();
-		}
-		else
-		{
-			success = success && m_clClothIdentifier.copyFromGPU();
-			success = success && m_clVertexPosition.copyFromGPU();
-			success = success && m_clVertexPreviousPosition.copyFromGPU();
-			success = success && m_clVertexVelocity.copyFromGPU();
-			success = success && m_clVertexForceAccumulator.copyFromGPU();
-			success = success && m_clVertexNormal.copyFromGPU();
-			success = success && m_clVertexInverseMass.copyFromGPU();
-			success = success && m_clVertexArea.copyFromGPU();
-			success = success && m_clVertexTriangleCount.copyFromGPU();
-		}
-	}
-	if( success )
-		m_onGPU = true;
-	return success;
-btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl_command_queue queue,  cl_context ctx) 
-	m_clLinks( queue, ctx, &m_links, false ),
-	m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
-	m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
-	m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
-	m_clLinksCLength( queue, ctx, &m_linksCLength, false ),
-	m_clLinksLengthRatio( queue, ctx, &m_linksLengthRatio, false ),
-	m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
-	m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
-static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
-	Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
-	return outVec;
-/** Allocate enough space in all link-related arrays to fit numLinks links */
-void btSoftBodyLinkDataOpenCL::createLinks( int numLinks )
-	int previousSize = m_links.size();
-	int newSize = previousSize + numLinks;
-	btSoftBodyLinkData::createLinks( numLinks );
-	// Resize the link addresses array as well
-	m_linkAddresses.resize( newSize );
-/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-void btSoftBodyLinkDataOpenCL::setLinkAt( 
-	const LinkDescription &link, 
-	int linkIndex )
-	btSoftBodyLinkData::setLinkAt( link, linkIndex );
-	// Set the link index correctly for initialisation
-	m_linkAddresses[linkIndex] = linkIndex;
-bool btSoftBodyLinkDataOpenCL::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyLinkDataOpenCL::moveToAccelerator()
-	bool success = true;
-	success = success && m_clLinks.moveToGPU();
-	success = success && m_clLinkStrength.moveToGPU();
-	success = success && m_clLinksMassLSC.moveToGPU();
-	success = success && m_clLinksRestLengthSquared.moveToGPU();
-	success = success && m_clLinksCLength.moveToGPU();
-	success = success && m_clLinksLengthRatio.moveToGPU();
-	success = success && m_clLinksRestLength.moveToGPU();
-	success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveToGPU();
-	if( success ) {
-		m_onGPU = true;
-	}
-	return success;
-bool btSoftBodyLinkDataOpenCL::moveFromAccelerator()
-	bool success = true;
-	success = success && m_clLinks.moveFromGPU();
-	success = success && m_clLinkStrength.moveFromGPU();
-	success = success && m_clLinksMassLSC.moveFromGPU();
-	success = success && m_clLinksRestLengthSquared.moveFromGPU();
-	success = success && m_clLinksCLength.moveFromGPU();
-	success = success && m_clLinksLengthRatio.moveFromGPU();
-	success = success && m_clLinksRestLength.moveFromGPU();
-	success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveFromGPU();
-	if( success ) {
-		m_onGPU = false;
-	}
-	return success;
- * Generate (and later update) the batching for the entire link set.
- * This redoes a lot of work because it batches the entire set when each cloth is inserted.
- * In theory we could delay it until just before we need the cloth.
- * It's a one-off overhead, though, so that is a later optimisation.
- */
-void btSoftBodyLinkDataOpenCL::generateBatches()
-	int numLinks = getNumLinks();
-	// Do the graph colouring here temporarily
-	btAlignedObjectArray< int > batchValues;
-	batchValues.resize( numLinks, 0 );
-	// Find the maximum vertex value internally for now
-	int maxVertex = 0;
-	for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-	{
-		int vertex0 = getVertexPair(linkIndex).vertex0;
-		int vertex1 = getVertexPair(linkIndex).vertex1;
-		if( vertex0 > maxVertex )
-			maxVertex = vertex0;
-		if( vertex1 > maxVertex )
-			maxVertex = vertex1;
-	}
-	int numVertices = maxVertex + 1;
-	// Set of lists, one for each node, specifying which colours are connected
-	// to that node.
-	// No two edges into a node can share a colour.
-	btAlignedObjectArray< btAlignedObjectArray< int > > vertexConnectedColourLists;
-	vertexConnectedColourLists.resize(numVertices);
-	// Simple algorithm that chooses the lowest batch number
-	// that none of the links attached to either of the connected 
-	// nodes is in
-	for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-	{				
-		int linkLocation = m_linkAddresses[linkIndex];
-		int vertex0 = getVertexPair(linkLocation).vertex0;
-		int vertex1 = getVertexPair(linkLocation).vertex1;
-		// Get the two node colour lists
-		btAlignedObjectArray< int > &colourListVertex0( vertexConnectedColourLists[vertex0] );
-		btAlignedObjectArray< int > &colourListVertex1( vertexConnectedColourLists[vertex1] );
-		// Choose the minimum colour that is in neither list
-		int colour = 0;
-		while( colourListVertex0.findLinearSearch(colour) != colourListVertex0.size() || colourListVertex1.findLinearSearch(colour) != colourListVertex1.size()  )
-			++colour;
-		// i should now be the minimum colour in neither list
-		// Add to the two lists so that future edges don't share
-		// And store the colour against this edge
-		colourListVertex0.push_back(colour);
-		colourListVertex1.push_back(colour);
-		batchValues[linkIndex] = colour;
-	}
-	// Check the colour counts
-	btAlignedObjectArray< int > batchCounts;
-	for( int i = 0; i < numLinks; ++i )
-	{
-		int batch = batchValues[i];
-		if( batch >= batchCounts.size() )
-			batchCounts.push_back(1);
-		else
-			++(batchCounts[batch]);
-	}
-	m_batchStartLengths.resize(batchCounts.size());
-	if( m_batchStartLengths.size() > 0 )
-	{
-		m_batchStartLengths.resize(batchCounts.size());
-		m_batchStartLengths[0] = BatchPair(0, 0);
-		int sum = 0;
-		for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
-		{
-			m_batchStartLengths[batchIndex].start = sum;
-			m_batchStartLengths[batchIndex].length = batchCounts[batchIndex];
-			sum += batchCounts[batchIndex];
-		}
-	}
-	/////////////////////////////
-	// Sort data based on batches
-	// Create source arrays by copying originals
-	btAlignedObjectArray<LinkNodePair>									m_links_Backup(m_links);
-	btAlignedObjectArray<float>											m_linkStrength_Backup(m_linkStrength);
-	btAlignedObjectArray<float>											m_linksMassLSC_Backup(m_linksMassLSC);
-	btAlignedObjectArray<float>											m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
-	btAlignedObjectArray<Vectormath::Aos::Vector3>						m_linksCLength_Backup(m_linksCLength);
-	btAlignedObjectArray<float>											m_linksLengthRatio_Backup(m_linksLengthRatio);
-	btAlignedObjectArray<float>											m_linksRestLength_Backup(m_linksRestLength);
-	btAlignedObjectArray<float>											m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
-	for( int batch = 0; batch < batchCounts.size(); ++batch )
-		batchCounts[batch] = 0;
-	// Do sort as single pass into destination arrays	
-	for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-	{
-		// To maintain locations run off the original link locations rather than the current position.
-		// It's not cache efficient, but as we run this rarely that should not matter.
-		// It's faster than searching the link location array for the current location and then updating it.
-		// The other alternative would be to unsort before resorting, but this is equivalent to doing that.
-		int linkLocation = m_linkAddresses[linkIndex];
-		// Obtain batch and calculate target location for the
-		// next element in that batch, incrementing the batch counter
-		// afterwards
-		int batch = batchValues[linkIndex];
-		int newLocation = m_batchStartLengths[batch].start + batchCounts[batch];
-		batchCounts[batch] = batchCounts[batch] + 1;
-		m_links[newLocation] = m_links_Backup[linkLocation];
-#if 1
-		m_linkStrength[newLocation] = m_linkStrength_Backup[linkLocation];
-		m_linksMassLSC[newLocation] = m_linksMassLSC_Backup[linkLocation];
-		m_linksRestLengthSquared[newLocation] = m_linksRestLengthSquared_Backup[linkLocation];
-		m_linksLengthRatio[newLocation] = m_linksLengthRatio_Backup[linkLocation];
-		m_linksRestLength[newLocation] = m_linksRestLength_Backup[linkLocation];
-		m_linksMaterialLinearStiffnessCoefficient[newLocation] = m_linksMaterialLinearStiffnessCoefficient_Backup[linkLocation];
-		// Update the locations array to account for the moved entry
-		m_linkAddresses[linkIndex] = newLocation;
-	}
-} // void generateBatches()
-btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl_command_queue queue , cl_context ctx) : 
-    m_queue( queue ),
-	m_clVertexIndices( queue, ctx, &m_vertexIndices, false ),
-	m_clArea( queue, ctx, &m_area, false ),
-	m_clNormal( queue, ctx, &m_normal, false )
-/** Allocate enough space in all link-related arrays to fit numLinks links */
-void btSoftBodyTriangleDataOpenCL::createTriangles( int numTriangles )
-	int previousSize = getNumTriangles();
-	int newSize = previousSize + numTriangles;
-	btSoftBodyTriangleData::createTriangles( numTriangles );
-	// Resize the link addresses array as well
-	m_triangleAddresses.resize( newSize );
-/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-void btSoftBodyTriangleDataOpenCL::setTriangleAt( const btSoftBodyTriangleData::TriangleDescription &triangle, int triangleIndex )
-	btSoftBodyTriangleData::setTriangleAt( triangle, triangleIndex );
-	m_triangleAddresses[triangleIndex] = triangleIndex;
-bool btSoftBodyTriangleDataOpenCL::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyTriangleDataOpenCL::moveToAccelerator()
-	bool success = true;
-	success = success && m_clVertexIndices.moveToGPU();
-	success = success && m_clArea.moveToGPU();
-	success = success && m_clNormal.moveToGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
-bool btSoftBodyTriangleDataOpenCL::moveFromAccelerator()
-	bool success = true;
-	success = success && m_clVertexIndices.moveFromGPU();
-	success = success && m_clArea.moveFromGPU();
-	success = success && m_clNormal.moveFromGPU();
-	if( success )
-		m_onGPU = true;
-	return success;
- * Generate (and later update) the batching for the entire triangle set.
- * This redoes a lot of work because it batches the entire set when each cloth is inserted.
- * In theory we could delay it until just before we need the cloth.
- * It's a one-off overhead, though, so that is a later optimisation.
- */
-void btSoftBodyTriangleDataOpenCL::generateBatches()
-	int numTriangles = getNumTriangles();
-	if( numTriangles == 0 )
-		return;
-	// Do the graph colouring here temporarily
-	btAlignedObjectArray< int > batchValues;
-	batchValues.resize( numTriangles );
-	// Find the maximum vertex value internally for now
-	int maxVertex = 0;
-	for( int triangleIndex = 0; triangleIndex < numTriangles; ++triangleIndex )
-	{
-		int vertex0 = getVertexSet(triangleIndex).vertex0;
-		int vertex1 = getVertexSet(triangleIndex).vertex1;
-		int vertex2 = getVertexSet(triangleIndex).vertex2;
-		if( vertex0 > maxVertex )
-			maxVertex = vertex0;
-		if( vertex1 > maxVertex )
-			maxVertex = vertex1;
-		if( vertex2 > maxVertex )
-			maxVertex = vertex2;
-	}
-	int numVertices = maxVertex + 1;
-	// Set of lists, one for each node, specifying which colours are connected
-	// to that node.
-	// No two edges into a node can share a colour.
-	btAlignedObjectArray< btAlignedObjectArray< int > > vertexConnectedColourLists;
-	vertexConnectedColourLists.resize(numVertices);
-	//std::cout << "\n";
-	// Simple algorithm that chooses the lowest batch number
-	// that none of the faces attached to either of the connected 
-	// nodes is in
-	for( int triangleIndex = 0; triangleIndex < numTriangles; ++triangleIndex )
-	{
-		// To maintain locations run off the original link locations rather than the current position.
-		// It's not cache efficient, but as we run this rarely that should not matter.
-		// It's faster than searching the link location array for the current location and then updating it.
-		// The other alternative would be to unsort before resorting, but this is equivalent to doing that.
-		int triangleLocation = m_triangleAddresses[triangleIndex];
-		int vertex0 = getVertexSet(triangleLocation).vertex0;
-		int vertex1 = getVertexSet(triangleLocation).vertex1;
-		int vertex2 = getVertexSet(triangleLocation).vertex2;
-		// Get the three node colour lists
-		btAlignedObjectArray< int > &colourListVertex0( vertexConnectedColourLists[vertex0] );
-		btAlignedObjectArray< int > &colourListVertex1( vertexConnectedColourLists[vertex1] );
-		btAlignedObjectArray< int > &colourListVertex2( vertexConnectedColourLists[vertex2] );
-		// Choose the minimum colour that is in none of the lists
-		int colour = 0;
-		while( 
-			colourListVertex0.findLinearSearch(colour) != colourListVertex0.size() || 
-			colourListVertex1.findLinearSearch(colour) != colourListVertex1.size() ||
-			colourListVertex2.findLinearSearch(colour) != colourListVertex2.size() )
-		{
-			++colour;
-		}
-		// i should now be the minimum colour in neither list
-		// Add to the three lists so that future edges don't share
-		// And store the colour against this face
-		colourListVertex0.push_back(colour);
-		colourListVertex1.push_back(colour);
-		colourListVertex2.push_back(colour);
-		batchValues[triangleIndex] = colour;
-	}
-	// Check the colour counts
-	btAlignedObjectArray< int > batchCounts;
-	for( int i = 0; i < numTriangles; ++i )
-	{
-		int batch = batchValues[i];
-		if( batch >= batchCounts.size() )
-			batchCounts.push_back(1);
-		else
-			++(batchCounts[batch]);
-	}
-	m_batchStartLengths.resize(batchCounts.size());
-	m_batchStartLengths[0] = btSomePair(0,0);
-	int sum = 0;
-	for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
-	{
-		m_batchStartLengths[batchIndex].first = sum;
-		m_batchStartLengths[batchIndex].second = batchCounts[batchIndex];
-		sum += batchCounts[batchIndex];
-	}
-	/////////////////////////////
-	// Sort data based on batches
-	// Create source arrays by copying originals
-	btAlignedObjectArray<btSoftBodyTriangleData::TriangleNodeSet>							m_vertexIndices_Backup(m_vertexIndices);
-	btAlignedObjectArray<float>										m_area_Backup(m_area);
-	btAlignedObjectArray<Vectormath::Aos::Vector3>					m_normal_Backup(m_normal);
-	for( int batch = 0; batch < batchCounts.size(); ++batch )
-		batchCounts[batch] = 0;
-	// Do sort as single pass into destination arrays	
-	for( int triangleIndex = 0; triangleIndex < numTriangles; ++triangleIndex )
-	{
-		// To maintain locations run off the original link locations rather than the current position.
-		// It's not cache efficient, but as we run this rarely that should not matter.
-		// It's faster than searching the link location array for the current location and then updating it.
-		// The other alternative would be to unsort before resorting, but this is equivalent to doing that.
-		int triangleLocation = m_triangleAddresses[triangleIndex];
-		// Obtain batch and calculate target location for the
-		// next element in that batch, incrementing the batch counter
-		// afterwards
-		int batch = batchValues[triangleIndex];
-		int newLocation = m_batchStartLengths[batch].first + batchCounts[batch];
-		batchCounts[batch] = batchCounts[batch] + 1;
-		m_vertexIndices[newLocation] = m_vertexIndices_Backup[triangleLocation];
-		m_area[newLocation] = m_area_Backup[triangleLocation];
-		m_normal[newLocation] = m_normal_Backup[triangleLocation];
-		// Update the locations array to account for the moved entry
-		m_triangleAddresses[triangleIndex] = newLocation;
-	}
-} // btSoftBodyTriangleDataOpenCL::generateBatches
-btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :
-	m_linkData(queue, ctx),
-	m_vertexData(queue, ctx),
-	m_triangleData(queue, ctx),
-	m_defaultCLFunctions(queue, ctx),
-	m_currentCLFunctions(&m_defaultCLFunctions),
-	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
-	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
-	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
-	m_clPerClothVelocityCorrectionCoefficient(queue, ctx,&m_perClothVelocityCorrectionCoefficient, true ),
-	m_clPerClothLiftFactor(queue, ctx,&m_perClothLiftFactor, true ),
-	m_clPerClothDragFactor(queue, ctx,&m_perClothDragFactor, true ),
-	m_clPerClothMediumDensity(queue, ctx,&m_perClothMediumDensity, true ),
-	m_clPerClothCollisionObjects( queue, ctx, &m_perClothCollisionObjects, true ),
-	m_clCollisionObjectDetails( queue, ctx, &m_collisionObjectDetails, true ),
-	m_clPerClothFriction( queue, ctx, &m_perClothFriction, false ),
-	m_clAnchorPosition( queue, ctx, &m_anchorPosition, true ),
-	m_clAnchorIndex( queue, ctx, &m_anchorIndex, true),
-	m_cqCommandQue( queue ),
-	m_cxMainContext(ctx),
-	m_defaultWorkGroupSize(BT_DEFAULT_WORKGROUPSIZE),
-	m_bUpdateAnchoredNodePos(bUpdateAchchoredNodePos)
-	// Initial we will clearly need to update solver constants
-	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
-	// for performance in future once we understand more clearly when constants need to be updated
-	m_updateSolverConstants = true;
-	m_shadersInitialized = false;
-	m_prepareLinksKernel = 0;
-	m_solvePositionsFromLinksKernel = 0;
-	m_updateConstantsKernel = 0;
-	m_integrateKernel = 0;
-	m_addVelocityKernel = 0;
-	m_updatePositionsFromVelocitiesKernel = 0;
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = 0;
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = 0;
-	m_vSolveLinksKernel = 0;
-	m_solveCollisionsAndUpdateVelocitiesKernel = 0;
-	m_resetNormalsAndAreasKernel = 0;
-	m_updateSoftBodiesKernel = 0;
-	m_normalizeNormalsAndAreasKernel = 0;
-	m_outputToVertexArrayKernel = 0;
-	m_applyForcesKernel = 0;
-	m_updateFixedVertexPositionsKernel = 0;
-	releaseKernels();
-void btOpenCLSoftBodySolver::releaseKernels()
-	RELEASE_CL_KERNEL( m_prepareLinksKernel );
-	RELEASE_CL_KERNEL( m_solvePositionsFromLinksKernel );
-	RELEASE_CL_KERNEL( m_updateConstantsKernel );
-	RELEASE_CL_KERNEL( m_integrateKernel );
-	RELEASE_CL_KERNEL( m_addVelocityKernel );
-	RELEASE_CL_KERNEL( m_updatePositionsFromVelocitiesKernel );
-	RELEASE_CL_KERNEL( m_updateVelocitiesFromPositionsWithoutVelocitiesKernel );
-	RELEASE_CL_KERNEL( m_updateVelocitiesFromPositionsWithVelocitiesKernel );
-	RELEASE_CL_KERNEL( m_vSolveLinksKernel );
-	RELEASE_CL_KERNEL( m_solveCollisionsAndUpdateVelocitiesKernel );
-	RELEASE_CL_KERNEL( m_resetNormalsAndAreasKernel );
-	RELEASE_CL_KERNEL( m_normalizeNormalsAndAreasKernel );
-	RELEASE_CL_KERNEL( m_outputToVertexArrayKernel );
-	RELEASE_CL_KERNEL( m_applyForcesKernel );
-	RELEASE_CL_KERNEL( m_updateFixedVertexPositionsKernel );
-	m_shadersInitialized = false;
-void btOpenCLSoftBodySolver::copyBackToSoftBodies(bool bMove)
-	// Move the vertex data back to the host first
-	m_vertexData.moveFromAccelerator(!bMove);
-	// Loop over soft bodies, copying all the vertex positions back for each body in turn
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btOpenCLAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[ softBodyIndex ];
-		btSoftBody *softBody = softBodyInterface->getSoftBody();
-		int firstVertex = softBodyInterface->getFirstVertex();
-		int numVertices = softBodyInterface->getNumVertices();
-		// Copy vertices from solver back into the softbody
-		for( int vertex = 0; vertex < numVertices; ++vertex )
-		{
-			using Vectormath::Aos::Point3;
-			Point3 vertexPosition( m_vertexData.getVertexPositions()[firstVertex + vertex] );
-			Point3 normal(m_vertexData.getNormal(firstVertex + vertex));
-			softBody->m_nodes[vertex].m_x.setX( vertexPosition.getX() );
-			softBody->m_nodes[vertex].m_x.setY( vertexPosition.getY() );
-			softBody->m_nodes[vertex].m_x.setZ( vertexPosition.getZ() );
-			softBody->m_nodes[vertex].m_n.setX( normal.getX() );
-			softBody->m_nodes[vertex].m_n.setY( normal.getY() );
-			softBody->m_nodes[vertex].m_n.setZ( normal.getZ() );
-		}
-	}	
-} // btOpenCLSoftBodySolver::copyBackToSoftBodies
-void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softBodies, bool forceUpdate )
-	if( forceUpdate || m_softBodySet.size() != softBodies.size() )
-	{
-		// Have a change in the soft body set so update, reloading all the data
-		getVertexData().clear();
-		getTriangleData().clear();
-		getLinkData().clear();
-		m_softBodySet.resize(0);
-		m_anchorIndex.clear();
-		int maxPiterations = 0;
-		int maxViterations = 0;
-		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
-		{
-			btSoftBody *softBody = softBodies[ softBodyIndex ];
-			using Vectormath::Aos::Matrix3;
-			using Vectormath::Aos::Point3;
-			// Create SoftBody that will store the information within the solver
-			btOpenCLAcceleratedSoftBodyInterface *newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
-			m_softBodySet.push_back( newSoftBody );
-			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
-			m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
-			m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
-			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
-			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
-			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
-			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
-			m_perClothFriction.push_back(softBody->m_cfg.kDF);
-			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
-			// Add space for new vertices and triangles in the default solver for now
-			// TODO: Include space here for tearing too later
-			int firstVertex = getVertexData().getNumVertices();
-			int numVertices = softBody->m_nodes.size();
-			int maxVertices = numVertices;
-			// Allocate space for new vertices in all the vertex arrays
-			getVertexData().createVertices( maxVertices, softBodyIndex );
-			int firstTriangle = getTriangleData().getNumTriangles();
-			int numTriangles = softBody->m_faces.size();
-			int maxTriangles = numTriangles;
-			getTriangleData().createTriangles( maxTriangles );
-			// Copy vertices from softbody into the solver
-			for( int vertex = 0; vertex < numVertices; ++vertex )
-			{
-				Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
-				btSoftBodyVertexData::VertexDescription desc;
-				// TODO: Position in the softbody might be pre-transformed
-				// or we may need to adapt for the pose.
-				//desc.setPosition( cloth.getMeshTransform()*multPoint );
-				desc.setPosition( multPoint );
-				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
-				desc.setInverseMass(vertexInverseMass);
-				getVertexData().setVertexAt( desc, firstVertex + vertex );
-				m_anchorIndex.push_back(-1.0);
-			}
-			// Copy triangles similarly
-			// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
-			for( int triangle = 0; triangle < numTriangles; ++triangle )
-			{
-				// Note that large array storage is relative to the array not to the cloth
-				// So we need to add firstVertex to each value
-				int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
-				int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
-				int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
-				btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
-				getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
-				// Increase vertex triangle counts for this triangle		
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
-			}
-			int firstLink = getLinkData().getNumLinks();
-			int numLinks = softBody->m_links.size();
-			int maxLinks = numLinks;
-			// Allocate space for the links
-			getLinkData().createLinks( numLinks );
-			// Add the links
-			for( int link = 0; link < numLinks; ++link )
-			{
-				int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
-				int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
-				btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
-				newLink.setLinkStrength(1.f);
-				getLinkData().setLinkAt(newLink, firstLink + link);
-			}
-			newSoftBody->setFirstVertex( firstVertex );
-			newSoftBody->setFirstTriangle( firstTriangle );
-			newSoftBody->setNumVertices( numVertices );
-			newSoftBody->setMaxVertices( maxVertices );
-			newSoftBody->setNumTriangles( numTriangles );
-			newSoftBody->setMaxTriangles( maxTriangles );
-			newSoftBody->setFirstLink( firstLink );
-			newSoftBody->setNumLinks( numLinks );
-			// Find maximum piterations and viterations
-			int piterations = softBody->m_cfg.piterations;
-            if ( piterations > maxPiterations )
-                  maxPiterations = piterations;
-            int viterations = softBody->m_cfg.viterations;
-			if ( viterations > maxViterations )
-                  maxViterations = viterations;
-			// zero mass
-			for( int vertex = 0; vertex < numVertices; ++vertex )
-			{
-				if ( softBody->m_nodes[vertex].m_im == 0 )
-				{
-					AnchorNodeInfoCL nodeInfo;
-					nodeInfo.clVertexIndex = firstVertex + vertex;
-					nodeInfo.pNode = &softBody->m_nodes[vertex];
-					m_anchorNodeInfoArray.push_back(nodeInfo);
-				}
-			}			
-			// anchor position
-			if ( numVertices > 0 )
-			{
-				for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )
-				{
-					btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;
-					btSoftBody::Node* firstNode = &softBody->m_nodes[0];
-					AnchorNodeInfoCL nodeInfo;
-					nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);
-					nodeInfo.pNode = anchorNode;
-					m_anchorNodeInfoArray.push_back(nodeInfo);
-				}
-			}			
-		}
-		m_anchorPosition.clear();		
-		m_anchorPosition.resize(m_anchorNodeInfoArray.size());
-		for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )
-		{
-			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];
-			m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;
-			getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;
-		}
-		updateConstants(0.f);
-		// set position and velocity iterations
-		setNumberOfPositionIterations(maxPiterations);
-		setNumberOfVelocityIterations(maxViterations);
-		// set wind velocity
-		m_perClothWindVelocity.resize( m_softBodySet.size() );
-		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-		{
-			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();			
-			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
-		}
-		m_clPerClothWindVelocity.changedOnCPU();
-		// generate batches
-		m_linkData.generateBatches();		
-		m_triangleData.generateBatches();
-		// Build the shaders to match the batching parameters
-		buildShaders();
-	}
-btSoftBodyLinkData &btOpenCLSoftBodySolver::getLinkData()
-	// TODO: Consider setting link data to "changed" here
-	return m_linkData;
-btSoftBodyVertexData &btOpenCLSoftBodySolver::getVertexData()
-	// TODO: Consider setting vertex data to "changed" here
-	return m_vertexData;
-btSoftBodyTriangleData &btOpenCLSoftBodySolver::getTriangleData()
-	// TODO: Consider setting triangle data to "changed" here
-	return m_triangleData;
-void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
-	cl_int ciErrNum;
-	ciErrNum = clSetKernelArg(m_resetNormalsAndAreasKernel, 0, sizeof(numVertices), (void*)&numVertices); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	ciErrNum = clSetKernelArg(m_resetNormalsAndAreasKernel, 1, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexNormal.m_buffer);//oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	ciErrNum = clSetKernelArg(m_resetNormalsAndAreasKernel,  2, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexArea.m_buffer); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	size_t numWorkItems = m_defaultWorkGroupSize*((numVertices + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_resetNormalsAndAreasKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0 );
-		if( ciErrNum != CL_SUCCESS )
-		{
-			btAssert( 0 && "enqueueNDRangeKernel(m_resetNormalsAndAreasKernel)" );
-		}
-	}
-void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
-	cl_int ciErrNum;
-	ciErrNum = clSetKernelArg(m_normalizeNormalsAndAreasKernel, 0, sizeof(int),(void*) &numVertices);
-	ciErrNum = clSetKernelArg(m_normalizeNormalsAndAreasKernel, 1, sizeof(cl_mem), &m_vertexData.m_clVertexTriangleCount.m_buffer);
-	ciErrNum = clSetKernelArg(m_normalizeNormalsAndAreasKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
-	ciErrNum = clSetKernelArg(m_normalizeNormalsAndAreasKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((numVertices + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_normalizeNormalsAndAreasKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
-		if( ciErrNum != CL_SUCCESS ) 
-		{
-			btAssert( 0 && "enqueueNDRangeKernel(m_normalizeNormalsAndAreasKernel)");
-		}
-	}
-void btOpenCLSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
-	cl_int ciErrNum;
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 0, sizeof(int), (void*) &firstTriangle);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 1, sizeof(int), &numTriangles);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 2, sizeof(cl_mem), &m_triangleData.m_clVertexIndices.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 6, sizeof(cl_mem), &m_triangleData.m_clNormal.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 7, sizeof(cl_mem), &m_triangleData.m_clArea.m_buffer);
-	size_t numWorkItems = m_defaultWorkGroupSize*((numTriangles + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_updateSoftBodiesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_normalizeNormalsAndAreasKernel)");
-	}
-void btOpenCLSoftBodySolver::updateSoftBodies()
-	using namespace Vectormath::Aos;
-	int numVertices = m_vertexData.getNumVertices();
-	int numTriangles = m_triangleData.getNumTriangles();
-	// Ensure data is on accelerator
-	m_vertexData.moveToAccelerator();
-	m_triangleData.moveToAccelerator();
-	resetNormalsAndAreas( numVertices );
-	// Go through triangle batches so updates occur correctly
-	for( int batchIndex = 0; batchIndex < m_triangleData.m_batchStartLengths.size(); ++batchIndex )
-	{
-		int startTriangle = m_triangleData.m_batchStartLengths[batchIndex].first;
-		int numTriangles = m_triangleData.m_batchStartLengths[batchIndex].second;
-		executeUpdateSoftBodies( startTriangle, numTriangles );
-	}
-	normalizeNormalsAndAreas( numVertices );
-} // updateSoftBodies
-Vectormath::Aos::Vector3 btOpenCLSoftBodySolver::ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a )
-	return a*Vectormath::Aos::dot(v, a);
-void btOpenCLSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce )
-	float dtInverseMass = solverdt*inverseMass;
-	if( Vectormath::Aos::lengthSqr(force * dtInverseMass) > Vectormath::Aos::lengthSqr(vertexVelocity) )
-	{
-		vertexForce -= ProjectOnAxis( vertexVelocity, normalize( force ) )/dtInverseMass;
-	} else {
-		vertexForce += force;
-	}
-void btOpenCLSoftBodySolver::updateFixedVertexPositions()
-	// Ensure data is on accelerator
-	m_vertexData.moveToAccelerator();
-	m_clAnchorPosition.moveToGPU();
-	m_clAnchorIndex.moveToGPU();
-	cl_int ciErrNum ;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel, 0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,1, sizeof(cl_mem), &m_clAnchorIndex.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,3, sizeof(cl_mem), &m_clAnchorPosition.m_buffer);
-	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_updateFixedVertexPositionsKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
-		if( ciErrNum != CL_SUCCESS ) 
-		{
-			btAssert( 0 &&  "enqueueNDRangeKernel(m_updateFixedVertexPositionsKernel)");
-		}
-	}
-void btOpenCLSoftBodySolver::applyForces( float solverdt )
-	// Ensure data is on accelerator
-	m_vertexData.moveToAccelerator();
-	m_clPerClothAcceleration.moveToGPU();
-	m_clPerClothLiftFactor.moveToGPU();
-	m_clPerClothDragFactor.moveToGPU();
-	m_clPerClothMediumDensity.moveToGPU();
-	m_clPerClothWindVelocity.moveToGPU();	
-	cl_int ciErrNum ;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 1, sizeof(float), &solverdt);
-	float fl = FLT_EPSILON;
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 2, sizeof(float), &fl);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 7, sizeof(cl_mem), &m_clPerClothLiftFactor.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 8 ,sizeof(cl_mem), &m_clPerClothDragFactor.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel, 9, sizeof(cl_mem), &m_clPerClothWindVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel,10, sizeof(cl_mem), &m_clPerClothAcceleration.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel,11, sizeof(cl_mem), &m_clPerClothMediumDensity.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	ciErrNum = clSetKernelArg(m_applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
-	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_applyForcesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
-		if( ciErrNum != CL_SUCCESS ) 
-		{
-			btAssert( 0 &&  "enqueueNDRangeKernel(m_applyForcesKernel)");
-		}
-	}
- * Integrate motion on the solver.
- */
-void btOpenCLSoftBodySolver::integrate( float solverdt )
-	// Ensure data is on accelerator
-	m_vertexData.moveToAccelerator();
-	cl_int ciErrNum;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_integrateKernel, 0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_integrateKernel, 1, sizeof(float), &solverdt);
-	ciErrNum = clSetKernelArg(m_integrateKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
-	ciErrNum = clSetKernelArg(m_integrateKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_integrateKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_integrateKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_integrateKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_integrateKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-		if( ciErrNum != CL_SUCCESS )
-		{
-			btAssert( 0 &&  "enqueueNDRangeKernel(m_integrateKernel)");
-		}
-	}
-float btOpenCLSoftBodySolver::computeTriangleArea( 
-	const Vectormath::Aos::Point3 &vertex0,
-	const Vectormath::Aos::Point3 &vertex1,
-	const Vectormath::Aos::Point3 &vertex2 )
-	Vectormath::Aos::Vector3 a = vertex1 - vertex0;
-	Vectormath::Aos::Vector3 b = vertex2 - vertex0;
-	Vectormath::Aos::Vector3 crossProduct = cross(a, b);
-	float area = length( crossProduct );
-	return area;
-void btOpenCLSoftBodySolver::updateBounds()
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btVector3 minBound(-1e30,-1e30,-1e30), maxBound(1e30,1e30,1e30);
-		m_softBodySet[softBodyIndex]->updateBounds( minBound, maxBound );
-	}
-} // btOpenCLSoftBodySolver::updateBounds
-void btOpenCLSoftBodySolver::updateConstants( float timeStep )
-	using namespace Vectormath::Aos;
-	if( m_updateSolverConstants )
-	{
-		m_updateSolverConstants = false;
-		// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
-		// Initialise link constants
-		const int numLinks = m_linkData.getNumLinks();
-		for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-		{
-			btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
-			m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
-			float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
-			float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
-			float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
-			float massLSC = (invMass0 + invMass1)/linearStiffness;
-			m_linkData.getMassLSC(linkIndex) = massLSC;
-			float restLength = m_linkData.getRestLength(linkIndex);
-			float restLengthSquared = restLength*restLength;
-			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
-		}
-	}
-class QuickSortCompare
-	public:
-	bool operator() ( const CollisionShapeDescription& a, const CollisionShapeDescription& b ) const
-	{
-		return ( a.softBodyIdentifier < b.softBodyIdentifier );
-	}
- * Sort the collision object details array and generate indexing into it for the per-cloth collision object array.
- */
-void btOpenCLSoftBodySolver::prepareCollisionConstraints()
-	// First do a simple sort on the collision objects
-	btAlignedObjectArray<int> numObjectsPerClothPrefixSum;
-	btAlignedObjectArray<int> numObjectsPerCloth;
-	numObjectsPerCloth.resize( m_softBodySet.size(), 0 );
-	numObjectsPerClothPrefixSum.resize( m_softBodySet.size(), 0 );
-	m_collisionObjectDetails.quickSort( QuickSortCompare() );
-	if (!m_perClothCollisionObjects.size())
-		return;
-	// Generating indexing for perClothCollisionObjects
-	// First clear the previous values with the "no collision object for cloth" constant
-	for( int clothIndex = 0; clothIndex < m_perClothCollisionObjects.size(); ++clothIndex )
-	{
-		m_perClothCollisionObjects[clothIndex].firstObject = -1;
-		m_perClothCollisionObjects[clothIndex].endObject = -1;
-	}
-	int currentCloth = 0;
-	int startIndex = 0;
-	for( int collisionObject = 0; collisionObject < m_collisionObjectDetails.size(); ++collisionObject )
-	{
-		int nextCloth = m_collisionObjectDetails[collisionObject].softBodyIdentifier;
-		if( nextCloth != currentCloth )
-		{	
-			// Changed cloth in the array
-			// Set the end index and the range is what we need for currentCloth
-			m_perClothCollisionObjects[currentCloth].firstObject = startIndex;
-			m_perClothCollisionObjects[currentCloth].endObject = collisionObject;
-			currentCloth = nextCloth;
-			startIndex = collisionObject;
-		}
-	}
-	// And update last cloth	
-	m_perClothCollisionObjects[currentCloth].firstObject = startIndex;
-	m_perClothCollisionObjects[currentCloth].endObject =  m_collisionObjectDetails.size();
-} // btOpenCLSoftBodySolver::prepareCollisionConstraints
-void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
-	using Vectormath::Aos::Vector3;
-	using Vectormath::Aos::Point3;
-	using Vectormath::Aos::lengthSqr;
-	using Vectormath::Aos::dot;
-	// Prepare links
-	int numLinks = m_linkData.getNumLinks();
-	int numVertices = m_vertexData.getNumVertices();
-	float kst = 1.f;
-	float ti = 0.f;
-	m_clPerClothDampingFactor.moveToGPU();
-	m_clPerClothVelocityCorrectionCoefficient.moveToGPU();
-	// Ensure data is on accelerator
-	m_linkData.moveToAccelerator();
-	m_vertexData.moveToAccelerator();
-	prepareLinks();	
-	for( int iteration = 0; iteration < m_numberOfVelocityIterations ; ++iteration )
-	{
-		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
-		{
-			int startLink = m_linkData.m_batchStartLengths[i].start;
-			int numLinks = m_linkData.m_batchStartLengths[i].length;
-			solveLinksForVelocity( startLink, numLinks, kst );
-		}
-	}
-	prepareCollisionConstraints();
-	// Compute new positions from velocity
-	// Also update the previous position so that our position computation is now based on the new position from the velocity solution
-	// rather than based directly on the original positions
-	if( m_numberOfVelocityIterations > 0 )
-	{
-		updateVelocitiesFromPositionsWithVelocities( 1.f/solverdt );
-	} else {
-		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
-	}
-	// Solve position
-	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	{
-		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
-		{
-			int startLink = m_linkData.m_batchStartLengths[i].start;
-			int numLinks = m_linkData.m_batchStartLengths[i].length;
-			solveLinksForPosition( startLink, numLinks, kst, ti );
-		}
-	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	// At this point assume that the force array is blank - we will overwrite it
-	solveCollisionsAndUpdateVelocities( 1.f/solverdt );
-// Kernel dispatches
-void btOpenCLSoftBodySolver::prepareLinks()
-	cl_int ciErrNum;
-	int numLinks = m_linkData.getNumLinks();
-	ciErrNum = clSetKernelArg(m_prepareLinksKernel,0, sizeof(int), &numLinks);
-	ciErrNum = clSetKernelArg(m_prepareLinksKernel,1, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
-	ciErrNum = clSetKernelArg(m_prepareLinksKernel,2, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
-	ciErrNum = clSetKernelArg(m_prepareLinksKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_prepareLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
-	ciErrNum = clSetKernelArg(m_prepareLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((m_linkData.getNumLinks() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_prepareLinksKernel, 1 , NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_prepareLinksKernel)");
-	}
-void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
-	cl_int ciErrNum;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,1, sizeof(float), &solverdt);
-	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,4, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_updatePositionsFromVelocitiesKernel, 1, NULL, &numWorkItems,&m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_updatePositionsFromVelocitiesKernel)");
-	}
-void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
-	cl_int ciErrNum;
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,0, sizeof(int), &startLink);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,1, sizeof(int), &numLinks);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,2, sizeof(float), &kst);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,3, sizeof(float), &ti);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((numLinks + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum!= CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_solvePositionsFromLinksKernel)");
-	}
-} // solveLinksForPosition
-void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
-	cl_int ciErrNum;
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 0, sizeof(int), &startLink);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 1, sizeof(int), &numLinks);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 2, sizeof(float), &kst);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 3, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 4, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 5, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
-	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 7, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((numLinks + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_vSolveLinksKernel,1,NULL,&numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_vSolveLinksKernel)");
-	}
-void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
-	cl_int ciErrNum;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel,0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 1, sizeof(float), &isolverdt);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 5, sizeof(cl_mem), &m_clPerClothVelocityCorrectionCoefficient.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 6, sizeof(cl_mem), &m_clPerClothDampingFactor.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 7, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel, 8, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_updateVelocitiesFromPositionsWithVelocitiesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_updateVelocitiesFromPositionsWithVelocitiesKernel)");
-	}
-} // updateVelocitiesFromPositionsWithVelocities
-void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
-	cl_int ciErrNum;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, sizeof(float), &isolverdt);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 4, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 6, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 7, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
-	}
-} // updateVelocitiesFromPositionsWithoutVelocities
-void btOpenCLSoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt )
-	// Copy kernel parameters to GPU
-	m_vertexData.moveToAccelerator();
-	m_clPerClothFriction.moveToGPU();
-	m_clPerClothDampingFactor.moveToGPU();
-	m_clPerClothCollisionObjects.moveToGPU();
-	m_clCollisionObjectDetails.moveToGPU();
-	cl_int ciErrNum;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
-	size_t	numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-		if( ciErrNum != CL_SUCCESS ) 
-		{
-			btAssert( 0 &&  "enqueueNDRangeKernel(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
-		}
-	}
-} // btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities
-// End kernel dispatches
-void btSoftBodySolverOutputCLtoCPU::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
-	btSoftBodySolver *solver = softBody->getSoftBodySolver();
-	btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER );
-	btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver );
-	btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody );
-	btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData );
-	const int firstVertex = currentCloth->getFirstVertex();
-	const int lastVertex = firstVertex + currentCloth->getNumVertices();
-	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
-	{		
-		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
-		float *basePointer = cpuVertexBuffer->getBasePointer();						
-		vertexData.m_clVertexPosition.copyFromGPU();
-		vertexData.m_clVertexNormal.copyFromGPU();
-		if( vertexBuffer->hasVertexPositions() )
-		{
-			const int vertexOffset = cpuVertexBuffer->getVertexOffset();
-			const int vertexStride = cpuVertexBuffer->getVertexStride();
-			float *vertexPointer = basePointer + vertexOffset;
-			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
-			{
-				Vectormath::Aos::Point3 position = vertexData.getPosition(vertexIndex);
-				*(vertexPointer + 0) = position.getX();
-				*(vertexPointer + 1) = position.getY();
-				*(vertexPointer + 2) = position.getZ();
-				vertexPointer += vertexStride;
-			}
-		}
-		if( vertexBuffer->hasNormals() )
-		{
-			const int normalOffset = cpuVertexBuffer->getNormalOffset();
-			const int normalStride = cpuVertexBuffer->getNormalStride();
-			float *normalPointer = basePointer + normalOffset;
-			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
-			{
-				Vectormath::Aos::Vector3 normal = vertexData.getNormal(vertexIndex);
-				*(normalPointer + 0) = normal.getX();
-				*(normalPointer + 1) = normal.getY();
-				*(normalPointer + 2) = normal.getZ();
-				normalPointer += normalStride;
-			}
-		}
-	}
-} // btSoftBodySolverOutputCLtoCPU::outputToVertexBuffers
-cl_kernel CLFunctions::compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros ,const char* orgSrcFileNameForCaching)
-	printf("compiling kernelName: %s ",kernelName);
-	cl_kernel kernel=0;
-	cl_int ciErrNum;
-	size_t program_length = strlen(kernelSource);
-	cl_program m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&kernelSource, &program_length, &ciErrNum);
-    // Build the program with 'mad' Optimization option
-#ifdef MAC
-	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
-	//const char* flags = "-DGUID_ARG= -fno-alias";
-	const char* flags = "-DGUID_ARG= ";
-	char* compileFlags = new char[strlen(additionalMacros) + strlen(flags) + 5];
-	sprintf(compileFlags, "%s %s", flags, additionalMacros);
-    ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, compileFlags, NULL, NULL);
-    if (ciErrNum != CL_SUCCESS)
-    {
-		size_t numDevices;
-		clGetProgramInfo( m_cpProgram, CL_PROGRAM_DEVICES, 0, 0, &numDevices );
-		cl_device_id *devices = new cl_device_id[numDevices];
-		clGetProgramInfo( m_cpProgram, CL_PROGRAM_DEVICES, numDevices, devices, &numDevices );
-        for( int i = 0; i < 2; ++i )
-		{
-			char *build_log;
-			size_t ret_val_size;
-			clGetProgramBuildInfo(m_cpProgram, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-			build_log = new char[ret_val_size+1];
-			clGetProgramBuildInfo(m_cpProgram, devices[i], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-			// to be carefully, terminate with \0
-			// there's no information in the reference whether the string is 0 terminated or not
-			build_log[ret_val_size] = '\0';
-			printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
-			delete[] build_log;
-		}
-		btAssert(0);
-		m_kernelCompilationFailures++;
-		return 0;
-    }
-    // Create the kernel
-    kernel = clCreateKernel(m_cpProgram, kernelName, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS)
-    {
-		const char* msg = "";
-        switch(ciErrNum)
-        {
-        case CL_INVALID_PROGRAM:
-            msg = "Program is not a valid program object.";
-            break;
-            msg = "There is no successfully built executable for program.";
-            break;
-            msg = "kernel_name is not found in program.";
-            break;
-            msg = "the function definition for __kernel function given by kernel_name such as the number of arguments, the argument types are not the same for all devices for which the program executable has been built.";
-            break;
-        case CL_INVALID_VALUE:
-            msg = "kernel_name is NULL.";
-            break;
-        case CL_OUT_OF_HOST_MEMORY:
-            msg = "Failure to allocate resources required by the OpenCL implementation on the host.";
-            break;
-		default:
-			{
-			}
-        }
-        printf("Error in clCreateKernel for kernel '%s', error is \"%s\", Line %u in file %s !!!\n\n", kernelName, msg, __LINE__, __FILE__);
-		btAssert(0);
-		m_kernelCompilationFailures++;
-		return 0;
-    }
-	printf("ready. \n");
-	delete [] compileFlags;
-	if (!kernel)
-		m_kernelCompilationFailures++;
-	return kernel;
-void btOpenCLSoftBodySolver::predictMotion( float timeStep )
-	// Clear the collision shape array for the next frame
-	// Ensure that the DX11 ones are moved off the device so they will be updated correctly
-	m_clCollisionObjectDetails.changedOnCPU();
-	m_clPerClothCollisionObjects.changedOnCPU();
-	m_collisionObjectDetails.clear();	
-	if ( m_bUpdateAnchoredNodePos )
-	{
-		// In OpenCL cloth solver, if softbody node has zero inverse mass(infinite mass) or anchor attached, 
-		// we need to update the node position in case the node or anchor is animated externally.
-		// If there is no such node, we can eliminate the unnecessary CPU-to-GPU data trasferring. 
-		for ( int i = 0; i < m_anchorNodeInfoArray.size(); i++ )
-		{
-			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[i];
-			btSoftBody::Node* node = anchorNodeInfo.pNode;
-			using Vectormath::Aos::Point3;
-			Point3 pos((float)node->m_x.getX(), (float)node->m_x.getY(), (float)node->m_x.getZ());				
-			m_anchorPosition[i] = pos;
-		}
-		if ( m_anchorNodeInfoArray.size() > 0 )
-			m_clAnchorPosition.changedOnCPU();
-		updateFixedVertexPositions();
-	}
-	{
-		BT_PROFILE("applyForces");
-		// Apply forces that we know about to the cloths
-		applyForces(  timeStep * getTimeScale() );
-	}
-	{
-		BT_PROFILE("integrate");
-		// Itegrate motion for all soft bodies dealt with by the solver
-		integrate( timeStep * getTimeScale() );
-	}
-	{
-		BT_PROFILE("updateBounds");
-		updateBounds();
-	}
-	// End prediction work for solvers
-static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
-	Vectormath::Aos::Transform3 outTransform;
-	outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
-	outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
-	outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
-	outTransform.setCol(3, toVector3(transform.getOrigin()));
-	return outTransform;	
-void btOpenCLAcceleratedSoftBodyInterface::updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound )
-	float scalarMargin = (float)getSoftBody()->getCollisionShape()->getMargin();
-	btVector3 vectorMargin( scalarMargin, scalarMargin, scalarMargin );
-	m_softBody->m_bounds[0] = lowerBound - vectorMargin;
-	m_softBody->m_bounds[1] = upperBound + vectorMargin;
-}  // btOpenCLSoftBodySolver::btDX11AcceleratedSoftBodyInterface::updateBounds
-void btOpenCLSoftBodySolver::processCollision( btSoftBody*, btSoftBody* )
-// Add the collision object to the set to deal with for a particular soft body
-void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
- 	int softBodyIndex = findSoftBodyIndex( softBody );
-	if( softBodyIndex >= 0 )
-	{
-		btCollisionShape *collisionShape = collisionObject->getCollisionShape();
-		float friction = collisionObject->getFriction();
-		int shapeType = collisionShape->getShapeType();
-		if( shapeType == CAPSULE_SHAPE_PROXYTYPE )
-		{
-			// Add to the list of expected collision objects
-			CollisionShapeDescription newCollisionShapeDescription;
-			newCollisionShapeDescription.softBodyIdentifier = softBodyIndex;
-			newCollisionShapeDescription.collisionShapeType = shapeType;
-			// TODO: May need to transpose this matrix either here or in HLSL
-			newCollisionShapeDescription.shapeTransform = toTransform3(collisionObject->getWorldTransform());
-			btCapsuleShape *capsule = static_cast<btCapsuleShape*>( collisionShape );
-			newCollisionShapeDescription.radius = capsule->getRadius();
-			newCollisionShapeDescription.halfHeight = capsule->getHalfHeight();
-			newCollisionShapeDescription.margin = capsule->getMargin();
-			newCollisionShapeDescription.upAxis = capsule->getUpAxis();
-			newCollisionShapeDescription.friction = friction;
-			btRigidBody* body = static_cast< btRigidBody* >( collisionObject );
-			newCollisionShapeDescription.linearVelocity = toVector3(body->getLinearVelocity());
-			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
-			m_collisionObjectDetails.push_back( newCollisionShapeDescription );
-		} 		
-		else {
-#ifdef _DEBUG
-			printf("Unsupported collision shape type\n");
-			//btAssert(0 && "Unsupported collision shape type\n");
-		}
-	} else {
-		btAssert(0 && "Unknown soft body");
-	}
-} // btOpenCLSoftBodySolver::processCollision
-btOpenCLAcceleratedSoftBodyInterface* btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btOpenCLAcceleratedSoftBodyInterface* softBodyInterface = m_softBodySet[softBodyIndex];
-		if( softBodyInterface->getSoftBody() == softBody )
-			return softBodyInterface;
-	}
-	return 0;
-int btOpenCLSoftBodySolver::findSoftBodyIndex( const btSoftBody* const softBody )
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-	{
-		btOpenCLAcceleratedSoftBodyInterface* softBodyInterface = m_softBodySet[softBodyIndex];
-		if( softBodyInterface->getSoftBody() == softBody )
-			return softBodyIndex;
-	}
-	return 1;
-bool btOpenCLSoftBodySolver::checkInitialized()
-	if( !m_shadersInitialized )
-		if( buildShaders() )
-			m_shadersInitialized = true;
-	return m_shadersInitialized;
-bool btOpenCLSoftBodySolver::buildShaders()
-	if( m_shadersInitialized )
-		return true;
-	const char* additionalMacros="";
-	// Ensure current kernels are released first
-	releaseKernels();
-	m_currentCLFunctions->clearKernelCompilationFailures();
-	m_prepareLinksKernel = m_currentCLFunctions->compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel",additionalMacros,"OpenCLC10/PrepareLinks.cl" );
-	m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");
-	m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel",additionalMacros,"OpenCLC10/SolvePositions.cl" );
-	m_vSolveLinksKernel = m_currentCLFunctions->compileCLKernelFromString( VSolveLinksCLString, "VSolveLinksKernel" ,additionalMacros,"OpenCLC10/VSolveLinks.cl");
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdateNodes.cl");
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdatePositions.cl");
-	m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel" ,additionalMacros,"OpenCLC10/SolveCollisionsAndUpdateVelocities.cl");
-	m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" ,additionalMacros,"OpenCLC10/Integrate.cl");
-	m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" ,additionalMacros,"OpenCLC10/ApplyForces.cl");
-	m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" , additionalMacros, "OpenCLC10/UpdateFixedVertexPositions.cl");
-	// TODO: Rename to UpdateSoftBodies
-	m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
-	m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
-	m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
-	if( m_currentCLFunctions->getKernelCompilationFailures()==0 )
-		m_shadersInitialized = true;
-	return m_shadersInitialized;
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
deleted file mode 100644
index cc8db089..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ /dev/null
@@ -1,527 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "stddef.h" //for size_t
-#include "vectormath/vmInclude.h"
-#include "BulletSoftBody/btSoftBodySolvers.h"
-#include "BulletSoftBody/btSoftBody.h"
-#include "btSoftBodySolverBuffer_OpenCL.h"
-#include "btSoftBodySolverLinkData_OpenCL.h"
-#include "btSoftBodySolverVertexData_OpenCL.h"
-#include "btSoftBodySolverTriangleData_OpenCL.h"
-class CLFunctions
-	cl_command_queue	m_cqCommandQue;
-	cl_context			m_cxMainContext;
-	int	m_kernelCompilationFailures;
-	CLFunctions(cl_command_queue cqCommandQue, cl_context cxMainContext) :
-		m_cqCommandQue( cqCommandQue ),
-		m_cxMainContext( cxMainContext ),
-		m_kernelCompilationFailures(0)
-	{
-	}
-	int getKernelCompilationFailures() const
-	{
-		return m_kernelCompilationFailures;
-	}
-	/**
-	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
-	 */	
-	virtual cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros, const char* srcFileNameForCaching);
-	void	clearKernelCompilationFailures()
-	{
-		m_kernelCompilationFailures=0;
-	}
- * Entry in the collision shape array.
- * Specifies the shape type, the transform matrix and the necessary details of the collisionShape.
- */
-struct CollisionShapeDescription
-	Vectormath::Aos::Transform3 shapeTransform;
-	Vectormath::Aos::Vector3 linearVelocity;
-	Vectormath::Aos::Vector3 angularVelocity;
-	int softBodyIdentifier;
-	int collisionShapeType;
-	// Both needed for capsule
-	float radius;
-	float halfHeight;
-	int upAxis;
-	float margin;
-	float friction;
-	CollisionShapeDescription()
-	{
-		collisionShapeType = 0;
-		margin = 0;
-		friction = 0;
-	}
-	 * SoftBody class to maintain information about a soft body instance
-	 * within a solver.
-	 * This data addresses the main solver arrays.
-	 */
-class btOpenCLAcceleratedSoftBodyInterface
-	/** Current number of vertices that are part of this cloth */
-	int m_numVertices;
-	/** Maximum number of vertices allocated to be part of this cloth */
-	int m_maxVertices;
-	/** Current number of triangles that are part of this cloth */
-	int m_numTriangles;
-	/** Maximum number of triangles allocated to be part of this cloth */
-	int m_maxTriangles;
-	/** Index of first vertex in the world allocated to this cloth */
-	int m_firstVertex;
-	/** Index of first triangle in the world allocated to this cloth */
-	int m_firstTriangle;
-	/** Index of first link in the world allocated to this cloth */
-	int m_firstLink;
-	/** Maximum number of links allocated to this cloth */
-	int m_maxLinks;
-	/** Current number of links allocated to this cloth */
-	int m_numLinks;
-	/** The actual soft body this data represents */
-	btSoftBody *m_softBody;
-	btOpenCLAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
-	  m_softBody( softBody )
-	{
-		m_numVertices = 0;
-		m_maxVertices = 0;
-		m_numTriangles = 0;
-		m_maxTriangles = 0;
-		m_firstVertex = 0;
-		m_firstTriangle = 0;
-		m_firstLink = 0;
-		m_maxLinks = 0;
-		m_numLinks = 0;
-	}
-	int getNumVertices()
-	{
-		return m_numVertices;
-	}
-	int getNumTriangles()
-	{
-		return m_numTriangles;
-	}
-	int getMaxVertices()
-	{
-		return m_maxVertices;
-	}
-	int getMaxTriangles()
-	{
-		return m_maxTriangles;
-	}
-	int getFirstVertex()
-	{
-		return m_firstVertex;
-	}
-	int getFirstTriangle()
-	{
-		return m_firstTriangle;
-	}
-	/**
-	 * Update the bounds in the btSoftBody object
-	 */
-	void updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound );
-	// TODO: All of these set functions will have to do checks and
-	// update the world because restructuring of the arrays will be necessary
-	// Reasonable use of "friend"?
-	void setNumVertices( int numVertices )
-	{
-		m_numVertices = numVertices;
-	}	
-	void setNumTriangles( int numTriangles )
-	{
-		m_numTriangles = numTriangles;
-	}
-	void setMaxVertices( int maxVertices )
-	{
-		m_maxVertices = maxVertices;
-	}
-	void setMaxTriangles( int maxTriangles )
-	{
-		m_maxTriangles = maxTriangles;
-	}
-	void setFirstVertex( int firstVertex )
-	{
-		m_firstVertex = firstVertex;
-	}
-	void setFirstTriangle( int firstTriangle )
-	{
-		m_firstTriangle = firstTriangle;
-	}
-	void setMaxLinks( int maxLinks )
-	{
-		m_maxLinks = maxLinks;
-	}
-	void setNumLinks( int numLinks )
-	{
-		m_numLinks = numLinks;
-	}
-	void setFirstLink( int firstLink )
-	{
-		m_firstLink = firstLink;
-	}
-	int getMaxLinks()
-	{
-		return m_maxLinks;
-	}
-	int getNumLinks()
-	{
-		return m_numLinks;
-	}
-	int getFirstLink()
-	{
-		return m_firstLink;
-	}
-	btSoftBody* getSoftBody()
-	{
-		return m_softBody;
-	}
-class btOpenCLSoftBodySolver : public btSoftBodySolver
-	struct UIntVector3
-	{
-		UIntVector3()
-		{
-			x = 0;
-			y = 0;
-			z = 0;
-			_padding = 0;
-		}
-		UIntVector3( unsigned int x_, unsigned int y_, unsigned int z_ )
-		{
-			x = x_;
-			y = y_;
-			z = z_;
-			_padding = 0;
-		}
-		unsigned int x;
-		unsigned int y;
-		unsigned int z;
-		unsigned int _padding;
-	};
-	struct CollisionObjectIndices
-	{
-		CollisionObjectIndices( int f, int e )
-		{
-			firstObject = f;
-			endObject = e;
-		}
-		int firstObject;
-		int endObject;
-	};
-	btSoftBodyLinkDataOpenCL m_linkData;
-	btSoftBodyVertexDataOpenCL m_vertexData;
-	btSoftBodyTriangleDataOpenCL m_triangleData;
-	CLFunctions m_defaultCLFunctions;
-	CLFunctions* m_currentCLFunctions;
-	/** Variable to define whether we need to update solver constants on the next iteration */
-	bool m_updateSolverConstants;
-	bool m_shadersInitialized;
-	/** 
-	 * Cloths owned by this solver.
-	 * Only our cloths are in this array.
-	 */
-	btAlignedObjectArray< btOpenCLAcceleratedSoftBodyInterface * > m_softBodySet;
-	/** Acceleration value to be applied to all non-static vertices in the solver. 
-	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
-	 */
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>			m_clPerClothAcceleration;
-	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
-	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
-	 */
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
-	btOpenCLBuffer<Vectormath::Aos::Vector3>			m_clPerClothWindVelocity;
-	/** Velocity damping factor */
-	btAlignedObjectArray< float >						m_perClothDampingFactor;
-	btOpenCLBuffer<float>								m_clPerClothDampingFactor;
-	/** Velocity correction coefficient */
-	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
-	btOpenCLBuffer<float>								m_clPerClothVelocityCorrectionCoefficient;
-	/** Lift parameter for wind effect on cloth. */
-	btAlignedObjectArray< float >						m_perClothLiftFactor;
-	btOpenCLBuffer<float>								m_clPerClothLiftFactor;
-	/** Drag parameter for wind effect on cloth. */
-	btAlignedObjectArray< float >						m_perClothDragFactor;
-	btOpenCLBuffer<float>								m_clPerClothDragFactor;
-	/** Density of the medium in which each cloth sits */
-	btAlignedObjectArray< float >						m_perClothMediumDensity;
-	btOpenCLBuffer<float>								m_clPerClothMediumDensity;
-	/** 
-	 * Collision shape details: pair of index of first collision shape for the cloth and number of collision objects.
-	 */
-	btAlignedObjectArray< CollisionObjectIndices >		m_perClothCollisionObjects;
-	btOpenCLBuffer<CollisionObjectIndices>				m_clPerClothCollisionObjects;
-	/** 
-	 * Collision shapes being passed across to the cloths in this solver.
-	 */
-	btAlignedObjectArray< CollisionShapeDescription >	m_collisionObjectDetails;
-	btOpenCLBuffer< CollisionShapeDescription >			m_clCollisionObjectDetails;
-	/** 
-	 * Friction coefficient for each cloth
-	 */
-	btAlignedObjectArray< float >	m_perClothFriction;
-	btOpenCLBuffer< float >			m_clPerClothFriction;
-	// anchor node info
-	struct AnchorNodeInfoCL
-	{
-		int clVertexIndex;
-		btSoftBody::Node* pNode;
-	};
-	btAlignedObjectArray<AnchorNodeInfoCL> m_anchorNodeInfoArray;
-	btAlignedObjectArray<Vectormath::Aos::Point3> m_anchorPosition;
-	btOpenCLBuffer<Vectormath::Aos::Point3>		  m_clAnchorPosition;
-	btAlignedObjectArray<int> m_anchorIndex;
-	btOpenCLBuffer<int>		  m_clAnchorIndex;
-	bool m_bUpdateAnchoredNodePos;
-	cl_kernel		m_prepareLinksKernel;
-	cl_kernel		m_solvePositionsFromLinksKernel;
-	cl_kernel		m_updateConstantsKernel;
-	cl_kernel		m_integrateKernel;
-	cl_kernel		m_addVelocityKernel;
-	cl_kernel		m_updatePositionsFromVelocitiesKernel;
-	cl_kernel		m_updateVelocitiesFromPositionsWithoutVelocitiesKernel;
-	cl_kernel		m_updateVelocitiesFromPositionsWithVelocitiesKernel;
-	cl_kernel		m_vSolveLinksKernel;
-	cl_kernel		m_solveCollisionsAndUpdateVelocitiesKernel;
-	cl_kernel		m_resetNormalsAndAreasKernel;
-	cl_kernel		m_normalizeNormalsAndAreasKernel;
-	cl_kernel		m_updateSoftBodiesKernel;
-	cl_kernel		m_outputToVertexArrayKernel;
-	cl_kernel		m_applyForcesKernel;
-	cl_kernel       m_updateFixedVertexPositionsKernel;	
-	cl_command_queue	m_cqCommandQue;
-	cl_context			m_cxMainContext;
-	size_t				m_defaultWorkGroupSize;
-	virtual bool buildShaders();
-	void resetNormalsAndAreas( int numVertices );
-	void normalizeNormalsAndAreas( int numVertices );
-	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
-	void prepareCollisionConstraints();
-	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
-	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
-	int findSoftBodyIndex( const btSoftBody* const softBody );
-	virtual void applyForces( float solverdt );
-	void updateFixedVertexPositions();
-	/**
-	 * Integrate motion on the solver.
-	 */
-	virtual void integrate( float solverdt );
-	virtual void updateConstants( float timeStep );
-	float computeTriangleArea( 
-		const Vectormath::Aos::Point3 &vertex0,
-		const Vectormath::Aos::Point3 &vertex1,
-		const Vectormath::Aos::Point3 &vertex2 );
-	//////////////////////////////////////
-	// Kernel dispatches
-	void prepareLinks();
-	void solveLinksForVelocity( int startLink, int numLinks, float kst );
-	void updatePositionsFromVelocities( float solverdt );
-	virtual void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
-	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
-	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
-	virtual void solveCollisionsAndUpdateVelocities( float isolverdt );
-	// End kernel dispatches
-	/////////////////////////////////////
-	void updateBounds();
-	void releaseKernels();
-	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);
-	virtual ~btOpenCLSoftBodySolver();
-	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
-	virtual btSoftBodyLinkData &getLinkData();
-	virtual btSoftBodyVertexData &getVertexData();
-	virtual btSoftBodyTriangleData &getTriangleData();
-	virtual SolverTypes getSolverType() const
-	{
-		return CL_SOLVER;
-	}
-	virtual bool checkInitialized();
-	virtual void updateSoftBodies( );
-	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
-	virtual void copyBackToSoftBodies(bool bMove = true);
-	virtual void solveConstraints( float solverdt );
-	virtual void predictMotion( float solverdt );
-	virtual void processCollision( btSoftBody *, btCollisionObject* );
-	virtual void processCollision( btSoftBody*, btSoftBody* );
-	virtual void	setDefaultWorkgroupSize(size_t workGroupSize)
-	{
-		m_defaultWorkGroupSize = workGroupSize;
-	}
-	virtual size_t	getDefaultWorkGroupSize() const
-	{
-		return m_defaultWorkGroupSize;
-	}
-	void	setCLFunctions(CLFunctions* funcs)
-	{
-		if (funcs)
-			m_currentCLFunctions = funcs;
-		else
-			m_currentCLFunctions  = &m_defaultCLFunctions;
-	}
-}; // btOpenCLSoftBodySolver
- * Class to manage movement of data from a solver to a given target.
- * This version is the CL to CPU version.
- */
-class btSoftBodySolverOutputCLtoCPU : public btSoftBodySolverOutput
-	btSoftBodySolverOutputCLtoCPU()
-	{
-	}
-	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
-	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
-#endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
deleted file mode 100644
index f97af57f..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
+++ /dev/null
@@ -1,1101 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "vectormath/vmInclude.h"
-#include <stdio.h> //@todo: remove the debugging printf at some stage
-#include "btSoftBodySolver_OpenCLSIMDAware.h"
-#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
-#include "BulletSoftBody/btSoftBody.h"
-#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
-#include <limits.h>
-#define WAVEFRONT_SIZE 32
-static const size_t workGroupSize = GROUP_SIZE;
-//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
-////OpenCL 1.0 kernels don't use float3
-#define MSTRINGIFY(A) #A
-static const char* UpdatePositionsFromVelocitiesCLString = 
-#include "OpenCLC10/UpdatePositionsFromVelocities.cl"
-static const char* SolvePositionsCLString = 
-#include "OpenCLC10/SolvePositionsSIMDBatched.cl"
-static const char* UpdateNodesCLString = 
-#include "OpenCLC10/UpdateNodes.cl"
-static const char* UpdatePositionsCLString = 
-#include "OpenCLC10/UpdatePositions.cl"
-static const char* UpdateConstantsCLString = 
-#include "OpenCLC10/UpdateConstants.cl"
-static const char* IntegrateCLString = 
-#include "OpenCLC10/Integrate.cl"
-static const char* ApplyForcesCLString = 
-#include "OpenCLC10/ApplyForces.cl"
-static const char* UpdateFixedVertexPositionsCLString = 
-#include "OpenCLC10/UpdateFixedVertexPositions.cl"
-static const char* UpdateNormalsCLString = 
-#include "OpenCLC10/UpdateNormals.cl"
-static const char* VSolveLinksCLString = 
-#include "OpenCLC10/VSolveLinks.cl"
-static const char* SolveCollisionsAndUpdateVelocitiesCLString =
-#include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
-static const char* OutputToVertexArrayCLString =
-#include "OpenCLC10/OutputToVertexArray.cl"
-btSoftBodyLinkDataOpenCLSIMDAware::btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue,  cl_context ctx) :
-	m_cqCommandQue(queue),
-	m_wavefrontSize( WAVEFRONT_SIZE ),
-	m_linksPerWorkItem( LINKS_PER_SIMD_LANE ),
-	m_maxBatchesWithinWave( 0 ),
-	m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ),
-	m_numWavefronts( 0 ),
-	m_maxVertex( 0 ),
-	m_clNumBatchesAndVerticesWithinWaves( queue, ctx, &m_numBatchesAndVerticesWithinWaves, true ),
-	m_clWavefrontVerticesGlobalAddresses( queue, ctx, &m_wavefrontVerticesGlobalAddresses, true ),
-	m_clLinkVerticesLocalAddresses( queue, ctx, &m_linkVerticesLocalAddresses, true ),
-	m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
-	m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
-	m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
-	m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
-	m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
-static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
-	Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
-	return outVec;
-/** Allocate enough space in all link-related arrays to fit numLinks links */
-void btSoftBodyLinkDataOpenCLSIMDAware::createLinks( int numLinks )
-	int previousSize = m_links.size();
-	int newSize = previousSize + numLinks;
-	btSoftBodyLinkData::createLinks( numLinks );
-	// Resize the link addresses array as well
-	m_linkAddresses.resize( newSize );
-/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-void btSoftBodyLinkDataOpenCLSIMDAware::setLinkAt( 
-	const LinkDescription &link, 
-	int linkIndex )
-	btSoftBodyLinkData::setLinkAt( link, linkIndex );
-	if( link.getVertex0() > m_maxVertex )
-		m_maxVertex = link.getVertex0();
-	if( link.getVertex1() > m_maxVertex )
-		m_maxVertex = link.getVertex1();
-	// Set the link index correctly for initialisation
-	m_linkAddresses[linkIndex] = linkIndex;
-bool btSoftBodyLinkDataOpenCLSIMDAware::onAccelerator()
-	return m_onGPU;
-bool btSoftBodyLinkDataOpenCLSIMDAware::moveToAccelerator()
-	bool success = true;
-	success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();
-	success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();
-	success = success && m_clLinkVerticesLocalAddresses.moveToGPU();
-	success = success && m_clLinkStrength.moveToGPU();
-	success = success && m_clLinksMassLSC.moveToGPU();
-	success = success && m_clLinksRestLengthSquared.moveToGPU();
-	success = success && m_clLinksRestLength.moveToGPU();
-	success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveToGPU();
-	if( success ) {
-		m_onGPU = true;
-	}
-	return success;
-bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator()
-	bool success = true;
-	success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();
-	success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();
-	success = success && m_clLinkVerticesLocalAddresses.moveToGPU();
-	success = success && m_clLinkStrength.moveFromGPU();
-	success = success && m_clLinksMassLSC.moveFromGPU();
-	success = success && m_clLinksRestLengthSquared.moveFromGPU();
-	success = success && m_clLinksRestLength.moveFromGPU();
-	success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveFromGPU();
-	if( success ) {
-		m_onGPU = false;
-	}
-	return success;
-btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :
-	btOpenCLSoftBodySolver( queue, ctx, bUpdateAchchoredNodePos ),
-	m_linkData(queue, ctx)
-	// Initial we will clearly need to update solver constants
-	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
-	// for performance in future once we understand more clearly when constants need to be updated
-	m_updateSolverConstants = true;
-	m_shadersInitialized = false;
-	releaseKernels();
-void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate)
-	if( forceUpdate || m_softBodySet.size() != softBodies.size() )
-	{
-		// Have a change in the soft body set so update, reloading all the data
-		getVertexData().clear();
-		getTriangleData().clear();
-		getLinkData().clear();
-		m_softBodySet.resize(0);
-		m_anchorIndex.clear();
-		int maxPiterations = 0;
-		int maxViterations = 0;
-		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
-		{
-			btSoftBody *softBody = softBodies[ softBodyIndex ];
-			using Vectormath::Aos::Matrix3;
-			using Vectormath::Aos::Point3;
-			// Create SoftBody that will store the information within the solver
-			btOpenCLAcceleratedSoftBodyInterface* newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
-			m_softBodySet.push_back( newSoftBody );
-			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
-			m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
-			m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
-			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
-			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
-			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
-			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
-			m_perClothFriction.push_back(softBody->m_cfg.kDF);
-			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
-			// Add space for new vertices and triangles in the default solver for now
-			// TODO: Include space here for tearing too later
-			int firstVertex = getVertexData().getNumVertices();
-			int numVertices = softBody->m_nodes.size();
-			// Round maxVertices to a multiple of the workgroup size so we know we're safe to run over in a given group
-			// maxVertices can be increased to allow tearing, but should be used sparingly because these extra verts will always be processed
-			int maxVertices = GROUP_SIZE*((numVertices+GROUP_SIZE)/GROUP_SIZE);
-			// Allocate space for new vertices in all the vertex arrays
-			getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );
-			int firstTriangle = getTriangleData().getNumTriangles();
-			int numTriangles = softBody->m_faces.size();
-			int maxTriangles = numTriangles;
-			getTriangleData().createTriangles( maxTriangles );
-			// Copy vertices from softbody into the solver
-			for( int vertex = 0; vertex < numVertices; ++vertex )
-			{
-				Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
-				btSoftBodyVertexData::VertexDescription desc;
-				// TODO: Position in the softbody might be pre-transformed
-				// or we may need to adapt for the pose.
-				//desc.setPosition( cloth.getMeshTransform()*multPoint );
-				desc.setPosition( multPoint );
-				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
-				desc.setInverseMass(vertexInverseMass);
-				getVertexData().setVertexAt( desc, firstVertex + vertex );
-				m_anchorIndex.push_back(-1.0);
-			}
-			for( int vertex = numVertices; vertex < maxVertices; ++vertex )
-			{
-				m_anchorIndex.push_back(-1.0);
-			}
-			// Copy triangles similarly
-			// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
-			for( int triangle = 0; triangle < numTriangles; ++triangle )
-			{
-				// Note that large array storage is relative to the array not to the cloth
-				// So we need to add firstVertex to each value
-				int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
-				int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
-				int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
-				btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
-				getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
-				// Increase vertex triangle counts for this triangle		
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
-				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
-			}
-			int firstLink = getLinkData().getNumLinks();
-			int numLinks = softBody->m_links.size();
-			int maxLinks = numLinks;
-			// Allocate space for the links
-			getLinkData().createLinks( numLinks );
-			// Add the links
-			for( int link = 0; link < numLinks; ++link )
-			{
-				int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
-				int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
-				btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
-				newLink.setLinkStrength(1.f);
-				getLinkData().setLinkAt(newLink, firstLink + link);
-			}
-			newSoftBody->setFirstVertex( firstVertex );
-			newSoftBody->setFirstTriangle( firstTriangle );
-			newSoftBody->setNumVertices( numVertices );
-			newSoftBody->setMaxVertices( maxVertices );
-			newSoftBody->setNumTriangles( numTriangles );
-			newSoftBody->setMaxTriangles( maxTriangles );
-			newSoftBody->setFirstLink( firstLink );
-			newSoftBody->setNumLinks( numLinks );
-			// Find maximum piterations and viterations
-			int piterations = softBody->m_cfg.piterations;
-            if ( piterations > maxPiterations )
-                  maxPiterations = piterations;
-            int viterations = softBody->m_cfg.viterations;
-			if ( viterations > maxViterations )
-                  maxViterations = viterations;
-			// zero mass
-			for( int vertex = 0; vertex < numVertices; ++vertex )
-			{
-				if ( softBody->m_nodes[vertex].m_im == 0 )
-				{
-					AnchorNodeInfoCL nodeInfo;
-					nodeInfo.clVertexIndex = firstVertex + vertex;
-					nodeInfo.pNode = &softBody->m_nodes[vertex];
-					m_anchorNodeInfoArray.push_back(nodeInfo);
-				}
-			}			
-			// anchor position
-			if ( numVertices > 0 )
-			{
-				for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )
-				{
-					btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;
-					btSoftBody::Node* firstNode = &softBody->m_nodes[0];
-					AnchorNodeInfoCL nodeInfo;
-					nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);
-					nodeInfo.pNode = anchorNode;
-					m_anchorNodeInfoArray.push_back(nodeInfo);
-				}
-			}			
-		}
-		m_anchorPosition.clear();		
-		m_anchorPosition.resize(m_anchorNodeInfoArray.size());
-		for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )
-		{
-			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];
-			m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;
-			getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;
-		}
-		updateConstants(0.f);
-		// set position and velocity iterations
-		setNumberOfPositionIterations(maxPiterations);
-		setNumberOfVelocityIterations(maxViterations);
-		// set wind velocity
-		m_perClothWindVelocity.resize( m_softBodySet.size() );
-		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
-		{
-			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();			
-			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
-		}
-		m_clPerClothWindVelocity.changedOnCPU();
-		// generate batches
-		m_linkData.generateBatches();		
-		m_triangleData.generateBatches();
-		// Build the shaders to match the batching parameters
-		buildShaders();
-	}
-btSoftBodyLinkData &btOpenCLSoftBodySolverSIMDAware::getLinkData()
-	// TODO: Consider setting link data to "changed" here
-	return m_linkData;
-void btOpenCLSoftBodySolverSIMDAware::updateConstants( float timeStep )
-	using namespace Vectormath::Aos;
-	if( m_updateSolverConstants )
-	{
-		m_updateSolverConstants = false;
-		// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
-		// Initialise link constants
-		const int numLinks = m_linkData.getNumLinks();
-		for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
-		{
-			btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
-			m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
-			float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
-			float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
-			float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
-			float massLSC = (invMass0 + invMass1)/linearStiffness;
-			m_linkData.getMassLSC(linkIndex) = massLSC;
-			float restLength = m_linkData.getRestLength(linkIndex);
-			float restLengthSquared = restLength*restLength;
-			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
-		}
-	}
-void btOpenCLSoftBodySolverSIMDAware::solveConstraints( float solverdt )
-	using Vectormath::Aos::Vector3;
-	using Vectormath::Aos::Point3;
-	using Vectormath::Aos::lengthSqr;
-	using Vectormath::Aos::dot;
-	// Prepare links
-	int numLinks = m_linkData.getNumLinks();
-	int numVertices = m_vertexData.getNumVertices();
-	float kst = 1.f;
-	float ti = 0.f;
-	m_clPerClothDampingFactor.moveToGPU();
-	m_clPerClothVelocityCorrectionCoefficient.moveToGPU();
-	// Ensure data is on accelerator
-	m_linkData.moveToAccelerator();
-	m_vertexData.moveToAccelerator();
-	//prepareLinks();	
-	prepareCollisionConstraints();
-	// Solve drift
-	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	{
-		for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i )
-		{
-			int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start;
-			int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length;
-			solveLinksForPosition( startWave, numWaves, kst, ti );
-		}
-	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
-	// At this point assume that the force array is blank - we will overwrite it
-	solveCollisionsAndUpdateVelocities( 1.f/solverdt );
-// Kernel dispatches
-void btOpenCLSoftBodySolverSIMDAware::solveLinksForPosition( int startWave, int numWaves, float kst, float ti )
-	cl_int ciErrNum;
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,0, sizeof(int), &startWave);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,1, sizeof(int), &numWaves);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,2, sizeof(float), &kst);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,3, sizeof(float), &ti);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clNumBatchesAndVerticesWithinWaves.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clWavefrontVerticesGlobalAddresses.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinkVerticesLocalAddresses.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,9, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,10, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,11, WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_int2), 0);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,12, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float4), 0);
-	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,13, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float), 0);
-	size_t	numWorkItems = workGroupSize*((numWaves*WAVEFRONT_SIZE + (workGroupSize-1)) / workGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);
-	if( ciErrNum!= CL_SUCCESS ) 
-	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(m_solvePositionsFromLinksKernel)");
-	}
-} // solveLinksForPosition
-void btOpenCLSoftBodySolverSIMDAware::solveCollisionsAndUpdateVelocities( float isolverdt )
-	// Copy kernel parameters to GPU
-	m_vertexData.moveToAccelerator();
-	m_clPerClothFriction.moveToGPU();
-	m_clPerClothDampingFactor.moveToGPU();
-	m_clPerClothCollisionObjects.moveToGPU();
-	m_clCollisionObjectDetails.moveToGPU();
-	cl_int ciErrNum;
-	int numVerts = m_vertexData.getNumVertices();
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 11, sizeof(CollisionShapeDescription)*16,0);
-	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 12, sizeof(cl_mem),&m_vertexData.m_clVertexInverseMass.m_buffer);
-	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	if (numWorkItems)
-	{
-		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
-		if( ciErrNum != CL_SUCCESS ) 
-		{
-			btAssert( 0 &&  "enqueueNDRangeKernel(m_solveCollisionsAndUpdateVelocitiesKernel)");
-		}
-	}
-} // btOpenCLSoftBodySolverSIMDAware::updateVelocitiesFromPositionsWithoutVelocities
-// End kernel dispatches
-bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
-	releaseKernels();
-	if( m_shadersInitialized )
-		return true;
-	const char* additionalMacros="";
-	m_currentCLFunctions->clearKernelCompilationFailures();
-	char *wavefrontMacros = new char[256];
-	sprintf(
-		wavefrontMacros, 
-		m_linkData.getMaxVerticesPerWavefront(),
-		m_linkData.getMaxBatchesPerWavefront(),
-		m_linkData.getWavefrontSize(),
-		WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
-	m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");
-	m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros ,"OpenCLC10/SolvePositionsSIMDBatched.cl");
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", additionalMacros ,"OpenCLC10/UpdateNodes.cl");
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositions.cl");
-	m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", additionalMacros ,"OpenCLC10/Integrate.cl");
-	m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", additionalMacros,"OpenCLC10/ApplyForces.cl" );
-	m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" ,additionalMacros,"OpenCLC10/UpdateFixedVertexPositions.cl");
-	m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", additionalMacros ,"OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl");
-	// TODO: Rename to UpdateSoftBodies
-	m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
-	m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
-	m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
-	delete [] wavefrontMacros;
-	if( m_currentCLFunctions->getKernelCompilationFailures()==0)
-	{
-		m_shadersInitialized = true;
-	}
-	return m_shadersInitialized;
-static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
-	Vectormath::Aos::Transform3 outTransform;
-	outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
-	outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
-	outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
-	outTransform.setCol(3, toVector3(transform.getOrigin()));
-	return outTransform;	
-static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray <int> > &wavefrontBatches )
-	// A per-batch map of truth values stating whether a given vertex is in that batch
-	// This allows us to significantly optimize the batching
-	btAlignedObjectArray <btAlignedObjectArray<bool> > mapOfVerticesInBatches;
-	for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex )
-	{
-		btAlignedObjectArray <int> &wavefront( linksForWavefronts[waveIndex] );
-		int batch = 0;
-		bool placed = false;
-		while( batch < wavefrontBatches.size() && !placed )
-		{
-			// Test the current batch, see if this wave shares any vertex with the waves in the batch
-			bool foundSharedVertex = false;
-			for( int link = 0; link < wavefront.size(); ++link )
-			{
-				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
-				if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] )
-				{
-					foundSharedVertex = true;
-				}
-			}
-			if( !foundSharedVertex )
-			{
-				wavefrontBatches[batch].push_back( waveIndex );	
-				// Insert vertices into this batch too
-				for( int link = 0; link < wavefront.size(); ++link )
-				{
-					btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
-					(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
-					(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
-				}
-				placed = true;
-			}
-			batch++;
-		}
-		if( batch == wavefrontBatches.size() && !placed )
-		{
-			wavefrontBatches.resize( batch + 1 );
-			wavefrontBatches[batch].push_back( waveIndex );
-			// And resize map as well
-			mapOfVerticesInBatches.resize( batch + 1 );
-			// Resize maps with total number of vertices
-			mapOfVerticesInBatches[batch].resize( numVertices+1, false );
-			// Insert vertices into this batch too
-			for( int link = 0; link < wavefront.size(); ++link )
-			{
-				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
-				(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
-				(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
-			}
-		}
-	}
-	mapOfVerticesInBatches.clear();
-// Function to remove an object from a vector maintaining correct ordering of the vector
-template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove )
-	int currentSize = vectorToUpdate.size();
-	for( int i = indexToRemove; i < (currentSize-1); ++i )
-	{
-		vectorToUpdate[i] = vectorToUpdate[i+1];
-	}
-	if( currentSize > 0 )
-		vectorToUpdate.resize( currentSize - 1 );
- * Insert element into vectorToUpdate at index index.
- */
-template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element )
-	vectorToUpdate.resize( vectorToUpdate.size() + 1 );
-	for( int i = (vectorToUpdate.size() - 1); i > index; --i )
-	{
-		vectorToUpdate[i] = vectorToUpdate[i-1];
-	}
-	vectorToUpdate[index] = element;
- * Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness.
- * ie it treats vectorToUpdate as an ordered set.
- */
-template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray<T> &vectorToUpdate, T element )
-	int index = 0;
-	while( index < vectorToUpdate.size() && vectorToUpdate[index] < element )
-	{
-		index++;
-	}
-	if( index == vectorToUpdate.size() || vectorToUpdate[index] != element )
-		insertAtIndex( vectorToUpdate, index, element );
-static void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray <int> &numLinksPerVertex, int &maxLinks )
-	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
-	{
-		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
-		numLinksPerVertex[nodes.vertex0]++;
-		numLinksPerVertex[nodes.vertex1]++;
-	}
-	int maxLinksPerVertex = 0;
-	for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex )
-	{
-		maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex);
-	}
-	maxLinks = maxLinksPerVertex;
-	btAlignedObjectArray< int > linksFoundPerVertex;
-	linksFoundPerVertex.resize( numVertices, 0 );
-	listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices );
-	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
-	{
-		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
-		{
-			// Do vertex 0
-			int vertexIndex = nodes.vertex0;
-			int linkForVertex = linksFoundPerVertex[nodes.vertex0];
-			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
-			listOfLinksPerVertex[linkAddress] = linkIndex;
-			linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1;
-		}
-		{
-			// Do vertex 1
-			int vertexIndex = nodes.vertex1;
-			int linkForVertex = linksFoundPerVertex[nodes.vertex1];
-			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
-			listOfLinksPerVertex[linkAddress] = linkIndex;
-			linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1;
-		}
-	}
-static void computeBatchingIntoWavefronts( 
-	btSoftBodyLinkData &linkData, 
-	int wavefrontSize, 
-	int linksPerWorkItem, 
-	int maxLinksPerWavefront, 
-	btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, 
-	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > &batchesWithinWaves, /* wave, batch, links in batch */
-	btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */
-	)
-	// Attempt generation of larger batches of links.
-	btAlignedObjectArray< bool > processedLink;
-	processedLink.resize( linkData.getNumLinks() );
-	btAlignedObjectArray< int > listOfLinksPerVertex;
-	int maxLinksPerVertex = 0;
-	// Count num vertices
-	int numVertices = 0;
-	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
-	{
-		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
-		numVertices = btMax( numVertices, nodes.vertex0 + 1 );
-		numVertices = btMax( numVertices, nodes.vertex1 + 1 );
-	}
-	// Need list of links per vertex
-	// Compute valence of each vertex
-	btAlignedObjectArray <int> numLinksPerVertex;
-	numLinksPerVertex.resize(0);
-	numLinksPerVertex.resize( numVertices, 0 );
-	generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex );
-	if (!numVertices)
-		return;
-	for( int vertex = 0; vertex < 10; ++vertex )
-	{
-		for( int link = 0; link < numLinksPerVertex[vertex]; ++link )
-		{
-			int linkAddress = vertex * maxLinksPerVertex + link;
-		}
-	}
-	// At this point we know what links we have for each vertex so we can start batching
-	// We want a vertex to start with, let's go with 0
-	int currentVertex = 0;
-	int linksProcessed = 0;
-	btAlignedObjectArray <int> verticesToProcess;
-	while( linksProcessed < linkData.getNumLinks() )
-	{
-		// Next wavefront
-		int nextWavefront = linksForWavefronts.size();
-		linksForWavefronts.resize( nextWavefront + 1 );
-		btAlignedObjectArray <int> &linksForWavefront(linksForWavefronts[nextWavefront]);
-		verticesForWavefronts.resize( nextWavefront + 1 );
-		btAlignedObjectArray<int> &vertexSet( verticesForWavefronts[nextWavefront] );
-		linksForWavefront.resize(0);
-		// Loop to find enough links to fill the wavefront
-		// Stopping if we either run out of links, or fill it
-		while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront )
-		{
-			// Go through the links for the current vertex
-			for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link )
-			{
-				int linkAddress = currentVertex * maxLinksPerVertex + link;
-				int linkIndex = listOfLinksPerVertex[linkAddress];
-				// If we have not already processed this link, add it to the wavefront
-				// Claim it as another processed link
-				// Add the vertex at the far end to the list of vertices to process.
-				if( !processedLink[linkIndex] )
-				{
-					linksForWavefront.push_back( linkIndex );
-					linksProcessed++;
-					processedLink[linkIndex] = true;
-					int v0 = linkData.getVertexPair(linkIndex).vertex0;
-					int v1 = linkData.getVertexPair(linkIndex).vertex1;
-					if( v0 == currentVertex )
-						verticesToProcess.push_back( v1 );
-					else
-						verticesToProcess.push_back( v0 );
-				}
-			}
-			if( verticesToProcess.size() > 0 )
-			{
-				// Get the element on the front of the queue and remove it
-				currentVertex = verticesToProcess[0];
-				removeFromVector( verticesToProcess, 0 );
-			} else {		
-				// If we've not yet processed all the links, find the first unprocessed one
-				// and select one of its vertices as the current vertex
-				if( linksProcessed < linkData.getNumLinks() )
-				{
-					int searchLink = 0;
-					while( processedLink[searchLink] )
-						searchLink++;
-					currentVertex = linkData.getVertexPair(searchLink).vertex0;
-				}	
-			}
-		}
-		// We have either finished or filled a wavefront
-		for( int link = 0; link < linksForWavefront.size(); ++link )
-		{
-			int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0;
-			int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1;
-			insertUniqueAndOrderedIntoVector( vertexSet, v0 );
-			insertUniqueAndOrderedIntoVector( vertexSet, v1 );
-		}
-		// Iterate over links mapped to the wave and batch those
-		// We can run a batch on each cycle trivially
-		batchesWithinWaves.resize( batchesWithinWaves.size() + 1 );
-		btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] );
-		for( int link = 0; link < linksForWavefront.size(); ++link )
-		{
-			int linkIndex = linksForWavefront[link];
-			btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex );
-			int batch = 0;
-			bool placed = false;
-			while( batch < batchesWithinWave.size() && !placed )
-			{
-				bool foundSharedVertex = false;
-				if( batchesWithinWave[batch].size() >= wavefrontSize )
-				{
-					// If we have already filled this batch, move on to another
-					foundSharedVertex = true;
-				} else {
-					for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 )
-					{
-						btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] );
-						if( vertices.vertex0 == vertices2.vertex0 ||
-							vertices.vertex1 == vertices2.vertex0 ||
-							vertices.vertex0 == vertices2.vertex1 ||
-							vertices.vertex1 == vertices2.vertex1 )
-						{
-							foundSharedVertex = true;
-							break;
-						}
-					}
-				}
-				if( !foundSharedVertex )
-				{
-					batchesWithinWave[batch].push_back( linkIndex );
-					placed = true;
-				} else {
-					++batch;
-				}
-			}
-			if( batch == batchesWithinWave.size() && !placed )
-			{
-				batchesWithinWave.resize( batch + 1 );
-				batchesWithinWave[batch].push_back( linkIndex );
-			}
-		}
-	}
-void btSoftBodyLinkDataOpenCLSIMDAware::generateBatches()
-	btAlignedObjectArray < btAlignedObjectArray <int> > linksForWavefronts;
-	btAlignedObjectArray < btAlignedObjectArray <int> > wavefrontBatches;
-	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > batchesWithinWaves;
-	btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set
-	// Group the links into wavefronts
-	computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts );
-	// Batch the wavefronts
-	generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches );
-	m_numWavefronts = linksForWavefronts.size();
-	// At this point we have a description of which links we need to process in each wavefront
-	// First correctly fill the batch ranges vector
-	int numBatches = wavefrontBatches.size();
-	m_wavefrontBatchStartLengths.resize(0);
-	int prefixSum = 0;
-	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
-	{
-		int wavesInBatch = wavefrontBatches[batchIndex].size();
-		int nextPrefixSum = prefixSum + wavesInBatch;
-		m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) );
-		prefixSum += wavesInBatch;
-	}
-	// Also find max number of batches within a wave
-	m_maxBatchesWithinWave = 0;
-	m_maxVerticesWithinWave = 0;
-	m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts );
-	for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex )
-	{
-		// See if the number of batches in this wave is greater than the current maxium
-		int batchesInCurrentWave = batchesWithinWaves[waveIndex].size();
-		int verticesInCurrentWave = verticesForWavefronts[waveIndex].size();
-		m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave );
-		m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave );
-	}
-	// Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around
-	m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2);
-	// Now we know the maximum number of vertices per-wave we can resize the global vertices array
-	m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts );
-	// Grab backup copies of all the link data arrays for the sorting process
-	btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair>				m_links_Backup(m_links);
-	btAlignedObjectArray<float>											m_linkStrength_Backup(m_linkStrength);
-	btAlignedObjectArray<float>											m_linksMassLSC_Backup(m_linksMassLSC);
-	btAlignedObjectArray<float>											m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
-	//btAlignedObjectArray<Vectormath::Aos::Vector3>						m_linksCLength_Backup(m_linksCLength);
-	//btAlignedObjectArray<float>											m_linksLengthRatio_Backup(m_linksLengthRatio);
-	btAlignedObjectArray<float>											m_linksRestLength_Backup(m_linksRestLength);
-	btAlignedObjectArray<float>											m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
-	// Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses.
-	m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
-	m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );	
-	// Then re-order links into wavefront blocks
-	// Total number of wavefronts moved. This will decide the ordering of sorted wavefronts.
-	int wavefrontCount = 0;
-	// Iterate over batches of wavefronts, then wavefronts in the batch
-	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
-	{
-		btAlignedObjectArray <int> &batch( wavefrontBatches[batchIndex] );
-		int wavefrontsInBatch = batch.size();
-		for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex )
-		{	
-			int originalWavefrontIndex = batch[wavefrontIndex];
-			btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] );
-			int verticesUsedByWavefront = wavefrontVertices.size();
-			// Copy the set of vertices into the correctly structured array for use on the device
-			// Fill the non-vertices with -1s
-			// so we can mask out those reads
-			for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex )
-			{
-				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex];
-			}
-			for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex )
-			{
-				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1;
-			}
-			// Obtain the set of batches within the current wavefront
-			btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] );
-			// Set the size of the batches for use in the solver, correctly ordered
-			NumBatchesVerticesPair batchesAndVertices;
-			batchesAndVertices.numBatches = batchesWithinWavefront.size();
-			batchesAndVertices.numVertices = verticesUsedByWavefront;
-			m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices;
-			// Now iterate over batches within the wavefront to structure the links correctly
-			for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch )
-			{
-				btAlignedObjectArray <int> &linksInBatch( batchesWithinWavefront[wavefrontBatch] );
-				int wavefrontBatchSize = linksInBatch.size();
-				int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch;
-				for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex )
-				{
-					int originalLinkAddress = linksInBatch[linkIndex];
-					// Reorder simple arrays trivially
-					m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress];
-					m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress];
-					m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress];
-					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress];
-					m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress];
-					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress];
-					// The local address is more complicated. We need to work out where a given vertex will end up
-					// by searching the set of vertices for this link and using the index as the local address
-					btSoftBodyLinkData::LinkNodePair localPair;
-					btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex];
-					localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 );
-					localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 );
-					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
-				}
-				for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex )
-				{
-					// Put 0s into these arrays for padding for cleanliness
-					m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0);
-					m_linkStrength[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f;
-					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f;
-					// For local addresses of junk data choose a set of addresses just above the range of valid ones 
-					// and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses
-					// The valid addresses will do scatter and gather in the valid range, the junk ones should happily work
-					// off the end of that range so we need no control
-					btSoftBodyLinkData::LinkNodePair localPair;
-					localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16);
-					localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16);
-					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
-				}
-			}
-			wavefrontCount++;
-		}
-	}
-} // void btSoftBodyLinkDataDX11SIMDAware::generateBatches()
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
deleted file mode 100644
index 8cd838ad..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
+++ /dev/null
@@ -1,81 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "stddef.h" //for size_t
-#include "vectormath/vmInclude.h"
-#include "btSoftBodySolver_OpenCL.h"
-#include "btSoftBodySolverBuffer_OpenCL.h"
-#include "btSoftBodySolverLinkData_OpenCLSIMDAware.h"
-#include "btSoftBodySolverVertexData_OpenCL.h"
-#include "btSoftBodySolverTriangleData_OpenCL.h"
-class btOpenCLSoftBodySolverSIMDAware : public btOpenCLSoftBodySolver
-	btSoftBodyLinkDataOpenCLSIMDAware m_linkData;
-	virtual bool buildShaders();
-	void updateConstants( float timeStep );
-	float computeTriangleArea( 
-		const Vectormath::Aos::Point3 &vertex0,
-		const Vectormath::Aos::Point3 &vertex1,
-		const Vectormath::Aos::Point3 &vertex2 );
-	//////////////////////////////////////
-	// Kernel dispatches
-	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
-	void solveCollisionsAndUpdateVelocities( float isolverdt );
-	// End kernel dispatches
-	/////////////////////////////////////
-	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);
-	virtual ~btOpenCLSoftBodySolverSIMDAware();
-	virtual SolverTypes getSolverType() const
-	{
-		return CL_SIMD_SOLVER;
-	}
-	virtual btSoftBodyLinkData &getLinkData();
-	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
-	virtual void solveConstraints( float solverdt );
-}; // btOpenCLSoftBodySolverSIMDAware
diff --git a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h b/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
deleted file mode 100644
index ab6721fb..00000000
--- a/src/bullet/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
+++ /dev/null
@@ -1,748 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "vectormath/vmInclude.h"
-class btSoftBodyLinkData
-	/**
-	 * Class representing a link as a set of three indices into the vertex array.
-	 */
-	class LinkNodePair
-	{
-	public:
-		int vertex0;
-		int vertex1;
-		LinkNodePair()
-		{
-			vertex0 = 0;
-			vertex1 = 0;
-		}
-		LinkNodePair( int v0, int v1 )
-		{
-			vertex0 = v0;
-			vertex1 = v1;
-		}
-	};
-	/**
-	 * Class describing a link for input into the system.
-	 */
-	class LinkDescription
-	{
-	protected:
-		int m_vertex0;
-		int m_vertex1;
-		float m_linkLinearStiffness;
-		float m_linkStrength;
-	public:
-		LinkDescription()
-		{
-			m_vertex0 = 0;
-			m_vertex1 = 0;
-			m_linkLinearStiffness = 1.0;
-			m_linkStrength = 1.0;
-		}
-		LinkDescription( int newVertex0, int newVertex1, float linkLinearStiffness )
-		{
-			m_vertex0 = newVertex0;
-			m_vertex1 = newVertex1;
-			m_linkLinearStiffness = linkLinearStiffness;
-			m_linkStrength = 1.0;
-		}
-		LinkNodePair getVertexPair() const
-		{
-			LinkNodePair nodes;
-			nodes.vertex0 = m_vertex0;
-			nodes.vertex1 = m_vertex1;
-			return nodes;
-		}
-		void setVertex0( int vertex )
-		{
-			m_vertex0 = vertex;
-		}
-		void setVertex1( int vertex )
-		{
-			m_vertex1 = vertex;
-		}
-		void setLinkLinearStiffness( float linearStiffness )
-		{
-			m_linkLinearStiffness = linearStiffness;
-		}
-		void setLinkStrength( float strength )
-		{
-			m_linkStrength = strength;
-		}
-		int getVertex0() const
-		{
-			return m_vertex0;
-		}
-		int getVertex1() const
-		{
-			return m_vertex1;
-		}
-		float getLinkStrength() const
-		{
-			return m_linkStrength;
-		}
-		float getLinkLinearStiffness() const
-		{
-			return m_linkLinearStiffness;
-		}
-	};
-	// NOTE:
-	// Vertex reference data is stored relative to global array, not relative to individual cloth.
-	// Values must be correct if being passed into single-cloth VBOs or when migrating from one solver
-	// to another.
-	btAlignedObjectArray< LinkNodePair > m_links; // Vertex pair for the link
-	btAlignedObjectArray< float >								m_linkStrength; // Strength of each link
-	// (inverseMassA + inverseMassB)/ linear stiffness coefficient
-	btAlignedObjectArray< float >								m_linksMassLSC; 
-	btAlignedObjectArray< float >								m_linksRestLengthSquared; 
-	// Current vector length of link
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >			m_linksCLength;
-	// 1/(current length * current length * massLSC)
-	btAlignedObjectArray< float >								m_linksLengthRatio; 
-	btAlignedObjectArray< float >								m_linksRestLength;
-	btAlignedObjectArray< float >								m_linksMaterialLinearStiffnessCoefficient;
-	btSoftBodyLinkData()
-	{
-	}
-	virtual ~btSoftBodyLinkData()
-	{
-	}
-	virtual void clear()
-	{
-		m_links.resize(0);
-		m_linkStrength.resize(0);
-		m_linksMassLSC.resize(0);
-		m_linksRestLengthSquared.resize(0);
-		m_linksLengthRatio.resize(0);
-		m_linksRestLength.resize(0);
-		m_linksMaterialLinearStiffnessCoefficient.resize(0);
-	}
-	int getNumLinks()
-	{
-		return m_links.size();
-	}
-	/** Allocate enough space in all link-related arrays to fit numLinks links */
-	virtual void createLinks( int numLinks )
-	{
-		int previousSize = m_links.size();
-		int newSize = previousSize + numLinks;
-		// Resize all the arrays that store link data
-		m_links.resize( newSize );
-		m_linkStrength.resize( newSize );
-		m_linksMassLSC.resize( newSize );
-		m_linksRestLengthSquared.resize( newSize );
-		m_linksCLength.resize( newSize );
-		m_linksLengthRatio.resize( newSize );
-		m_linksRestLength.resize( newSize );
-		m_linksMaterialLinearStiffnessCoefficient.resize( newSize );
-	}
-	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
-	virtual void setLinkAt( const LinkDescription &link, int linkIndex )
-	{
-		m_links[linkIndex] = link.getVertexPair();
-		m_linkStrength[linkIndex] = link.getLinkStrength();
-		m_linksMassLSC[linkIndex] = 0.f;
-		m_linksRestLengthSquared[linkIndex] = 0.f;
-		m_linksCLength[linkIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
-		m_linksLengthRatio[linkIndex] = 0.f;
-		m_linksRestLength[linkIndex] = 0.f;
-		m_linksMaterialLinearStiffnessCoefficient[linkIndex] = link.getLinkLinearStiffness();
-	}
-	/**
-	 * Return true if data is on the accelerator.
-	 * The CPU version of this class will return true here because
-	 * the CPU is the same as the accelerator.
-	 */
-	virtual bool onAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Move data from host memory to the accelerator.
-	 * The CPU version will always return that it has moved it.
-	 */
-	virtual bool moveToAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Move data from host memory from the accelerator.
-	 * The CPU version will always return that it has moved it.
-	 */
-	virtual bool moveFromAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Return reference to the vertex index pair for link linkIndex as stored on the host.
-	 */
-	LinkNodePair &getVertexPair( int linkIndex )
-	{
-		return m_links[linkIndex];
-	}
-	/** 
-	 * Return reference to strength of link linkIndex as stored on the host.
-	 */
-	float &getStrength( int linkIndex )
-	{
-		return m_linkStrength[linkIndex];
-	}
-	/**
-	 * Return a reference to the strength of the link corrected for link sorting.
-	 * This is important if we are using data on an accelerator which has the data sorted in some fashion.
-	 */
-	virtual float &getStrengthCorrected( int linkIndex )
-	{
-		return getStrength( linkIndex );
-	}
-	/**
-	 * Return reference to the rest length of link linkIndex as stored on the host.
-	 */
-	float &getRestLength( int linkIndex )
-	{
-		return m_linksRestLength[linkIndex];
-	}
-	/**
-	 * Return reference to linear stiffness coefficient for link linkIndex as stored on the host.
-	 */
-	float &getLinearStiffnessCoefficient( int linkIndex )
-	{
-		return m_linksMaterialLinearStiffnessCoefficient[linkIndex];
-	}
-	/**
-	 * Return reference to the MassLSC value for link linkIndex as stored on the host.
-	 */
-	float &getMassLSC( int linkIndex )
-	{
-		return m_linksMassLSC[linkIndex];
-	}
-	/**
-	 * Return reference to rest length squared for link linkIndex as stored on the host.
-	 */
-	float &getRestLengthSquared( int linkIndex )
-	{
-		return m_linksRestLengthSquared[linkIndex];
-	}
-	/**
-	 * Return reference to current length of link linkIndex as stored on the host.
-	 */
-	Vectormath::Aos::Vector3 &getCurrentLength( int linkIndex )
-	{
-		return m_linksCLength[linkIndex];
-	}
-	 /**
-	  * Return the link length ratio from for link linkIndex as stored on the host.
-	  */
-	 float &getLinkLengthRatio( int linkIndex )
-	 {
-		 return m_linksLengthRatio[linkIndex];
-	 }
- * Wrapper for vertex data information.
- * By wrapping it like this we stand a good chance of being able to optimise for storage format easily.
- * It should also help us make sure all the data structures remain consistent.
- */
-class btSoftBodyVertexData
-	/**
-	 * Class describing a vertex for input into the system.
-	 */
-	class VertexDescription
-	{
-	private:
-		Vectormath::Aos::Point3 m_position;
-		/** Inverse mass. If this is 0f then the mass was 0 because that simplifies calculations. */
-		float m_inverseMass;
-	public:
-		VertexDescription()
-		{	
-			m_position = Vectormath::Aos::Point3( 0.f, 0.f, 0.f );
-			m_inverseMass = 0.f;
-		}
-		VertexDescription( const Vectormath::Aos::Point3 &position, float mass )
-		{
-			m_position = position;
-			if( mass > 0.f )
-				m_inverseMass = 1.0f/mass;
-			else
-				m_inverseMass = 0.f;
-		}
-		void setPosition( const Vectormath::Aos::Point3 &position )
-		{
-			m_position = position;
-		}
-		void setInverseMass( float inverseMass )
-		{
-			m_inverseMass = inverseMass;
-		}
-		void setMass( float mass )
-		{
-			if( mass > 0.f )
-				m_inverseMass = 1.0f/mass;
-			else
-				m_inverseMass = 0.f;
-		}
-		Vectormath::Aos::Point3 getPosition() const
-		{
-			return m_position;
-		}
-		float getInverseMass() const
-		{
-			return m_inverseMass;
-		}
-		float getMass() const
-		{
-			if( m_inverseMass == 0.f )
-				return 0.f;
-			else
-				return 1.0f/m_inverseMass;
-		}
-	};
-	// identifier for the individual cloth
-	// For the CPU we don't really need this as we can grab the cloths and iterate over only their vertices
-	// For a parallel accelerator knowing on a per-vertex basis which cloth we're part of will help for obtaining
-	// per-cloth data
-	// For sorting etc it might also be helpful to be able to use in-array data such as this.
-	btAlignedObjectArray< int >							m_clothIdentifier;
-	btAlignedObjectArray< Vectormath::Aos::Point3 >		m_vertexPosition;			// vertex positions
-	btAlignedObjectArray< Vectormath::Aos::Point3 >		m_vertexPreviousPosition;	// vertex positions
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_vertexVelocity;			// Velocity
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_vertexForceAccumulator;	// Force accumulator
-	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_vertexNormal;				// Normals
-	btAlignedObjectArray< float >						m_vertexInverseMass;		// Inverse mass
-	btAlignedObjectArray< float >						m_vertexArea;				// Area controlled by the vertex
-	btAlignedObjectArray< int >							m_vertexTriangleCount;		// Number of triangles touching this vertex
-	btSoftBodyVertexData()
-	{
-	}
-	virtual ~btSoftBodyVertexData()
-	{
-	}
-	virtual void clear()
-	{
-		m_clothIdentifier.resize(0);
-		m_vertexPosition.resize(0);
-		m_vertexPreviousPosition.resize(0);
-		m_vertexVelocity.resize(0);
-		m_vertexForceAccumulator.resize(0);
-		m_vertexNormal.resize(0);
-		m_vertexInverseMass.resize(0);
-		m_vertexArea.resize(0);
-		m_vertexTriangleCount.resize(0);
-	}
-	int getNumVertices()
-	{
-		return m_vertexPosition.size();
-	}
-	int getClothIdentifier( int vertexIndex )
-	{
-		return m_clothIdentifier[vertexIndex];
-	}
-	void setVertexAt( const VertexDescription &vertex, int vertexIndex )
-	{
-		m_vertexPosition[vertexIndex] = vertex.getPosition();
-		m_vertexPreviousPosition[vertexIndex] = vertex.getPosition();
-		m_vertexVelocity[vertexIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
-		m_vertexForceAccumulator[vertexIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
-		m_vertexNormal[vertexIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
-		m_vertexInverseMass[vertexIndex] = vertex.getInverseMass();
-		m_vertexArea[vertexIndex] = 0.f;
-		m_vertexTriangleCount[vertexIndex] = 0;
-	}
-	/** 
-	 * Create numVertices new vertices for cloth clothIdentifier 
-	 * maxVertices allows a buffer zone of extra vertices for alignment or tearing reasons.
-	 */
-	void createVertices( int numVertices, int clothIdentifier, int maxVertices = 0 )
-	{
-		int previousSize = m_vertexPosition.size();
-		if( maxVertices == 0 )
-			maxVertices = numVertices;
-		int newSize = previousSize + maxVertices;
-		// Resize all the arrays that store vertex data
-		m_clothIdentifier.resize( newSize );
-		m_vertexPosition.resize( newSize );
-		m_vertexPreviousPosition.resize( newSize );
-		m_vertexVelocity.resize( newSize );
-		m_vertexForceAccumulator.resize( newSize );
-		m_vertexNormal.resize( newSize );
-		m_vertexInverseMass.resize( newSize );
-		m_vertexArea.resize( newSize );
-		m_vertexTriangleCount.resize( newSize );
-		for( int vertexIndex = previousSize; vertexIndex < newSize; ++vertexIndex )
-			m_clothIdentifier[vertexIndex] = clothIdentifier;
-		for( int vertexIndex = (previousSize + numVertices); vertexIndex < newSize; ++vertexIndex )
-			m_clothIdentifier[vertexIndex] = -1;
-	}
-	// Get and set methods in header so they can be inlined
-	/**
-	 * Return a reference to the position of vertex vertexIndex as stored on the host.
-	 */
-	Vectormath::Aos::Point3 &getPosition( int vertexIndex )
-	{
-		return m_vertexPosition[vertexIndex];
-	}
-	Vectormath::Aos::Point3 getPosition( int vertexIndex ) const
-	{
-		return m_vertexPosition[vertexIndex];
-	}
-	/**
-	 * Return a reference to the previous position of vertex vertexIndex as stored on the host.
-	 */
-	Vectormath::Aos::Point3 &getPreviousPosition( int vertexIndex )
-	{
-		return m_vertexPreviousPosition[vertexIndex];
-	}
-	/**
-	 * Return a reference to the velocity of vertex vertexIndex as stored on the host.
-	 */
-	Vectormath::Aos::Vector3 &getVelocity( int vertexIndex )
-	{
-		return m_vertexVelocity[vertexIndex];
-	}
-	/**
-	 * Return a reference to the force accumulator of vertex vertexIndex as stored on the host.
-	 */
-	Vectormath::Aos::Vector3 &getForceAccumulator( int vertexIndex )
-	{
-		return m_vertexForceAccumulator[vertexIndex];
-	}
-	/**
-	 * Return a reference to the normal of vertex vertexIndex as stored on the host.
-	 */
-	Vectormath::Aos::Vector3 &getNormal( int vertexIndex )
-	{
-		return m_vertexNormal[vertexIndex];
-	}
-	Vectormath::Aos::Vector3 getNormal( int vertexIndex ) const
-	{
-		return m_vertexNormal[vertexIndex];
-	}
-	/**
-	 * Return a reference to the inverse mass of vertex vertexIndex as stored on the host.
-	 */
-	float &getInverseMass( int vertexIndex )
-	{
-		return m_vertexInverseMass[vertexIndex];
-	}
-	/**
-	 * Get access to the area controlled by this vertex.
-	 */
-	float &getArea( int vertexIndex )
-	{
-		return m_vertexArea[vertexIndex];
-	}
-	/**
-	 * Get access to the array of how many triangles touch each vertex.
-	 */
-	int &getTriangleCount( int vertexIndex )
-	{
-		return m_vertexTriangleCount[vertexIndex];
-	}
-	/**
-	 * Return true if data is on the accelerator.
-	 * The CPU version of this class will return true here because
-	 * the CPU is the same as the accelerator.
-	 */
-	virtual bool onAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Move data from host memory to the accelerator.
-	 * The CPU version will always return that it has moved it.
-	 */
-	virtual bool moveToAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Move data to host memory from the accelerator if bCopy is false.
-	 * If bCopy is true, copy data to host memory from the accelerator so that data 
-	 * won't be moved to accelerator when moveToAccelerator() is called next time. 
-	 * If bCopyMinimum is true, only vertex position and normal are copied.
-	 * bCopyMinimum will be meaningful only if bCopy is true.
-	 * The CPU version will always return that it has moved it.
-	 */
-	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true)
-	{
-		return true;
-	}
-	btAlignedObjectArray< Vectormath::Aos::Point3 >	&getVertexPositions()
-	{
-		return m_vertexPosition;
-	}
-class btSoftBodyTriangleData
-	/**
-	 * Class representing a triangle as a set of three indices into the
-	 * vertex array.
-	 */
-	class TriangleNodeSet
-	{
-	public:
-		int vertex0;
-		int vertex1;
-		int vertex2;
-		int _padding;
-		TriangleNodeSet( )
-		{
-			vertex0 = 0;
-			vertex1 = 0;
-			vertex2 = 0;
-			_padding = -1;
-		}
-		TriangleNodeSet( int newVertex0, int newVertex1, int newVertex2 )
-		{
-			vertex0 = newVertex0;
-			vertex1 = newVertex1;
-			vertex2 = newVertex2;
-		}
-	};
-	class TriangleDescription
-	{
-	protected:
-		int m_vertex0;
-		int m_vertex1;
-		int m_vertex2;
-	public:
-		TriangleDescription()
-		{
-			m_vertex0 = 0;
-			m_vertex1 = 0;
-			m_vertex2 = 0;
-		}
-		TriangleDescription( int newVertex0, int newVertex1, int newVertex2 )
-		{
-			m_vertex0 = newVertex0;
-			m_vertex1 = newVertex1;
-			m_vertex2 = newVertex2;
-		}
-		TriangleNodeSet getVertexSet() const
-		{
-			btSoftBodyTriangleData::TriangleNodeSet nodes;
-			nodes.vertex0 = m_vertex0;
-			nodes.vertex1 = m_vertex1;
-			nodes.vertex2 = m_vertex2;
-			return nodes;
-		}
-	};
-	// NOTE:
-	// Vertex reference data is stored relative to global array, not relative to individual cloth.
-	// Values must be correct if being passed into single-cloth VBOs or when migrating from one solver
-	// to another.
-	btAlignedObjectArray< TriangleNodeSet > m_vertexIndices;
-	btAlignedObjectArray< float > m_area;
-	btAlignedObjectArray< Vectormath::Aos::Vector3 > m_normal;
-	btSoftBodyTriangleData()
-	{
-	}
-	virtual ~btSoftBodyTriangleData()
-	{
-	}
-	virtual void clear()
-	{
-		m_vertexIndices.resize(0);
-		m_area.resize(0);
-		m_normal.resize(0);
-	}
-	int getNumTriangles()
-	{
-		return m_vertexIndices.size();
-	}
-	virtual void setTriangleAt( const TriangleDescription &triangle, int triangleIndex )
-	{
-		m_vertexIndices[triangleIndex] = triangle.getVertexSet();
-	}
-	virtual void createTriangles( int numTriangles )		
-	{
-		int previousSize = m_vertexIndices.size();
-		int newSize = previousSize + numTriangles;
-		// Resize all the arrays that store triangle data
-		m_vertexIndices.resize( newSize );
-		m_area.resize( newSize );
-		m_normal.resize( newSize );
-	}
-	/**
-	 * Return the vertex index set for triangle triangleIndex as stored on the host.
-	 */
-	const TriangleNodeSet &getVertexSet( int triangleIndex )
-	{
-		return m_vertexIndices[triangleIndex];
-	}
-	/**
-	 * Get access to the triangle area.
-	 */
-	float &getTriangleArea( int triangleIndex )
-	{
-		return m_area[triangleIndex];
-	}
-	/**
-	 * Get access to the normal vector for this triangle.
-	 */
-	Vectormath::Aos::Vector3 &getNormal( int triangleIndex )
-	{
-		return m_normal[triangleIndex];
-	}
-	/**
-	 * Return true if data is on the accelerator.
-	 * The CPU version of this class will return true here because
-	 * the CPU is the same as the accelerator.
-	 */
-	virtual bool onAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Move data from host memory to the accelerator.
-	 * The CPU version will always return that it has moved it.
-	 */
-	virtual bool moveToAccelerator()
-	{
-		return true;
-	}
-	/**
-	 * Move data from host memory from the accelerator.
-	 * The CPU version will always return that it has moved it.
-	 */
-	virtual bool moveFromAccelerator()
-	{
-		return true;
-	}
-#endif // #ifndef BT_SOFT_BODY_SOLVER_DATA_H
diff --git a/src/bullet/BulletMultiThreaded/HeapManager.h b/src/bullet/BulletMultiThreaded/HeapManager.h
deleted file mode 100644
index b2da4ef5..00000000
--- a/src/bullet/BulletMultiThreaded/HeapManager.h
+++ /dev/null
@@ -1,117 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef BT_HEAP_MANAGER_H__
-#define BT_HEAP_MANAGER_H__
-#ifdef __SPU__
-	#define HEAP_STACK_SIZE 32
-	#define HEAP_STACK_SIZE 64
-#define MIN_ALLOC_SIZE 16
-class HeapManager
-	ATTRIBUTE_ALIGNED16(unsigned char *mHeap);
-	ATTRIBUTE_ALIGNED16(unsigned int mHeapBytes);
-	ATTRIBUTE_ALIGNED16(unsigned char *mPoolStack[HEAP_STACK_SIZE]);
-	ATTRIBUTE_ALIGNED16(unsigned int mCurStack);
-	enum {ALIGN16,ALIGN128};
-	HeapManager(unsigned char *buf,int bytes)
-	{
-		mHeap = buf;
-		mHeapBytes = bytes;
-		clear();
-	}
-	~HeapManager()
-	{
-	}
-	int getAllocated()
-	{
-		return (int)(mPoolStack[mCurStack]-mHeap);
-	}
-	int getRest()
-	{
-		return mHeapBytes-getAllocated();
-	}
-	void *allocate(size_t bytes,int alignment = ALIGN16)
-	{
-		if(bytes <= 0) bytes = MIN_ALLOC_SIZE;
-		btAssert(mCurStack < (HEAP_STACK_SIZE-1));
-#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__)
-		unsigned long long p = (unsigned long long )mPoolStack[mCurStack];
-		if(alignment == ALIGN128) {
-			p = ((p+127) & 0xffffffffffffff80);
-			bytes = (bytes+127) & 0xffffffffffffff80;
-		}
-		else {
-			bytes = (bytes+15) & 0xfffffffffffffff0;
-		}
-		btAssert(bytes <=(mHeapBytes-(p-(unsigned long long )mHeap)) );
-		unsigned long p = (unsigned long )mPoolStack[mCurStack];
-		if(alignment == ALIGN128) {
-			p = ((p+127) & 0xffffff80);
-			bytes = (bytes+127) & 0xffffff80;
-		}
-		else {
-			bytes = (bytes+15) & 0xfffffff0;
-		}
-		btAssert(bytes <=(mHeapBytes-(p-(unsigned long)mHeap)) );
-		unsigned char * bla = (unsigned char *)(p + bytes);
-		mPoolStack[++mCurStack] = bla;
-		return (void*)p;
-	}
-	void deallocate(void *p)
-	{
-		(void) p;
-		mCurStack--;
-	}
-	void clear()
-	{
-		mPoolStack[0] = mHeap;
-		mCurStack = 0;
-	}
-//	void printStack()
-//	{
-//		for(unsigned int i=0;i<=mCurStack;i++) {
-//			PRINTF("memStack %2d 0x%x\n",i,(uint32_t)mPoolStack[i]);
-//		}
-//	}
-#endif //BT_HEAP_MANAGER_H__
diff --git a/src/bullet/BulletMultiThreaded/PlatformDefinitions.h b/src/bullet/BulletMultiThreaded/PlatformDefinitions.h
deleted file mode 100644
index 142103a0..00000000
--- a/src/bullet/BulletMultiThreaded/PlatformDefinitions.h
+++ /dev/null
@@ -1,99 +0,0 @@
-///This file provides some platform/compiler checks for common definitions
-#include "LinearMath/btScalar.h"
-#include "LinearMath/btMinMax.h"
-#include "physics_effects/base_level/base/pfx_vectormath_include.win32.h"
-typedef Vectormath::Aos::Vector3    vmVector3;
-typedef Vectormath::Aos::Quat       vmQuat;
-typedef Vectormath::Aos::Matrix3    vmMatrix3;
-typedef Vectormath::Aos::Transform3 vmTransform3;
-typedef Vectormath::Aos::Point3     vmPoint3;
-#include "vectormath/vmInclude.h"
-#ifdef _WIN32
-typedef union
-  unsigned int u;
-  void *p;
-} addr64;
-#define USE_WIN32_THREADING 1
-		#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
-		#else
-		#endif //__MINGW32__
-		typedef unsigned char     uint8_t;
-#ifndef __PHYSICS_COMMON_H__
-#ifndef __BT_SKIP_UINT64_H
-		typedef unsigned long int uint64_t;
-#endif //__BT_SKIP_UINT64_H
-		typedef unsigned int      uint32_t;
-#endif //__PHYSICS_COMMON_H__
-		typedef unsigned short    uint16_t;
-		#include <malloc.h>
-		#define memalign(alignment, size) malloc(size);
-#include <string.h> //memcpy
-		#include <stdio.h>		
-		#define spu_printf printf
-		#include <stdint.h>
-		#include <stdlib.h>
-		#include <string.h> //for memcpy
-#if defined	(__CELLOS_LV2__)
-	// Playstation 3 Cell SDK
-#include <spu_printf.h>
-	// posix system
-#define USE_PTHREADS    (1)
-#ifdef USE_LIBSPE2
-#include <stdio.h>		
-#define spu_printf printf	
-#define DWORD unsigned int
-			typedef union
-			{
-			  unsigned long long ull;
-			  unsigned int ui[2];
-			  void *p;
-			} addr64;
-#endif // USE_LIBSPE2
-#endif	//__CELLOS_LV2__
-#ifdef __SPU__
-#include <stdio.h>		
-#define printf spu_printf
-/* Included here because we need uint*_t typedefs */
-#include "PpuAddressSpace.h"
diff --git a/src/bullet/BulletMultiThreaded/PosixThreadSupport.cpp b/src/bullet/BulletMultiThreaded/PosixThreadSupport.cpp
deleted file mode 100644
index c8b49ee3..00000000
--- a/src/bullet/BulletMultiThreaded/PosixThreadSupport.cpp
+++ /dev/null
@@ -1,399 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <stdio.h>
-#include "PosixThreadSupport.h"
-#include <errno.h>
-#include <unistd.h>
-#include "SpuCollisionTaskProcess.h"
-#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
-#define checkPThreadFunction(returnValue) \
-    if(0 != returnValue) { \
-        printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \
-    }
-// The number of threads should be equal to the number of available cores
-// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
-// PosixThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
-// Setup and initialize SPU/CELL/Libspe2
-PosixThreadSupport::PosixThreadSupport(ThreadConstructionInfo& threadConstructionInfo)
-	startThreads(threadConstructionInfo);
-// cleanup/shutdown Libspe2
-	stopSPU();
-#if (defined (__APPLE__))
-// this semaphore will signal, if and how many threads are finished with their work
-static sem_t* mainSemaphore=0;
-static sem_t* createSem(const char* baseName)
-	static int semCount = 0;
-        /// Named semaphore begin
-        char name[32];
-        snprintf(name, 32, "/%s-%d-%4.4d", baseName, getpid(), semCount++); 
-        sem_t* tempSem = sem_open(name, O_CREAT, 0600, 0);
-        if (tempSem != reinterpret_cast<sem_t *>(SEM_FAILED))
-        {
-//        printf("Created \"%s\" Semaphore %p\n", name, tempSem);
-        }
-        else
-	{
-		//printf("Error creating Semaphore %d\n", errno);
-		exit(-1);
-	}
-        /// Named semaphore end
-	sem_t* tempSem = new sem_t;
-	checkPThreadFunction(sem_init(tempSem, 0, 0));
-	return tempSem;
-static void destroySem(sem_t* semaphore)
-	checkPThreadFunction(sem_close(semaphore));
-	checkPThreadFunction(sem_destroy(semaphore));
-	delete semaphore;
-static void *threadFunction(void *argument) 
-	PosixThreadSupport::btSpuStatus* status = (PosixThreadSupport::btSpuStatus*)argument;
-	while (1)
-	{
-            checkPThreadFunction(sem_wait(status->startSemaphore));
-		void* userPtr = status->m_userPtr;
-		if (userPtr)
-		{
-			btAssert(status->m_status);
-			status->m_userThreadFunc(userPtr,status->m_lsMemory);
-			status->m_status = 2;
-			checkPThreadFunction(sem_post(mainSemaphore));
-	                status->threadUsed++;
-		} else {
-			//exit Thread
-			status->m_status = 3;
-			checkPThreadFunction(sem_post(mainSemaphore));
-			printf("Thread with taskId %i exiting\n",status->m_taskId);
-			break;
-		}
-	}
-	printf("Thread TERMINATED\n");
-	return 0;
-///send messages to SPUs
-void PosixThreadSupport::sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t taskId)
-	///	gMidphaseSPU.sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (uint32_t) &taskDesc);
-	///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
-	switch (uiCommand)
-	{
-		{
-			btSpuStatus&	spuStatus = m_activeSpuStatus[taskId];
-			btAssert(taskId >= 0);
-			btAssert(taskId < m_activeSpuStatus.size());
-			spuStatus.m_commandId = uiCommand;
-			spuStatus.m_status = 1;
-			spuStatus.m_userPtr = (void*)uiArgument0;
-			// fire event to start new task
-			checkPThreadFunction(sem_post(spuStatus.startSemaphore));
-			break;
-		}
-	default:
-		{
-			///not implemented
-			btAssert(0);
-		}
-	};
-///check for messages from SPUs
-void PosixThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
-	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
-	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
-	btAssert(m_activeSpuStatus.size());
-        // wait for any of the threads to finish
-	checkPThreadFunction(sem_wait(mainSemaphore));
-	// get at least one thread which has finished
-        size_t last = -1;
-        for(size_t t=0; t < size_t(m_activeSpuStatus.size()); ++t) {
-            if(2 == m_activeSpuStatus[t].m_status) {
-                last = t;
-                break;
-            }
-        }
-	btSpuStatus& spuStatus = m_activeSpuStatus[last];
-	btAssert(spuStatus.m_status > 1);
-	spuStatus.m_status = 0;
-	// need to find an active spu
-	btAssert(last >= 0);
-	*puiArgument0 = spuStatus.m_taskId;
-	*puiArgument1 = spuStatus.m_status;
-void PosixThreadSupport::startThreads(ThreadConstructionInfo& threadConstructionInfo)
-        printf("%s creating %i threads.\n", __FUNCTION__, threadConstructionInfo.m_numThreads);
-	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
-	mainSemaphore = createSem("main");                
-	//checkPThreadFunction(sem_wait(mainSemaphore));
-	for (int i=0;i < threadConstructionInfo.m_numThreads;i++)
-	{
-		printf("starting thread %d\n",i);
-		btSpuStatus&	spuStatus = m_activeSpuStatus[i];
-		spuStatus.startSemaphore = createSem("threadLocal");                
-                checkPThreadFunction(pthread_create(&spuStatus.thread, NULL, &threadFunction, (void*)&spuStatus));
-		spuStatus.m_userPtr=0;
-		spuStatus.m_taskId = i;
-		spuStatus.m_commandId = 0;
-		spuStatus.m_status = 0;
-		spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
-		spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
-        spuStatus.threadUsed = 0;
-		printf("started thread %d \n",i);
-	}
-void PosixThreadSupport::startSPU()
-///tell the task scheduler we are done with the SPU tasks
-void PosixThreadSupport::stopSPU()
-	for(size_t t=0; t < size_t(m_activeSpuStatus.size()); ++t) 
-	{
-            btSpuStatus&	spuStatus = m_activeSpuStatus[t];
-            printf("%s: Thread %i used: %ld\n", __FUNCTION__, int(t), spuStatus.threadUsed);
-	spuStatus.m_userPtr = 0;       
- 	checkPThreadFunction(sem_post(spuStatus.startSemaphore));
-	checkPThreadFunction(sem_wait(mainSemaphore));
-	printf("destroy semaphore\n"); 
-            destroySem(spuStatus.startSemaphore);
-            printf("semaphore destroyed\n");
-		checkPThreadFunction(pthread_join(spuStatus.thread,0));
-        }
-	printf("destroy main semaphore\n");
-        destroySem(mainSemaphore);
-	printf("main semaphore destroyed\n");
-	m_activeSpuStatus.clear();
-class PosixCriticalSection : public btCriticalSection 
-	pthread_mutex_t m_mutex;
-	PosixCriticalSection() 
-	{
-		pthread_mutex_init(&m_mutex, NULL);
-	}
-	virtual ~PosixCriticalSection() 
-	{
-		pthread_mutex_destroy(&m_mutex);
-	}
-	ATTRIBUTE_ALIGNED16(unsigned int mCommonBuff[32]);
-	virtual unsigned int getSharedParam(int i)
-	{
-		return mCommonBuff[i];
-	}
-	virtual void setSharedParam(int i,unsigned int p)
-	{
-		mCommonBuff[i] = p;
-	}
-	virtual void lock()
-	{
-		pthread_mutex_lock(&m_mutex);
-	}
-	virtual void unlock()
-	{
-		pthread_mutex_unlock(&m_mutex);
-	}
-#if defined(_POSIX_BARRIERS) && (_POSIX_BARRIERS - 20012L) >= 0
-/* OK to use barriers on this platform */
-class PosixBarrier : public btBarrier 
-	pthread_barrier_t m_barr;
-	int m_numThreads;
-	PosixBarrier()
-	:m_numThreads(0)	{	}
-	virtual ~PosixBarrier()	{
-		pthread_barrier_destroy(&m_barr);
-	}
-	virtual void sync()
-	{
-		int rc = pthread_barrier_wait(&m_barr);
-		if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)
-		{
-			printf("Could not wait on barrier\n");
-			exit(-1);
-		}
-	}
-	virtual void setMaxCount(int numThreads)
-	{
-		int result = pthread_barrier_init(&m_barr, NULL, numThreads);
-		m_numThreads = numThreads;
-		btAssert(result==0);
-	}
-	virtual int  getMaxCount()
-	{
-		return m_numThreads;
-	}
-/* Not OK to use barriers on this platform - insert alternate code here */
-class PosixBarrier : public btBarrier 
-	pthread_mutex_t m_mutex;
-	pthread_cond_t m_cond;
-	int m_numThreads;
-	int	m_called;
-	PosixBarrier()
-	:m_numThreads(0)
-	{
-	}
-	virtual ~PosixBarrier() 
-	{
-		if (m_numThreads>0)
-		{
-			pthread_mutex_destroy(&m_mutex);
-			pthread_cond_destroy(&m_cond);
-		}
-	}
-	virtual void sync()
-	{		
-		pthread_mutex_lock(&m_mutex);
-		m_called++;
-		if (m_called == m_numThreads) {
-			m_called = 0;
-			pthread_cond_broadcast(&m_cond);
-		} else {
-			pthread_cond_wait(&m_cond,&m_mutex);
-		}
-		pthread_mutex_unlock(&m_mutex);
-	}
-	virtual void setMaxCount(int numThreads)
-	{
-		if (m_numThreads>0)
-		{
-			pthread_mutex_destroy(&m_mutex);
-			pthread_cond_destroy(&m_cond);
-		}
-		m_called = 0;
-		pthread_mutex_init(&m_mutex,NULL);
-		pthread_cond_init(&m_cond,NULL);
-		m_numThreads = numThreads;
-	}
-	virtual int  getMaxCount()
-	{
-		return m_numThreads;
-	}
-btBarrier* PosixThreadSupport::createBarrier()
-	PosixBarrier* barrier = new PosixBarrier();
-	barrier->setMaxCount(getNumTasks());
-	return barrier;
-btCriticalSection* PosixThreadSupport::createCriticalSection()
-	return new PosixCriticalSection();
-#endif // USE_PTHREADS
diff --git a/src/bullet/BulletMultiThreaded/PosixThreadSupport.h b/src/bullet/BulletMultiThreaded/PosixThreadSupport.h
deleted file mode 100644
index ca47e450..00000000
--- a/src/bullet/BulletMultiThreaded/PosixThreadSupport.h
+++ /dev/null
@@ -1,142 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "LinearMath/btScalar.h"
-#include "PlatformDefinitions.h"
-#ifdef USE_PTHREADS //platform specifc defines are defined in PlatformDefinitions.h
-#ifndef _XOPEN_SOURCE
-#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html
-#endif //_XOPEN_SOURCE
-#include <pthread.h>
-#include <semaphore.h>
-#include "LinearMath/btAlignedObjectArray.h"
-#include "btThreadSupportInterface.h"
-typedef void (*PosixThreadFunc)(void* userPtr,void* lsMemory);
-typedef void* (*PosixlsMemorySetupFunc)();
-// PosixThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
-class PosixThreadSupport : public btThreadSupportInterface 
-    typedef enum sStatus {
-        STATUS_BUSY,
-        STATUS_READY,
-    } Status;
-	// placeholder, until libspe2 support is there
-	struct	btSpuStatus
-	{
-		uint32_t	m_taskId;
-		uint32_t	m_commandId;
-		uint32_t	m_status;
-		PosixThreadFunc	m_userThreadFunc;
-		void*	m_userPtr; //for taskDesc etc
-		void*	m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc
-                pthread_t thread;
-                sem_t* startSemaphore;
-        unsigned long threadUsed;
-	};
-	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
-	///Setup and initialize SPU/CELL/Libspe2
-	struct	ThreadConstructionInfo
-	{
-		ThreadConstructionInfo(const char* uniqueName,
-									PosixThreadFunc userThreadFunc,
-									PosixlsMemorySetupFunc	lsMemoryFunc,
-									int numThreads=1,
-									int threadStackSize=65535
-									)
-									:m_uniqueName(uniqueName),
-									m_userThreadFunc(userThreadFunc),
-									m_lsMemoryFunc(lsMemoryFunc),
-									m_numThreads(numThreads),
-									m_threadStackSize(threadStackSize)
-		{
-		}
-		const char*					m_uniqueName;
-		PosixThreadFunc			m_userThreadFunc;
-		PosixlsMemorySetupFunc	m_lsMemoryFunc;
-		int						m_numThreads;
-		int						m_threadStackSize;
-	};
-	PosixThreadSupport(ThreadConstructionInfo& threadConstructionInfo);
-///cleanup/shutdown Libspe2
-	virtual	~PosixThreadSupport();
-	void	startThreads(ThreadConstructionInfo&	threadInfo);
-///send messages to SPUs
-	virtual	void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1);
-///check for messages from SPUs
-	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
-///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-	virtual	void startSPU();
-///tell the task scheduler we are done with the SPU tasks
-	virtual	void stopSPU();
-	virtual void setNumTasks(int numTasks) {}
-	virtual int getNumTasks() const
-	{
-		return m_activeSpuStatus.size();
-	}
-	virtual btBarrier* createBarrier();
-	virtual btCriticalSection* createCriticalSection();
-	virtual void*	getThreadLocalMemory(int taskId)
-	{
-		return m_activeSpuStatus[taskId].m_lsMemory;
-	}
-#endif // USE_PTHREADS
diff --git a/src/bullet/BulletMultiThreaded/SequentialThreadSupport.cpp b/src/bullet/BulletMultiThreaded/SequentialThreadSupport.cpp
deleted file mode 100644
index 8cc72418..00000000
--- a/src/bullet/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SequentialThreadSupport.h"
-#include "SpuCollisionTaskProcess.h"
-#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
-SequentialThreadSupport::SequentialThreadSupport(SequentialThreadConstructionInfo& threadConstructionInfo)
-	startThreads(threadConstructionInfo);
-///cleanup/shutdown Libspe2
-	stopSPU();
-#include <stdio.h>
-///send messages to SPUs
-void SequentialThreadSupport::sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t taskId)
-	switch (uiCommand)
-	{
-		{
-			btSpuStatus&	spuStatus = m_activeSpuStatus[0];
-			spuStatus.m_userPtr=(void*)uiArgument0;
-			spuStatus.m_userThreadFunc(spuStatus.m_userPtr,spuStatus.m_lsMemory);
-		}
-	break;
-	default:
-		{
-			///not implemented
-			btAssert(0 && "Not implemented");
-		}
-	};
-///check for messages from SPUs
-void SequentialThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
-	btAssert(m_activeSpuStatus.size());
-	btSpuStatus& spuStatus = m_activeSpuStatus[0];
-	*puiArgument0 = spuStatus.m_taskId;
-	*puiArgument1 = spuStatus.m_status;
-void SequentialThreadSupport::startThreads(SequentialThreadConstructionInfo& threadConstructionInfo)
-	m_activeSpuStatus.resize(1);
-	printf("STS: Not starting any threads\n");
-	btSpuStatus& spuStatus = m_activeSpuStatus[0];
-	spuStatus.m_userPtr = 0;
-	spuStatus.m_taskId = 0;
-	spuStatus.m_commandId = 0;
-	spuStatus.m_status = 0;
-	spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
-	spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
-	printf("STS: Created local store at %p for task %s\n", spuStatus.m_lsMemory, threadConstructionInfo.m_uniqueName);
-void SequentialThreadSupport::startSPU()
-void SequentialThreadSupport::stopSPU()
-	m_activeSpuStatus.clear();
-void SequentialThreadSupport::setNumTasks(int numTasks)
-	printf("SequentialThreadSupport::setNumTasks(%d) is not implemented and has no effect\n",numTasks);
-class btDummyBarrier : public btBarrier
-	btDummyBarrier()
-	{
-	}
-	virtual ~btDummyBarrier()
-	{
-	}
-	void sync()
-	{
-	}
-	virtual void setMaxCount(int n) {}
-	virtual int  getMaxCount() {return 1;}
-class btDummyCriticalSection : public btCriticalSection
-	btDummyCriticalSection()
-	{
-	}
-	virtual ~btDummyCriticalSection()
-	{
-	}
-	unsigned int getSharedParam(int i)
-	{
-		btAssert(i>=0&&i<31);
-		return mCommonBuff[i+1];
-	}
-	void setSharedParam(int i,unsigned int p)
-	{
-		btAssert(i>=0&&i<31);
-		mCommonBuff[i+1] = p;
-	}
-	void lock()
-	{
-		mCommonBuff[0] = 1;
-	}
-	void unlock()
-	{
-		mCommonBuff[0] = 0;
-	}
-btBarrier*	SequentialThreadSupport::createBarrier()
-	return new btDummyBarrier();
-btCriticalSection* SequentialThreadSupport::createCriticalSection()
-	return new btDummyCriticalSection();
diff --git a/src/bullet/BulletMultiThreaded/SequentialThreadSupport.h b/src/bullet/BulletMultiThreaded/SequentialThreadSupport.h
deleted file mode 100644
index 2b9ade82..00000000
--- a/src/bullet/BulletMultiThreaded/SequentialThreadSupport.h
+++ /dev/null
@@ -1,96 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "LinearMath/btScalar.h"
-#include "PlatformDefinitions.h"
-#include "LinearMath/btAlignedObjectArray.h"
-#include "btThreadSupportInterface.h"
-typedef void (*SequentialThreadFunc)(void* userPtr,void* lsMemory);
-typedef void* (*SequentiallsMemorySetupFunc)();
-///The SequentialThreadSupport is a portable non-parallel implementation of the btThreadSupportInterface
-///This is useful for debugging and porting SPU Tasks to other platforms.
-class SequentialThreadSupport : public btThreadSupportInterface 
-	struct	btSpuStatus
-	{
-		uint32_t	m_taskId;
-		uint32_t	m_commandId;
-		uint32_t	m_status;
-		SequentialThreadFunc	m_userThreadFunc;
-		void*	m_userPtr; //for taskDesc etc
-		void*	m_lsMemory; //initialized using SequentiallsMemorySetupFunc
-	};
-	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
-	btAlignedObjectArray<void*>			m_completeHandles;	
-	struct	SequentialThreadConstructionInfo
-	{
-		SequentialThreadConstructionInfo (const char* uniqueName,
-									SequentialThreadFunc userThreadFunc,
-									SequentiallsMemorySetupFunc	lsMemoryFunc
-									)
-									:m_uniqueName(uniqueName),
-									m_userThreadFunc(userThreadFunc),
-									m_lsMemoryFunc(lsMemoryFunc)
-		{
-		}
-		const char*						m_uniqueName;
-		SequentialThreadFunc		m_userThreadFunc;
-		SequentiallsMemorySetupFunc	m_lsMemoryFunc;
-	};
-	SequentialThreadSupport(SequentialThreadConstructionInfo& threadConstructionInfo);
-	virtual	~SequentialThreadSupport();
-	void	startThreads(SequentialThreadConstructionInfo&	threadInfo);
-///send messages to SPUs
-	virtual	void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1);
-///check for messages from SPUs
-	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
-///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-	virtual	void startSPU();
-///tell the task scheduler we are done with the SPU tasks
-	virtual	void stopSPU();
-	virtual void setNumTasks(int numTasks);
-	virtual int getNumTasks() const
-	{
-		return 1;
-	}
-	virtual btBarrier*	createBarrier();
-	virtual btCriticalSection* createCriticalSection();
diff --git a/src/bullet/BulletMultiThreaded/SpuCollisionTaskProcess.cpp b/src/bullet/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
deleted file mode 100644
index f606d136..00000000
--- a/src/bullet/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-//class OptimizedBvhNode;
-#include "SpuCollisionTaskProcess.h"
-void	SpuCollisionTaskProcess::setNumTasks(int maxNumTasks)
-	if (int(m_maxNumOutstandingTasks) != maxNumTasks)
-	{
-		m_maxNumOutstandingTasks = maxNumTasks;
-		m_taskBusy.resize(m_maxNumOutstandingTasks);
-		m_spuGatherTaskDesc.resize(m_maxNumOutstandingTasks);
-		for (int i = 0; i < m_taskBusy.size(); i++)
-		{
-			m_taskBusy[i] = false;
-		}
-		///re-allocate task memory buffers
-		if (m_workUnitTaskBuffers != 0)
-		{
-			btAlignedFree(m_workUnitTaskBuffers);
-		}
-		m_workUnitTaskBuffers = (unsigned char *)btAlignedAlloc(MIDPHASE_WORKUNIT_TASK_SIZE*m_maxNumOutstandingTasks, 128);
-	}
-SpuCollisionTaskProcess::SpuCollisionTaskProcess(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks)
-	m_workUnitTaskBuffers = (unsigned char *)0;
-	setNumTasks(maxNumOutstandingTasks);
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_currentPage = 0;
-	m_currentPageEntry = 0;
-#ifdef DEBUG_SpuCollisionTaskProcess
-	m_initialized = false;
-	m_threadInterface->startSPU();
-	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
-	printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", int(sizeof(SpuGatherAndProcessWorkUnitInput)));
-	if (m_workUnitTaskBuffers != 0)
-	{
-		btAlignedFree(m_workUnitTaskBuffers);
-		m_workUnitTaskBuffers = 0;
-	}
-	m_threadInterface->stopSPU();
-void SpuCollisionTaskProcess::initialize2(bool useEpa)
-	printf("SpuCollisionTaskProcess::initialize()\n");
-	for (int i = 0; i < int (m_maxNumOutstandingTasks); i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_currentPage = 0;
-	m_currentPageEntry = 0;
-	m_useEpa = useEpa;
-#ifdef DEBUG_SpuCollisionTaskProcess
-	m_initialized = true;
-void SpuCollisionTaskProcess::issueTask2()
-	printf("SpuCollisionTaskProcess::issueTask (m_currentTask= %d\n)", m_currentTask);
-	m_taskBusy[m_currentTask] = true;
-	m_numBusyTasks++;
-	SpuGatherAndProcessPairsTaskDesc& taskDesc = m_spuGatherTaskDesc[m_currentTask];
-	taskDesc.m_useEpa = m_useEpa;
-	{
-		// send task description in event message
-		// no error checking here...
-		// but, currently, event queue can be no larger than NUM_WORKUNIT_TASKS.
-		taskDesc.m_inPairPtr = reinterpret_cast<uint64_t>(MIDPHASE_TASK_PTR(m_currentTask));
-		taskDesc.taskId = m_currentTask;
-		taskDesc.numPages = m_currentPage+1;
-		taskDesc.numOnLastPage = m_currentPageEntry;
-	}
-	m_threadInterface->sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (ppu_address_t) &taskDesc,m_currentTask);
-	// if all tasks busy, wait for spu event to clear the task.
-	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
-	{
-		unsigned int taskId;
-		unsigned int outputSize;
-		for (int i=0;i<int (m_maxNumOutstandingTasks);i++)
-		  {
-			  if (m_taskBusy[i])
-			  {
-				  taskId = i;
-				  break;
-			  }
-		  }
-	  btAssert(taskId>=0);
-		m_threadInterface->waitForResponse(&taskId, &outputSize);
-//		printf("issueTask taskId %d completed, numBusy=%d\n",taskId,m_numBusyTasks);
-		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
-		//postProcess(taskId, outputSize);
-		m_taskBusy[taskId] = false;
-		m_numBusyTasks--;
-	}
-void SpuCollisionTaskProcess::addWorkToTask(void* pairArrayPtr,int startIndex,int endIndex)
-	printf("#");
-#ifdef DEBUG_SpuCollisionTaskProcess
-	btAssert(m_initialized);
-	btAssert(m_workUnitTaskBuffers);
-	bool batch = true;
-	if (batch)
-	{
-		if (m_currentPageEntry == MIDPHASE_NUM_WORKUNITS_PER_PAGE)
-		{
-			if (m_currentPage == MIDPHASE_NUM_WORKUNIT_PAGES-1)
-			{
-				// task buffer is full, issue current task.
-				// if all task buffers busy, this waits until SPU is done.
-				issueTask2();
-				// find new task buffer
-				for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
-				{
-					if (!m_taskBusy[i])
-					{
-						m_currentTask = i;
-						//init the task data
-						break;
-					}
-				}
-				m_currentPage = 0;
-			}
-			else
-			{
-				m_currentPage++;
-			}
-			m_currentPageEntry = 0;
-		}
-	}
-	{
-		SpuGatherAndProcessWorkUnitInput &wuInput = 
-			*(reinterpret_cast<SpuGatherAndProcessWorkUnitInput*>
-			(MIDPHASE_ENTRY_PTR(m_currentTask, m_currentPage, m_currentPageEntry)));
-		wuInput.m_pairArrayPtr = reinterpret_cast<uint64_t>(pairArrayPtr);
-		wuInput.m_startIndex = startIndex;
-		wuInput.m_endIndex = endIndex;
-		m_currentPageEntry++;
-		if (!batch)
-		{
-			issueTask2();
-			// find new task buffer
-			for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
-			{
-				if (!m_taskBusy[i])
-				{
-					m_currentTask = i;
-					//init the task data
-					break;
-				}
-			}
-			m_currentPage = 0;
-			m_currentPageEntry =0;
-		}
-	}
-	printf("\nSpuCollisionTaskProcess::flush()\n");
-	// if there's a partially filled task buffer, submit that task
-	if (m_currentPage > 0 || m_currentPageEntry > 0)
-	{
-		issueTask2();
-	}
-	// all tasks are issued, wait for all tasks to be complete
-	while(m_numBusyTasks > 0)
-	{
-	  // Consolidating SPU code
-	  unsigned int taskId=-1;
-	  unsigned int outputSize;
-	  for (int i=0;i<int (m_maxNumOutstandingTasks);i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-	  btAssert(taskId>=0);
-	  {
-		// SPURS support.
-		  m_threadInterface->waitForResponse(&taskId, &outputSize);
-	  }
-//		 printf("flush2 taskId %d completed, numBusy =%d \n",taskId,m_numBusyTasks);
-		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
-		//postProcess(taskId, outputSize);
-		m_taskBusy[taskId] = false;
-		m_numBusyTasks--;
-	}
diff --git a/src/bullet/BulletMultiThreaded/SpuCollisionTaskProcess.h b/src/bullet/BulletMultiThreaded/SpuCollisionTaskProcess.h
deleted file mode 100644
index 23b5b05a..00000000
--- a/src/bullet/BulletMultiThreaded/SpuCollisionTaskProcess.h
+++ /dev/null
@@ -1,163 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <assert.h>
-#include "LinearMath/btScalar.h"
-#include "PlatformDefinitions.h"
-#include "LinearMath/btAlignedObjectArray.h"
-#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h" // for definitions processCollisionTask and createCollisionLocalStoreMemory
-#include "btThreadSupportInterface.h"
-//#include "SPUAssert.h"
-#include <string.h>
-#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-#include "BulletCollision/CollisionShapes/btCollisionShape.h"
-#include "BulletCollision/CollisionShapes/btConvexShape.h"
-#include "LinearMath/btAlignedAllocator.h"
-#include <stdio.h>
-#define DEBUG_SpuCollisionTaskProcess 1
-class btCollisionObject;
-class btPersistentManifold;
-class btDispatcher;
-/////Task Description for SPU collision detection
-//struct SpuGatherAndProcessPairsTaskDesc
-//	uint64_t	inPtr;//m_pairArrayPtr;
-//	//mutex variable
-//	uint32_t	m_someMutexVariableInMainMemory;
-//	uint64_t	m_dispatcher;
-//	uint32_t	numOnLastPage;
-//	uint16_t numPages;
-//	uint16_t taskId;
-//	struct	CollisionTask_LocalStoreMemory*	m_lsMemory; 
-//#if  defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
-//__attribute__ ((aligned (16)))
-///MidphaseWorkUnitInput stores individual primitive versus mesh collision detection input, to be processed by the SPU.
-ATTRIBUTE_ALIGNED16(struct) SpuGatherAndProcessWorkUnitInput
-	uint64_t m_pairArrayPtr;
-	int		m_startIndex;
-	int		m_endIndex;
-/// SpuCollisionTaskProcess handles SPU processing of collision pairs.
-/// Maintains a set of task buffers.
-/// When the task is full, the task is issued for SPUs to process.  Contact output goes into btPersistentManifold
-/// associated with each task.
-/// When PPU issues a task, it will look for completed task buffers
-/// PPU will do postprocessing, dependent on workunit output (not likely)
-class SpuCollisionTaskProcess
-  unsigned char  *m_workUnitTaskBuffers;
-	// track task buffers that are being used, and total busy tasks
-	btAlignedObjectArray<bool>	m_taskBusy;
-	btAlignedObjectArray<SpuGatherAndProcessPairsTaskDesc>	m_spuGatherTaskDesc;
-	class	btThreadSupportInterface*	m_threadInterface;
-	unsigned int	m_maxNumOutstandingTasks;
-	unsigned int   m_numBusyTasks;
-	// the current task and the current entry to insert a new work unit
-	unsigned int   m_currentTask;
-	unsigned int   m_currentPage;
-	unsigned int   m_currentPageEntry;
-	bool m_useEpa;
-#ifdef DEBUG_SpuCollisionTaskProcess
-	bool m_initialized;
-	void issueTask2();
-	//void postProcess(unsigned int taskId, int outputSize);
-	SpuCollisionTaskProcess(btThreadSupportInterface*	threadInterface, unsigned int maxNumOutstandingTasks);
-	~SpuCollisionTaskProcess();
-	///call initialize in the beginning of the frame, before addCollisionPairToTask
-	void initialize2(bool useEpa = false);
-	///batch up additional work to a current task for SPU processing. When batch is full, it issues the task.
-	void addWorkToTask(void* pairArrayPtr,int startIndex,int endIndex);
-	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
-	void flush2();
-	/// set the maximum number of SPU tasks allocated
-	void	setNumTasks(int maxNumTasks);
-	int		getNumTasks() const
-	{
-		return m_maxNumOutstandingTasks;
-	}
-#define MIDPHASE_TASK_PTR(task) (&m_workUnitTaskBuffers[0] + MIDPHASE_WORKUNIT_TASK_SIZE*task)
-#define MIDPHASE_ENTRY_PTR(task,page,entry) (MIDPHASE_TASK_PTR(task) + MIDPHASE_WORKUNIT_PAGE_SIZE*page + sizeof(SpuGatherAndProcessWorkUnitInput)*entry)
-#define MIDPHASE_OUTPUT_PTR(task) (&m_contactOutputBuffers[0] + MIDPHASE_MAX_CONTACT_BUFFER_SIZE*task)
-#define MIDPHASE_TREENODES_PTR(task) (&m_complexShapeBuffers[0] + MIDPHASE_COMPLEX_SHAPE_BUFFER_SIZE*task)
diff --git a/src/bullet/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp b/src/bullet/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
deleted file mode 100644
index 286b6319..00000000
--- a/src/bullet/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuContactManifoldCollisionAlgorithm.h"
-#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
-#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-#include "BulletCollision/CollisionShapes/btCollisionShape.h"
-#include "BulletCollision/CollisionShapes/btPolyhedralConvexShape.h"
-void SpuContactManifoldCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
-	btAssert(0);
-btScalar SpuContactManifoldCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
-	btAssert(0);
-	return 1.f;
-#ifndef __SPU__
-SpuContactManifoldCollisionAlgorithm::SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1)
-	m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
-	m_shapeType0 = body0->getCollisionShape()->getShapeType();
-	m_shapeType1 = body1->getCollisionShape()->getShapeType();
-	m_collisionMargin0 = body0->getCollisionShape()->getMargin();
-	m_collisionMargin1 = body1->getCollisionShape()->getMargin();
-	m_collisionObject0 = body0;
-	m_collisionObject1 = body1;
-	if (body0->getCollisionShape()->isPolyhedral())
-	{
-		btPolyhedralConvexShape* convex0 = (btPolyhedralConvexShape*)body0->getCollisionShape();
-		m_shapeDimensions0 = convex0->getImplicitShapeDimensions();
-	}
-	if (body1->getCollisionShape()->isPolyhedral())
-	{
-		btPolyhedralConvexShape* convex1 = (btPolyhedralConvexShape*)body1->getCollisionShape();
-		m_shapeDimensions1 = convex1->getImplicitShapeDimensions();
-	}
-#endif //__SPU__
-	if (m_manifoldPtr)
-			m_dispatcher->releaseManifold(m_manifoldPtr);
diff --git a/src/bullet/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h b/src/bullet/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
deleted file mode 100644
index d28d4db3..00000000
--- a/src/bullet/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
+++ /dev/null
@@ -1,120 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
-#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
-#include "BulletCollision/CollisionDispatch/btCollisionCreateFunc.h"
-#include "BulletCollision/BroadphaseCollision/btDispatcher.h"
-#include "LinearMath/btTransformUtil.h"
-class btPersistentManifold;
-/// SpuContactManifoldCollisionAlgorithm  provides contact manifold and should be processed on SPU.
-ATTRIBUTE_ALIGNED16(class) SpuContactManifoldCollisionAlgorithm : public btCollisionAlgorithm
-	btVector3	m_shapeDimensions0;
-	btVector3	m_shapeDimensions1;
-	btPersistentManifold*	m_manifoldPtr;
-	int		m_shapeType0;
-	int		m_shapeType1;
-	float	m_collisionMargin0;
-	float	m_collisionMargin1;
-	btCollisionObject*	m_collisionObject0;
-	btCollisionObject*	m_collisionObject1;
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
-	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
-	SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
-	btConvexSeparatingDistanceUtil	m_sepDistance;
-	virtual ~SpuContactManifoldCollisionAlgorithm();
-	virtual	void	getAllContactManifolds(btManifoldArray&	manifoldArray)
-	{
-		if (m_manifoldPtr)
-			manifoldArray.push_back(m_manifoldPtr);
-	}
-	btPersistentManifold*	getContactManifoldPtr()
-	{
-		return m_manifoldPtr;
-	}
-	btCollisionObject*	getCollisionObject0()
-	{
-		return m_collisionObject0;
-	}
-	btCollisionObject*	getCollisionObject1()
-	{
-		return m_collisionObject1;
-	}
-	int		getShapeType0() const
-	{
-		return m_shapeType0;
-	}
-	int		getShapeType1() const
-	{
-		return m_shapeType1;
-	}
-	float	getCollisionMargin0() const
-	{
-		return m_collisionMargin0;
-	}
-	float	getCollisionMargin1() const
-	{
-		return m_collisionMargin1;
-	}
-	const btVector3&	getShapeDimensions0() const
-	{
-		return m_shapeDimensions0;
-	}
-	const btVector3&	getShapeDimensions1() const
-	{
-		return m_shapeDimensions1;
-	}
-	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
-		{
-			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(SpuContactManifoldCollisionAlgorithm));
-			return new(mem) SpuContactManifoldCollisionAlgorithm(ci,body0,body1);
-		}
-	};
diff --git a/src/bullet/BulletMultiThreaded/SpuDoubleBuffer.h b/src/bullet/BulletMultiThreaded/SpuDoubleBuffer.h
deleted file mode 100644
index 558d6152..00000000
--- a/src/bullet/BulletMultiThreaded/SpuDoubleBuffer.h
+++ /dev/null
@@ -1,126 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuFakeDma.h"
-#include "LinearMath/btScalar.h"
-template<class T, int size>
-class DoubleBuffer
-#if defined(__SPU__) || defined(USE_LIBSPE2)
-	ATTRIBUTE_ALIGNED128( T m_buffer0[size] ) ;
-	ATTRIBUTE_ALIGNED128( T m_buffer1[size] ) ;
-	T m_buffer0[size];
-	T m_buffer1[size];
-	T *m_frontBuffer;
-	T *m_backBuffer;
-	unsigned int m_dmaTag;
-	bool m_dmaPending;
-	bool	isPending() const { return m_dmaPending;}
-	DoubleBuffer();
-	void init ();
-	// dma get and put commands
-	void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag);
-	void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag);
-	// gets pointer to a buffer
-	T *getFront();
-	T *getBack();
-	// if back buffer dma was started, wait for it to complete
-	// then move back to front and vice versa
-	T *swapBuffers();
-template<class T, int size>
-	init ();
-template<class T, int size>
-void DoubleBuffer<T,size>::init()
-	this->m_dmaPending = false;
-	this->m_frontBuffer = &this->m_buffer0[0];
-	this->m_backBuffer = &this->m_buffer1[0];
-template<class T, int size>
-DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag)
-	m_dmaPending = true;
-	m_dmaTag = tag;
-	if (numBytes)
-	{
-		m_backBuffer = (T*)cellDmaLargeGetReadOnly(m_backBuffer, ea, numBytes, tag, 0, 0);
-	}
-template<class T, int size>
-DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag)
-	m_dmaPending = true;
-	m_dmaTag = tag;
-	cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0);
-template<class T, int size>
-T *
-	return m_frontBuffer;
-template<class T, int size>
-T *
-	return m_backBuffer;
-template<class T, int size>
-T *
-	if (m_dmaPending)
-	{
-		cellDmaWaitTagStatusAll(1<<m_dmaTag);
-		m_dmaPending = false;
-	}
-	T *tmp = m_backBuffer;
-	m_backBuffer = m_frontBuffer;
-	m_frontBuffer = tmp;
-	return m_frontBuffer;
diff --git a/src/bullet/BulletMultiThreaded/SpuFakeDma.cpp b/src/bullet/BulletMultiThreaded/SpuFakeDma.cpp
deleted file mode 100644
index b776a120..00000000
--- a/src/bullet/BulletMultiThreaded/SpuFakeDma.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuFakeDma.h"
-#include <LinearMath/btScalar.h> //for btAssert
-//Disabling memcpy sometimes helps debugging DMA
-#define USE_MEMCPY 1
-#ifdef USE_MEMCPY
-void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-	cellDmaLargeGet(ls,ea,size,tag,tid,rid);
-	return ls;
-	return (void*)(ppu_address_t)ea;
-void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-	mfc_get(ls,ea,size,tag,0,0);
-	return ls;
-	return (void*)(ppu_address_t)ea;
-void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-	cellDmaGet(ls,ea,size,tag,tid,rid);
-	return ls;
-	return (void*)(ppu_address_t)ea;
-///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
-int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
-	btAssert(size<32);
-	ATTRIBUTE_ALIGNED16(char	tmpBuffer[32]);
-	char* localStore = (char*)ls;
-	uint32_t i;
-	///make sure last 4 bits are the same, for cellDmaSmallGet
-	uint32_t last4BitsOffset = ea & 0x0f;
-	char* tmpTarget = tmpBuffer + last4BitsOffset;
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-	int remainingSize = size;
-//#define FORCE_cellDmaUnalignedGet 1
-#ifdef FORCE_cellDmaUnalignedGet
-	cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
-	char* remainingTmpTarget = tmpTarget;
-	uint64_t remainingEa = ea;
-	while (remainingSize)
-	{
-		switch (remainingSize)
-		{
-		case 1:
-		case 2:
-		case 4:
-		case 8:
-		case 16:
-			{
-				mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0);
-				remainingSize=0;
-				break;
-			}
-		default:
-			{
-				//spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize);
-				int actualSize = 0;
-				if (remainingSize > 16)
-					actualSize = 16;
-				else
-					if (remainingSize >8)
-						actualSize=8;
-					else
-						if (remainingSize >4)
-							actualSize=4;
-						else
-							if (remainingSize >2)
-								actualSize=2;
-				mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0);
-				remainingSize-=actualSize;
-				remainingTmpTarget+=actualSize;
-				remainingEa += actualSize;
-			}
-		}
-	}
-	char* mainMem = (char*)ea;
-	//copy into final destination
-#ifdef USE_MEMCPY
-		memcpy(tmpTarget,mainMem,size);
-		for ( i=0;i<size;i++)
-		{
-			tmpTarget[i] = mainMem[i];
-		}
-#endif //USE_MEMCPY
-	cellDmaWaitTagStatusAll(DMA_MASK(1));
-	//this is slowish, perhaps memcpy on SPU is smarter?
-	for (i=0; btLikely( i<size );i++)
-	{
-		localStore[i] = tmpTarget[i];
-	}
-	return 0;
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-int	cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
-	char* mainMem = (char*)ea;
-	char* localStore = (char*)ls;
-#ifdef USE_MEMCPY
-	memcpy(localStore,mainMem,size);
-	for (uint32_t i=0;i<size;i++)
-	{
-		localStore[i] = mainMem[i];
-	}
-	return 0;
-int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
-	char* mainMem = (char*)ea;
-	char* localStore = (char*)ls;
-//	printf("mainMem=%x, localStore=%x",mainMem,localStore);
-#ifdef USE_MEMCPY
-	memcpy(localStore,mainMem,size);
-	for (uint32_t i=0;i<size;i++)
-	{
-		localStore[i] = mainMem[i];
-	}	
-#endif //#ifdef USE_MEMCPY
-//	printf(" finished\n");
-	return 0;
-int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
-	char* mainMem = (char*)ea;
-	const char* localStore = (const char*)ls;
-#ifdef USE_MEMCPY
-	memcpy(mainMem,localStore,size);
-	for (uint32_t i=0;i<size;i++)
-	{
-		mainMem[i] = localStore[i];
-	}	
-#endif //#ifdef USE_MEMCPY
-	return 0;
-void	cellDmaWaitTagStatusAll(int ignore)
diff --git a/src/bullet/BulletMultiThreaded/SpuFakeDma.h b/src/bullet/BulletMultiThreaded/SpuFakeDma.h
deleted file mode 100644
index 40e20393..00000000
--- a/src/bullet/BulletMultiThreaded/SpuFakeDma.h
+++ /dev/null
@@ -1,135 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef BT_FAKE_DMA_H
-#define BT_FAKE_DMA_H
-#include "PlatformDefinitions.h"
-#include "LinearMath/btScalar.h"
-#ifdef __SPU__
-#ifndef USE_LIBSPE2
-#include <cell/dma.h>
-#include <stdint.h>
-#define DMA_TAG(xfer) (xfer + 1)
-#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
-#else // !USE_LIBSPE2
-#define DMA_TAG(xfer) (xfer + 1)
-#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
-#include <spu_mfcio.h>		
-#define DEBUG_DMA		
-#ifdef DEBUG_DMA
-#define dUASSERT(a,b) if (!(a)) { printf(b);}
-#define uintsize ppu_address_t
-#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
-															dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
-															dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
-															dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
-															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
-															dUASSERT(size < 16384, "size too big: "); \
-															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
-	    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
-															printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
-															} \
-															mfc_get(ls, ea, size, tag, tid, rid)
-#define cellDmaGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
-														dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
-														dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
-														dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
-														dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
-    													dUASSERT(size < 16384, "size too big: "); \
-														dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
-    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
-    													printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
-														} \
-														mfc_get(ls, ea, size, tag, tid, rid)
-#define cellDmaLargePut(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
-															dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
-															dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
-															dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
-															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
-        													dUASSERT(size < 16384, "size too big: "); \
-															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
-        													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
-    														printf("PUT %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ls,(unsigned int)ea,(unsigned int)size); \
-															} \
-															mfc_put(ls, ea, size, tag, tid, rid)
-#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
-																dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
-																dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
-																dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
-    															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
-    															dUASSERT(size < 16384, "size too big: "); \
-    															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
-    	    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
-    															printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
-																} \
-																mfc_get(ls, ea, size, tag, tid, rid)
-#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
-#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
-#define cellDmaGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
-#define cellDmaLargePut(ls, ea, size, tag, tid, rid) mfc_put(ls, ea, size, tag, tid, rid)
-#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
-#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
-#endif // DEBUG_DMA
-#endif // USE_LIBSPE2
-#else // !__SPU__
-//Simulate DMA using memcpy or direct access on non-CELL platforms that don't have DMAs and SPUs (Win32, Mac, Linux etc)
-//Potential to add networked simulation using this interface
-#define DMA_TAG(a) (a)
-#define DMA_MASK(a) (a)
-		/// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
-		int	cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
-		int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
-		/// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
-		int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
-		/// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
-		void	cellDmaWaitTagStatusAll(int ignore);
-#endif //__CELLOS_LV2__
-///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
-int	stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
-void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
-void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
-void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
-#endif //BT_FAKE_DMA_H
diff --git a/src/bullet/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp b/src/bullet/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
deleted file mode 100644
index 1a76be08..00000000
--- a/src/bullet/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuGatheringCollisionDispatcher.h"
-#include "SpuCollisionTaskProcess.h"
-#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
-#include "BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h"
-#include "SpuContactManifoldCollisionAlgorithm.h"
-#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-#include "BulletCollision/CollisionShapes/btCollisionShape.h"
-#include "LinearMath/btQuickprof.h"
-#include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
-SpuGatheringCollisionDispatcher::SpuGatheringCollisionDispatcher(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks,btCollisionConfiguration* collisionConfiguration)
-bool	SpuGatheringCollisionDispatcher::supportsDispatchPairOnSpu(int proxyType0,int proxyType1)
-	bool supported0 = (
-		(proxyType0 == BOX_SHAPE_PROXYTYPE) ||
-		(proxyType0 == TRIANGLE_SHAPE_PROXYTYPE) ||
-		(proxyType0 == SPHERE_SHAPE_PROXYTYPE) ||
-		(proxyType0 == CAPSULE_SHAPE_PROXYTYPE) ||
-		(proxyType0 == CYLINDER_SHAPE_PROXYTYPE) ||
-//		(proxyType0 == CONE_SHAPE_PROXYTYPE) ||
-		(proxyType0 == STATIC_PLANE_PROXYTYPE)||
-		);
-	bool supported1 = (
-		(proxyType1 == BOX_SHAPE_PROXYTYPE) ||
-		(proxyType1 == TRIANGLE_SHAPE_PROXYTYPE) ||
-		(proxyType1 == SPHERE_SHAPE_PROXYTYPE) ||
-		(proxyType1 == CAPSULE_SHAPE_PROXYTYPE) ||
-		(proxyType1 == CYLINDER_SHAPE_PROXYTYPE) ||
-//		(proxyType1 == CONE_SHAPE_PROXYTYPE) ||
-		(proxyType1 == STATIC_PLANE_PROXYTYPE) ||
-		);
-	return supported0 && supported1;
-	if (m_spuCollisionTaskProcess)
-		delete m_spuCollisionTaskProcess;
-#include "stdio.h"
-///interface for iterating all overlapping collision pairs, no matter how those pairs are stored (array, set, map etc)
-///this is useful for the collision dispatcher.
-class btSpuCollisionPairCallback : public btOverlapCallback
-	const btDispatcherInfo& m_dispatchInfo;
-	SpuGatheringCollisionDispatcher*	m_dispatcher;
-	btSpuCollisionPairCallback(const btDispatcherInfo& dispatchInfo, SpuGatheringCollisionDispatcher*	dispatcher)
-	:m_dispatchInfo(dispatchInfo),
-	m_dispatcher(dispatcher)
-	{
-	}
-	virtual bool	processOverlap(btBroadphasePair& collisionPair)
-	{
-		//PPU version
-		//(*m_dispatcher->getNearCallback())(collisionPair,*m_dispatcher,m_dispatchInfo);
-		//only support discrete collision detection for now, we could fallback on PPU/unoptimized version for TOI/CCD
-		btAssert(m_dispatchInfo.m_dispatchFunc == btDispatcherInfo::DISPATCH_DISCRETE);
-		//by default, Bullet will use this near callback
-		{
-			///userInfo is used to determine if the SPU has to handle this case or not (skip PPU tasks)
-			if (!collisionPair.m_internalTmpValue)
-			{
-				collisionPair.m_internalTmpValue = 1;
-			}
-			if (!collisionPair.m_algorithm)
-			{
-				btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
-				btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
-				btCollisionAlgorithmConstructionInfo ci;
-				ci.m_dispatcher1 = m_dispatcher;
-				ci.m_manifold = 0;
-				if (m_dispatcher->needsCollision(colObj0,colObj1))
-				{
-					int	proxyType0 = colObj0->getCollisionShape()->getShapeType();
-					int	proxyType1 = colObj1->getCollisionShape()->getShapeType();
-					bool supportsSpuDispatch = m_dispatcher->supportsDispatchPairOnSpu(proxyType0,proxyType1) 
-						&& ((colObj0->getCollisionFlags() & btCollisionObject::CF_DISABLE_SPU_COLLISION_PROCESSING) == 0)
-						&& ((colObj1->getCollisionFlags() & btCollisionObject::CF_DISABLE_SPU_COLLISION_PROCESSING) == 0);
-					if (proxyType0 == COMPOUND_SHAPE_PROXYTYPE)
-					{
-						btCompoundShape* compound = (btCompoundShape*)colObj0->getCollisionShape();
-						if (compound->getNumChildShapes()>MAX_SPU_COMPOUND_SUBSHAPES)
-						{
-							//printf("PPU fallback, compound->getNumChildShapes(%d)>%d\n",compound->getNumChildShapes(),MAX_SPU_COMPOUND_SUBSHAPES);
-							supportsSpuDispatch = false;
-						}
-					}
-					if (proxyType1 == COMPOUND_SHAPE_PROXYTYPE)
-					{
-						btCompoundShape* compound = (btCompoundShape*)colObj1->getCollisionShape();
-						if (compound->getNumChildShapes()>MAX_SPU_COMPOUND_SUBSHAPES)
-						{
-							//printf("PPU fallback, compound->getNumChildShapes(%d)>%d\n",compound->getNumChildShapes(),MAX_SPU_COMPOUND_SUBSHAPES);
-							supportsSpuDispatch = false;
-						}
-					}
-					if (supportsSpuDispatch)
-					{
-						int so = sizeof(SpuContactManifoldCollisionAlgorithm);
-						void* mem = btAlignedAlloc(so,16);//m_dispatcher->allocateCollisionAlgorithm(so);
-						void* mem = m_dispatcher->allocateCollisionAlgorithm(so);
-						collisionPair.m_algorithm = new(mem) SpuContactManifoldCollisionAlgorithm(ci,colObj0,colObj1);
-						collisionPair.m_internalTmpValue =  2;
-					} else
-					{
-						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(colObj0,colObj1);
-						collisionPair.m_internalTmpValue = 3;
-					}
-				} 
-			}
-		}
-		return false;
-	}
-void	SpuGatheringCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo, btDispatcher* dispatcher) 
-	if (dispatchInfo.m_enableSPU)
-	{
-		m_maxNumOutstandingTasks = m_threadInterface->getNumTasks();
-		{
-			BT_PROFILE("processAllOverlappingPairs");
-			if (!m_spuCollisionTaskProcess)
-				m_spuCollisionTaskProcess = new SpuCollisionTaskProcess(m_threadInterface,m_maxNumOutstandingTasks);
-			m_spuCollisionTaskProcess->setNumTasks(m_maxNumOutstandingTasks);
-	//		printf("m_maxNumOutstandingTasks =%d\n",m_maxNumOutstandingTasks);
-			m_spuCollisionTaskProcess->initialize2(dispatchInfo.m_useEpa);
-			///modified version of btCollisionDispatcher::dispatchAllCollisionPairs:
-			{
-				btSpuCollisionPairCallback	collisionCallback(dispatchInfo,this);
-				pairCache->processAllOverlappingPairs(&collisionCallback,dispatcher);
-			}
-		}
-		//send one big batch
-		int numTotalPairs = pairCache->getNumOverlappingPairs();
-		if (numTotalPairs)
-		{
-			btBroadphasePair* pairPtr = pairCache->getOverlappingPairArrayPtr();
-			int i;
-			{
-				if (numTotalPairs < (m_spuCollisionTaskProcess->getNumTasks()*SPU_BATCHSIZE_BROADPHASE_PAIRS))
-				{
-					pairRange = (numTotalPairs/m_spuCollisionTaskProcess->getNumTasks())+1;
-				}
-				BT_PROFILE("addWorkToTask");
-				for (i=0;i<numTotalPairs;)
-				{
-					//Performance Hint: tweak this number during benchmarking
-					int endIndex = (i+pairRange) < numTotalPairs ? i+pairRange : numTotalPairs;
-					m_spuCollisionTaskProcess->addWorkToTask(pairPtr,i,endIndex);
-					i = endIndex;
-				}
-			}
-			{
-				BT_PROFILE("PPU fallback");
-				//handle PPU fallback pairs
-				for (i=0;i<numTotalPairs;i++)
-				{
-					btBroadphasePair& collisionPair = pairPtr[i];
-					if (collisionPair.m_internalTmpValue == 3)
-					{
-						if (collisionPair.m_algorithm)
-						{
-							btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
-							btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
-							if (dispatcher->needsCollision(colObj0,colObj1))
-							{
-								btManifoldResult contactPointResult(colObj0,colObj1);
-								if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
-								{
-									//discrete collision detection query
-									collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
-								} else
-								{
-									//continuous collision detection query, time of impact (toi)
-									btScalar toi = collisionPair.m_algorithm->calculateTimeOfImpact(colObj0,colObj1,dispatchInfo,&contactPointResult);
-									if (dispatchInfo.m_timeOfImpact > toi)
-										dispatchInfo.m_timeOfImpact = toi;
-								}
-							}
-						}
-					}
-				}
-			}
-		}
-		{
-			BT_PROFILE("flush2");
-			//make sure all SPU work is done
-			m_spuCollisionTaskProcess->flush2();
-		}
-	} else
-	{
-		///PPU fallback
-		///!Need to make sure to clear all 'algorithms' when switching between SPU and PPU
-		btCollisionDispatcher::dispatchAllCollisionPairs(pairCache,dispatchInfo,dispatcher);
-	}
diff --git a/src/bullet/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h b/src/bullet/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
deleted file mode 100644
index f8bc7da6..00000000
--- a/src/bullet/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
+++ /dev/null
@@ -1,72 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
-///Tuning value to optimized SPU utilization 
-///Too small value means Task overhead is large compared to computation (too fine granularity)
-///Too big value might render some SPUs are idle, while a few other SPUs are doing all work.
-class SpuCollisionTaskProcess;
-///SpuGatheringCollisionDispatcher can use SPU to gather and calculate collision detection
-///Time of Impact, Closest Points and Penetration Depth.
-class SpuGatheringCollisionDispatcher : public btCollisionDispatcher
-	SpuCollisionTaskProcess*	m_spuCollisionTaskProcess;
-	class	btThreadSupportInterface*	m_threadInterface;
-	unsigned int	m_maxNumOutstandingTasks;
-	//can be used by SPU collision algorithms	
-	SpuCollisionTaskProcess*	getSpuCollisionTaskProcess()
-	{
-			return m_spuCollisionTaskProcess;
-	}
-	SpuGatheringCollisionDispatcher (class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks,btCollisionConfiguration* collisionConfiguration);
-	virtual ~SpuGatheringCollisionDispatcher();
-	bool	supportsDispatchPairOnSpu(int proxyType0,int proxyType1);
-	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) ;
diff --git a/src/bullet/BulletMultiThreaded/SpuLibspe2Support.cpp b/src/bullet/BulletMultiThreaded/SpuLibspe2Support.cpp
deleted file mode 100644
index a312450e..00000000
--- a/src/bullet/BulletMultiThreaded/SpuLibspe2Support.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifdef USE_LIBSPE2
-#include "SpuLibspe2Support.h"
-//SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
-///Setup and initialize SPU/CELL/Libspe2
-SpuLibspe2Support::SpuLibspe2Support(spe_program_handle_t *speprog, int numThreads)
-	this->program = speprog;
-	this->numThreads =  ((numThreads <= spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1)) ? numThreads : spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1));
-///cleanup/shutdown Libspe2
-	stopSPU();
-///send messages to SPUs
-void SpuLibspe2Support::sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1)
-	spe_context_ptr_t context;
-	switch (uiCommand)
-	{
-	{
-		//get taskdescription
-		SpuSampleTaskDesc* taskDesc = (SpuSampleTaskDesc*) uiArgument0;
-		btAssert(taskDesc->m_taskId<m_activeSpuStatus.size());
-		//get status of SPU on which task should run
-		btSpuStatus&	spuStatus = m_activeSpuStatus[taskDesc->m_taskId];
-		//set data for spuStatus
-		spuStatus.m_commandId = uiCommand;
-		spuStatus.m_status = Spu_Status_Occupied; //set SPU as "occupied"
-		spuStatus.m_taskDesc.p = taskDesc; 
-		//get context
-		context = data[taskDesc->m_taskId].context;
-		taskDesc->m_mainMemoryPtr = reinterpret_cast<uint64_t> (spuStatus.m_lsMemory.p);
-		break;
-	}
-		{
-			//get taskdescription
-			SpuGatherAndProcessPairsTaskDesc* taskDesc = (SpuGatherAndProcessPairsTaskDesc*) uiArgument0;
-			btAssert(taskDesc->taskId<m_activeSpuStatus.size());
-			//get status of SPU on which task should run
-			btSpuStatus&	spuStatus = m_activeSpuStatus[taskDesc->taskId];
-			//set data for spuStatus
-			spuStatus.m_commandId = uiCommand;
-			spuStatus.m_status = Spu_Status_Occupied; //set SPU as "occupied"
-			spuStatus.m_taskDesc.p = taskDesc; 
-			//get context
-			context = data[taskDesc->taskId].context;
-			taskDesc->m_lsMemory = (CollisionTask_LocalStoreMemory*)spuStatus.m_lsMemory.p;
-			break;
-		}
-	default:
-		{
-			///not implemented
-			btAssert(0);
-		}
-	};
-	//write taskdescription in mailbox
-	unsigned int event = Spu_Mailbox_Event_Task;
-	spe_in_mbox_write(context, &event, 1, SPE_MBOX_ANY_NONBLOCKING);
-///check for messages from SPUs
-void SpuLibspe2Support::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
-	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
-	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
-	btAssert(m_activeSpuStatus.size());
-	int last = -1;
-	//find an active spu/thread
-	while(last < 0)
-	{
-		for (int i=0;i<m_activeSpuStatus.size();i++)
-		{
-			if ( m_activeSpuStatus[i].m_status == Spu_Status_Free)
-			{
-				last = i;
-				break;
-			}
-		}
-		if(last < 0)
-			sched_yield();
-	}
-	btSpuStatus& spuStatus = m_activeSpuStatus[last];
-	///need to find an active spu
-	btAssert(last>=0);
-	*puiArgument0 = spuStatus.m_taskId;
-	*puiArgument1 = spuStatus.m_status;
-void SpuLibspe2Support::startSPU()
-	this->internal_startSPU();
-///start the spus group (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-void SpuLibspe2Support::internal_startSPU()
-	m_activeSpuStatus.resize(numThreads);
-	for (int i=0; i < numThreads; i++)
-	{
-		if(data[i].context == NULL) 
-		{
-			 /* Create context */
-			if ((data[i].context = spe_context_create(0, NULL)) == NULL)
-			{
-			      perror ("Failed creating context");
-		          exit(1);
-			}
-			/* Load program into context */
-			if(spe_program_load(data[i].context, this->program))
-			{
-			      perror ("Failed loading program");
-		          exit(1);
-			}
-			m_activeSpuStatus[i].m_status = Spu_Status_Startup; 
-			m_activeSpuStatus[i].m_taskId = i; 
-			m_activeSpuStatus[i].m_commandId = 0; 
-			m_activeSpuStatus[i].m_lsMemory.p = NULL; 
-			data[i].entry = SPE_DEFAULT_ENTRY;
-			data[i].flags = 0;
-			data[i].argp.p = &m_activeSpuStatus[i];
-			data[i].envp.p = NULL;
-		    /* Create thread for each SPE context */
-			if (pthread_create(&data[i].pthread, NULL, &ppu_pthread_function, &(data[i]) ))
-			{
-			      perror ("Failed creating thread");
-		          exit(1);
-			}
-			/*
-			else
-			{
-				printf("started thread %d\n",i);
-			}*/
-		}		
-	}
-	for (int i=0; i < numThreads; i++)
-	{
-		if(data[i].context != NULL) 
-		{
-			while( m_activeSpuStatus[i].m_status == Spu_Status_Startup)
-			{
-				// wait for spu to set up
-				sched_yield();
-			}
-			printf("Spu %d is ready\n", i);
-		}
-	}
-///tell the task scheduler we are done with the SPU tasks
-void SpuLibspe2Support::stopSPU()
-	// wait for all threads to finish 
-	int i;
-	for ( i = 0; i < this->numThreads; i++ ) 
-	{ 
-		unsigned int event = Spu_Mailbox_Event_Shutdown;
-		spe_context_ptr_t context = data[i].context;
-		spe_in_mbox_write(context, &event, 1, SPE_MBOX_ALL_BLOCKING);
-		pthread_join (data[i].pthread, NULL); 
-	} 
-	// close SPE program 
-	spe_image_close(program); 
-	// destroy SPE contexts 
-	for ( i = 0; i < this->numThreads; i++ ) 
-	{ 
-		if(data[i].context != NULL)
-		{
-			spe_context_destroy (data[i].context);
-		}
-	} 
-	m_activeSpuStatus.clear();
-#endif //USE_LIBSPE2
diff --git a/src/bullet/BulletMultiThreaded/SpuLibspe2Support.h b/src/bullet/BulletMultiThreaded/SpuLibspe2Support.h
deleted file mode 100644
index 37a5e79f..00000000
--- a/src/bullet/BulletMultiThreaded/SpuLibspe2Support.h
+++ /dev/null
@@ -1,180 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <LinearMath/btScalar.h> //for uint32_t etc.
-#ifdef USE_LIBSPE2
-#include <stdlib.h>
-#include <stdio.h>
-//#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
-#include "PlatformDefinitions.h"
-//extern struct SpuGatherAndProcessPairsTaskDesc;
-	Spu_Mailbox_Event_Nothing = 0,
-	Spu_Mailbox_Event_Task = 1,
-	Spu_Mailbox_Event_Shutdown = 2,
-	Spu_Mailbox_Event_ForceDword = 0xFFFFFFFF
-	Spu_Status_Free = 0,
-	Spu_Status_Occupied = 1,
-	Spu_Status_Startup = 2,
-	Spu_Status_ForceDword = 0xFFFFFFFF
-struct btSpuStatus
-	uint32_t	m_taskId;
-	uint32_t	m_commandId;
-	uint32_t	m_status;
-	addr64 m_taskDesc;
-	addr64 m_lsMemory;
-__attribute__ ((aligned (128)))
-#ifndef __SPU__
-#include "LinearMath/btAlignedObjectArray.h"
-#include "SpuCollisionTaskProcess.h"
-#include "SpuSampleTaskProcess.h"
-#include "btThreadSupportInterface.h"
-#include <libspe2.h>
-#include <pthread.h>
-#include <sched.h>
-#define MAX_SPUS 4 
-typedef struct ppu_pthread_data 
-	spe_context_ptr_t context;
-	pthread_t pthread;
-	unsigned int entry;
-	unsigned int flags;
-	addr64 argp;
-	addr64 envp;
-	spe_stop_info_t stopinfo;
-} ppu_pthread_data_t;
-static void *ppu_pthread_function(void *arg)
-    ppu_pthread_data_t * datap = (ppu_pthread_data_t *)arg;
-    /*
-    int rc;
-    do 
-    {*/
-        spe_context_run(datap->context, &datap->entry, datap->flags, datap->argp.p, datap->envp.p, &datap->stopinfo);
-        if (datap->stopinfo.stop_reason == SPE_EXIT) 
-        {
-           if (datap->stopinfo.result.spe_exit_code != 0) 
-           {
-             perror("FAILED: SPE returned a non-zero exit status: \n");
-             exit(1);
-           }
-         } 
-        else 
-         {
-           perror("FAILED: SPE abnormally terminated\n");
-           exit(1);
-         }
-    //} while (rc > 0); // loop until exit or error, and while any stop & signal
-    pthread_exit(NULL);
-///SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
-class SpuLibspe2Support : public btThreadSupportInterface
-	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
-	//Setup and initialize SPU/CELL/Libspe2
-	SpuLibspe2Support(spe_program_handle_t *speprog,int numThreads);
-	// SPE program handle ptr.
-	spe_program_handle_t *program;
-	// SPE program data
-	ppu_pthread_data_t data[MAX_SPUS];
-	//cleanup/shutdown Libspe2
-	~SpuLibspe2Support();
-	///send messages to SPUs
-	void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1=0);
-	//check for messages from SPUs
-	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
-	//start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-	virtual void startSPU();
-	//tell the task scheduler we are done with the SPU tasks
-	virtual void stopSPU();
-	virtual void setNumTasks(int numTasks)
-	{
-		//changing the number of tasks after initialization is not implemented (yet)
-	}
-	///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-	void internal_startSPU();
-	int numThreads;
-#endif // NOT __SPU__
-#endif //USE_LIBSPE2
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
deleted file mode 100644
index e5179611..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
+++ /dev/null
@@ -1,167 +0,0 @@
-   Copyright (C) 2006, 2008 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef __BOX_H__
-#define __BOX_H__
-#ifndef PE_REF
-#define PE_REF(a) a&
-#include <math.h>
-#include "../PlatformDefinitions.h"
-enum FeatureType { F, E, V };
-// Box
-///The Box is an internal class used by the boxBoxDistance calculation.
-class Box
-	vmVector3 mHalf;
-	inline Box()
-	{}
-	inline Box(PE_REF(vmVector3) half_);
-	inline Box(float hx, float hy, float hz);
-	inline void Set(PE_REF(vmVector3) half_);
-	inline void Set(float hx, float hy, float hz);
-	inline vmVector3 GetAABB(const vmMatrix3& rotation) const;
-Box::Box(PE_REF(vmVector3) half_)
-	Set(half_);
-Box::Box(float hx, float hy, float hz)
-	Set(hx, hy, hz);
-Box::Set(PE_REF(vmVector3) half_)
-	mHalf = half_;
-Box::Set(float hx, float hy, float hz)
-	mHalf = vmVector3(hx, hy, hz);
-Box::GetAABB(const vmMatrix3& rotation) const
-	return absPerElem(rotation) * mHalf;
-// BoxPoint
-///The BoxPoint class is an internally used class to contain feature information for boxBoxDistance calculation.
-class BoxPoint
-	BoxPoint() : localPoint(0.0f) {}
-	vmPoint3      localPoint;
-	FeatureType featureType;
-	int         featureIdx;
-	inline void setVertexFeature(int plusX, int plusY, int plusZ);
-	inline void setEdgeFeature(int dim0, int plus0, int dim1, int plus1);
-	inline void setFaceFeature(int dim, int plus);
-	inline void getVertexFeature(int & plusX, int & plusY, int & plusZ) const;
-	inline void getEdgeFeature(int & dim0, int & plus0, int & dim1, int & plus1) const;
-	inline void getFaceFeature(int & dim, int & plus) const;
-BoxPoint::setVertexFeature(int plusX, int plusY, int plusZ)
-	featureType = V;
-	featureIdx = plusX << 2 | plusY << 1 | plusZ;
-BoxPoint::setEdgeFeature(int dim0, int plus0, int dim1, int plus1)
-	featureType = E;
-	if (dim0 > dim1) {
-		featureIdx = plus1 << 5 | dim1 << 3 | plus0 << 2 | dim0;
-	} else {
-		featureIdx = plus0 << 5 | dim0 << 3 | plus1 << 2 | dim1;
-	}
-BoxPoint::setFaceFeature(int dim, int plus)
-	featureType = F;
-	featureIdx = plus << 2 | dim;
-BoxPoint::getVertexFeature(int & plusX, int & plusY, int & plusZ) const
-	plusX = featureIdx >> 2;
-	plusY = featureIdx >> 1 & 1;
-	plusZ = featureIdx & 1;
-BoxPoint::getEdgeFeature(int & dim0, int & plus0, int & dim1, int & plus1) const
-	plus0 = featureIdx >> 5;
-	dim0 = featureIdx >> 3 & 3;
-	plus1 = featureIdx >> 2 & 1;
-	dim1 = featureIdx & 3;
-BoxPoint::getFaceFeature(int & dim, int & plus) const
-	plus = featureIdx >> 2;
-	dim = featureIdx & 3;
-#endif /* __BOX_H__ */
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
deleted file mode 100644
index dfcd8426..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuCollisionShapes.h"
-///not supported on IBM SDK, until we fix the alignment of btVector3
-#if defined (__CELLOS_LV2__) && defined (__SPU__)
-#include <spu_intrinsics.h>
-static inline vec_float4 vec_dot3( vec_float4 vec0, vec_float4 vec1 )
-    vec_float4 result;
-    result = spu_mul( vec0, vec1 );
-    result = spu_madd( spu_rlqwbyte( vec0, 4 ), spu_rlqwbyte( vec1, 4 ), result );
-    return spu_madd( spu_rlqwbyte( vec0, 8 ), spu_rlqwbyte( vec1, 8 ), result );
-#endif //__SPU__
-void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, const btTransform& xform)
-	//calculate the aabb, given the types...
-	switch (shapeType)
-	{
-		/* fall through */
-	{
-		btScalar margin=convexShape->getMarginNV();
-		btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
-		halfExtents += btVector3(margin,margin,margin);
-		const btTransform& t = xform;
-		btMatrix3x3 abs_b = t.getBasis().absolute();  
-		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
-		aabbMin = center - extent;
-		aabbMax = center + extent;
-		break;
-	}
-	{
-		btScalar margin=convexShape->getMarginNV();
-		btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
-		//add the radius to y-axis to get full height
-		btScalar radius = halfExtents[0];
-		halfExtents[1] += radius;
-		halfExtents += btVector3(margin,margin,margin);
-#if 0
-		int capsuleUpAxis = convexShape->getUpAxis();
-		btScalar halfHeight = convexShape->getHalfHeight();
-		btScalar radius = convexShape->getRadius();
-		halfExtents[capsuleUpAxis] = radius + halfHeight;
-		const btTransform& t = xform;
-		btMatrix3x3 abs_b = t.getBasis().absolute();  
-		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
-		aabbMin = center - extent;
-		aabbMax = center + extent;
-		break;
-	}
-	{
-		btScalar radius = convexShape->getImplicitShapeDimensions().getX();// * convexShape->getLocalScaling().getX();
-		btScalar margin = radius + convexShape->getMarginNV();
-		const btTransform& t = xform;
-		const btVector3& center = t.getOrigin();
-		btVector3 extent(margin,margin,margin);
-		aabbMin = center - extent;
-		aabbMax = center + extent;
-		break;
-	}
-	{
-		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
-		cellDmaGet(&convexHullShape0, convexShapePtr  , sizeof(btConvexHullShape), DMA_TAG(1), 0, 0);
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-		btConvexHullShape* localPtr = (btConvexHullShape*)&convexHullShape0;
-		const btTransform& t = xform;
-		btScalar margin = convexShape->getMarginNV();
-		localPtr->getNonvirtualAabb(t,aabbMin,aabbMax,margin);
-		//spu_printf("SPU convex aabbMin=%f,%f,%f=\n",aabbMin.getX(),aabbMin.getY(),aabbMin.getZ());
-		//spu_printf("SPU convex aabbMax=%f,%f,%f=\n",aabbMax.getX(),aabbMax.getY(),aabbMax.getZ());
-		break;
-	}
-	default:
-		{
-	//	spu_printf("SPU: unsupported shapetype %d in AABB calculation\n");
-		}
-	};
-void dmaBvhShapeData (bvhMeshShape_LocalStoreMemory* bvhMeshShape, btBvhTriangleMeshShape* triMeshShape)
-	register int dmaSize;
-	register ppu_address_t	dmaPpuAddress2;
-	dmaSize = sizeof(btTriangleIndexVertexArray);
-	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(triMeshShape->getMeshInterface());
-	//	spu_printf("trimeshShape->getMeshInterface() == %llx\n",dmaPpuAddress2);
-#ifdef __SPU__
-	cellDmaGet(&bvhMeshShape->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-	bvhMeshShape->gTriangleMeshInterfacePtr = &bvhMeshShape->gTriangleMeshInterfaceStorage;
-	bvhMeshShape->gTriangleMeshInterfacePtr = (btTriangleIndexVertexArray*)cellDmaGetReadOnly(&bvhMeshShape->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-	//cellDmaWaitTagStatusAll(DMA_MASK(1));
-	///now DMA over the BVH
-	dmaSize = sizeof(btOptimizedBvh);
-	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(triMeshShape->getOptimizedBvh());
-	//spu_printf("trimeshShape->getOptimizedBvh() == %llx\n",dmaPpuAddress2);
-	cellDmaGet(&bvhMeshShape->gOptimizedBvh, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-	//cellDmaWaitTagStatusAll(DMA_MASK(2));
-	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-void dmaBvhIndexedMesh (btIndexedMesh* IndexMesh, IndexedMeshArray& indexArray, int index, uint32_t dmaTag)
-	cellDmaGet(IndexMesh, (ppu_address_t)&indexArray[index]  , sizeof(btIndexedMesh), DMA_TAG(dmaTag), 0, 0);
-void dmaBvhSubTreeHeaders (btBvhSubtreeInfo* subTreeHeaders, ppu_address_t subTreePtr, int batchSize, uint32_t dmaTag)
-	cellDmaGet(subTreeHeaders, subTreePtr, batchSize * sizeof(btBvhSubtreeInfo), DMA_TAG(dmaTag), 0, 0);
-void dmaBvhSubTreeNodes (btQuantizedBvhNode* nodes, const btBvhSubtreeInfo& subtree, QuantizedNodeArray&	nodeArray, int dmaTag)
-	cellDmaGet(nodes, reinterpret_cast<ppu_address_t>(&nodeArray[subtree.m_rootNodeIndex]) , subtree.m_subtreeSize* sizeof(btQuantizedBvhNode), DMA_TAG(2), 0, 0);
-///getShapeTypeSize could easily be optimized, but it is not likely a bottleneck
-int		getShapeTypeSize(int shapeType)
-	switch (shapeType)
-	{
-		{
-			int shapeSize = sizeof(btCylinderShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btBoxShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btSphereShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btBvhTriangleMeshShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btCapsuleShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btConvexHullShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btCompoundShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-		{
-			int shapeSize = sizeof(btStaticPlaneShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-	default:
-		btAssert(0);
-		//unsupported shapetype, please add here
-		return 0;
-	}
-void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btConvexHullShape* convexShapeSPU)
-	convexVertexData->gNumConvexPoints = convexShapeSPU->getNumPoints();
-	if (convexVertexData->gNumConvexPoints>MAX_NUM_SPU_CONVEX_POINTS)
-	{
-		btAssert(0);
-	//	spu_printf("SPU: Error: MAX_NUM_SPU_CONVEX_POINTS(%d) exceeded: %d\n",MAX_NUM_SPU_CONVEX_POINTS,convexVertexData->gNumConvexPoints);
-		return;
-	}
-	register int dmaSize = convexVertexData->gNumConvexPoints*sizeof(btVector3);
-	ppu_address_t pointsPPU = (ppu_address_t) convexShapeSPU->getUnscaledPoints();
-	cellDmaGet(&convexVertexData->g_convexPointBuffer[0], pointsPPU  , dmaSize, DMA_TAG(2), 0, 0);
-void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType)
-	register int dmaSize = getShapeTypeSize(shapeType);
-	cellDmaGet(collisionShapeLocation, collisionShapePtr  , dmaSize, DMA_TAG(dmaTag), 0, 0);
-	//cellDmaGetReadOnly(collisionShapeLocation, collisionShapePtr  , dmaSize, DMA_TAG(dmaTag), 0, 0);
-	//cellDmaWaitTagStatusAll(DMA_MASK(dmaTag));
-void dmaCompoundShapeInfo (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag)
-	register int dmaSize;
-	register	ppu_address_t	dmaPpuAddress2;
-	int childShapeCount = spuCompoundShape->getNumChildShapes();
-	dmaSize = childShapeCount * sizeof(btCompoundShapeChild);
-	dmaPpuAddress2 = (ppu_address_t)spuCompoundShape->getChildList();
-	cellDmaGet(&compoundShapeLocation->gSubshapes[0], dmaPpuAddress2, dmaSize, DMA_TAG(dmaTag), 0, 0);
-void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag)
-	int childShapeCount = spuCompoundShape->getNumChildShapes();
-	int i;
-	// DMA all the subshapes 
-	for ( i = 0; i < childShapeCount; ++i)
-	{
-		btCompoundShapeChild& childShape = compoundShapeLocation->gSubshapes[i];
-		dmaCollisionShape (&compoundShapeLocation->gSubshapeShape[i],(ppu_address_t)childShape.m_childShape, dmaTag, childShape.m_childShapeType);
-	}
-void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex)
-	int curIndex = startNodeIndex;
-	int walkIterations = 0;
-#ifdef BT_DEBUG
-	int subTreeSize = endNodeIndex - startNodeIndex;
-	int escapeIndex;
-	unsigned int aabbOverlap, isLeafNode;
-	while (curIndex < endNodeIndex)
-	{
-		//catch bugs in tree data
-		btAssert (walkIterations < subTreeSize);
-		walkIterations++;
-		aabbOverlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
-		isLeafNode = rootNode->isLeafNode();
-		if (isLeafNode && aabbOverlap)
-		{
-			//printf("overlap with node %d\n",rootNode->getTriangleIndex());
-			nodeCallback->processNode(0,rootNode->getTriangleIndex());
-			//			spu_printf("SPU: overlap detected with triangleIndex:%d\n",rootNode->getTriangleIndex());
-		} 
-		if (aabbOverlap || isLeafNode)
-		{
-			rootNode++;
-			curIndex++;
-		} else
-		{
-			escapeIndex = rootNode->getEscapeIndex();
-			rootNode += escapeIndex;
-			curIndex += escapeIndex;
-		}
-	}
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
deleted file mode 100644
index aa8a2910..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+++ /dev/null
@@ -1,128 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "../SpuDoubleBuffer.h"
-#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
-#include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
-#include "BulletCollision/CollisionShapes/btCylinderShape.h"
-#include "BulletCollision/CollisionShapes/btStaticPlaneShape.h"
-#include "BulletCollision/CollisionShapes/btOptimizedBvh.h"
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "BulletCollision/CollisionShapes/btSphereShape.h"
-#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
-#include "BulletCollision/CollisionShapes/btConvexShape.h"
-#include "BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h"
-#include "BulletCollision/CollisionShapes/btConvexHullShape.h"
-#include "BulletCollision/CollisionShapes/btCompoundShape.h"
-#define MAX_NUM_SPU_CONVEX_POINTS 128 //@fallback to PPU if a btConvexHullShape has more than MAX_NUM_SPU_CONVEX_POINTS points
-#define MAX_SPU_COMPOUND_SUBSHAPES 16 //@fallback on PPU if compound has more than MAX_SPU_COMPOUND_SUBSHAPES child shapes
-#define MAX_SHAPE_SIZE 256 //@todo: assert on this
-ATTRIBUTE_ALIGNED16(struct)	SpuConvexPolyhedronVertexData
-	void*	gSpuConvexShapePtr;
-	btVector3* gConvexPoints;
-	int gNumConvexPoints;
-	int unused;
-	ATTRIBUTE_ALIGNED16(btVector3 g_convexPointBuffer[MAX_NUM_SPU_CONVEX_POINTS]);
-ATTRIBUTE_ALIGNED16(struct) CollisionShape_LocalStoreMemory
-	ATTRIBUTE_ALIGNED16(char collisionShape[MAX_SHAPE_SIZE]);
-ATTRIBUTE_ALIGNED16(struct) CompoundShape_LocalStoreMemory
-	// Compound data
-ATTRIBUTE_ALIGNED16(struct) bvhMeshShape_LocalStoreMemory
-	//ATTRIBUTE_ALIGNED16(btOptimizedBvh	gOptimizedBvh);
-	ATTRIBUTE_ALIGNED16(char gOptimizedBvh[sizeof(btOptimizedBvh)+16]);
-	btOptimizedBvh*	getOptimizedBvh()
-	{
-		return (btOptimizedBvh*) gOptimizedBvh;
-	}
-	ATTRIBUTE_ALIGNED16(btTriangleIndexVertexArray	gTriangleMeshInterfaceStorage);
-	btTriangleIndexVertexArray*	gTriangleMeshInterfacePtr;
-	///only a single mesh part for now, we can add support for multiple parts, but quantized trees don't support this at the moment 
-	ATTRIBUTE_ALIGNED16(btIndexedMesh	gIndexMesh);
-	//1024
-	ATTRIBUTE_ALIGNED16(btQuantizedBvhNode	gSubtreeNodes[MAX_SUBTREE_SIZE_IN_BYTES/sizeof(btQuantizedBvhNode)]);
-void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, const btTransform& xform);
-void dmaBvhShapeData (bvhMeshShape_LocalStoreMemory* bvhMeshShape, btBvhTriangleMeshShape* triMeshShape);
-void dmaBvhIndexedMesh (btIndexedMesh* IndexMesh, IndexedMeshArray& indexArray, int index, uint32_t dmaTag);
-void dmaBvhSubTreeHeaders (btBvhSubtreeInfo* subTreeHeaders, ppu_address_t subTreePtr, int batchSize, uint32_t dmaTag);
-void dmaBvhSubTreeNodes (btQuantizedBvhNode* nodes, const btBvhSubtreeInfo& subtree, QuantizedNodeArray&	nodeArray, int dmaTag);
-int  getShapeTypeSize(int shapeType);
-void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btConvexHullShape* convexShapeSPU);
-void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType);
-void dmaCompoundShapeInfo (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag);
-void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag);
-SIMD_FORCE_INLINE unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
-#if defined(__CELLOS_LV2__) && defined (__SPU__)
-	vec_ushort8 vecMin = {aabbMin1[0],aabbMin2[0],aabbMin1[2],aabbMin2[2],aabbMin1[1],aabbMin2[1],0,0};
-	vec_ushort8 vecMax = {aabbMax2[0],aabbMax1[0],aabbMax2[2],aabbMax1[2],aabbMax2[1],aabbMax1[1],0,0};
-	vec_ushort8 isGt = spu_cmpgt(vecMin,vecMax);
-	return spu_extract(spu_gather(isGt),0)==0;
-	return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
-		& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
-		& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
-		1, 0);
-SIMD_FORCE_INLINE unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int*  aabbMax2)
-	unsigned int overlap = 1;
-	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? 0 : overlap;
-	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? 0 : overlap;
-	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? 0 : overlap;
-	return overlap;
-void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex);
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
deleted file mode 100644
index 8584e74c..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuContactResult.h"
-#ifndef __SPU__
-#include <stdio.h>
-#define spu_printf printf
-	m_manifoldAddress = 0;
-	m_spuManifold = NULL;
-	m_RequiresWriteBack = false;
- SpuContactResult::~SpuContactResult()
-	g_manifoldDmaExport.swapBuffers();
- 	///User can override this material combiner by implementing gContactAddedCallback and setting body0->m_collisionFlags |= btCollisionObject::customMaterialCallback;
-inline btScalar	calculateCombinedFriction(btScalar friction0,btScalar friction1)
-	btScalar friction = friction0*friction1;
-	const btScalar MAX_FRICTION  = btScalar(10.);
-	if (friction < -MAX_FRICTION)
-		friction = -MAX_FRICTION;
-	if (friction > MAX_FRICTION)
-		friction = MAX_FRICTION;
-	return friction;
-inline btScalar	calculateCombinedRestitution(btScalar restitution0,btScalar restitution1)
-	return restitution0*restitution1;
- void	SpuContactResult::setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction1, bool isSwapped)
- {
-	//spu_printf("SpuContactResult::setContactInfo ManifoldAddress: %lu\n", manifoldAddress);
-	m_rootWorldTransform0 = worldTrans0;
-	m_rootWorldTransform1 = worldTrans1;
-	m_manifoldAddress = manifoldAddress;    
-	m_spuManifold = spuManifold;
-	m_combinedFriction = calculateCombinedFriction(friction0,friction1);
-	m_combinedRestitution = calculateCombinedRestitution(restitution0,restitution1);
-	m_isSwapped = isSwapped;
- }
- void SpuContactResult::setShapeIdentifiersA(int partId0,int index0)
- {
- }
- void SpuContactResult::setShapeIdentifiersB(int partId1,int index1)
- {
- }
- ///return true if it requires a dma transfer back
-bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
-								   const btVector3& pointInWorld,
-								   float depth,
-								   btPersistentManifold* manifoldPtr,
-								   btTransform& transA,
-								   btTransform& transB,
-									btScalar	combinedFriction,
-									btScalar	combinedRestitution,
-								   bool isSwapped)
-//	float contactTreshold = manifoldPtr->getContactBreakingThreshold();
-	//spu_printf("SPU: add contactpoint, depth:%f, contactTreshold %f, manifoldPtr %llx\n",depth,contactTreshold,manifoldPtr);
-	spu_printf("SPU: contactTreshold %f\n",contactTreshold);
-	if (depth > manifoldPtr->getContactBreakingThreshold())
-		return false;
-	//if (depth > manifoldPtr->getContactProcessingThreshold())
-	//	return false;
-	btVector3 pointA;
-	btVector3 localA;
-	btVector3 localB;
-	btVector3 normal;
-	if (isSwapped)
-	{
-		normal = normalOnBInWorld * -1;
-		pointA = pointInWorld + normal * depth;
-		localA = transA.invXform(pointA );
-		localB = transB.invXform(pointInWorld);
-	}
-	else
-	{
-		normal = normalOnBInWorld;
-		pointA = pointInWorld + normal * depth;
-		localA = transA.invXform(pointA );
-		localB = transB.invXform(pointInWorld);
-	}
-	btManifoldPoint newPt(localA,localB,normal,depth);
-	newPt.m_positionWorldOnA = pointA;
-	newPt.m_positionWorldOnB = pointInWorld;
-	newPt.m_combinedFriction = combinedFriction;
-	newPt.m_combinedRestitution = combinedRestitution;
-	int insertIndex = manifoldPtr->getCacheEntry(newPt);
-	if (insertIndex >= 0)
-	{
-		// we need to replace the current contact point, otherwise small errors will accumulate (spheres start rolling etc)
-		manifoldPtr->replaceContactPoint(newPt,insertIndex);
-		return true;
-	} else
-	{
-		/*
-		///@todo: SPU callbacks, either immediate (local on the SPU), or deferred
-		//User can override friction and/or restitution
-		if (gContactAddedCallback &&
-			//and if either of the two bodies requires custom material
-			 ((m_body0->m_collisionFlags & btCollisionObject::customMaterialCallback) ||
-			   (m_body1->m_collisionFlags & btCollisionObject::customMaterialCallback)))
-		{
-			//experimental feature info, for per-triangle material etc.
-			(*gContactAddedCallback)(newPt,m_body0,m_partId0,m_index0,m_body1,m_partId1,m_index1);
-		}
-		*/
-		manifoldPtr->addManifoldPoint(newPt);
-		return true;
-	}
-	return false;
-void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold)
-	///only write back the contact information on SPU. Other platforms avoid copying, and use the data in-place
-	///see SpuFakeDma.cpp 'cellDmaLargeGetReadOnly'
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-    memcpy(g_manifoldDmaExport.getFront(),lsManifold,sizeof(btPersistentManifold));
-    g_manifoldDmaExport.swapBuffers();
-    ppu_address_t mmAddr = (ppu_address_t)mmManifold;
-    g_manifoldDmaExport.backBufferDmaPut(mmAddr, sizeof(btPersistentManifold), DMA_TAG(9));
-	// Should there be any kind of wait here?  What if somebody tries to use this tag again?  What if we call this function again really soon?
-	//no, the swapBuffers does the wait
-void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
-	spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
-	spu_printf("*** normal = %f,%f,%f\n",normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
-	spu_printf("*** position = %f,%f,%f\n",pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
- //   int sman = sizeof(rage::phManifold);
-//	spu_printf("sizeof_manifold = %i\n",sman);
-	btPersistentManifold* localManifold = m_spuManifold;
-	btVector3	normalB(normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
-	btVector3	pointWrld(pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
-	//process the contact point
-	const bool retVal = ManifoldResultAddContactPoint(normalB,
-		pointWrld,
-		depth,
-		localManifold,
-		m_rootWorldTransform0,
-		m_rootWorldTransform1,
-		m_combinedFriction,
-		m_combinedRestitution,
-		m_isSwapped);
-	m_RequiresWriteBack = m_RequiresWriteBack || retVal;
-void SpuContactResult::flush()
-	if (m_spuManifold && m_spuManifold->getNumContacts())
-	{
-		m_spuManifold->refreshContactPoints(m_rootWorldTransform0,m_rootWorldTransform1);
-		m_RequiresWriteBack = true;
-	}
-	if (m_RequiresWriteBack)
-	{
-		spu_printf("SPU: Start SpuContactResult::flush (Put) DMA\n");
-		spu_printf("Num contacts:%d\n", m_spuManifold->getNumContacts());
-		spu_printf("Manifold address: %llu\n", m_manifoldAddress);
-	//	spu_printf("writeDoubleBufferedManifold\n");
-		writeDoubleBufferedManifold(m_spuManifold, (btPersistentManifold*)m_manifoldAddress);
-		spu_printf("SPU: Finished (Put) DMA\n");
-	}
-	m_spuManifold = NULL;
-	m_RequiresWriteBack = false;
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
deleted file mode 100644
index 394f56dc..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
+++ /dev/null
@@ -1,106 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef _WIN32
-#include <stdint.h>
-#include "../SpuDoubleBuffer.h"
-#include "LinearMath/btTransform.h"
-#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
-#include "BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h"
-class btCollisionShape;
-struct SpuCollisionPairInput
-	ppu_address_t m_collisionShapes[2];
-	btCollisionShape*	m_spuCollisionShapes[2];
-	ppu_address_t m_persistentManifoldPtr;
-	btVector3	m_primitiveDimensions0;
-	btVector3	m_primitiveDimensions1;
-	int		m_shapeType0;
-	int		m_shapeType1;	
-	float	m_collisionMargin0;
-	float	m_collisionMargin1;
-	btTransform	m_worldTransform0;
-	btTransform m_worldTransform1;
-	bool	m_isSwapped;
-	bool    m_useEpa;
-struct SpuClosestPointInput : public btDiscreteCollisionDetectorInterface::ClosestPointInput
-	struct SpuConvexPolyhedronVertexData* m_convexVertexData[2];
-///SpuContactResult exports the contact points using double-buffered DMA transfers, only when needed
-///So when an existing contact point is duplicated, no transfer/refresh is performed.
-class SpuContactResult : public btDiscreteCollisionDetectorInterface::Result
-    btTransform		m_rootWorldTransform0;
-	btTransform		m_rootWorldTransform1;
-	ppu_address_t	m_manifoldAddress;
-    btPersistentManifold* m_spuManifold;
-	bool m_RequiresWriteBack;
-	btScalar	m_combinedFriction;
-	btScalar	m_combinedRestitution;
-	bool m_isSwapped;
-	DoubleBuffer<btPersistentManifold, 1> g_manifoldDmaExport;
-	public:
-		SpuContactResult();
-		virtual ~SpuContactResult();
-		btPersistentManifold*	GetSpuManifold() const
-		{
-			return m_spuManifold;
-		}
-		virtual void setShapeIdentifiersA(int partId0,int index0);
-		virtual void setShapeIdentifiersB(int partId1,int index1);
-		void	setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction01, bool isSwapped);
-        void writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold);
-        virtual void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth);
-		void flush();
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
deleted file mode 100644
index 449f1928..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+++ /dev/null
@@ -1,51 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-class btStackAlloc;
-class btIDebugDraw;
-#include "BulletCollision/NarrowphaseCollision/btConvexPenetrationDepthSolver.h"
-#include "LinearMath/btTransform.h"
-///ConvexPenetrationDepthSolver provides an interface for penetration depth calculation.
-class SpuConvexPenetrationDepthSolver : public btConvexPenetrationDepthSolver
-	virtual ~SpuConvexPenetrationDepthSolver() {};
-	virtual bool calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
-	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
-            btTransform& transA,const btTransform& transB,
-			btVector3& v, btVector3& pa, btVector3& pb,
-			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
-			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
-			struct SpuConvexPolyhedronVertexData* convexVertexDataB
-			) const = 0;
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
deleted file mode 100644
index 42f5f45c..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ /dev/null
@@ -1,1415 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuGatheringCollisionTask.h"
-#include "../SpuDoubleBuffer.h"
-#include "../SpuCollisionTaskProcess.h"
-#include "../SpuGatheringCollisionDispatcher.h" //for SPU_BATCHSIZE_BROADPHASE_PAIRS
-#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
-#include "../SpuContactManifoldCollisionAlgorithm.h"
-#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-#include "SpuContactResult.h"
-#include "BulletCollision/CollisionShapes/btOptimizedBvh.h"
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-#include "BulletCollision/CollisionShapes/btSphereShape.h"
-#include "BulletCollision/CollisionShapes/btConvexPointCloudShape.h"
-#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
-#include "BulletCollision/CollisionShapes/btConvexShape.h"
-#include "BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h"
-#include "BulletCollision/CollisionShapes/btConvexHullShape.h"
-#include "BulletCollision/CollisionShapes/btCompoundShape.h"
-#include "SpuMinkowskiPenetrationDepthSolver.h"
-//#include "SpuEpaPenetrationDepthSolver.h"
-#include "BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h"
-#include "boxBoxDistance.h"
-#include "BulletMultiThreaded/vectormath2bullet.h"
-#include "SpuCollisionShapes.h" //definition of SpuConvexPolyhedronVertexData
-#include "BulletCollision/CollisionDispatch/btBoxBoxDetector.h"
-#include "BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h"
-#include "BulletCollision/CollisionShapes/btTriangleShape.h"
-#ifdef __SPU__
-///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases
-#ifndef USE_LIBSPE2
-#endif //__SPU__
-int gSkippedCol = 0;
-int gProcessedCol = 0;
-/// software caching
-#include <spu_intrinsics.h>
-#include <sys/spu_thread.h>
-#include <sys/spu_event.h>
-#include <stdint.h>
-#define SPE_CACHE_NWAY   		4
-//#define SPE_CACHE_NSETS 		32, 16
-#define SPE_CACHE_NSETS 		8
-//#define SPE_CACHELINE_SIZE 		512
-#define SPE_CACHELINE_SIZE 		128
-#define SPE_CACHE_SET_TAGID(set) 	15
-///make sure that spe_cache.h is below those defines!
-#include "../Extras/software_cache/cache/include/spe_cache.h"
-int g_CacheMisses=0;
-int g_CacheHits=0;
-#if 0 // Added to allow cache misses and hits to be tracked, change this to 1 to restore unmodified version
-#define spe_cache_read(ea)		_spe_cache_lookup_xfer_wait_(ea, 0, 1)
-#define spe_cache_read(ea)		\
-({								\
-    int set, idx, line, byte;					\
-    _spe_cache_nway_lookup_(ea, set, idx);			\
-								\
-    if (btUnlikely(idx < 0)) {					\
-        ++g_CacheMisses;                        \
-	    idx = _spe_cache_miss_(ea, set, -1);			\
-        spu_writech(22, SPE_CACHE_SET_TAGMASK(set));		\
-        spu_mfcstat(MFC_TAG_UPDATE_ALL);			\
-    } 								\
-    else                            \
-    {                               \
-        ++g_CacheHits;              \
-    }                               \
-    line = _spe_cacheline_num_(set, idx);			\
-    byte = _spe_cacheline_byte_offset_(ea);			\
-    (void *) &spe_cache_mem[line + byte];			\
-bool gUseEpa = false;
-#ifdef USE_SN_TUNER
-#include <LibSN_SPU.h>
-#endif //USE_SN_TUNER
-#if defined (__SPU__) && !defined (USE_LIBSPE2)
-#include <spu_printf.h>
-#elif defined (USE_LIBSPE2)
-#define spu_printf(a)
-#include <stdio.h>
-#include <stdlib.h>
-#define spu_printf printf
-//int gNumConvexPoints0=0;
-///Make sure no destructors are called on this memory
-struct	CollisionTask_LocalStoreMemory
-	///This CollisionTask_LocalStoreMemory is mainly used for the SPU version, using explicit DMA
-	///Other platforms can use other memory programming models.
-	DoubleBuffer<unsigned char, MIDPHASE_WORKUNIT_PAGE_SIZE> g_workUnitTaskBuffers;
-	ATTRIBUTE_ALIGNED16(char gSpuContactManifoldAlgoBuffer [sizeof(SpuContactManifoldCollisionAlgorithm)+16]);
-	ATTRIBUTE_ALIGNED16(char gColObj0Buffer [sizeof(btCollisionObject)+16]);
-	ATTRIBUTE_ALIGNED16(char gColObj1Buffer [sizeof(btCollisionObject)+16]);
-	///we reserve 32bit integer indices, even though they might be 16bit
-	ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
-	btPersistentManifold	gPersistentManifoldBuffer;
-	CollisionShape_LocalStoreMemory gCollisionShapes[2];
-	bvhMeshShape_LocalStoreMemory bvhShapeData;
-	SpuConvexPolyhedronVertexData convexVertexData[2];
-	CompoundShape_LocalStoreMemory compoundShapeData[2];
-	///The following pointers might either point into this local store memory, or to the original/other memory locations.
-	///See SpuFakeDma for implementation of cellDmaSmallGetReadOnly.
-	btCollisionObject*	m_lsColObj0Ptr;
-	btCollisionObject*	m_lsColObj1Ptr;
-	btBroadphasePair* m_pairsPointer;
-	btPersistentManifold*	m_lsManifoldPtr;
-	SpuContactManifoldCollisionAlgorithm*	m_lsCollisionAlgorithmPtr;
-	bool	needsDmaPutContactManifoldAlgo;
-	btCollisionObject* getColObj0()
-	{
-		return m_lsColObj0Ptr;
-	}
-	btCollisionObject* getColObj1()
-	{
-		return m_lsColObj1Ptr;
-	}
-	btBroadphasePair* getBroadphasePairPtr()
-	{
-		return m_pairsPointer;
-	}
-	SpuContactManifoldCollisionAlgorithm*	getlocalCollisionAlgorithm()
-	{
-		return m_lsCollisionAlgorithmPtr;
-	}
-	btPersistentManifold*	getContactManifoldPtr()
-	{
-		return m_lsManifoldPtr;
-	}
-#if defined(__CELLOS_LV2__) || defined(USE_LIBSPE2) 
-ATTRIBUTE_ALIGNED16(CollisionTask_LocalStoreMemory	gLocalStoreMemory);
-void* createCollisionLocalStoreMemory()
-	return &gLocalStoreMemory;
-void* createCollisionLocalStoreMemory()
-        return new CollisionTask_LocalStoreMemory;
-void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts);
-SIMD_FORCE_INLINE void small_cache_read(void* buffer, ppu_address_t ea, size_t size)
-	// Check for alignment requirements. We need to make sure the entire request fits within one cache line,
-	// so the first and last bytes should fall on the same cache line
-	btAssert((ea & ~SPE_CACHELINE_MASK) == ((ea + size - 1) & ~SPE_CACHELINE_MASK));
-	void* ls = spe_cache_read(ea);
-	memcpy(buffer, ls, size);
-	stallingUnalignedDmaSmallGet(buffer,ea,size);
-SIMD_FORCE_INLINE void small_cache_read_triple(	void* ls0, ppu_address_t ea0,
-												void* ls1, ppu_address_t ea1,
-												void* ls2, ppu_address_t ea2,
-												size_t size)
-		btAssert(size<16);
-		ATTRIBUTE_ALIGNED16(char	tmpBuffer0[32]);
-		ATTRIBUTE_ALIGNED16(char	tmpBuffer1[32]);
-		ATTRIBUTE_ALIGNED16(char	tmpBuffer2[32]);
-		uint32_t i;
-		///make sure last 4 bits are the same, for cellDmaSmallGet
-		char* localStore0 = (char*)ls0;
-		uint32_t last4BitsOffset = ea0 & 0x0f;
-		char* tmpTarget0 = tmpBuffer0 + last4BitsOffset;
-#ifdef __SPU__
-		cellDmaSmallGet(tmpTarget0,ea0,size,DMA_TAG(1),0,0);
-		tmpTarget0 = (char*)cellDmaSmallGetReadOnly(tmpTarget0,ea0,size,DMA_TAG(1),0,0);
-		char* localStore1 = (char*)ls1;
-		last4BitsOffset = ea1 & 0x0f;
-		char* tmpTarget1 = tmpBuffer1 + last4BitsOffset;
-#ifdef __SPU__
-		cellDmaSmallGet(tmpTarget1,ea1,size,DMA_TAG(1),0,0);
-		tmpTarget1 = (char*)cellDmaSmallGetReadOnly(tmpTarget1,ea1,size,DMA_TAG(1),0,0);
-		char* localStore2 = (char*)ls2;
-		last4BitsOffset = ea2 & 0x0f;
-		char* tmpTarget2 = tmpBuffer2 + last4BitsOffset;
-#ifdef __SPU__
-		cellDmaSmallGet(tmpTarget2,ea2,size,DMA_TAG(1),0,0);
-		tmpTarget2 = (char*)cellDmaSmallGetReadOnly(tmpTarget2,ea2,size,DMA_TAG(1),0,0);
-		cellDmaWaitTagStatusAll( DMA_MASK(1) );
-		//this is slowish, perhaps memcpy on SPU is smarter?
-		for (i=0; btLikely( i<size );i++)
-		{
-			localStore0[i] = tmpTarget0[i];
-			localStore1[i] = tmpTarget1[i];
-			localStore2[i] = tmpTarget2[i];
-		}
-class spuNodeCallback : public btNodeOverlapCallback
-	SpuCollisionPairInput* m_wuInput;
-	SpuContactResult&		m_spuContacts;
-	CollisionTask_LocalStoreMemory*	m_lsMemPtr;
-	ATTRIBUTE_ALIGNED16(btTriangleShape)	m_tmpTriangleShape;
-	ATTRIBUTE_ALIGNED16(btVector3	spuTriangleVertices[3]);
-	ATTRIBUTE_ALIGNED16(btScalar	spuUnscaledVertex[4]);
-	spuNodeCallback(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory*	lsMemPtr,SpuContactResult& spuContacts)
-		:	m_wuInput(wuInput),
-		m_spuContacts(spuContacts),
-		m_lsMemPtr(lsMemPtr)
-	{
-	}
-	virtual void processNode(int subPart, int triangleIndex)
-	{
-		///Create a triangle on the stack, call process collision, with GJK
-		///DMA the vertices, can benefit from software caching
-		//		spu_printf("processNode with triangleIndex %d\n",triangleIndex);
-		if (m_lsMemPtr->bvhShapeData.gIndexMesh.m_indexType == PHY_SHORT)
-		{
-			unsigned short int* indexBasePtr = (unsigned short int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
-			ATTRIBUTE_ALIGNED16(unsigned short int tmpIndices[3]);
-			small_cache_read_triple(&tmpIndices[0],(ppu_address_t)&indexBasePtr[0],
-									&tmpIndices[1],(ppu_address_t)&indexBasePtr[1],
-									&tmpIndices[2],(ppu_address_t)&indexBasePtr[2],
-									sizeof(unsigned short int));
-			m_lsMemPtr->spuIndices[0] = int(tmpIndices[0]);
-			m_lsMemPtr->spuIndices[1] = int(tmpIndices[1]);
-			m_lsMemPtr->spuIndices[2] = int(tmpIndices[2]);
-		} else
-		{
-			unsigned int* indexBasePtr = (unsigned int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
-			small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0],
-								&m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1],
-								&m_lsMemPtr->spuIndices[2],(ppu_address_t)&indexBasePtr[2],
-								sizeof(int));
-		}
-		//		spu_printf("SPU index0=%d ,",spuIndices[0]);
-		//		spu_printf("SPU index1=%d ,",spuIndices[1]);
-		//		spu_printf("SPU index2=%d ,",spuIndices[2]);
-		//		spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr);
-		const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling();
-		for (int j=2;btLikely( j>=0 );j--)
-		{
-			int graphicsindex = m_lsMemPtr->spuIndices[j];
-			//			spu_printf("SPU index=%d ,",graphicsindex);
-			btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride);
-			//			spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr);
-			///handle un-aligned vertices...
-			//another DMA for each vertex
-			small_cache_read_triple(&spuUnscaledVertex[0],(ppu_address_t)&graphicsbasePtr[0],
-									&spuUnscaledVertex[1],(ppu_address_t)&graphicsbasePtr[1],
-									&spuUnscaledVertex[2],(ppu_address_t)&graphicsbasePtr[2],
-									sizeof(btScalar));
-			m_tmpTriangleShape.getVertexPtr(j).setValue(spuUnscaledVertex[0]*meshScaling.getX(),
-				spuUnscaledVertex[1]*meshScaling.getY(),
-				spuUnscaledVertex[2]*meshScaling.getZ());
-			//			spu_printf("SPU:triangle vertices:%f,%f,%f\n",spuTriangleVertices[j].x(),spuTriangleVertices[j].y(),spuTriangleVertices[j].z());
-		}
-		SpuCollisionPairInput triangleConcaveInput(*m_wuInput);
-//		triangleConcaveInput.m_spuCollisionShapes[1] = &spuTriangleVertices[0];
-		triangleConcaveInput.m_spuCollisionShapes[1] = &m_tmpTriangleShape;
-		triangleConcaveInput.m_shapeType1 = TRIANGLE_SHAPE_PROXYTYPE;
-		m_spuContacts.setShapeIdentifiersB(subPart,triangleIndex);
-		//		m_spuContacts.flush();
-		ProcessSpuConvexConvexCollision(&triangleConcaveInput, m_lsMemPtr,m_spuContacts);
-		///this flush should be automatic
-		//	m_spuContacts.flush();
-	}
-void btConvexPlaneCollideSingleContact (SpuCollisionPairInput* wuInput,CollisionTask_LocalStoreMemory* lsMemPtr,SpuContactResult&  spuContacts)
-	btConvexShape* convexShape = (btConvexShape*) wuInput->m_spuCollisionShapes[0];
-	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) wuInput->m_spuCollisionShapes[1];
-    bool hasCollision = false;
-	const btVector3& planeNormal = planeShape->getPlaneNormal();
-	const btScalar& planeConstant = planeShape->getPlaneConstant();
-	btTransform convexWorldTransform = wuInput->m_worldTransform0;
-	btTransform convexInPlaneTrans;
-	convexInPlaneTrans= wuInput->m_worldTransform1.inverse() * convexWorldTransform;
-	btTransform planeInConvex;
-	planeInConvex= convexWorldTransform.inverse() * wuInput->m_worldTransform1;
-	//btVector3 vtx = convexShape->localGetSupportVertexWithoutMarginNonVirtual(planeInConvex.getBasis()*-planeNormal);
-	btVector3 vtx = convexShape->localGetSupportVertexNonVirtual(planeInConvex.getBasis()*-planeNormal);
-	btVector3 vtxInPlane = convexInPlaneTrans(vtx);
-	btScalar distance = (planeNormal.dot(vtxInPlane) - planeConstant);
-	btVector3 vtxInPlaneProjected = vtxInPlane - distance*planeNormal;
-	btVector3 vtxInPlaneWorld = wuInput->m_worldTransform1 * vtxInPlaneProjected;
-	hasCollision = distance < lsMemPtr->getContactManifoldPtr()->getContactBreakingThreshold();
-	//resultOut->setPersistentManifold(m_manifoldPtr);
-	if (hasCollision)
-	{
-		/// report a contact. internally this will be kept persistent, and contact reduction is done
-		btVector3 normalOnSurfaceB =wuInput->m_worldTransform1.getBasis() * planeNormal;
-		btVector3 pOnB = vtxInPlaneWorld;
-		spuContacts.addContactPoint(normalOnSurfaceB,pOnB,distance);
-	}
-void	ProcessConvexPlaneSpuCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts)
-		register	int dmaSize = 0;
-		register ppu_address_t	dmaPpuAddress2;
-		btPersistentManifold* manifold = (btPersistentManifold*)wuInput->m_persistentManifoldPtr;
-		///DMA in the vertices for convex shapes
-		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
-		ATTRIBUTE_ALIGNED16(char convexHullShape1[sizeof(btConvexHullShape)]);
-		if ( btLikely( wuInput->m_shapeType0== CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			//	spu_printf("SPU: DMA btConvexHullShape\n");
-			dmaSize = sizeof(btConvexHullShape);
-			dmaPpuAddress2 = wuInput->m_collisionShapes[0];
-			cellDmaGet(&convexHullShape0, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		}
-		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			//	spu_printf("SPU: DMA btConvexHullShape\n");
-			dmaSize = sizeof(btConvexHullShape);
-			dmaPpuAddress2 = wuInput->m_collisionShapes[1];
-			cellDmaGet(&convexHullShape1, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		}
-		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{		
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			dmaConvexVertexData (&lsMemPtr->convexVertexData[0], (btConvexHullShape*)&convexHullShape0);
-			lsMemPtr->convexVertexData[0].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[0];
-		}
-		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			dmaConvexVertexData (&lsMemPtr->convexVertexData[1], (btConvexHullShape*)&convexHullShape1);
-			lsMemPtr->convexVertexData[1].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[1];
-		}
-		btConvexPointCloudShape cpc0,cpc1;
-		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			cellDmaWaitTagStatusAll(DMA_MASK(2));
-			lsMemPtr->convexVertexData[0].gConvexPoints = &lsMemPtr->convexVertexData[0].g_convexPointBuffer[0];
-			btConvexHullShape* ch = (btConvexHullShape*)wuInput->m_spuCollisionShapes[0];
-			const btVector3& localScaling = ch->getLocalScalingNV();
-			cpc0.setPoints(lsMemPtr->convexVertexData[0].gConvexPoints,lsMemPtr->convexVertexData[0].gNumConvexPoints,false,localScaling);
-			wuInput->m_spuCollisionShapes[0] = &cpc0;
-		}
-		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			cellDmaWaitTagStatusAll(DMA_MASK(2));		
-			lsMemPtr->convexVertexData[1].gConvexPoints = &lsMemPtr->convexVertexData[1].g_convexPointBuffer[0];
-			btConvexHullShape* ch = (btConvexHullShape*)wuInput->m_spuCollisionShapes[1];
-			const btVector3& localScaling = ch->getLocalScalingNV();
-			cpc1.setPoints(lsMemPtr->convexVertexData[1].gConvexPoints,lsMemPtr->convexVertexData[1].gNumConvexPoints,false,localScaling);
-			wuInput->m_spuCollisionShapes[1] = &cpc1;
-		}
-//		const btConvexShape* shape0Ptr = (const btConvexShape*)wuInput->m_spuCollisionShapes[0];
-//		const btConvexShape* shape1Ptr = (const btConvexShape*)wuInput->m_spuCollisionShapes[1];
-//		int shapeType0 = wuInput->m_shapeType0;
-//		int shapeType1 = wuInput->m_shapeType1;
-		float marginA = wuInput->m_collisionMargin0;
-		float marginB = wuInput->m_collisionMargin1;
-		SpuClosestPointInput	cpInput;
-		cpInput.m_convexVertexData[0] = &lsMemPtr->convexVertexData[0];
-		cpInput.m_convexVertexData[1] = &lsMemPtr->convexVertexData[1];
-		cpInput.m_transformA = wuInput->m_worldTransform0;
-		cpInput.m_transformB = wuInput->m_worldTransform1;
-		float sumMargin = (marginA+marginB+lsMemPtr->getContactManifoldPtr()->getContactBreakingThreshold());
-		cpInput.m_maximumDistanceSquared = sumMargin * sumMargin;
-		ppu_address_t manifoldAddress = (ppu_address_t)manifold;
-		btPersistentManifold* spuManifold=lsMemPtr->getContactManifoldPtr();
-		//spuContacts.setContactInfo(spuManifold,manifoldAddress,wuInput->m_worldTransform0,wuInput->m_worldTransform1,wuInput->m_isSwapped);
-		spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMemPtr->getColObj0()->getWorldTransform(),
-			lsMemPtr->getColObj1()->getWorldTransform(),
-			lsMemPtr->getColObj0()->getRestitution(),lsMemPtr->getColObj1()->getRestitution(),
-			lsMemPtr->getColObj0()->getFriction(),lsMemPtr->getColObj1()->getFriction(),
-			wuInput->m_isSwapped);
-		btConvexPlaneCollideSingleContact(wuInput,lsMemPtr,spuContacts);
-/// Convex versus Concave triangle mesh collision detection (handles concave triangle mesh versus sphere, box, cylinder, triangle, cone, convex polyhedron etc)
-void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts)
-	//order: first collision shape is convex, second concave. m_isSwapped is true, if the original order was opposite
-	btBvhTriangleMeshShape*	trimeshShape = (btBvhTriangleMeshShape*)wuInput->m_spuCollisionShapes[1];
-	//need the mesh interface, for access to triangle vertices
-	dmaBvhShapeData (&lsMemPtr->bvhShapeData, trimeshShape);
-	btVector3 aabbMin(-1,-400,-1);
-	btVector3 aabbMax(1,400,1);
-	//recalc aabbs
-	btTransform convexInTriangleSpace;
-	convexInTriangleSpace = wuInput->m_worldTransform1.inverse() * wuInput->m_worldTransform0;
-	btConvexInternalShape* convexShape = (btConvexInternalShape*)wuInput->m_spuCollisionShapes[0];
-	computeAabb (aabbMin, aabbMax, convexShape, wuInput->m_collisionShapes[0], wuInput->m_shapeType0, convexInTriangleSpace);
-	//CollisionShape* triangleShape = static_cast<btCollisionShape*>(triBody->m_collisionShape);
-	//convexShape->getAabb(convexInTriangleSpace,m_aabbMin,m_aabbMax);
-	//	btScalar extraMargin = collisionMarginTriangle;
-	//	btVector3 extra(extraMargin,extraMargin,extraMargin);
-	//	aabbMax += extra;
-	//	aabbMin -= extra;
-	///quantize query AABB
-	unsigned short int quantizedQueryAabbMin[3];
-	unsigned short int quantizedQueryAabbMax[3];
-	lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin,aabbMin,0);
-	lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax,aabbMax,1);
-	QuantizedNodeArray&	nodeArray = lsMemPtr->bvhShapeData.getOptimizedBvh()->getQuantizedNodeArray();
-	//spu_printf("SPU: numNodes = %d\n",nodeArray.size());
-	BvhSubtreeInfoArray& subTrees = lsMemPtr->bvhShapeData.getOptimizedBvh()->getSubtreeInfoArray();
-	spuNodeCallback	nodeCallback(wuInput,lsMemPtr,spuContacts);
-	IndexedMeshArray&	indexArray = lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getIndexedMeshArray();
-	//spu_printf("SPU:indexArray.size() = %d\n",indexArray.size());
-	//	spu_printf("SPU: numSubTrees = %d\n",subTrees.size());
-	//not likely to happen
-	if (subTrees.size() && indexArray.size() == 1)
-	{
-		///DMA in the index info
-		dmaBvhIndexedMesh (&lsMemPtr->bvhShapeData.gIndexMesh, indexArray, 0 /* index into indexArray */, 1 /* dmaTag */);
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-		//display the headers
-		int numBatch = subTrees.size();
-		for (int i=0;i<numBatch;)
-		{
-			//@todo- can reorder DMA transfers for less stall
-			int remaining = subTrees.size() - i;
-			int nextBatch = remaining < MAX_SPU_SUBTREE_HEADERS ? remaining : MAX_SPU_SUBTREE_HEADERS;
-			dmaBvhSubTreeHeaders (&lsMemPtr->bvhShapeData.gSubtreeHeaders[0], (ppu_address_t)(&subTrees[i]), nextBatch, 1);
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			//			spu_printf("nextBatch = %d\n",nextBatch);
-			for (int j=0;j<nextBatch;j++)
-			{
-				const btBvhSubtreeInfo& subtree = lsMemPtr->bvhShapeData.gSubtreeHeaders[j];
-				unsigned int overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
-				if (overlap)
-				{
-					btAssert(subtree.m_subtreeSize);
-					//dma the actual nodes of this subtree
-					dmaBvhSubTreeNodes (&lsMemPtr->bvhShapeData.gSubtreeNodes[0], subtree, nodeArray, 2);
-					cellDmaWaitTagStatusAll(DMA_MASK(2));
-					/* Walk this subtree */
-					spuWalkStacklessQuantizedTree(&nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,
-						&lsMemPtr->bvhShapeData.gSubtreeNodes[0],
-						0,
-						subtree.m_subtreeSize);
-				}
-				//				spu_printf("subtreeSize = %d\n",gSubtreeHeaders[j].m_subtreeSize);
-			}
-			//	unsigned short int	m_quantizedAabbMin[3];
-			//	unsigned short int	m_quantizedAabbMax[3];
-			//	int			m_rootNodeIndex;
-			//	int			m_subtreeSize;
-			i+=nextBatch;
-		}
-		//pre-fetch first tree, then loop and double buffer
-	}
-int stats[MAX_DEGENERATE_STATS]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-int degenerateStats[MAX_DEGENERATE_STATS]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-/// Convex versus Convex collision detection (handles collision between sphere, box, cylinder, triangle, cone, convex polyhedron etc)
-void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts)
-	register int dmaSize;
-	register ppu_address_t	dmaPpuAddress2;
-	//spu_printf("SPU: ProcessSpuConvexConvexCollision\n");
-	//CollisionShape* shape0 = (CollisionShape*)wuInput->m_collisionShapes[0];
-	//CollisionShape* shape1 = (CollisionShape*)wuInput->m_collisionShapes[1];
-	btPersistentManifold* manifold = (btPersistentManifold*)wuInput->m_persistentManifoldPtr;
-	bool genericGjk = true;
-	if (genericGjk)
-	{
-		//try generic GJK
-		//SpuConvexPenetrationDepthSolver* penetrationSolver=0;
-		btVoronoiSimplexSolver simplexSolver;
-		btGjkEpaPenetrationDepthSolver	epaPenetrationSolver2;
-		btConvexPenetrationDepthSolver* penetrationSolver = &epaPenetrationSolver2;
-		//SpuMinkowskiPenetrationDepthSolver	minkowskiPenetrationSolver;
-#ifdef ENABLE_EPA
-		if (gUseEpa)
-		{
-			penetrationSolver = &epaPenetrationSolver2;
-		} else
-		{
-			//penetrationSolver = &minkowskiPenetrationSolver;
-		}
-		///DMA in the vertices for convex shapes
-		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
-		ATTRIBUTE_ALIGNED16(char convexHullShape1[sizeof(btConvexHullShape)]);
-		if ( btLikely( wuInput->m_shapeType0== CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			//	spu_printf("SPU: DMA btConvexHullShape\n");
-			dmaSize = sizeof(btConvexHullShape);
-			dmaPpuAddress2 = wuInput->m_collisionShapes[0];
-			cellDmaGet(&convexHullShape0, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		}
-		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			//	spu_printf("SPU: DMA btConvexHullShape\n");
-			dmaSize = sizeof(btConvexHullShape);
-			dmaPpuAddress2 = wuInput->m_collisionShapes[1];
-			cellDmaGet(&convexHullShape1, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		}
-		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{		
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			dmaConvexVertexData (&lsMemPtr->convexVertexData[0], (btConvexHullShape*)&convexHullShape0);
-			lsMemPtr->convexVertexData[0].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[0];
-		}
-		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			dmaConvexVertexData (&lsMemPtr->convexVertexData[1], (btConvexHullShape*)&convexHullShape1);
-			lsMemPtr->convexVertexData[1].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[1];
-		}
-		btConvexPointCloudShape cpc0,cpc1;
-		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			cellDmaWaitTagStatusAll(DMA_MASK(2));
-			lsMemPtr->convexVertexData[0].gConvexPoints = &lsMemPtr->convexVertexData[0].g_convexPointBuffer[0];
-			btConvexHullShape* ch = (btConvexHullShape*)wuInput->m_spuCollisionShapes[0];
-			const btVector3& localScaling = ch->getLocalScalingNV();
-			cpc0.setPoints(lsMemPtr->convexVertexData[0].gConvexPoints,lsMemPtr->convexVertexData[0].gNumConvexPoints,false,localScaling);
-			wuInput->m_spuCollisionShapes[0] = &cpc0;
-		}
-		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
-		{
-			cellDmaWaitTagStatusAll(DMA_MASK(2));		
-			lsMemPtr->convexVertexData[1].gConvexPoints = &lsMemPtr->convexVertexData[1].g_convexPointBuffer[0];
-			btConvexHullShape* ch = (btConvexHullShape*)wuInput->m_spuCollisionShapes[1];
-			const btVector3& localScaling = ch->getLocalScalingNV();
-			cpc1.setPoints(lsMemPtr->convexVertexData[1].gConvexPoints,lsMemPtr->convexVertexData[1].gNumConvexPoints,false,localScaling);
-			wuInput->m_spuCollisionShapes[1] = &cpc1;
-		}
-		const btConvexShape* shape0Ptr = (const btConvexShape*)wuInput->m_spuCollisionShapes[0];
-		const btConvexShape* shape1Ptr = (const btConvexShape*)wuInput->m_spuCollisionShapes[1];
-		int shapeType0 = wuInput->m_shapeType0;
-		int shapeType1 = wuInput->m_shapeType1;
-		float marginA = wuInput->m_collisionMargin0;
-		float marginB = wuInput->m_collisionMargin1;
-		SpuClosestPointInput	cpInput;
-		cpInput.m_convexVertexData[0] = &lsMemPtr->convexVertexData[0];
-		cpInput.m_convexVertexData[1] = &lsMemPtr->convexVertexData[1];
-		cpInput.m_transformA = wuInput->m_worldTransform0;
-		cpInput.m_transformB = wuInput->m_worldTransform1;
-		float sumMargin = (marginA+marginB+lsMemPtr->getContactManifoldPtr()->getContactBreakingThreshold());
-		cpInput.m_maximumDistanceSquared = sumMargin * sumMargin;
-		ppu_address_t manifoldAddress = (ppu_address_t)manifold;
-		btPersistentManifold* spuManifold=lsMemPtr->getContactManifoldPtr();
-		//spuContacts.setContactInfo(spuManifold,manifoldAddress,wuInput->m_worldTransform0,wuInput->m_worldTransform1,wuInput->m_isSwapped);
-		spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMemPtr->getColObj0()->getWorldTransform(),
-			lsMemPtr->getColObj1()->getWorldTransform(),
-			lsMemPtr->getColObj0()->getRestitution(),lsMemPtr->getColObj1()->getRestitution(),
-			lsMemPtr->getColObj0()->getFriction(),lsMemPtr->getColObj1()->getFriction(),
-			wuInput->m_isSwapped);
-		{
-			btGjkPairDetector gjk(shape0Ptr,shape1Ptr,shapeType0,shapeType1,marginA,marginB,&simplexSolver,penetrationSolver);//&vsSolver,penetrationSolver);
-			gjk.getClosestPoints(cpInput,spuContacts,0);//,debugDraw);
-			btAssert(gjk.m_lastUsedMethod <MAX_DEGENERATE_STATS);
-			stats[gjk.m_lastUsedMethod]++;
-			btAssert(gjk.m_degenerateSimplex <MAX_DEGENERATE_STATS);
-			degenerateStats[gjk.m_degenerateSimplex]++;
-			btScalar sepDist = gjk.getCachedSeparatingDistance()+spuManifold->getContactBreakingThreshold();
-			lsMemPtr->getlocalCollisionAlgorithm()->m_sepDistance.initSeparatingDistance(gjk.getCachedSeparatingAxis(),sepDist,wuInput->m_worldTransform0,wuInput->m_worldTransform1);
-			lsMemPtr->needsDmaPutContactManifoldAlgo = true;
-		}
-	}
-template<typename T> void DoSwap(T& a, T& b)
-	char tmp[sizeof(T)];
-	memcpy(tmp, &a, sizeof(T));
-	memcpy(&a, &b, sizeof(T));
-	memcpy(&b, tmp, sizeof(T));
-SIMD_FORCE_INLINE void	dmaAndSetupCollisionObjects(SpuCollisionPairInput& collisionPairInput, CollisionTask_LocalStoreMemory& lsMem)
-	register int dmaSize;
-	register ppu_address_t	dmaPpuAddress2;
-	dmaSize = sizeof(btCollisionObject);//btTransform);
-	dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr1->m_clientObject :*/ (ppu_address_t)lsMem.getlocalCollisionAlgorithm()->getCollisionObject0();
-	lsMem.m_lsColObj0Ptr = (btCollisionObject*)cellDmaGetReadOnly(&lsMem.gColObj0Buffer, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);		
-	dmaSize = sizeof(btCollisionObject);//btTransform);
-	dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr0->m_clientObject :*/ (ppu_address_t)lsMem.getlocalCollisionAlgorithm()->getCollisionObject1();
-	lsMem.m_lsColObj1Ptr = (btCollisionObject*)cellDmaGetReadOnly(&lsMem.gColObj1Buffer, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);		
-	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-	btCollisionObject* ob0 = lsMem.getColObj0();
-	btCollisionObject* ob1 = lsMem.getColObj1();
-	collisionPairInput.m_worldTransform0 = ob0->getWorldTransform();
-	collisionPairInput.m_worldTransform1 = ob1->getWorldTransform();
-void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTask_LocalStoreMemory& lsMem,
-							SpuContactResult &spuContacts,
-							ppu_address_t collisionShape0Ptr, void* collisionShape0Loc,
-							ppu_address_t collisionShape1Ptr, void* collisionShape1Loc, bool dmaShapes = true)
-	if (btBroadphaseProxy::isConvex(collisionPairInput.m_shapeType0) 
-		&& btBroadphaseProxy::isConvex(collisionPairInput.m_shapeType1))
-	{
-		if (dmaShapes)
-		{
-			dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
-			dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
-			cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		}
-		btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
-		btConvexInternalShape* spuConvexShape1 = (btConvexInternalShape*)collisionShape1Loc;
-		btVector3 dim0 = spuConvexShape0->getImplicitShapeDimensions();
-		btVector3 dim1 = spuConvexShape1->getImplicitShapeDimensions();
-		collisionPairInput.m_primitiveDimensions0 = dim0;
-		collisionPairInput.m_primitiveDimensions1 = dim1;
-		collisionPairInput.m_collisionShapes[0] = collisionShape0Ptr;
-		collisionPairInput.m_collisionShapes[1] = collisionShape1Ptr;
-		collisionPairInput.m_spuCollisionShapes[0] = spuConvexShape0;
-		collisionPairInput.m_spuCollisionShapes[1] = spuConvexShape1;
-		ProcessSpuConvexConvexCollision(&collisionPairInput,&lsMem,spuContacts);
-	} 
-	else if (btBroadphaseProxy::isCompound(collisionPairInput.m_shapeType0) && 
-			btBroadphaseProxy::isCompound(collisionPairInput.m_shapeType1))
-	{
-		//snPause();
-		dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
-		dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		// Both are compounds, do N^2 CD for now
-		///@todo: add some AABB-based pruning (probably not -> slower)
-		btCompoundShape* spuCompoundShape0 = (btCompoundShape*)collisionShape0Loc;
-		btCompoundShape* spuCompoundShape1 = (btCompoundShape*)collisionShape1Loc;
-		dmaCompoundShapeInfo (&lsMem.compoundShapeData[0], spuCompoundShape0, 1);
-		dmaCompoundShapeInfo (&lsMem.compoundShapeData[1], spuCompoundShape1, 2);
-		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		dmaCompoundSubShapes (&lsMem.compoundShapeData[0], spuCompoundShape0, 1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-		dmaCompoundSubShapes (&lsMem.compoundShapeData[1], spuCompoundShape1, 1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-		int childShapeCount0 = spuCompoundShape0->getNumChildShapes();
-		btAssert(childShapeCount0< MAX_SPU_COMPOUND_SUBSHAPES);
-		int childShapeCount1 = spuCompoundShape1->getNumChildShapes();
-		btAssert(childShapeCount1< MAX_SPU_COMPOUND_SUBSHAPES);
-		// Start the N^2
-		for (int i = 0; i < childShapeCount0; ++i)
-		{
-			btCompoundShapeChild& childShape0 = lsMem.compoundShapeData[0].gSubshapes[i];
-			btAssert(!btBroadphaseProxy::isCompound(childShape0.m_childShapeType));
-			for (int j = 0; j < childShapeCount1; ++j)
-			{
-				btCompoundShapeChild& childShape1 = lsMem.compoundShapeData[1].gSubshapes[j];
-				btAssert(!btBroadphaseProxy::isCompound(childShape1.m_childShapeType));
-				/* Create a new collision pair input struct using the two child shapes */
-				SpuCollisionPairInput cinput (collisionPairInput);
-				cinput.m_worldTransform0 = collisionPairInput.m_worldTransform0 * childShape0.m_transform;
-				cinput.m_shapeType0 = childShape0.m_childShapeType;
-				cinput.m_collisionMargin0 = childShape0.m_childMargin;
-				cinput.m_worldTransform1 = collisionPairInput.m_worldTransform1 * childShape1.m_transform;
-				cinput.m_shapeType1 = childShape1.m_childShapeType;
-				cinput.m_collisionMargin1 = childShape1.m_childMargin;
-				/* Recursively call handleCollisionPair () with new collision pair input */
-				handleCollisionPair(cinput, lsMem, spuContacts,			
-					(ppu_address_t)childShape0.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], 
-					(ppu_address_t)childShape1.m_childShape, lsMem.compoundShapeData[1].gSubshapeShape[j], false);
-			}
-		}
-	}
-	else if (btBroadphaseProxy::isCompound(collisionPairInput.m_shapeType0) )
-	{
-		//snPause();
-		dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
-		dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		// object 0 compound, object 1 non-compound
-		btCompoundShape* spuCompoundShape = (btCompoundShape*)collisionShape0Loc;
-		dmaCompoundShapeInfo (&lsMem.compoundShapeData[0], spuCompoundShape, 1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-		int childShapeCount = spuCompoundShape->getNumChildShapes();
-		btAssert(childShapeCount< MAX_SPU_COMPOUND_SUBSHAPES);
-		for (int i = 0; i < childShapeCount; ++i)
-		{
-			btCompoundShapeChild& childShape = lsMem.compoundShapeData[0].gSubshapes[i];
-			btAssert(!btBroadphaseProxy::isCompound(childShape.m_childShapeType));
-			// Dma the child shape
-			dmaCollisionShape (&lsMem.compoundShapeData[0].gSubshapeShape[i], (ppu_address_t)childShape.m_childShape, 1, childShape.m_childShapeType);
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			SpuCollisionPairInput cinput (collisionPairInput);
-			cinput.m_worldTransform0 = collisionPairInput.m_worldTransform0 * childShape.m_transform;
-			cinput.m_shapeType0 = childShape.m_childShapeType;
-			cinput.m_collisionMargin0 = childShape.m_childMargin;
-			handleCollisionPair(cinput, lsMem, spuContacts,			
-				(ppu_address_t)childShape.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], 
-				collisionShape1Ptr, collisionShape1Loc, false);
-		}
-	}
-	else if (btBroadphaseProxy::isCompound(collisionPairInput.m_shapeType1) )
-	{
-		//snPause();
-		dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
-		dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		// object 0 non-compound, object 1 compound
-		btCompoundShape* spuCompoundShape = (btCompoundShape*)collisionShape1Loc;
-		dmaCompoundShapeInfo (&lsMem.compoundShapeData[0], spuCompoundShape, 1);
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-		int childShapeCount = spuCompoundShape->getNumChildShapes();
-		btAssert(childShapeCount< MAX_SPU_COMPOUND_SUBSHAPES);
-		for (int i = 0; i < childShapeCount; ++i)
-		{
-			btCompoundShapeChild& childShape = lsMem.compoundShapeData[0].gSubshapes[i];
-			btAssert(!btBroadphaseProxy::isCompound(childShape.m_childShapeType));
-			// Dma the child shape
-			dmaCollisionShape (&lsMem.compoundShapeData[0].gSubshapeShape[i], (ppu_address_t)childShape.m_childShape, 1, childShape.m_childShapeType);
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			SpuCollisionPairInput cinput (collisionPairInput);
-			cinput.m_worldTransform1 = collisionPairInput.m_worldTransform1 * childShape.m_transform;
-			cinput.m_shapeType1 = childShape.m_childShapeType;
-			cinput.m_collisionMargin1 = childShape.m_childMargin;
-			handleCollisionPair(cinput, lsMem, spuContacts,
-				collisionShape0Ptr, collisionShape0Loc, 
-				(ppu_address_t)childShape.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], false);
-		}
-	}
-	else
-	{
-		//a non-convex shape is involved									
-		bool handleConvexConcave = false;
-		//snPause();
-		if (btBroadphaseProxy::isConcave(collisionPairInput.m_shapeType0) &&
-			btBroadphaseProxy::isConvex(collisionPairInput.m_shapeType1))
-		{
-			// Swap stuff
-			DoSwap(collisionShape0Ptr, collisionShape1Ptr);
-			DoSwap(collisionShape0Loc, collisionShape1Loc);
-			DoSwap(collisionPairInput.m_shapeType0, collisionPairInput.m_shapeType1);
-			DoSwap(collisionPairInput.m_worldTransform0, collisionPairInput.m_worldTransform1);
-			DoSwap(collisionPairInput.m_collisionMargin0, collisionPairInput.m_collisionMargin1);
-			collisionPairInput.m_isSwapped = true;
-		}
-		if (btBroadphaseProxy::isConvex(collisionPairInput.m_shapeType0)&&
-			btBroadphaseProxy::isConcave(collisionPairInput.m_shapeType1))
-		{
-			handleConvexConcave = true;
-		}
-		if (handleConvexConcave)
-		{
-			if (dmaShapes)
-			{
-				dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
-				dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
-				cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-			}
-			if (collisionPairInput.m_shapeType1 == STATIC_PLANE_PROXYTYPE)
-			{
-				btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
-				btStaticPlaneShape* planeShape= (btStaticPlaneShape*)collisionShape1Loc;
-				btVector3 dim0 = spuConvexShape0->getImplicitShapeDimensions();
-				collisionPairInput.m_primitiveDimensions0 = dim0;
-				collisionPairInput.m_collisionShapes[0] = collisionShape0Ptr;
-				collisionPairInput.m_collisionShapes[1] = collisionShape1Ptr;
-				collisionPairInput.m_spuCollisionShapes[0] = spuConvexShape0;
-				collisionPairInput.m_spuCollisionShapes[1] = planeShape;
-				ProcessConvexPlaneSpuCollision(&collisionPairInput,&lsMem,spuContacts);
-			} else
-			{
-				btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
-				btBvhTriangleMeshShape* trimeshShape = (btBvhTriangleMeshShape*)collisionShape1Loc;
-				btVector3 dim0 = spuConvexShape0->getImplicitShapeDimensions();
-				collisionPairInput.m_primitiveDimensions0 = dim0;
-				collisionPairInput.m_collisionShapes[0] = collisionShape0Ptr;
-				collisionPairInput.m_collisionShapes[1] = collisionShape1Ptr;
-				collisionPairInput.m_spuCollisionShapes[0] = spuConvexShape0;
-				collisionPairInput.m_spuCollisionShapes[1] = trimeshShape;
-				ProcessConvexConcaveSpuCollision(&collisionPairInput,&lsMem,spuContacts);
-			}
-		}
-	}
-	spuContacts.flush();
-void	processCollisionTask(void* userPtr, void* lsMemPtr)
-	SpuGatherAndProcessPairsTaskDesc* taskDescPtr = (SpuGatherAndProcessPairsTaskDesc*)userPtr;
-	SpuGatherAndProcessPairsTaskDesc& taskDesc = *taskDescPtr;
-	CollisionTask_LocalStoreMemory*	colMemPtr = (CollisionTask_LocalStoreMemory*)lsMemPtr;
-	CollisionTask_LocalStoreMemory& lsMem = *(colMemPtr);
-	gUseEpa = taskDesc.m_useEpa;
-	//	spu_printf("taskDescPtr=%llx\n",taskDescPtr);
-	SpuContactResult spuContacts;
-	////////////////////
-	ppu_address_t dmaInPtr = taskDesc.m_inPairPtr;
-	unsigned int numPages = taskDesc.numPages;
-	unsigned int numOnLastPage = taskDesc.numOnLastPage;
-	// prefetch first set of inputs and wait
-	lsMem.g_workUnitTaskBuffers.init();
-	unsigned int nextNumOnPage = (numPages > 1)? MIDPHASE_NUM_WORKUNITS_PER_PAGE : numOnLastPage;
-	lsMem.g_workUnitTaskBuffers.backBufferDmaGet(dmaInPtr, nextNumOnPage*sizeof(SpuGatherAndProcessWorkUnitInput), DMA_TAG(3));
-	register unsigned char *inputPtr;
-	register unsigned int numOnPage;
-	register unsigned int j;
-	SpuGatherAndProcessWorkUnitInput* wuInputs;	
-	register int dmaSize;
-	register ppu_address_t	dmaPpuAddress;
-	register ppu_address_t	dmaPpuAddress2;
-	int numPairs;
-	register int p;
-	SpuCollisionPairInput collisionPairInput;
-	for (unsigned int i = 0; btLikely(i < numPages); i++)
-	{
-		// wait for back buffer dma and swap buffers
-		inputPtr = lsMem.g_workUnitTaskBuffers.swapBuffers();
-		// number on current page is number prefetched last iteration
-		numOnPage = nextNumOnPage;
-		// prefetch next set of inputs
-		if ( btLikely( i < numPages-1 ) )
-		if ( btUnlikely( i < numPages-1 ) )
-		{
-			nextNumOnPage = (i == numPages-2)? numOnLastPage : MIDPHASE_NUM_WORKUNITS_PER_PAGE;
-			lsMem.g_workUnitTaskBuffers.backBufferDmaGet(dmaInPtr, nextNumOnPage*sizeof(SpuGatherAndProcessWorkUnitInput), DMA_TAG(3));
-		}
-		wuInputs = reinterpret_cast<SpuGatherAndProcessWorkUnitInput *>(inputPtr);
-		for (j = 0; btLikely( j < numOnPage ); j++)
-		{
-		//	printMidphaseInput(&wuInputs[j]);
-			numPairs = wuInputs[j].m_endIndex - wuInputs[j].m_startIndex;
-			if ( btLikely( numPairs ) )
-			{
-					dmaSize = numPairs*sizeof(btBroadphasePair);
-					dmaPpuAddress = wuInputs[j].m_pairArrayPtr+wuInputs[j].m_startIndex * sizeof(btBroadphasePair);
-					lsMem.m_pairsPointer = (btBroadphasePair*)cellDmaGetReadOnly(&lsMem.gBroadphasePairsBuffer, dmaPpuAddress  , dmaSize, DMA_TAG(1), 0, 0);
-					cellDmaWaitTagStatusAll(DMA_MASK(1));
-				for (p=0;p<numPairs;p++)
-				{
-					//for each broadphase pair, do something
-					btBroadphasePair& pair = lsMem.getBroadphasePairPtr()[p];
-					spu_printf("pair->m_userInfo = %d\n",pair.m_userInfo);
-					spu_printf("pair->m_algorithm = %d\n",pair.m_algorithm);
-					spu_printf("pair->m_pProxy0 = %d\n",pair.m_pProxy0);
-					spu_printf("pair->m_pProxy1 = %d\n",pair.m_pProxy1);
-					if (pair.m_internalTmpValue == 2 && pair.m_algorithm && pair.m_pProxy0 && pair.m_pProxy1)
-					{
-						dmaSize = sizeof(SpuContactManifoldCollisionAlgorithm);
-						dmaPpuAddress2 = (ppu_address_t)pair.m_algorithm;
-						lsMem.m_lsCollisionAlgorithmPtr = (SpuContactManifoldCollisionAlgorithm*)cellDmaGetReadOnly(&lsMem.gSpuContactManifoldAlgoBuffer, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-						cellDmaWaitTagStatusAll(DMA_MASK(1));
-						lsMem.needsDmaPutContactManifoldAlgo = false;
-						collisionPairInput.m_persistentManifoldPtr = (ppu_address_t) lsMem.getlocalCollisionAlgorithm()->getContactManifoldPtr();
-						collisionPairInput.m_isSwapped = false;
-						if (1)
-						{
-							///can wait on the combined DMA_MASK, or dma on the same tag
-					//		spu_printf("SPU collisionPairInput->m_shapeType0 = %d\n",collisionPairInput->m_shapeType0);
-					//		spu_printf("SPU collisionPairInput->m_shapeType1 = %d\n",collisionPairInput->m_shapeType1);
-							dmaSize = sizeof(btPersistentManifold);
-							dmaPpuAddress2 = collisionPairInput.m_persistentManifoldPtr;
-							lsMem.m_lsManifoldPtr = (btPersistentManifold*)cellDmaGetReadOnly(&lsMem.gPersistentManifoldBuffer, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-							collisionPairInput.m_shapeType0 = lsMem.getlocalCollisionAlgorithm()->getShapeType0();
-							collisionPairInput.m_shapeType1 = lsMem.getlocalCollisionAlgorithm()->getShapeType1();
-							collisionPairInput.m_collisionMargin0 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin0();
-							collisionPairInput.m_collisionMargin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1();
-							//??cellDmaWaitTagStatusAll(DMA_MASK(1));
-							if (1)
-							{
-								//snPause();
-								// Get the collision objects
-								dmaAndSetupCollisionObjects(collisionPairInput, lsMem);
-								if (lsMem.getColObj0()->isActive() || lsMem.getColObj1()->isActive())
-								{
-									lsMem.needsDmaPutContactManifoldAlgo = true;
-									lsMem.getlocalCollisionAlgorithm()->m_sepDistance.updateSeparatingDistance(collisionPairInput.m_worldTransform0,collisionPairInput.m_worldTransform1);
-									bool boxbox = ((lsMem.getlocalCollisionAlgorithm()->getShapeType0()==BOX_SHAPE_PROXYTYPE)&&
-										(lsMem.getlocalCollisionAlgorithm()->getShapeType1()==BOX_SHAPE_PROXYTYPE));
-									if (boxbox)
-									{
-										//spu_printf("boxbox dist = %f\n",distance);
-										btPersistentManifold* spuManifold=lsMem.getContactManifoldPtr();
-										btPersistentManifold* manifold = (btPersistentManifold*)collisionPairInput.m_persistentManifoldPtr;
-										ppu_address_t manifoldAddress = (ppu_address_t)manifold;
-										spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMem.getColObj0()->getWorldTransform(),
-											lsMem.getColObj1()->getWorldTransform(),
-											lsMem.getColObj0()->getRestitution(),lsMem.getColObj1()->getRestitution(),
-											lsMem.getColObj0()->getFriction(),lsMem.getColObj1()->getFriction(),
-											collisionPairInput.m_isSwapped);
-									//float distance=0.f;
-									btVector3 normalInB;
-									if (//!gUseEpa &&
-										lsMem.getlocalCollisionAlgorithm()->m_sepDistance.getConservativeSeparatingDistance()<=0.f
-										1
-										)
-										{
-//#define USE_PE_BOX_BOX 1
-#ifdef USE_PE_BOX_BOX
-											{
-												//getCollisionMargin0
-												btScalar margin0 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin0();
-												btScalar margin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1();
-												btVector3 shapeDim0 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions0()+btVector3(margin0,margin0,margin0);
-												btVector3 shapeDim1 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions1()+btVector3(margin1,margin1,margin1);
-												//Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
-												vmVector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
-												vmVector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
-												vmMatrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
-												vmMatrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());
-												vmTransform3 transformA(vmMatrix0,vmPos0);
-												Box boxB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
-												vmTransform3 transformB(vmMatrix1,vmPos1);
-												BoxPoint resultClosestBoxPointA;
-												BoxPoint resultClosestBoxPointB;
-												vmVector3 resultNormal;
-												*/
-												float distanceThreshold = FLT_MAX
-												//float distanceThreshold = 0.f;
-												vmVector3 n;
-												Box boxA;
-												vmVector3 hA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
-												vmVector3 hB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
-												boxA.mHalf= hA;
-												vmTransform3 trA;
-												trA.setTranslation(getVmVector3(collisionPairInput.m_worldTransform0.getOrigin()));
-												trA.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis()));
-												Box boxB;
-												boxB.mHalf = hB;
-												vmTransform3 trB;
-												trB.setTranslation(getVmVector3(collisionPairInput.m_worldTransform1.getOrigin()));
-												trB.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis()));
-												float distanceThreshold = spuManifold->getContactBreakingThreshold();//0.001f;
-												BoxPoint ptA,ptB;
-												float dist = boxBoxDistance(n, ptA, ptB,
-														   boxA, trA, boxB,	   trB,
-															distanceThreshold );
-//												float distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
-												normalInB = -getBtVector3(n);//resultNormal);
-												//if(dist < distanceThreshold)//spuManifold->getContactBreakingThreshold())
-												if(dist < spuManifold->getContactBreakingThreshold())
-												{
-													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(ptB.localPoint));
-													spuContacts.addContactPoint(
-														normalInB,
-														pointOnB,
-														dist);
-												}
-											} 
-											{
-												btScalar margin0 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin0();
-												btScalar margin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1();
-												btVector3 shapeDim0 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions0()+btVector3(margin0,margin0,margin0);
-												btVector3 shapeDim1 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions1()+btVector3(margin1,margin1,margin1);
-												btBoxShape box0(shapeDim0);
-												btBoxShape box1(shapeDim1);
-												struct SpuBridgeContactCollector : public btDiscreteCollisionDetectorInterface::Result
-												{
-													SpuContactResult&	m_spuContacts;
-													virtual void setShapeIdentifiersA(int partId0,int index0)
-													{
-														m_spuContacts.setShapeIdentifiersA(partId0,index0);
-													}
-													virtual void setShapeIdentifiersB(int partId1,int index1)
-													{
-														m_spuContacts.setShapeIdentifiersB(partId1,index1);
-													}
-													virtual void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
-													{
-														m_spuContacts.addContactPoint(normalOnBInWorld,pointInWorld,depth);
-													}
-													SpuBridgeContactCollector(SpuContactResult& spuContacts)
-														:m_spuContacts(spuContacts)
-													{
-													}
-												};
-												SpuBridgeContactCollector  bridgeOutput(spuContacts);
-												btDiscreteCollisionDetectorInterface::ClosestPointInput input;
-												input.m_maximumDistanceSquared = BT_LARGE_FLOAT;
-												input.m_transformA = collisionPairInput.m_worldTransform0;
-												input.m_transformB = collisionPairInput.m_worldTransform1;
-												btBoxBoxDetector detector(&box0,&box1);
-												detector.getClosestPoints(input,bridgeOutput,0);
-											}
-#endif //USE_PE_BOX_BOX
-											lsMem.needsDmaPutContactManifoldAlgo = true;
-											btScalar sepDist2 = distance+spuManifold->getContactBreakingThreshold();
-											lsMem.getlocalCollisionAlgorithm()->m_sepDistance.initSeparatingDistance(normalInB,sepDist2,collisionPairInput.m_worldTransform0,collisionPairInput.m_worldTransform1);
-											gProcessedCol++;
-										} else
-										{
-											gSkippedCol++;
-										}
-										spuContacts.flush();
-									} else
-									{
-										if (
-											lsMem.getlocalCollisionAlgorithm()->m_sepDistance.getConservativeSeparatingDistance()<=0.f
-											1
-											)
-										{
-											handleCollisionPair(collisionPairInput, lsMem, spuContacts,
-												(ppu_address_t)lsMem.getColObj0()->getRootCollisionShape(), &lsMem.gCollisionShapes[0].collisionShape,
-												(ppu_address_t)lsMem.getColObj1()->getRootCollisionShape(), &lsMem.gCollisionShapes[1].collisionShape);
-										} else
-										{
-												//spu_printf("boxbox dist = %f\n",distance);
-											btPersistentManifold* spuManifold=lsMem.getContactManifoldPtr();
-											btPersistentManifold* manifold = (btPersistentManifold*)collisionPairInput.m_persistentManifoldPtr;
-											ppu_address_t manifoldAddress = (ppu_address_t)manifold;
-											spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMem.getColObj0()->getWorldTransform(),
-												lsMem.getColObj1()->getWorldTransform(),
-												lsMem.getColObj0()->getRestitution(),lsMem.getColObj1()->getRestitution(),
-												lsMem.getColObj0()->getFriction(),lsMem.getColObj1()->getFriction(),
-												collisionPairInput.m_isSwapped);
-											spuContacts.flush();
-										}
-									}
-								}
-							}
-						}
-#if defined (__SPU__) || defined (USE_LIBSPE2)
-						if (lsMem.needsDmaPutContactManifoldAlgo)
-						{
-							dmaSize = sizeof(SpuContactManifoldCollisionAlgorithm);
-							dmaPpuAddress2 = (ppu_address_t)pair.m_algorithm;
-							cellDmaLargePut(&lsMem.gSpuContactManifoldAlgoBuffer, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-							cellDmaWaitTagStatusAll(DMA_MASK(1));
-						}
-#endif //#ifdef USE_SEPDISTANCE_UTIL
-					}
-				}
-			}
-		} //end for (j = 0; j < numOnPage; j++)
-	}//	for 
-	return;
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
deleted file mode 100644
index bbaa555e..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+++ /dev/null
@@ -1,140 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "../PlatformDefinitions.h"
-///Task Description for SPU collision detection
-struct SpuGatherAndProcessPairsTaskDesc 
-	ppu_address_t	m_inPairPtr;//m_pairArrayPtr;
-	//mutex variable
-	uint32_t	m_someMutexVariableInMainMemory;
-	ppu_address_t	m_dispatcher;
-	uint32_t	numOnLastPage;
-	uint16_t numPages;
-	uint16_t taskId;
-	bool m_useEpa;
-	struct	CollisionTask_LocalStoreMemory*	m_lsMemory; 
-#if  defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
-__attribute__ ((aligned (128)))
-void	processCollisionTask(void* userPtr, void* lsMemory);
-void*	createCollisionLocalStoreMemory();
-#if defined(USE_LIBSPE2) && defined(__SPU__)
-#include "../SpuLibspe2Support.h"
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <SpuFakeDma.h>
-int main(unsigned long long speid, addr64 argp, addr64 envp)
-	printf("SPU: hello \n");
-	ATTRIBUTE_ALIGNED128(btSpuStatus status);
-	ATTRIBUTE_ALIGNED16( SpuGatherAndProcessPairsTaskDesc taskDesc ) ;
-	unsigned int received_message = Spu_Mailbox_Event_Nothing;
-    bool shutdown = false;
-	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	status.m_status = Spu_Status_Free;
-	status.m_lsMemory.p = createCollisionLocalStoreMemory();
-	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	while ( btLikely( !shutdown ) )
-	{
-		received_message = spu_read_in_mbox();
-		if( btLikely( received_message == Spu_Mailbox_Event_Task ))
-		{
-			printf("SPU: received Spu_Mailbox_Event_Task\n");
-			// refresh the status
-			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			btAssert(status.m_status==Spu_Status_Occupied);
-			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuGatherAndProcessPairsTaskDesc), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			printf("SPU:processCollisionTask\n");	
-			processCollisionTask((void*)&taskDesc, taskDesc.m_lsMemory);
-			printf("SPU:finished processCollisionTask\n");
-		}
-		else
-		{
-			printf("SPU: received ShutDown\n");
-			if( btLikely( received_message == Spu_Mailbox_Event_Shutdown ) )
-			{
-				shutdown = true;
-			}
-			else
-			{
-				//printf("SPU - Sth. recieved\n");
-			}
-		}
-		// set to status free and wait for next task
-		status.m_status = Spu_Status_Free;
-		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-		cellDmaWaitTagStatusAll(DMA_MASK(3));		
-  	}
-	printf("SPU: shutdown\n");
-  	return 0;
-#endif // USE_LIBSPE2
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
deleted file mode 100644
index 9f7e64dd..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuMinkowskiPenetrationDepthSolver.h"
-#include "SpuContactResult.h"
-#include "SpuPreferredPenetrationDirections.h"
-#include "BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h"
-#include "BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h"
-#include "SpuCollisionShapes.h"
-btVector3(btScalar(0.000000) , btScalar(-0.000000),btScalar(-1.000000)),
-btVector3(btScalar(0.723608) , btScalar(-0.525725),btScalar(-0.447219)),
-btVector3(btScalar(-0.276388) , btScalar(-0.850649),btScalar(-0.447219)),
-btVector3(btScalar(-0.894426) , btScalar(-0.000000),btScalar(-0.447216)),
-btVector3(btScalar(-0.276388) , btScalar(0.850649),btScalar(-0.447220)),
-btVector3(btScalar(0.723608) , btScalar(0.525725),btScalar(-0.447219)),
-btVector3(btScalar(0.276388) , btScalar(-0.850649),btScalar(0.447220)),
-btVector3(btScalar(-0.723608) , btScalar(-0.525725),btScalar(0.447219)),
-btVector3(btScalar(-0.723608) , btScalar(0.525725),btScalar(0.447219)),
-btVector3(btScalar(0.276388) , btScalar(0.850649),btScalar(0.447219)),
-btVector3(btScalar(0.894426) , btScalar(0.000000),btScalar(0.447216)),
-btVector3(btScalar(-0.000000) , btScalar(0.000000),btScalar(1.000000)),
-btVector3(btScalar(0.425323) , btScalar(-0.309011),btScalar(-0.850654)),
-btVector3(btScalar(-0.162456) , btScalar(-0.499995),btScalar(-0.850654)),
-btVector3(btScalar(0.262869) , btScalar(-0.809012),btScalar(-0.525738)),
-btVector3(btScalar(0.425323) , btScalar(0.309011),btScalar(-0.850654)),
-btVector3(btScalar(0.850648) , btScalar(-0.000000),btScalar(-0.525736)),
-btVector3(btScalar(-0.525730) , btScalar(-0.000000),btScalar(-0.850652)),
-btVector3(btScalar(-0.688190) , btScalar(-0.499997),btScalar(-0.525736)),
-btVector3(btScalar(-0.162456) , btScalar(0.499995),btScalar(-0.850654)),
-btVector3(btScalar(-0.688190) , btScalar(0.499997),btScalar(-0.525736)),
-btVector3(btScalar(0.262869) , btScalar(0.809012),btScalar(-0.525738)),
-btVector3(btScalar(0.951058) , btScalar(0.309013),btScalar(0.000000)),
-btVector3(btScalar(0.951058) , btScalar(-0.309013),btScalar(0.000000)),
-btVector3(btScalar(0.587786) , btScalar(-0.809017),btScalar(0.000000)),
-btVector3(btScalar(0.000000) , btScalar(-1.000000),btScalar(0.000000)),
-btVector3(btScalar(-0.587786) , btScalar(-0.809017),btScalar(0.000000)),
-btVector3(btScalar(-0.951058) , btScalar(-0.309013),btScalar(-0.000000)),
-btVector3(btScalar(-0.951058) , btScalar(0.309013),btScalar(-0.000000)),
-btVector3(btScalar(-0.587786) , btScalar(0.809017),btScalar(-0.000000)),
-btVector3(btScalar(-0.000000) , btScalar(1.000000),btScalar(-0.000000)),
-btVector3(btScalar(0.587786) , btScalar(0.809017),btScalar(-0.000000)),
-btVector3(btScalar(0.688190) , btScalar(-0.499997),btScalar(0.525736)),
-btVector3(btScalar(-0.262869) , btScalar(-0.809012),btScalar(0.525738)),
-btVector3(btScalar(-0.850648) , btScalar(0.000000),btScalar(0.525736)),
-btVector3(btScalar(-0.262869) , btScalar(0.809012),btScalar(0.525738)),
-btVector3(btScalar(0.688190) , btScalar(0.499997),btScalar(0.525736)),
-btVector3(btScalar(0.525730) , btScalar(0.000000),btScalar(0.850652)),
-btVector3(btScalar(0.162456) , btScalar(-0.499995),btScalar(0.850654)),
-btVector3(btScalar(-0.425323) , btScalar(-0.309011),btScalar(0.850654)),
-btVector3(btScalar(-0.425323) , btScalar(0.309011),btScalar(0.850654)),
-btVector3(btScalar(0.162456) , btScalar(0.499995),btScalar(0.850654))
-bool SpuMinkowskiPenetrationDepthSolver::calcPenDepth( btSimplexSolverInterface& simplexSolver,
-		const btConvexShape* convexA,const btConvexShape* convexB,
-					const btTransform& transA,const btTransform& transB,
-				btVector3& v, btVector3& pa, btVector3& pb,
-				class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc)
-#if 0
-	(void)stackAlloc;
-	(void)v;
-	struct btIntermediateResult : public SpuContactResult
-	{
-		btIntermediateResult():m_hasResult(false)
-		{
-		}
-		btVector3 m_normalOnBInWorld;
-		btVector3 m_pointInWorld;
-		btScalar m_depth;
-		bool	m_hasResult;
-		virtual void setShapeIdentifiersA(int partId0,int index0)
-		{
-			(void)partId0;
-			(void)index0;
-		}
-		virtual void setShapeIdentifiersB(int partId1,int index1)
-		{
-			(void)partId1;
-			(void)index1;
-		}
-		void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
-		{
-			m_normalOnBInWorld = normalOnBInWorld;
-			m_pointInWorld = pointInWorld;
-			m_depth = depth;
-			m_hasResult = true;
-		}
-	};
-	//just take fixed number of orientation, and sample the penetration depth in that direction
-	btScalar minProj = btScalar(BT_LARGE_FLOAT);
-	btVector3 minNorm(0.f,0.f,0.f);
-	btVector3 minVertex;
-	btVector3 minA,minB;
-	btVector3 seperatingAxisInA,seperatingAxisInB;
-	btVector3 pInA,qInB,pWorld,qWorld,w;
-	int i;
-	int numSampleDirections = NUM_UNITSPHERE_POINTS;
-	for (i=0;i<numSampleDirections;i++)
-	{
-		const btVector3& norm = sPenetrationDirections[i];
-		seperatingAxisInABatch[i] =  (-norm) * transA.getBasis() ;
-		seperatingAxisInBBatch[i] =  norm   * transB.getBasis() ;
-	}
-	{
-		int numPDA = convexA->getNumPreferredPenetrationDirections();
-		if (numPDA)
-		{
-			for (int i=0;i<numPDA;i++)
-			{
-				btVector3 norm;
-				convexA->getPreferredPenetrationDirection(i,norm);
-				norm  = transA.getBasis() * norm;
-				sPenetrationDirections[numSampleDirections] = norm;
-				seperatingAxisInABatch[numSampleDirections] = (-norm) * transA.getBasis();
-				seperatingAxisInBBatch[numSampleDirections] = norm * transB.getBasis();
-				numSampleDirections++;
-			}
-		}
-	}
-	{
-		int numPDB = convexB->getNumPreferredPenetrationDirections();
-		if (numPDB)
-		{
-			for (int i=0;i<numPDB;i++)
-			{
-				btVector3 norm;
-				convexB->getPreferredPenetrationDirection(i,norm);
-				norm  = transB.getBasis() * norm;
-				sPenetrationDirections[numSampleDirections] = norm;
-				seperatingAxisInABatch[numSampleDirections] = (-norm) * transA.getBasis();
-				seperatingAxisInBBatch[numSampleDirections] = norm * transB.getBasis();
-				numSampleDirections++;
-			}
-		}
-	}
-	convexA->batchedUnitVectorGetSupportingVertexWithoutMargin(seperatingAxisInABatch,supportVerticesABatch,numSampleDirections);
-	convexB->batchedUnitVectorGetSupportingVertexWithoutMargin(seperatingAxisInBBatch,supportVerticesBBatch,numSampleDirections);
-	for (i=0;i<numSampleDirections;i++)
-	{
-		const btVector3& norm = sPenetrationDirections[i];
-		seperatingAxisInA = seperatingAxisInABatch[i];
-		seperatingAxisInB = seperatingAxisInBBatch[i];
-		pInA = supportVerticesABatch[i];
-		qInB = supportVerticesBBatch[i];
-		pWorld = transA(pInA);	
-		qWorld = transB(qInB);
-		w	= qWorld - pWorld;
-		btScalar delta = norm.dot(w);
-		//find smallest delta
-		if (delta < minProj)
-		{
-			minProj = delta;
-			minNorm = norm;
-			minA = pWorld;
-			minB = qWorld;
-		}
-	}	
-	int numSampleDirections = NUM_UNITSPHERE_POINTS;
-///this is necessary, otherwise the normal is not correct, and sphere will rotate forever on a sloped triangle mesh
-	{
-		int numPDA = spuGetNumPreferredPenetrationDirections(shapeTypeA,convexA);
-		if (numPDA)
-		{
-			for (int i=0;i<numPDA;i++)
-			{
-				btVector3 norm;
-				spuGetPreferredPenetrationDirection(shapeTypeA,convexA,i,norm);
-				norm  = transA.getBasis() * norm;
-				sPenetrationDirections[numSampleDirections] = norm;
-				numSampleDirections++;
-			}
-		}
-	}
-	{
-		int numPDB = spuGetNumPreferredPenetrationDirections(shapeTypeB,convexB);
-		if (numPDB)
-		{
-			for (int i=0;i<numPDB;i++)
-			{
-				btVector3 norm;
-				spuGetPreferredPenetrationDirection(shapeTypeB,convexB,i,norm);
-				norm  = transB.getBasis() * norm;
-				sPenetrationDirections[numSampleDirections] = norm;
-				numSampleDirections++;
-			}
-		}
-	}
-	for (int i=0;i<numSampleDirections;i++)
-	{
-		const btVector3& norm = sPenetrationDirections[i];
-		seperatingAxisInA = (-norm)* transA.getBasis();
-		seperatingAxisInB = norm* transB.getBasis();
-		pInA = convexA->localGetSupportVertexWithoutMarginNonVirtual( seperatingAxisInA);//, NULL);
-		qInB = convexB->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInB);//, NULL);
-	//	pInA = convexA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
-	//	qInB = convexB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
-		pWorld = transA(pInA);	
-		qWorld = transB(qInB);
-		w	= qWorld - pWorld;
-		btScalar delta = norm.dot(w);
-		//find smallest delta
-		if (delta < minProj)
-		{
-			minProj = delta;
-			minNorm = norm;
-			minA = pWorld;
-			minB = qWorld;
-		}
-	}
-	//add the margins
-	minA += minNorm*marginA;
-	minB -= minNorm*marginB;
-	//no penetration
-	if (minProj < btScalar(0.))
-		return false;
-	minProj += (marginA + marginB) + btScalar(1.00);
-//#define DEBUG_DRAW 1
-#ifdef DEBUG_DRAW
-	if (debugDraw)
-	{
-		btVector3 color(0,1,0);
-		debugDraw->drawLine(minA,minB,color);
-		color = btVector3 (1,1,1);
-		btVector3 vec = minB-minA;
-		btScalar prj2 = minNorm.dot(vec);
-		debugDraw->drawLine(minA,minA+(minNorm*minProj),color);
-	}
-#endif //DEBUG_DRAW
-	btGjkPairDetector gjkdet(convexA,convexB,&simplexSolver,0);
-	btScalar offsetDist = minProj;
-	btVector3 offset = minNorm * offsetDist;
-	SpuClosestPointInput input;
-	input.m_convexVertexData[0] = convexVertexDataA;
-	input.m_convexVertexData[1] = convexVertexDataB;
-	btVector3 newOrg = transA.getOrigin() + offset;
-	btTransform displacedTrans = transA;
-	displacedTrans.setOrigin(newOrg);
-	input.m_transformA = displacedTrans;
-	input.m_transformB = transB;
-	input.m_maximumDistanceSquared = btScalar(BT_LARGE_FLOAT);//minProj;
-	btIntermediateResult res;
-	gjkdet.getClosestPoints(input,res,0);
-	btScalar correctedMinNorm = minProj - res.m_depth;
-	//the penetration depth is over-estimated, relax it
-	btScalar penetration_relaxation= btScalar(1.);
-	minNorm*=penetration_relaxation;
-	if (res.m_hasResult)
-	{
-		pa = res.m_pointInWorld - minNorm * correctedMinNorm;
-		pb = res.m_pointInWorld;
-#ifdef DEBUG_DRAW
-		if (debugDraw)
-		{
-			btVector3 color(1,0,0);
-			debugDraw->drawLine(pa,pb,color);
-		}
-	} else {
-		// could not seperate shapes
-		//btAssert (false);
-	}
-	return res.m_hasResult;
-	return false;
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
deleted file mode 100644
index 18ad223e..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+++ /dev/null
@@ -1,48 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h"
-class btStackAlloc;
-class btIDebugDraw;
-class btVoronoiSimplexSolver;
-class btConvexShape;
-///MinkowskiPenetrationDepthSolver implements bruteforce penetration depth estimation.
-///Implementation is based on sampling the depth using support mapping, and using GJK step to get the witness points.
-class SpuMinkowskiPenetrationDepthSolver : public btConvexPenetrationDepthSolver
-	SpuMinkowskiPenetrationDepthSolver() {}
-	virtual ~SpuMinkowskiPenetrationDepthSolver() {};
-		virtual bool calcPenDepth( btSimplexSolverInterface& simplexSolver,
-		const btConvexShape* convexA,const btConvexShape* convexB,
-					const btTransform& transA,const btTransform& transB,
-				btVector3& v, btVector3& pa, btVector3& pb,
-				class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc
-				);
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
deleted file mode 100644
index 774a0cb2..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
+++ /dev/null
@@ -1,70 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://continuousphysics.com/Bullet/
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
-int		spuGetNumPreferredPenetrationDirections(int shapeType, void* shape)
-	switch (shapeType)
-    {
-		{
-			return 2;
-			//spu_printf("2\n");
-			break;
-		}
-		default:
-			{
-#if __ASSERT
-        spu_printf("spuGetNumPreferredPenetrationDirections() - Unsupported bound type: %d.\n", shapeType);
-#endif // __ASSERT
-			}
-	}
-	return 0;	
-void	spuGetPreferredPenetrationDirection(int shapeType, void* shape, int index, btVector3& penetrationVector)
-	switch (shapeType)
-    {
-		{
-			btVector3* vertices = (btVector3*)shape;
-			///calcNormal
-			penetrationVector = (vertices[1]-vertices[0]).cross(vertices[2]-vertices[0]);
-			penetrationVector.normalize();
-			if (index)
-				penetrationVector *= btScalar(-1.);
-			break;
-		}
-		default:
-			{
-#if __ASSERT
-        spu_printf("spuGetNumPreferredPenetrationDirections() - Unsupported bound type: %d.\n", shapeType);
-#endif // __ASSERT
-			}
-	}
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
deleted file mode 100644
index 5e1202c0..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
+++ /dev/null
@@ -1,1160 +0,0 @@
-   Copyright (C) 2006, 2008 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-//#include "PfxContactBoxBox.h"
-#include <math.h>
-#include "../PlatformDefinitions.h"
-#include "boxBoxDistance.h"
-static inline float sqr( float a )
-	return (a * a);
-enum BoxSepAxisType
-// voronoiTol: bevels Voronoi planes slightly which helps when features are parallel.
-static const float voronoiTol = -1.0e-5f;
-// separating axis tests: gaps along each axis are computed, and the axis with the maximum
-// gap is stored.  cross product axes are normalized.
-#define AaxisTest( dim, letter, first )                                                         \
-{                                                                                               \
-   if ( first )                                                                                 \
-   {                                                                                            \
-      maxGap = gap = gapsA.get##letter();                                                      \
-      if ( gap > distanceThreshold ) return gap;                                                \
-      axisType = A_AXIS;                                                                        \
-      faceDimA = dim;                                                                           \
-      axisA = identity.getCol##dim();                                                          \
-   }                                                                                            \
-   else                                                                                         \
-   {                                                                                            \
-      gap = gapsA.get##letter();                                                               \
-      if ( gap > distanceThreshold ) return gap;                                                \
-      else if ( gap > maxGap )                                                                  \
-      {                                                                                         \
-         maxGap = gap;                                                                          \
-         axisType = A_AXIS;                                                                     \
-         faceDimA = dim;                                                                        \
-         axisA = identity.getCol##dim();                                                       \
-      }                                                                                         \
-   }                                                                                            \
-#define BaxisTest( dim, letter )                                                                \
-{                                                                                               \
-   gap = gapsB.get##letter();                                                                  \
-   if ( gap > distanceThreshold ) return gap;                                                   \
-   else if ( gap > maxGap )                                                                     \
-   {                                                                                            \
-      maxGap = gap;                                                                             \
-      axisType = B_AXIS;                                                                        \
-      faceDimB = dim;                                                                           \
-      axisB = identity.getCol##dim();                                                          \
-   }                                                                                            \
-#define CrossAxisTest( dima, dimb, letterb )                                                    \
-{                                                                                               \
-   const float lsqr_tolerance = 1.0e-30f;                                                       \
-   float lsqr;                                                                                  \
-                                                                                                \
-   lsqr = lsqrs.getCol##dima().get##letterb();                                                \
-                                                                                                \
-   if ( lsqr > lsqr_tolerance )                                                                 \
-   {                                                                                            \
-      float l_recip = 1.0f / sqrtf( lsqr );                                                     \
-      gap = float(gapsAxB.getCol##dima().get##letterb()) * l_recip;                           \
-                                                                                                \
-      if ( gap > distanceThreshold )                                                            \
-      {                                                                                         \
-         return gap;                                                                            \
-      }                                                                                         \
-                                                                                                \
-      if ( gap > maxGap )                                                                       \
-      {                                                                                         \
-         maxGap = gap;                                                                          \
-         axisType = CROSS_AXIS;                                                                 \
-         edgeDimA = dima;                                                                       \
-         edgeDimB = dimb;                                                                       \
-         axisA = cross(identity.getCol##dima(),matrixAB.getCol##dimb()) * l_recip;            \
-      }                                                                                         \
-   }                                                                                            \
-// tests whether a vertex of box B and a face of box A are the closest features
-	bool & inVoronoi,
-	float & t0,
-	float & t1,
-	const vmVector3 & hA,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesB )
-	// compute a corner of box B in A's coordinate system
-	vmVector3 corner =
-		vmVector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() );
-	// compute the parameters of the point on A, closest to this corner
-	t0 = corner[0];
-	t1 = corner[1];
-	if ( t0 > hA[0] )
-		t0 = hA[0];
-	else if ( t0 < -hA[0] )
-		t0 = -hA[0];
-	if ( t1 > hA[1] )
-		t1 = hA[1];
-	else if ( t1 < -hA[1] )
-		t1 = -hA[1];
-	// do the Voronoi test: already know the point on B is in the Voronoi region of the
-	// point on A, check the reverse.
-	vmVector3 facePointB =
-		vmVector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) );
-	inVoronoi = ( ( facePointB[0] >= voronoiTol * facePointB[2] ) &&
-				  ( facePointB[1] >= voronoiTol * facePointB[0] ) &&
-				  ( facePointB[2] >= voronoiTol * facePointB[1] ) );
-	return (sqr( corner[0] - t0 ) + sqr( corner[1] - t1 ) + sqr( corner[2] ));
-#define VertexBFaceA_SetNewMin()                \
-{                                               \
-   minDistSqr = distSqr;                        \
-   localPointA.setX(t0);                        \
-   localPointA.setY(t1);                        \
-   localPointB.setX( scalesB.getX() );          \
-   localPointB.setY( scalesB.getY() );          \
-   featureA = F;                                \
-   featureB = V;                                \
-	bool & done,
-	float & minDistSqr,
-	vmPoint3 & localPointA,
-	vmPoint3 & localPointB,
-	FeatureType & featureA,
-	FeatureType & featureB,
-	const vmVector3 & hA,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesB,
-	bool first )
-	float t0, t1;
-	float distSqr;
-	distSqr = VertexBFaceATest( done, t0, t1, hA, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsB, scalesB );
-	if ( first ) {
-		VertexBFaceA_SetNewMin();
-	} else {
-		if ( distSqr < minDistSqr ) {
-			VertexBFaceA_SetNewMin();
-		}
-	}
-	if ( done )
-		return;
-	signsB.setX( -signsB.getX() );
-	scalesB.setX( -scalesB.getX() );
-	distSqr = VertexBFaceATest( done, t0, t1, hA, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsB, scalesB );
-	if ( distSqr < minDistSqr ) {
-		VertexBFaceA_SetNewMin();
-	}
-	if ( done )
-		return;
-	signsB.setY( -signsB.getY() );
-	scalesB.setY( -scalesB.getY() );
-	distSqr = VertexBFaceATest( done, t0, t1, hA, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsB, scalesB );
-	if ( distSqr < minDistSqr ) {
-		VertexBFaceA_SetNewMin();
-	}
-	if ( done )
-		return;
-	signsB.setX( -signsB.getX() );
-	scalesB.setX( -scalesB.getX() );
-	distSqr = VertexBFaceATest( done, t0, t1, hA, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsB, scalesB );
-	if ( distSqr < minDistSqr ) {
-		VertexBFaceA_SetNewMin();
-	}
-// VertexAFaceBTest: tests whether a vertex of box A and a face of box B are the closest features
-	bool & inVoronoi,
-	float & t0,
-	float & t1,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) scalesA )
-	vmVector3 corner =
-		vmVector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() );
-	t0 = corner[0];
-	t1 = corner[1];
-	if ( t0 > hB[0] )
-		t0 = hB[0];
-	else if ( t0 < -hB[0] )
-		t0 = -hB[0];
-	if ( t1 > hB[1] )
-		t1 = hB[1];
-	else if ( t1 < -hB[1] )
-		t1 = -hB[1];
-	vmVector3 facePointA =
-		vmVector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) );
-	inVoronoi = ( ( facePointA[0] >= voronoiTol * facePointA[2] ) &&
-				  ( facePointA[1] >= voronoiTol * facePointA[0] ) &&
-				  ( facePointA[2] >= voronoiTol * facePointA[1] ) );
-	return (sqr( corner[0] - t0 ) + sqr( corner[1] - t1 ) + sqr( corner[2] ));
-#define VertexAFaceB_SetNewMin()                \
-{                                               \
-   minDistSqr = distSqr;                        \
-   localPointB.setX(t0);                        \
-   localPointB.setY(t1);                        \
-   localPointA.setX( scalesA.getX() );          \
-   localPointA.setY( scalesA.getY() );          \
-   featureA = V;                                \
-   featureB = F;                                \
-	bool & done,
-	float & minDistSqr,
-	vmPoint3 & localPointA,
-	vmPoint3 & localPointB,
-	FeatureType & featureA,
-	FeatureType & featureB,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) scalesA,
-	bool first )
-	float t0, t1;
-	float distSqr;
-	distSqr = VertexAFaceBTest( done, t0, t1, hB, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsA, scalesA );
-	if ( first ) {
-		VertexAFaceB_SetNewMin();
-	} else {
-		if ( distSqr < minDistSqr ) {
-			VertexAFaceB_SetNewMin();
-		}
-	}
-	if ( done )
-		return;
-	signsA.setX( -signsA.getX() );
-	scalesA.setX( -scalesA.getX() );
-	distSqr = VertexAFaceBTest( done, t0, t1, hB, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsA, scalesA );
-	if ( distSqr < minDistSqr ) {
-		VertexAFaceB_SetNewMin();
-	}
-	if ( done )
-		return;
-	signsA.setY( -signsA.getY() );
-	scalesA.setY( -scalesA.getY() );
-	distSqr = VertexAFaceBTest( done, t0, t1, hB, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsA, scalesA );
-	if ( distSqr < minDistSqr ) {
-		VertexAFaceB_SetNewMin();
-	}
-	if ( done )
-		return;
-	signsA.setX( -signsA.getX() );
-	scalesA.setX( -scalesA.getX() );
-	distSqr = VertexAFaceBTest( done, t0, t1, hB, faceOffsetAB, faceOffsetBA,
-								matrixAB, matrixBA, signsA, scalesA );
-	if ( distSqr < minDistSqr ) {
-		VertexAFaceB_SetNewMin();
-	}
-// CustomEdgeEdgeTest:
-// tests whether a pair of edges are the closest features
-// note on the shorthand:
-// 'a' & 'b' refer to the edges.
-// 'c' is the dimension of the axis that points from the face center to the edge Center
-// 'd' is the dimension of the edge Direction
-// the dimension of the face normal is 2
-#define CustomEdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter )              \
-{                                                                                               \
-   vmVector3 edgeOffsetAB;                                                                          \
-   vmVector3 edgeOffsetBA;                                                                          \
-                                                                                                \
-   edgeOffsetAB = faceOffsetAB + matrixAB.getCol##bc() * scalesB.get##bc_letter();            \
-   edgeOffsetAB.set##ac_letter( edgeOffsetAB.get##ac_letter() - scalesA.get##ac_letter() );  \
-                                                                                                \
-   edgeOffsetBA = faceOffsetBA + matrixBA.getCol##ac() * scalesA.get##ac_letter();            \
-   edgeOffsetBA.set##bc_letter( edgeOffsetBA.get##bc_letter() - scalesB.get##bc_letter() );  \
-                                                                                                \
-   float dirDot = matrixAB.getCol##bd().get##ad_letter();                                     \
-   float denom = 1.0f - dirDot*dirDot;                                                          \
-   float edgeOffsetAB_ad = edgeOffsetAB.get##ad_letter();                                      \
-   float edgeOffsetBA_bd = edgeOffsetBA.get##bd_letter();                                      \
-                                                                                                \
-   if ( denom == 0.0f )                                                                         \
-   {                                                                                            \
-      tA = 0.0f;                                                                                \
-   }                                                                                            \
-   else                                                                                         \
-   {                                                                                            \
-      tA = ( edgeOffsetAB_ad + edgeOffsetBA_bd * dirDot ) / denom;                              \
-   }                                                                                            \
-                                                                                                \
-   if ( tA < -hA[ad] ) tA = -hA[ad];                                                            \
-   else if ( tA > hA[ad] ) tA = hA[ad];                                                         \
-                                                                                                \
-   tB = tA * dirDot + edgeOffsetBA_bd;                                                          \
-                                                                                                \
-   if ( tB < -hB[bd] )                                                                          \
-   {                                                                                            \
-      tB = -hB[bd];                                                                             \
-      tA = tB * dirDot + edgeOffsetAB_ad;                                                       \
-                                                                                                \
-      if ( tA < -hA[ad] ) tA = -hA[ad];                                                         \
-      else if ( tA > hA[ad] ) tA = hA[ad];                                                      \
-   }                                                                                            \
-   else if ( tB > hB[bd] )                                                                      \
-   {                                                                                            \
-      tB = hB[bd];                                                                              \
-      tA = tB * dirDot + edgeOffsetAB_ad;                                                       \
-                                                                                                \
-      if ( tA < -hA[ad] ) tA = -hA[ad];                                                         \
-      else if ( tA > hA[ad] ) tA = hA[ad];                                                      \
-   }                                                                                            \
-                                                                                                \
-   vmVector3 edgeOffAB = vmVector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\
-   vmVector3 edgeOffBA = vmVector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\
-                                                                                                \
-   inVoronoi = ( edgeOffAB[ac] >= voronoiTol * edgeOffAB[2] ) &&                                \
-               ( edgeOffAB[2] >= voronoiTol * edgeOffAB[ac] ) &&                                \
-               ( edgeOffBA[bc] >= voronoiTol * edgeOffBA[2] ) &&                                \
-               ( edgeOffBA[2] >= voronoiTol * edgeOffBA[bc] );                                  \
-                                                                                                \
-   edgeOffAB[ad] -= tA;                                                                         \
-   edgeOffBA[bd] -= tB;                                                                         \
-                                                                                                \
-   return dot(edgeOffAB,edgeOffAB);                                                             \
-	bool & inVoronoi,
-	float & tA,
-	float & tB,
-	const vmVector3 & hA,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesA,
-	PE_REF(vmVector3) scalesB )
-	CustomEdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y );
-	bool & inVoronoi,
-	float & tA,
-	float & tB,
-	const vmVector3 & hA,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesA,
-	PE_REF(vmVector3) scalesB )
-	CustomEdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X );
-	bool & inVoronoi,
-	float & tA,
-	float & tB,
-	const vmVector3 & hA,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesA,
-	PE_REF(vmVector3) scalesB )
-	CustomEdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y );
-	bool & inVoronoi,
-	float & tA,
-	float & tB,
-	const vmVector3 & hA,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesA,
-	PE_REF(vmVector3) scalesB )
-	CustomEdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X );
-#define EdgeEdge_SetNewMin( ac_letter, ad_letter, bc_letter, bd_letter )   \
-{                                                                          \
-   minDistSqr = distSqr;                                                   \
-   localPointA.set##ac_letter(scalesA.get##ac_letter());                 \
-   localPointA.set##ad_letter(tA);                                        \
-   localPointB.set##bc_letter(scalesB.get##bc_letter());                 \
-   localPointB.set##bd_letter(tB);                                        \
-   otherFaceDimA = testOtherFaceDimA;                                      \
-   otherFaceDimB = testOtherFaceDimB;                                      \
-   featureA = E;                                                           \
-   featureB = E;                                                           \
-	bool & done,
-	float & minDistSqr,
-	vmPoint3 & localPointA,
-	vmPoint3 & localPointB,
-	int & otherFaceDimA,
-	int & otherFaceDimB,
-	FeatureType & featureA,
-	FeatureType & featureB,
-	const vmVector3 & hA,
-	const vmVector3 & hB,
-	PE_REF(vmVector3) faceOffsetAB,
-	PE_REF(vmVector3) faceOffsetBA,
-	const vmMatrix3 & matrixAB,
-	const vmMatrix3 & matrixBA,
-	PE_REF(vmVector3) signsA,
-	PE_REF(vmVector3) signsB,
-	PE_REF(vmVector3) scalesA,
-	PE_REF(vmVector3) scalesB,
-	bool first )
-	float distSqr;
-	float tA, tB;
-	int testOtherFaceDimA, testOtherFaceDimB;
-	testOtherFaceDimA = 0;
-	testOtherFaceDimB = 0;
-	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( first ) {
-		EdgeEdge_SetNewMin( X, Y, X, Y );
-	} else {
-		if ( distSqr < minDistSqr ) {
-			EdgeEdge_SetNewMin( X, Y, X, Y );
-		}
-	}
-	if ( done )
-		return;
-	signsA.setX( -signsA.getX() );
-	scalesA.setX( -scalesA.getX() );
-	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, X, Y );
-	}
-	if ( done )
-		return;
-	signsB.setX( -signsB.getX() );
-	scalesB.setX( -scalesB.getX() );
-	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, X, Y );
-	}
-	if ( done )
-		return;
-	signsA.setX( -signsA.getX() );
-	scalesA.setX( -scalesA.getX() );
-	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, X, Y );
-	}
-	if ( done )
-		return;
-	testOtherFaceDimA = 1;
-	testOtherFaceDimB = 0;
-	signsB.setX( -signsB.getX() );
-	scalesB.setX( -scalesB.getX() );
-	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, X, Y );
-	}
-	if ( done )
-		return;
-	signsA.setY( -signsA.getY() );
-	scalesA.setY( -scalesA.getY() );
-	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, X, Y );
-	}
-	if ( done )
-		return;
-	signsB.setX( -signsB.getX() );
-	scalesB.setX( -scalesB.getX() );
-	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, X, Y );
-	}
-	if ( done )
-		return;
-	signsA.setY( -signsA.getY() );
-	scalesA.setY( -scalesA.getY() );
-	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, X, Y );
-	}
-	if ( done )
-		return;
-	testOtherFaceDimA = 0;
-	testOtherFaceDimB = 1;
-	signsB.setX( -signsB.getX() );
-	scalesB.setX( -scalesB.getX() );
-	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, Y, X );
-	}
-	if ( done )
-		return;
-	signsA.setX( -signsA.getX() );
-	scalesA.setX( -scalesA.getX() );
-	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, Y, X );
-	}
-	if ( done )
-		return;
-	signsB.setY( -signsB.getY() );
-	scalesB.setY( -scalesB.getY() );
-	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, Y, X );
-	}
-	if ( done )
-		return;
-	signsA.setX( -signsA.getX() );
-	scalesA.setX( -scalesA.getX() );
-	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( X, Y, Y, X );
-	}
-	if ( done )
-		return;
-	testOtherFaceDimA = 1;
-	testOtherFaceDimB = 1;
-	signsB.setY( -signsB.getY() );
-	scalesB.setY( -scalesB.getY() );
-	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, Y, X );
-	}
-	if ( done )
-		return;
-	signsA.setY( -signsA.getY() );
-	scalesA.setY( -scalesA.getY() );
-	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, Y, X );
-	}
-	if ( done )
-		return;
-	signsB.setY( -signsB.getY() );
-	scalesB.setY( -scalesB.getY() );
-	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, Y, X );
-	}
-	if ( done )
-		return;
-	signsA.setY( -signsA.getY() );
-	scalesA.setY( -scalesA.getY() );
-	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
-								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
-	if ( distSqr < minDistSqr ) {
-		EdgeEdge_SetNewMin( Y, X, Y, X );
-	}
-boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
-			   PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB,
-			   const vmTransform3 & transformB,
-			   float distanceThreshold)
-	vmMatrix3 identity;
-	identity = vmMatrix3::identity();
-	vmVector3 ident[3];
-	ident[0] = identity.getCol0();
-	ident[1] = identity.getCol1();
-	ident[2] = identity.getCol2();
-	// get relative transformations
-	vmTransform3 transformAB, transformBA;
-	vmMatrix3 matrixAB, matrixBA;
-	vmVector3 offsetAB, offsetBA;
-	transformAB = orthoInverse(transformA) * transformB;
-	transformBA = orthoInverse(transformAB);
-	matrixAB = transformAB.getUpper3x3();
-	offsetAB = transformAB.getTranslation();
-	matrixBA = transformBA.getUpper3x3();
-	offsetBA = transformBA.getTranslation();
-	vmMatrix3 absMatrixAB = absPerElem(matrixAB);
-	vmMatrix3 absMatrixBA = absPerElem(matrixBA);
-	// find separating axis with largest gap between projections
-	BoxSepAxisType axisType;
-	vmVector3 axisA(0.0f), axisB(0.0f);
-	float gap, maxGap;
-	int faceDimA = 0, faceDimB = 0, edgeDimA = 0, edgeDimB = 0;
-	// face axes
-	vmVector3  gapsA   = absPerElem(offsetAB) - boxA.mHalf - absMatrixAB * boxB.mHalf;
-	AaxisTest(0,X,true);
-	AaxisTest(1,Y,false);
-	AaxisTest(2,Z,false);
-	vmVector3  gapsB   = absPerElem(offsetBA) - boxB.mHalf - absMatrixBA * boxA.mHalf;
-	BaxisTest(0,X);
-	BaxisTest(1,Y);
-	BaxisTest(2,Z);
-	// cross product axes
-	// �O�ς��O�̂Ƃ��̑΍�
-	absMatrixAB += vmMatrix3(1.0e-5f);
-	absMatrixBA += vmMatrix3(1.0e-5f);
-	vmMatrix3 lsqrs, projOffset, projAhalf, projBhalf;
-	lsqrs.setCol0( mulPerElem( matrixBA.getCol2(), matrixBA.getCol2() ) +
-				   mulPerElem( matrixBA.getCol1(), matrixBA.getCol1() ) );
-	lsqrs.setCol1( mulPerElem( matrixBA.getCol2(), matrixBA.getCol2() ) +
-				   mulPerElem( matrixBA.getCol0(), matrixBA.getCol0() ) );
-	lsqrs.setCol2( mulPerElem( matrixBA.getCol1(), matrixBA.getCol1() ) +
-				   mulPerElem( matrixBA.getCol0(), matrixBA.getCol0() ) );
-	projOffset.setCol0(matrixBA.getCol1() * offsetAB.getZ() - matrixBA.getCol2() * offsetAB.getY());
-	projOffset.setCol1(matrixBA.getCol2() * offsetAB.getX() - matrixBA.getCol0() * offsetAB.getZ());
-	projOffset.setCol2(matrixBA.getCol0() * offsetAB.getY() - matrixBA.getCol1() * offsetAB.getX());
-	projAhalf.setCol0(absMatrixBA.getCol1() * boxA.mHalf.getZ() + absMatrixBA.getCol2() * boxA.mHalf.getY());
-	projAhalf.setCol1(absMatrixBA.getCol2() * boxA.mHalf.getX() + absMatrixBA.getCol0() * boxA.mHalf.getZ());
-	projAhalf.setCol2(absMatrixBA.getCol0() * boxA.mHalf.getY() + absMatrixBA.getCol1() * boxA.mHalf.getX());
-	projBhalf.setCol0(absMatrixAB.getCol1() * boxB.mHalf.getZ() + absMatrixAB.getCol2() * boxB.mHalf.getY());
-	projBhalf.setCol1(absMatrixAB.getCol2() * boxB.mHalf.getX() + absMatrixAB.getCol0() * boxB.mHalf.getZ());
-	projBhalf.setCol2(absMatrixAB.getCol0() * boxB.mHalf.getY() + absMatrixAB.getCol1() * boxB.mHalf.getX());
-	vmMatrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf);
-	CrossAxisTest(0,0,X);
-	CrossAxisTest(0,1,Y);
-	CrossAxisTest(0,2,Z);
-	CrossAxisTest(1,0,X);
-	CrossAxisTest(1,1,Y);
-	CrossAxisTest(1,2,Z);
-	CrossAxisTest(2,0,X);
-	CrossAxisTest(2,1,Y);
-	CrossAxisTest(2,2,Z);
-	// need to pick the face on each box whose normal best matches the separating axis.
-	// will transform vectors to be in the coordinate system of this face to simplify things later.
-	// for this, a permutation matrix can be used, which the next section computes.
-	int dimA[3], dimB[3];
-	if ( axisType == A_AXIS ) {
-		if ( dot(axisA,offsetAB) < 0.0f )
-			axisA = -axisA;
-		axisB = matrixBA * -axisA;
-		vmVector3 absAxisB = vmVector3(absPerElem(axisB));
-		if ( ( absAxisB[0] > absAxisB[1] ) && ( absAxisB[0] > absAxisB[2] ) )
-			faceDimB = 0;
-		else if ( absAxisB[1] > absAxisB[2] )
-			faceDimB = 1;
-		else
-			faceDimB = 2;
-	} else if ( axisType == B_AXIS ) {
-		if ( dot(axisB,offsetBA) < 0.0f )
-			axisB = -axisB;
-		axisA = matrixAB * -axisB;
-		vmVector3 absAxisA = vmVector3(absPerElem(axisA));
-		if ( ( absAxisA[0] > absAxisA[1] ) && ( absAxisA[0] > absAxisA[2] ) )
-			faceDimA = 0;
-		else if ( absAxisA[1] > absAxisA[2] )
-			faceDimA = 1;
-		else
-			faceDimA = 2;
-	}
-	if ( axisType == CROSS_AXIS ) {
-		if ( dot(axisA,offsetAB) < 0.0f )
-			axisA = -axisA;
-		axisB = matrixBA * -axisA;
-		vmVector3 absAxisA = vmVector3(absPerElem(axisA));
-		vmVector3 absAxisB = vmVector3(absPerElem(axisB));
-		dimA[1] = edgeDimA;
-		dimB[1] = edgeDimB;
-		if ( edgeDimA == 0 ) {
-			if ( absAxisA[1] > absAxisA[2] ) {
-				dimA[0] = 2;
-				dimA[2] = 1;
-			} else                             {
-				dimA[0] = 1;
-				dimA[2] = 2;
-			}
-		} else if ( edgeDimA == 1 ) {
-			if ( absAxisA[2] > absAxisA[0] ) {
-				dimA[0] = 0;
-				dimA[2] = 2;
-			} else                             {
-				dimA[0] = 2;
-				dimA[2] = 0;
-			}
-		} else {
-			if ( absAxisA[0] > absAxisA[1] ) {
-				dimA[0] = 1;
-				dimA[2] = 0;
-			} else                             {
-				dimA[0] = 0;
-				dimA[2] = 1;
-			}
-		}
-		if ( edgeDimB == 0 ) {
-			if ( absAxisB[1] > absAxisB[2] ) {
-				dimB[0] = 2;
-				dimB[2] = 1;
-			} else                             {
-				dimB[0] = 1;
-				dimB[2] = 2;
-			}
-		} else if ( edgeDimB == 1 ) {
-			if ( absAxisB[2] > absAxisB[0] ) {
-				dimB[0] = 0;
-				dimB[2] = 2;
-			} else                             {
-				dimB[0] = 2;
-				dimB[2] = 0;
-			}
-		} else {
-			if ( absAxisB[0] > absAxisB[1] ) {
-				dimB[0] = 1;
-				dimB[2] = 0;
-			} else                             {
-				dimB[0] = 0;
-				dimB[2] = 1;
-			}
-		}
-	} else {
-		dimA[2] = faceDimA;
-		dimA[0] = (faceDimA+1)%3;
-		dimA[1] = (faceDimA+2)%3;
-		dimB[2] = faceDimB;
-		dimB[0] = (faceDimB+1)%3;
-		dimB[1] = (faceDimB+2)%3;
-	}
-	vmMatrix3 aperm_col, bperm_col;
-	aperm_col.setCol0(ident[dimA[0]]);
-	aperm_col.setCol1(ident[dimA[1]]);
-	aperm_col.setCol2(ident[dimA[2]]);
-	bperm_col.setCol0(ident[dimB[0]]);
-	bperm_col.setCol1(ident[dimB[1]]);
-	bperm_col.setCol2(ident[dimB[2]]);
-	vmMatrix3 aperm_row, bperm_row;
-	aperm_row = transpose(aperm_col);
-	bperm_row = transpose(bperm_col);
-	// permute all box parameters to be in the face coordinate systems
-	vmMatrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col;
-	vmMatrix3 matrixBA_perm = transpose(matrixAB_perm);
-	vmVector3 offsetAB_perm, offsetBA_perm;
-	offsetAB_perm = aperm_row * offsetAB;
-	offsetBA_perm = bperm_row * offsetBA;
-	vmVector3 halfA_perm, halfB_perm;
-	halfA_perm = aperm_row * boxA.mHalf;
-	halfB_perm = bperm_row * boxB.mHalf;
-	// compute the vector between the centers of each face, in each face's coordinate frame
-	vmVector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm;
-	signsA_perm = copySignPerElem(vmVector3(1.0f),aperm_row * axisA);
-	signsB_perm = copySignPerElem(vmVector3(1.0f),bperm_row * axisB);
-	scalesA_perm = mulPerElem( signsA_perm, halfA_perm );
-	scalesB_perm = mulPerElem( signsB_perm, halfB_perm );
-	faceOffsetAB_perm = offsetAB_perm + matrixAB_perm.getCol2() * scalesB_perm.getZ();
-	faceOffsetAB_perm.setZ( faceOffsetAB_perm.getZ() - scalesA_perm.getZ() );
-	faceOffsetBA_perm = offsetBA_perm + matrixBA_perm.getCol2() * scalesA_perm.getZ();
-	faceOffsetBA_perm.setZ( faceOffsetBA_perm.getZ() - scalesB_perm.getZ() );
-	if ( maxGap < 0.0f ) {
-		// if boxes overlap, this will separate the faces for finding points of penetration.
-		faceOffsetAB_perm -= aperm_row * axisA * maxGap * 1.01f;
-		faceOffsetBA_perm -= bperm_row * axisB * maxGap * 1.01f;
-	}
-	// for each vertex/face or edge/edge pair of the two faces, find the closest points.
-	//
-	// these points each have an associated box feature (vertex, edge, or face).  if each
-	// point is in the external Voronoi region of the other's feature, they are the
-	// closest points of the boxes, and the algorithm can exit.
-	//
-	// the feature pairs are arranged so that in the general case, the first test will
-	// succeed.  degenerate cases (parallel faces) may require up to all tests in the
-	// worst case.
-	//
-	// if for some reason no case passes the Voronoi test, the features with the minimum
-	// distance are returned.
-	vmPoint3 localPointA_perm, localPointB_perm;
-	float minDistSqr;
-	bool done;
-	vmVector3 hA_perm( halfA_perm ), hB_perm( halfB_perm );
-	localPointA_perm.setZ( scalesA_perm.getZ() );
-	localPointB_perm.setZ( scalesB_perm.getZ() );
-	scalesA_perm.setZ(0.0f);
-	scalesB_perm.setZ(0.0f);
-	int otherFaceDimA, otherFaceDimB;
-	FeatureType featureA, featureB;
-	if ( axisType == CROSS_AXIS ) {
-		EdgeEdgeTests( done, minDistSqr, localPointA_perm, localPointB_perm,
-					   otherFaceDimA, otherFaceDimB, featureA, featureB,
-					   hA_perm, hB_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-					   matrixAB_perm, matrixBA_perm, signsA_perm, signsB_perm,
-					   scalesA_perm, scalesB_perm, true );
-		if ( !done ) {
-			VertexBFaceATests( done, minDistSqr, localPointA_perm, localPointB_perm,
-							   featureA, featureB,
-							   hA_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-							   matrixAB_perm, matrixBA_perm, signsB_perm, scalesB_perm, false );
-			if ( !done ) {
-				VertexAFaceBTests( done, minDistSqr, localPointA_perm, localPointB_perm,
-								   featureA, featureB,
-								   hB_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-								   matrixAB_perm, matrixBA_perm, signsA_perm, scalesA_perm, false );
-			}
-		}
-	} else if ( axisType == B_AXIS ) {
-		VertexAFaceBTests( done, minDistSqr, localPointA_perm, localPointB_perm,
-						   featureA, featureB,
-						   hB_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-						   matrixAB_perm, matrixBA_perm, signsA_perm, scalesA_perm, true );
-		if ( !done ) {
-			VertexBFaceATests( done, minDistSqr, localPointA_perm, localPointB_perm,
-							   featureA, featureB,
-							   hA_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-							   matrixAB_perm, matrixBA_perm, signsB_perm, scalesB_perm, false );
-			if ( !done ) {
-				EdgeEdgeTests( done, minDistSqr, localPointA_perm, localPointB_perm,
-							   otherFaceDimA, otherFaceDimB, featureA, featureB,
-							   hA_perm, hB_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-							   matrixAB_perm, matrixBA_perm, signsA_perm, signsB_perm,
-							   scalesA_perm, scalesB_perm, false );
-			}
-		}
-	} else {
-		VertexBFaceATests( done, minDistSqr, localPointA_perm, localPointB_perm,
-						   featureA, featureB,
-						   hA_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-						   matrixAB_perm, matrixBA_perm, signsB_perm, scalesB_perm, true );
-		if ( !done ) {
-			VertexAFaceBTests( done, minDistSqr, localPointA_perm, localPointB_perm,
-							   featureA, featureB,
-							   hB_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-							   matrixAB_perm, matrixBA_perm, signsA_perm, scalesA_perm, false );
-			if ( !done ) {
-				EdgeEdgeTests( done, minDistSqr, localPointA_perm, localPointB_perm,
-							   otherFaceDimA, otherFaceDimB, featureA, featureB,
-							   hA_perm, hB_perm, faceOffsetAB_perm, faceOffsetBA_perm,
-							   matrixAB_perm, matrixBA_perm, signsA_perm, signsB_perm,
-							   scalesA_perm, scalesB_perm, false );
-			}
-		}
-	}
-	// convert local points from face-local to box-local coordinate system
-	boxPointA.localPoint = vmPoint3( aperm_col * vmVector3( localPointA_perm )) ;
-	boxPointB.localPoint = vmPoint3( bperm_col * vmVector3( localPointB_perm )) ;
-#if 0
-	// find which features of the boxes are involved.
-	// the only feature pairs which occur in this function are VF, FV, and EE, even though the
-	// closest points might actually lie on sub-features, as in a VF contact might be used for
-	// what's actually a VV contact.  this means some feature pairs could possibly seem distinct
-	// from others, although their contact positions are the same.  don't know yet whether this
-	// matters.
-	int sA[3], sB[3];
-	sA[0] = boxPointA.localPoint.getX() > 0.0f;
-	sA[1] = boxPointA.localPoint.getY() > 0.0f;
-	sA[2] = boxPointA.localPoint.getZ() > 0.0f;
-	sB[0] = boxPointB.localPoint.getX() > 0.0f;
-	sB[1] = boxPointB.localPoint.getY() > 0.0f;
-	sB[2] = boxPointB.localPoint.getZ() > 0.0f;
-	if ( featureA == F ) {
-		boxPointA.setFaceFeature( dimA[2], sA[dimA[2]] );
-	} else if ( featureA == E ) {
-		boxPointA.setEdgeFeature( dimA[2], sA[dimA[2]], dimA[otherFaceDimA], sA[dimA[otherFaceDimA]] );
-	} else {
-		boxPointA.setVertexFeature( sA[0], sA[1], sA[2] );
-	}
-	if ( featureB == F ) {
-		boxPointB.setFaceFeature( dimB[2], sB[dimB[2]] );
-	} else if ( featureB == E ) {
-		boxPointB.setEdgeFeature( dimB[2], sB[dimB[2]], dimB[otherFaceDimB], sB[dimB[otherFaceDimB]] );
-	} else {
-		boxPointB.setVertexFeature( sB[0], sB[1], sB[2] );
-	}
-	normal = transformA * axisA;
-	if ( maxGap < 0.0f ) {
-		return (maxGap);
-	} else {
-		return (sqrtf( minDistSqr ));
-	}
diff --git a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h b/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
deleted file mode 100644
index 0d4957de..00000000
--- a/src/bullet/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+++ /dev/null
@@ -1,65 +0,0 @@
-   Copyright (C) 2006, 2008 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "Box.h"
-// boxBoxDistance:
-// description:
-//    this computes info that can be used for the collision response of two boxes.  when the boxes
-//    do not overlap, the points are set to the closest points of the boxes, and a positive
-//    distance between them is returned.  if the boxes do overlap, a negative distance is returned
-//    and the points are set to two points that would touch after the boxes are translated apart.
-//    the contact normal gives the direction to repel or separate the boxes when they touch or
-//    overlap (it's being approximated here as one of the 15 "separating axis" directions).
-// returns:
-//    positive or negative distance between two boxes.
-// args:
-//    vmVector3& normal: set to a unit contact normal pointing from box A to box B.
-//    BoxPoint& boxPointA, BoxPoint& boxPointB:
-//       set to a closest point or point of penetration on each box.
-//    Box boxA, Box boxB:
-//       boxes, represented as 3 half-widths
-//    const vmTransform3& transformA, const vmTransform3& transformB:
-//       box transformations, in world coordinates
-//    float distanceThreshold:
-//       the algorithm will exit early if it finds that the boxes are more distant than this
-//       threshold, and not compute a contact normal or points.  if this distance returned
-//       exceeds the threshold, all the other output data may not have been computed.  by
-//       default, this is set to MAX_FLOAT so it will have no effect.
-boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
-			   PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB,
-			   const vmTransform3 & transformB,
-			   float distanceThreshold = FLT_MAX );
-#endif /* __BOXBOXDISTANCE_H__ */
diff --git a/src/bullet/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp b/src/bullet/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp
deleted file mode 100644
index fe619555..00000000
--- a/src/bullet/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "SpuSampleTask.h"
-#include "BulletDynamics/Dynamics/btRigidBody.h"
-#include "../PlatformDefinitions.h"
-#include "../SpuFakeDma.h"
-#include "LinearMath/btMinMax.h"
-#ifdef __SPU__
-#include <spu_printf.h>
-#include <stdio.h>
-#define spu_printf printf
-#define MAX_NUM_BODIES 8192
-struct SampleTask_LocalStoreMemory
-	ATTRIBUTE_ALIGNED16(char gLocalRigidBody [sizeof(btRigidBody)+16]);
-void processSampleTask(void* userPtr, void* lsMemory)
-	//	BT_PROFILE("processSampleTask");
-	SampleTask_LocalStoreMemory* localMemory = (SampleTask_LocalStoreMemory*)lsMemory;
-	SpuSampleTaskDesc* taskDescPtr = (SpuSampleTaskDesc*)userPtr;
-	SpuSampleTaskDesc& taskDesc = *taskDescPtr;
-	switch (taskDesc.m_sampleCommand)
-	{
-		{
-			btTransform predictedTrans;
-			btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr;
-			int batchSize = taskDesc.m_sampleValue;
-			if (batchSize>MAX_NUM_BODIES)
-			{
-				spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n");
-				break;
-			}
-			int dmaArraySize = batchSize*sizeof(void*);
-			uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr);
-			//			spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize);
-			if (dmaArraySize>=16)
-			{
-				cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize, DMA_TAG(1), 0, 0);	
-				cellDmaWaitTagStatusAll(DMA_MASK(1));
-			} else
-			{
-				stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize);
-			}
-			for ( int i=0;i<batchSize;i++)
-			{
-				///DMA rigid body
-				void* localPtr = &localMemory->gLocalRigidBody[0];
-				void* shortAdd = localMemory->gPointerArray[i];
-				uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd);
-				//	spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr);
-				int dmaBodySize = sizeof(btRigidBody);
-				cellDmaGet((void*)localPtr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);	
-				cellDmaWaitTagStatusAll(DMA_MASK(1));
-				float timeStep = 1.f/60.f;
-				btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj);
-				if (body)
-				{
-					if (body->isActive() && (!body->isStaticOrKinematicObject()))
-					{
-						body->predictIntegratedTransform(timeStep, predictedTrans);
-						body->proceedToTransform( predictedTrans);
-						void* ptr = (void*)localPtr;
-						//	spu_printf("cellDmaLargePut from %llx to LS %llx\n",ptr,ppuRigidBodyAddress);
-						cellDmaLargePut(ptr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);
-						cellDmaWaitTagStatusAll(DMA_MASK(1));
-					}
-				}
-			}
-			break;
-		}
-		{
-			btTransform predictedTrans;
-			btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr;
-			int batchSize = taskDesc.m_sampleValue;
-			int dmaArraySize = batchSize*sizeof(void*);
-			if (batchSize>MAX_NUM_BODIES)
-			{
-				spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n");
-				break;
-			}
-			uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr);
-			//			spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize);
-			if (dmaArraySize>=16)
-			{
-				cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize, DMA_TAG(1), 0, 0);	
-				cellDmaWaitTagStatusAll(DMA_MASK(1));
-			} else
-			{
-				stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize);
-			}
-			for ( int i=0;i<batchSize;i++)
-			{
-				///DMA rigid body
-				void* localPtr = &localMemory->gLocalRigidBody[0];
-				void* shortAdd = localMemory->gPointerArray[i];
-				uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd);
-				//	spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr);
-				int dmaBodySize = sizeof(btRigidBody);
-				cellDmaGet((void*)localPtr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);	
-				cellDmaWaitTagStatusAll(DMA_MASK(1));
-				float timeStep = 1.f/60.f;
-				btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj);
-				if (body)
-				{
-					if (!body->isStaticOrKinematicObject())
-					{
-						if (body->isActive())
-						{
-							body->integrateVelocities( timeStep);
-							//damping
-							body->applyDamping(timeStep);
-							body->predictIntegratedTransform(timeStep,body->getInterpolationWorldTransform());
-							void* ptr = (void*)localPtr;
-							cellDmaLargePut(ptr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);
-							cellDmaWaitTagStatusAll(DMA_MASK(1));
-						}
-					}
-				}
-			}
-			break;
-		}
-	default:
-		{
-		}
-	};
-#if defined(__CELLOS_LV2__) || defined (LIBSPE2)
-ATTRIBUTE_ALIGNED16(SampleTask_LocalStoreMemory	gLocalStoreMemory);
-void* createSampleLocalStoreMemory()
-	return &gLocalStoreMemory;
-void* createSampleLocalStoreMemory()
-	return new SampleTask_LocalStoreMemory;
diff --git a/src/bullet/BulletMultiThreaded/SpuSampleTaskProcess.cpp b/src/bullet/BulletMultiThreaded/SpuSampleTaskProcess.cpp
deleted file mode 100644
index 11cb9e7c..00000000
--- a/src/bullet/BulletMultiThreaded/SpuSampleTaskProcess.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-//#define __CELLOS_LV2__ 1
-#include "SpuSampleTaskProcess.h"
-#include <stdio.h>
-#ifdef __SPU__
-void	SampleThreadFunc(void* userPtr,void* lsMemory)
-	//do nothing
-	printf("hello world\n");
-void*	SamplelsMemoryFunc()
-	//don't create local store memory, just return 0
-	return 0;
-#include "btThreadSupportInterface.h"
-//#	include "SPUAssert.h"
-#include <string.h>
-extern "C" {
-	extern char SPU_SAMPLE_ELF_SYMBOL[];
-SpuSampleTaskProcess::SpuSampleTaskProcess(btThreadSupportInterface*	threadInterface,  int maxNumOutstandingTasks)
-	m_taskBusy.resize(m_maxNumOutstandingTasks);
-	m_spuSampleTaskDesc.resize(m_maxNumOutstandingTasks);
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_initialized = false;
-	m_threadInterface->startSPU();
-	m_threadInterface->stopSPU();
-void	SpuSampleTaskProcess::initialize()
-	printf("SpuSampleTaskProcess::initialize()\n");
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_initialized = true;
-void SpuSampleTaskProcess::issueTask(void* sampleMainMemPtr,int sampleValue,int sampleCommand)
-	printf("SpuSampleTaskProcess::issueTask (m_currentTask= %d\)n", m_currentTask);
-	m_taskBusy[m_currentTask] = true;
-	m_numBusyTasks++;
-	SpuSampleTaskDesc& taskDesc = m_spuSampleTaskDesc[m_currentTask];
-	{
-		// send task description in event message
-		// no error checking here...
-		// but, currently, event queue can be no larger than NUM_WORKUNIT_TASKS.
-		taskDesc.m_mainMemoryPtr = reinterpret_cast<uint64_t>(sampleMainMemPtr);
-		taskDesc.m_sampleValue = sampleValue;
-		taskDesc.m_sampleCommand = sampleCommand;
-		//some bookkeeping to recognize finished tasks
-		taskDesc.m_taskId = m_currentTask;
-	}
-	m_threadInterface->sendRequest(1, (ppu_address_t) &taskDesc, m_currentTask);
-	// if all tasks busy, wait for spu event to clear the task.
-	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
-	{
-		unsigned int taskId;
-		unsigned int outputSize;
-		for (int i=0;i<m_maxNumOutstandingTasks;i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-		m_threadInterface->waitForResponse(&taskId, &outputSize);
-		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
-		postProcess(taskId, outputSize);
-		m_taskBusy[taskId] = false;
-		m_numBusyTasks--;
-	}
-	// find new task buffer
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		if (!m_taskBusy[i])
-		{
-			m_currentTask = i;
-			break;
-		}
-	}
-///Optional PPU-size post processing for each task
-void SpuSampleTaskProcess::postProcess(int taskId, int outputSize)
-void SpuSampleTaskProcess::flush()
-	printf("\nSpuCollisionTaskProcess::flush()\n");
-	// all tasks are issued, wait for all tasks to be complete
-	while(m_numBusyTasks > 0)
-	{
-// Consolidating SPU code
-	  unsigned int taskId;
-	  unsigned int outputSize;
-	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-	  {
-		  m_threadInterface->waitForResponse(&taskId, &outputSize);
-	  }
-		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
-		postProcess(taskId, outputSize);
-		m_taskBusy[taskId] = false;
-		m_numBusyTasks--;
-	}
diff --git a/src/bullet/BulletMultiThreaded/SpuSampleTaskProcess.h b/src/bullet/BulletMultiThreaded/SpuSampleTaskProcess.h
deleted file mode 100644
index 6173225a..00000000
--- a/src/bullet/BulletMultiThreaded/SpuSampleTaskProcess.h
+++ /dev/null
@@ -1,153 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <assert.h>
-#include "PlatformDefinitions.h"
-#include <stdlib.h>
-#include "LinearMath/btAlignedObjectArray.h"
-#include "SpuSampleTask/SpuSampleTask.h"
-//just add your commands here, try to keep them globally unique for debugging purposes
-/// SpuSampleTaskProcess handles SPU processing of collision pairs.
-/// When PPU issues a task, it will look for completed task buffers
-/// PPU will do postprocessing, dependent on workunit output (not likely)
-class SpuSampleTaskProcess
-	// track task buffers that are being used, and total busy tasks
-	btAlignedObjectArray<bool>	m_taskBusy;
-	btAlignedObjectArray<SpuSampleTaskDesc>m_spuSampleTaskDesc;
-	int   m_numBusyTasks;
-	// the current task and the current entry to insert a new work unit
-	int   m_currentTask;
-	bool m_initialized;
-	void postProcess(int taskId, int outputSize);
-	class	btThreadSupportInterface*	m_threadInterface;
-	int	m_maxNumOutstandingTasks;
-	SpuSampleTaskProcess(btThreadSupportInterface*	threadInterface, int maxNumOutstandingTasks);
-	~SpuSampleTaskProcess();
-	///call initialize in the beginning of the frame, before addCollisionPairToTask
-	void initialize();
-	void issueTask(void* sampleMainMemPtr,int sampleValue,int sampleCommand);
-	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
-	void flush();
-#if defined(USE_LIBSPE2) && defined(__SPU__)
-#include "../SpuLibspe2Support.h"
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <SpuFakeDma.h>
-void * SamplelsMemoryFunc();
-void SampleThreadFunc(void* userPtr,void* lsMemory);
-int main(unsigned long long speid, addr64 argp, addr64 envp)
-	printf("SPU is up \n");
-	ATTRIBUTE_ALIGNED128(btSpuStatus status);
-	ATTRIBUTE_ALIGNED16( SpuSampleTaskDesc taskDesc ) ;
-	unsigned int received_message = Spu_Mailbox_Event_Nothing;
-        bool shutdown = false;
-	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	status.m_status = Spu_Status_Free;
-	status.m_lsMemory.p = SamplelsMemoryFunc();
-	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	while (!shutdown)
-	{
-		received_message = spu_read_in_mbox();
-		switch(received_message)
-		{
-		case Spu_Mailbox_Event_Shutdown:
-			shutdown = true;
-			break; 
-		case Spu_Mailbox_Event_Task:
-			// refresh the status
-			printf("SPU recieved Task \n");
-			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			btAssert(status.m_status==Spu_Status_Occupied);
-			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuSampleTaskDesc), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			SampleThreadFunc((void*)&taskDesc, reinterpret_cast<void*> (taskDesc.m_mainMemoryPtr) );
-			break;
-		case Spu_Mailbox_Event_Nothing:
-		default:
-			break;
-		}
-		// set to status free and wait for next task
-		status.m_status = Spu_Status_Free;
-		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-		cellDmaWaitTagStatusAll(DMA_MASK(3));		
-  	}
-  	return 0;
diff --git a/src/bullet/BulletMultiThreaded/SpuSync.h b/src/bullet/BulletMultiThreaded/SpuSync.h
deleted file mode 100644
index 4157b8f0..00000000
--- a/src/bullet/BulletMultiThreaded/SpuSync.h
+++ /dev/null
@@ -1,149 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2007 Starbreeze Studios
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-Written by: Marten Svanfeldt
-#ifndef BT_SPU_SYNC_H
-#define	BT_SPU_SYNC_H
-#include "PlatformDefinitions.h"
-#if defined(WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifdef _XBOX
-#include <Xtl.h>
-#include <Windows.h>
-///The btSpinlock is a structure to allow multi-platform synchronization. This allows to port the SPU tasks to other platforms.
-class btSpinlock
-	//typedef volatile LONG SpinVariable;
-	typedef CRITICAL_SECTION SpinVariable;
-	btSpinlock (SpinVariable* var)
-		: spinVariable (var)
-	{}
-	void Init ()
-	{
-		//*spinVariable = 0;
-		InitializeCriticalSection(spinVariable);
-	}
-	void Lock ()
-	{
-		EnterCriticalSection(spinVariable);
-	}
-	void Unlock ()
-	{
-		LeaveCriticalSection(spinVariable);
-	}
-	SpinVariable* spinVariable;
-#elif defined (__CELLOS_LV2__)
-//#include <cell/atomic.h>
-#include <cell/sync/mutex.h>
-///The btSpinlock is a structure to allow multi-platform synchronization. This allows to port the SPU tasks to other platforms.
-class btSpinlock
-	typedef CellSyncMutex SpinVariable;
-	btSpinlock (SpinVariable* var)
-		: spinVariable (var)
-	{}
-	void Init ()
-	{
-#ifndef __SPU__
-		//*spinVariable = 1;
-		cellSyncMutexInitialize(spinVariable);
-	}
-	void Lock ()
-	{
-#ifdef __SPU__
-		// lock semaphore
-		/*while (cellAtomicTestAndDecr32(atomic_buf, (uint64_t)spinVariable) == 0) 
-		{
-		};*/
-		cellSyncMutexLock((uint64_t)spinVariable);
-	}
-	void Unlock ()
-	{
-#ifdef __SPU__
-		//cellAtomicIncr32(atomic_buf, (uint64_t)spinVariable);
-		cellSyncMutexUnlock((uint64_t)spinVariable);
-	}
-	SpinVariable*	spinVariable;
-	ATTRIBUTE_ALIGNED128(uint32_t		atomic_buf[32]);
-//create a dummy implementation (without any locking) useful for serial processing
-class btSpinlock
-	typedef int  SpinVariable;
-	btSpinlock (SpinVariable* var)
-		: spinVariable (var)
-	{}
-	void Init ()
-	{
-	}
-	void Lock ()
-	{
-	}
-	void Unlock ()
-	{
-	}
-	SpinVariable* spinVariable;
-#endif //BT_SPU_SYNC_H
diff --git a/src/bullet/BulletMultiThreaded/TrbDynBody.h b/src/bullet/BulletMultiThreaded/TrbDynBody.h
deleted file mode 100644
index a7f4bf1b..00000000
--- a/src/bullet/BulletMultiThreaded/TrbDynBody.h
+++ /dev/null
@@ -1,79 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef BT_RB_DYN_BODY_H__
-#define BT_RB_DYN_BODY_H__
-#include "vectormath/vmInclude.h"
-using namespace Vectormath::Aos;
-#include "TrbStateVec.h"
-class CollObject;
-class TrbDynBody
-	TrbDynBody()
-	{
-		fMass   = 0.0f;
-		fCollObject = NULL;
-		fElasticity = 0.2f;
-		fFriction = 0.8f;
-	}
-	// Get methods
-	float          getMass() const {return fMass;};
-	float          getElasticity() const {return fElasticity;}
-	float          getFriction() const {return fFriction;}
-	CollObject*    getCollObject() const {return fCollObject;}
-	const Matrix3 &getBodyInertia() const {return fIBody;}
-	const Matrix3 &getBodyInertiaInv() const {return fIBodyInv;}
-	float          getMassInv() const {return fMassInv;}
-	// Set methods
-	void           setMass(float mass) {fMass=mass;fMassInv=mass>0.0f?1.0f/mass:0.0f;}
-	void           setBodyInertia(const Matrix3 bodyInertia) {fIBody = bodyInertia;fIBodyInv = inverse(bodyInertia);}
-	void           setElasticity(float elasticity) {fElasticity = elasticity;}
-	void           setFriction(float friction) {fFriction = friction;}
-	void           setCollObject(CollObject *collObj) {fCollObject = collObj;}
-	void           setBodyInertiaInv(const Matrix3 bodyInertiaInv) 
-	{
-		fIBody = inverse(bodyInertiaInv);
-		fIBodyInv = bodyInertiaInv;
-	}
-	void           setMassInv(float invMass) {
-		fMass= invMass>0.0f ? 1.0f/invMass :0.0f;
-		fMassInv=invMass;
-	}
-	// Rigid Body constants
-	float          fMass;        // Rigid Body mass
-	float          fMassInv;     // Inverse of mass
-	Matrix3        fIBody;       // Inertia matrix in body's coords
-	Matrix3        fIBodyInv;    // Inertia matrix inverse in body's coords
-	float          fElasticity;  // Coefficient of restitution
-	float          fFriction;    // Coefficient of friction
-	CollObject*    fCollObject;  // Collision object corresponding the RB
-} __attribute__ ((aligned(16)));
-#endif //BT_RB_DYN_BODY_H__
diff --git a/src/bullet/BulletMultiThreaded/TrbStateVec.h b/src/bullet/BulletMultiThreaded/TrbStateVec.h
deleted file mode 100644
index b6d895e1..00000000
--- a/src/bullet/BulletMultiThreaded/TrbStateVec.h
+++ /dev/null
@@ -1,339 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <stdlib.h>
-#include "vecmath/vmInclude.h"
-#include "vectormath/vmInclude.h"
-#include "PlatformDefinitions.h"
-static inline vmVector3 read_Vector3(const float* p)
-	vmVector3 v;
-	loadXYZ(v, p);
-	return v;
-static inline vmQuat read_Quat(const float* p)
-	vmQuat vq;
-	loadXYZW(vq, p);
-	return vq;
-static inline void store_Vector3(const vmVector3 &src, float* p)
-	vmVector3 v = src;
-	storeXYZ(v, p);
-static inline void store_Quat(const vmQuat &src, float* p)
-	vmQuat vq = src;
-	storeXYZW(vq, p);
-// Motion Type
-enum {
-	PfxMotionTypeFixed = 0,
-	PfxMotionTypeActive,
-	PfxMotionTypeKeyframe,
-	PfxMotionTypeOneWay,
-	PfxMotionTypeTrigger,
-	PfxMotionTypeCount
-#define PFX_MOTION_MASK_DYNAMIC 0x0a // Active,OneWay
-#define PFX_MOTION_MASK_STATIC  0x95 // Fixed,Keyframe,Trigger,Sleeping
-#define PFX_MOTION_MASK_SLEEP   0x0e // Can sleep
-#define PFX_MOTION_MASK_TYPE    0x7f
-// Rigid Body state
-#ifdef __CELLOS_LV2__
-ATTRIBUTE_ALIGNED128(class) TrbState
-ATTRIBUTE_ALIGNED16(class) TrbState
-	TrbState()
-	{
-		setMotionType(PfxMotionTypeActive);
-		contactFilterSelf=contactFilterTarget=0xffffffff;
-		deleted = 0;
-		mSleeping = 0;
-		useSleep = 1;
-		trbBodyIdx=0;
-		mSleepCount=0;
-		useCcd = 0;
-		useContactCallback = 0;
-		useSleepCallback = 0;
-		linearDamping = 1.0f;
-		angularDamping = 0.99f;
-	}
-	TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega );
-	uint16_t	mSleepCount;
-	uint8_t		mMotionType;
-	uint8_t		deleted            : 1;
-	uint8_t		mSleeping           : 1;
-	uint8_t		useSleep           : 1;
-	uint8_t		useCcd		       : 1;
-	uint8_t		useContactCallback : 1;
-	uint8_t		useSleepCallback   : 1;
-	uint16_t	trbBodyIdx;
-	uint32_t	contactFilterSelf;
-	uint32_t	contactFilterTarget;
-	float		center[3];		// AABB center(World)
-	float		half[3];		// AABB half(World)
-	float		linearDamping;
-	float		angularDamping;
-	float		deltaLinearVelocity[3];
-	float		deltaAngularVelocity[3];
-	float     fX[3];				// position
-	float     fQ[4];				// orientation
-	float     fV[3];				// velocity
-	float     fOmega[3];			// angular velocity
-	inline void setZero();      // Zeroes out the elements
-	inline void setIdentity();  // Sets the rotation to identity and zeroes out the other elements
-	bool		isDeleted() const {return deleted==1;}
-	uint16_t	getRigidBodyId() const {return trbBodyIdx;}
-	void		setRigidBodyId(uint16_t i) {trbBodyIdx = i;}
-	uint32_t	getContactFilterSelf() const {return contactFilterSelf;}
-	void		setContactFilterSelf(uint32_t filter) {contactFilterSelf = filter;}
-	uint32_t	getContactFilterTarget() const {return contactFilterTarget;}
-	void		setContactFilterTarget(uint32_t filter) {contactFilterTarget = filter;}
-	float getLinearDamping() const {return linearDamping;}
-	float getAngularDamping() const {return angularDamping;}
-	void setLinearDamping(float damping) {linearDamping=damping;}
-	void setAngularDamping(float damping) {angularDamping=damping;}
-	uint8_t		getMotionType() const {return mMotionType;}
-	void		setMotionType(uint8_t t) {mMotionType = t;mSleeping=0;mSleepCount=0;}
-	uint8_t		getMotionMask() const {return (1<<mMotionType)|(mSleeping<<7);}
-	bool		isAsleep() const {return mSleeping==1;}
-	bool		isAwake() const {return mSleeping==0;}
-	void		wakeup() {mSleeping=0;mSleepCount=0;}
-	void		sleep() {if(useSleep) {mSleeping=1;mSleepCount=0;}}
-	uint8_t		getUseSleep() const {return useSleep;}
-	void		setUseSleep(uint8_t b) {useSleep=b;}
-	uint8_t		getUseCcd() const {return useCcd;}
-	void		setUseCcd(uint8_t b) {useCcd=b;}
-	uint8_t		getUseContactCallback() const {return useContactCallback;}
-	void		setUseContactCallback(uint8_t b) {useContactCallback=b;}
-	uint8_t		getUseSleepCallback() const {return useSleepCallback;}
-	void		setUseSleepCallback(uint8_t b) {useSleepCallback=b;}
-	void	 	incrementSleepCount() {mSleepCount++;}
-	void		resetSleepCount() {mSleepCount=0;}
-	uint16_t	getSleepCount() const {return mSleepCount;}
-	vmVector3 getPosition() const {return read_Vector3(fX);}
-	vmQuat    getOrientation() const {return read_Quat(fQ);}
-	vmVector3 getLinearVelocity() const {return read_Vector3(fV);}
-	vmVector3 getAngularVelocity() const {return read_Vector3(fOmega);}
-	vmVector3 getDeltaLinearVelocity() const {return read_Vector3(deltaLinearVelocity);}
-	vmVector3 getDeltaAngularVelocity() const {return read_Vector3(deltaAngularVelocity);}
-	void setPosition(const vmVector3 &pos) {store_Vector3(pos, fX);}
-	void setLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, fV);}
-	void setAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, fOmega);}
-	void setDeltaLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaLinearVelocity);}
-	void setDeltaAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaAngularVelocity);}
-	void setOrientation(const vmQuat &rot) {store_Quat(rot, fQ);}
-	inline void setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal);
-	inline void	setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep);
-	inline	void reset();
-TrbState::TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega)
-	setMotionType(m);
-	fX[0] = x[0];
-	fX[1] = x[1];
-	fX[2] = x[2];
-	fQ[0] = q[0];
-	fQ[1] = q[1];
-	fQ[2] = q[2];
-	fQ[3] = q[3];
-	fV[0] = v[0];
-	fV[1] = v[1];
-	fV[2] = v[2];
-	fOmega[0] = omega[0];
-	fOmega[1] = omega[1];
-	fOmega[2] = omega[2];
-	contactFilterSelf=contactFilterTarget=0xffff;
-	trbBodyIdx=0;
-	mSleeping = 0;
-	deleted = 0;
-	useSleep = 1;
-	useCcd = 0;
-	useContactCallback = 0;
-	useSleepCallback = 0;
-	mSleepCount=0;
-	linearDamping = 1.0f;
-	angularDamping = 0.99f;
-inline void
-	fX[0] = 0.0f;
-	fX[1] = 0.0f;
-	fX[2] = 0.0f;
-	fQ[0] = 0.0f;
-	fQ[1] = 0.0f;
-	fQ[2] = 0.0f;
-	fQ[3] = 1.0f;
-	fV[0] = 0.0f;
-	fV[1] = 0.0f;
-	fV[2] = 0.0f;
-	fOmega[0] = 0.0f;
-	fOmega[1] = 0.0f;
-	fOmega[2] = 0.0f;
-inline void
-	fX[0] = 0.0f;
-	fX[1] = 0.0f;
-	fX[2] = 0.0f;
-	fQ[0] = 0.0f;
-	fQ[1] = 0.0f;
-	fQ[2] = 0.0f;
-	fQ[3] = 0.0f;
-	fV[0] = 0.0f;
-	fV[1] = 0.0f;
-	fV[2] = 0.0f;
-	fOmega[0] = 0.0f;
-	fOmega[1] = 0.0f;
-	fOmega[2] = 0.0f;
-inline void
-TrbState::setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal)
-	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
-	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
-	center[0] = centerW[0];
-	center[1] = centerW[1];
-	center[2] = centerW[2];
-	half[0] = halfW[0];
-	half[1] = halfW[1];
-	half[2] = halfW[2];
-inline void
-TrbState::setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep)
-	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
-	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
-	vmVector3 diffvec = getLinearVelocity()*timeStep;
-	vmVector3 newCenter = centerW + diffvec;
-	vmVector3 aabbMin = minPerElem(newCenter - halfW,centerW - halfW);
-	vmVector3 aabbMax = maxPerElem(newCenter + halfW,centerW + halfW);
-	centerW = 0.5f * (aabbMin + aabbMax);
-	halfW =0.5f * (aabbMax - aabbMin);
-	center[0] = centerW[0];
-	center[1] = centerW[1];
-	center[2] = centerW[2];
-	half[0] = halfW[0];
-	half[1] = halfW[1];
-	half[2] = halfW[2];
-void TrbState::reset()
-#if 0
-	mSleepCount = 0;
-	mMotionType = PfxMotionTypeActive;
-	mDeleted = 0;
-	mSleeping = 0;
-	mUseSleep = 1;
-	mUseCcd = 0;
-	mUseContactCallback = 0;
-	mUseSleepCallback = 0;
-	mRigidBodyId = 0;
-	mContactFilterSelf = 0xffffffff;
-	mContactFilterTarget = 0xffffffff;
-	mLinearDamping = 1.0f;
-	mAngularDamping = 0.99f;
-	mPosition = vmVector3(0.0f);
-	mOrientation = vmQuat::identity();
-	mLinearVelocity = vmVector3(0.0f);
-	mAngularVelocity = vmVector3(0.0f);
-	setMotionType(PfxMotionTypeActive);
-	contactFilterSelf=contactFilterTarget=0xffffffff;
-	deleted = 0;
-	mSleeping = 0;
-	useSleep = 1;
-	trbBodyIdx=0;
-	mSleepCount=0;
-	useCcd = 0;
-	useContactCallback = 0;
-	useSleepCallback = 0;
-	linearDamping = 1.0f;
-	angularDamping = 0.99f;
-#endif //BT_TRBSTATEVEC_H__
diff --git a/src/bullet/BulletMultiThreaded/Win32ThreadSupport.cpp b/src/bullet/BulletMultiThreaded/Win32ThreadSupport.cpp
deleted file mode 100644
index 1197bbe0..00000000
--- a/src/bullet/BulletMultiThreaded/Win32ThreadSupport.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "Win32ThreadSupport.h"
-#include <windows.h>
-#include "SpuCollisionTaskProcess.h"
-#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
-///The number of threads should be equal to the number of available cores
-///@todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
-///Win32ThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
-///Setup and initialize SPU/CELL/Libspe2
-Win32ThreadSupport::Win32ThreadSupport(const Win32ThreadConstructionInfo & threadConstructionInfo)
-	m_maxNumTasks = threadConstructionInfo.m_numThreads;
-	startThreads(threadConstructionInfo);
-///cleanup/shutdown Libspe2
-	stopSPU();
-#include <stdio.h>
-DWORD WINAPI Thread_no_1( LPVOID lpParam ) 
-	Win32ThreadSupport::btSpuStatus* status = (Win32ThreadSupport::btSpuStatus*)lpParam;
-	while (1)
-	{
-		WaitForSingleObject(status->m_eventStartHandle,INFINITE);
-		void* userPtr = status->m_userPtr;
-		if (userPtr)
-		{
-			btAssert(status->m_status);
-			status->m_userThreadFunc(userPtr,status->m_lsMemory);
-			status->m_status = 2;
-			SetEvent(status->m_eventCompletetHandle);
-		} else
-		{
-			//exit Thread
-			status->m_status = 3;
-			printf("Thread with taskId %i with handle %p exiting\n",status->m_taskId, status->m_threadHandle);
-			SetEvent(status->m_eventCompletetHandle);
-			break;
-		}
-	}
-	printf("Thread TERMINATED\n");
-	return 0;
-///send messages to SPUs
-void Win32ThreadSupport::sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t taskId)
-	///	gMidphaseSPU.sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (ppu_address_t) &taskDesc);
-	///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
-	switch (uiCommand)
-	{
-		{
-//#define SINGLE_THREADED 1
-			btSpuStatus&	spuStatus = m_activeSpuStatus[0];
-			spuStatus.m_userPtr=(void*)uiArgument0;
-			spuStatus.m_userThreadFunc(spuStatus.m_userPtr,spuStatus.m_lsMemory);
-			HANDLE handle =0;
-			btSpuStatus&	spuStatus = m_activeSpuStatus[taskId];
-			btAssert(taskId>=0);
-			btAssert(int(taskId)<m_activeSpuStatus.size());
-			spuStatus.m_commandId = uiCommand;
-			spuStatus.m_status = 1;
-			spuStatus.m_userPtr = (void*)uiArgument0;
-			///fire event to start new task
-			SetEvent(spuStatus.m_eventStartHandle);
-#endif //CollisionTask_LocalStoreMemory
-			break;
-		}
-	default:
-		{
-			///not implemented
-			btAssert(0);
-		}
-	};
-///check for messages from SPUs
-void Win32ThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
-	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
-	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
-	btAssert(m_activeSpuStatus.size());
-	int last = -1;
-	DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE);
-	btAssert(res != WAIT_FAILED);
-	last = res - WAIT_OBJECT_0;
-	btSpuStatus& spuStatus = m_activeSpuStatus[last];
-	btAssert(spuStatus.m_threadHandle);
-	btAssert(spuStatus.m_eventCompletetHandle);
-	//WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
-	btAssert(spuStatus.m_status > 1);
-	spuStatus.m_status = 0;
-	///need to find an active spu
-	btAssert(last>=0);
-	last=0;
-	btSpuStatus& spuStatus = m_activeSpuStatus[last];
-	*puiArgument0 = spuStatus.m_taskId;
-	*puiArgument1 = spuStatus.m_status;
-///check for messages from SPUs
-bool Win32ThreadSupport::isTaskCompleted(unsigned int *puiArgument0, unsigned int *puiArgument1, int timeOutInMilliseconds)
-	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
-	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
-	btAssert(m_activeSpuStatus.size());
-	int last = -1;
-	DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, timeOutInMilliseconds);
-	if ((res != STATUS_TIMEOUT) && (res != WAIT_FAILED))
-	{
-		btAssert(res != WAIT_FAILED);
-		last = res - WAIT_OBJECT_0;
-		btSpuStatus& spuStatus = m_activeSpuStatus[last];
-		btAssert(spuStatus.m_threadHandle);
-		btAssert(spuStatus.m_eventCompletetHandle);
-		//WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
-		btAssert(spuStatus.m_status > 1);
-		spuStatus.m_status = 0;
-		///need to find an active spu
-		btAssert(last>=0);
-	#else
-		last=0;
-		btSpuStatus& spuStatus = m_activeSpuStatus[last];
-		*puiArgument0 = spuStatus.m_taskId;
-		*puiArgument1 = spuStatus.m_status;
-		return true;
-	} 
-	return false;
-void Win32ThreadSupport::startThreads(const Win32ThreadConstructionInfo& threadConstructionInfo)
-	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
-	m_completeHandles.resize(threadConstructionInfo.m_numThreads);
-	m_maxNumTasks = threadConstructionInfo.m_numThreads;
-	for (int i=0;i<threadConstructionInfo.m_numThreads;i++)
-	{
-		printf("starting thread %d\n",i);
-		btSpuStatus&	spuStatus = m_activeSpuStatus[i];
-		SIZE_T dwStackSize=threadConstructionInfo.m_threadStackSize;
-		LPTHREAD_START_ROUTINE lpStartAddress=&Thread_no_1;
-		LPVOID lpParameter=&spuStatus;
-		DWORD dwCreationFlags=0;
-		LPDWORD lpThreadId=0;
-		spuStatus.m_userPtr=0;
-		sprintf(spuStatus.m_eventStartHandleName,"eventStart%s%d",threadConstructionInfo.m_uniqueName,i);
-		spuStatus.m_eventStartHandle = CreateEventA (0,false,false,spuStatus.m_eventStartHandleName);
-		sprintf(spuStatus.m_eventCompletetHandleName,"eventComplete%s%d",threadConstructionInfo.m_uniqueName,i);
-		spuStatus.m_eventCompletetHandle = CreateEventA (0,false,false,spuStatus.m_eventCompletetHandleName);
-		m_completeHandles[i] = spuStatus.m_eventCompletetHandle;
-		HANDLE handle = CreateThread(lpThreadAttributes,dwStackSize,lpStartAddress,lpParameter,	dwCreationFlags,lpThreadId);
-		SetThreadPriority(handle,THREAD_PRIORITY_HIGHEST);
-		//SetThreadPriority(handle,THREAD_PRIORITY_TIME_CRITICAL);
-		SetThreadAffinityMask(handle, 1<<i);
-		spuStatus.m_taskId = i;
-		spuStatus.m_commandId = 0;
-		spuStatus.m_status = 0;
-		spuStatus.m_threadHandle = handle;
-		spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
-		spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
-		printf("started thread %d with threadHandle %p\n",i,handle);
-	}
-void Win32ThreadSupport::startSPU()
-///tell the task scheduler we are done with the SPU tasks
-void Win32ThreadSupport::stopSPU()
-	int i;
-	for (i=0;i<m_activeSpuStatus.size();i++)
-	{
-		btSpuStatus& spuStatus = m_activeSpuStatus[i];
-		if (spuStatus.m_status>0)
-		{
-			WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
-		}
-		spuStatus.m_userPtr = 0;
-		SetEvent(spuStatus.m_eventStartHandle);
-		WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
-		CloseHandle(spuStatus.m_eventCompletetHandle);
-		CloseHandle(spuStatus.m_eventStartHandle);
-		CloseHandle(spuStatus.m_threadHandle);
-	}
-	m_activeSpuStatus.clear();
-	m_completeHandles.clear();
-class btWin32Barrier : public btBarrier
-	CRITICAL_SECTION mExternalCriticalSection;
-	CRITICAL_SECTION mLocalCriticalSection;
-	HANDLE mRunEvent,mNotifyEvent;
-	int mCounter,mEnableCounter;
-	int mMaxCount;
-	btWin32Barrier()
-	{
-		mCounter = 0;
-		mMaxCount = 1;
-		mEnableCounter = 0;
-		InitializeCriticalSection(&mExternalCriticalSection);
-		InitializeCriticalSection(&mLocalCriticalSection);
-		mRunEvent = CreateEvent(NULL,TRUE,FALSE,NULL);
-		mNotifyEvent = CreateEvent(NULL,TRUE,FALSE,NULL);
-	}
-	virtual ~btWin32Barrier()
-	{
-		DeleteCriticalSection(&mExternalCriticalSection);
-		DeleteCriticalSection(&mLocalCriticalSection);
-		CloseHandle(mRunEvent);
-		CloseHandle(mNotifyEvent);
-	}
-	void sync()
-	{
-		int eventId;
-		EnterCriticalSection(&mExternalCriticalSection);
-		//PFX_PRINTF("enter taskId %d count %d stage %d phase %d mEnableCounter %d\n",taskId,mCounter,debug&0xff,debug>>16,mEnableCounter);
-		if(mEnableCounter > 0) {
-			ResetEvent(mNotifyEvent);
-			LeaveCriticalSection(&mExternalCriticalSection);
-			WaitForSingleObject(mNotifyEvent,INFINITE); 
-			EnterCriticalSection(&mExternalCriticalSection);
-		}
-		eventId = mCounter;
-		mCounter++;
-		if(eventId == mMaxCount-1) {
-			SetEvent(mRunEvent);
-			mEnableCounter = mCounter-1;
-			mCounter = 0;
-		}
-		else {
-			ResetEvent(mRunEvent);
-			LeaveCriticalSection(&mExternalCriticalSection);
-			WaitForSingleObject(mRunEvent,INFINITE); 
-			EnterCriticalSection(&mExternalCriticalSection);
-			mEnableCounter--;
-		}
-		if(mEnableCounter == 0) {
-			SetEvent(mNotifyEvent);
-		}
-		//PFX_PRINTF("leave taskId %d count %d stage %d phase %d mEnableCounter %d\n",taskId,mCounter,debug&0xff,debug>>16,mEnableCounter);
-		LeaveCriticalSection(&mExternalCriticalSection);
-	}
-	virtual void setMaxCount(int n) {mMaxCount = n;}
-	virtual int  getMaxCount() {return mMaxCount;}
-class btWin32CriticalSection : public btCriticalSection
-	CRITICAL_SECTION mCriticalSection;
-	btWin32CriticalSection()
-	{
-		InitializeCriticalSection(&mCriticalSection);
-	}
-	~btWin32CriticalSection()
-	{
-		DeleteCriticalSection(&mCriticalSection);
-	}
-	unsigned int getSharedParam(int i)
-	{
-		btAssert(i>=0&&i<31);
-		return mCommonBuff[i+1];
-	}
-	void setSharedParam(int i,unsigned int p)
-	{
-		btAssert(i>=0&&i<31);
-		mCommonBuff[i+1] = p;
-	}
-	void lock()
-	{
-		EnterCriticalSection(&mCriticalSection);
-		mCommonBuff[0] = 1;
-	}
-	void unlock()
-	{
-		mCommonBuff[0] = 0;
-		LeaveCriticalSection(&mCriticalSection);
-	}
-btBarrier*	Win32ThreadSupport::createBarrier()
-	unsigned char* mem = (unsigned char*)btAlignedAlloc(sizeof(btWin32Barrier),16);
-	btWin32Barrier* barrier = new(mem) btWin32Barrier();
-	barrier->setMaxCount(getNumTasks());
-	return barrier;
-btCriticalSection* Win32ThreadSupport::createCriticalSection()
-	unsigned char* mem = (unsigned char*) btAlignedAlloc(sizeof(btWin32CriticalSection),16);
-	btWin32CriticalSection* cs = new(mem) btWin32CriticalSection();
-	return cs;
diff --git a/src/bullet/BulletMultiThreaded/Win32ThreadSupport.h b/src/bullet/BulletMultiThreaded/Win32ThreadSupport.h
deleted file mode 100644
index abf5d21e..00000000
--- a/src/bullet/BulletMultiThreaded/Win32ThreadSupport.h
+++ /dev/null
@@ -1,138 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "LinearMath/btScalar.h"
-#include "PlatformDefinitions.h"
-#ifdef USE_WIN32_THREADING  //platform specific defines are defined in PlatformDefinitions.h
-#include "LinearMath/btAlignedObjectArray.h"
-#include "btThreadSupportInterface.h"
-typedef void (*Win32ThreadFunc)(void* userPtr,void* lsMemory);
-typedef void* (*Win32lsMemorySetupFunc)();
-///Win32ThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
-class Win32ThreadSupport : public btThreadSupportInterface 
-	///placeholder, until libspe2 support is there
-	struct	btSpuStatus
-	{
-		uint32_t	m_taskId;
-		uint32_t	m_commandId;
-		uint32_t	m_status;
-		Win32ThreadFunc	m_userThreadFunc;
-		void*	m_userPtr; //for taskDesc etc
-		void*	m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc
-		void*	m_threadHandle; //this one is calling 'Win32ThreadFunc'
-		void*	m_eventStartHandle;
-		char	m_eventStartHandleName[32];
-		void*	m_eventCompletetHandle;
-		char	m_eventCompletetHandleName[32];
-	};
-	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
-	btAlignedObjectArray<void*>			m_completeHandles;
-	int m_maxNumTasks;
-	///Setup and initialize SPU/CELL/Libspe2
-	struct	Win32ThreadConstructionInfo
-	{
-		Win32ThreadConstructionInfo(const char* uniqueName,
-									Win32ThreadFunc userThreadFunc,
-									Win32lsMemorySetupFunc	lsMemoryFunc,
-									int numThreads=1,
-									int threadStackSize=65535
-									)
-									:m_uniqueName(uniqueName),
-									m_userThreadFunc(userThreadFunc),
-									m_lsMemoryFunc(lsMemoryFunc),
-									m_numThreads(numThreads),
-									m_threadStackSize(threadStackSize)
-		{
-		}
-		const char*				m_uniqueName;
-		Win32ThreadFunc			m_userThreadFunc;
-		Win32lsMemorySetupFunc	m_lsMemoryFunc;
-		int						m_numThreads;
-		int						m_threadStackSize;
-	};
-	Win32ThreadSupport(const Win32ThreadConstructionInfo& threadConstructionInfo);
-///cleanup/shutdown Libspe2
-	virtual	~Win32ThreadSupport();
-	void	startThreads(const Win32ThreadConstructionInfo&	threadInfo);
-///send messages to SPUs
-	virtual	void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1);
-///check for messages from SPUs
-	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
-	virtual bool isTaskCompleted(unsigned int *puiArgument0, unsigned int *puiArgument1, int timeOutInMilliseconds);
-///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-	virtual	void startSPU();
-///tell the task scheduler we are done with the SPU tasks
-	virtual	void stopSPU();
-	virtual	void	setNumTasks(int numTasks)
-	{
-		m_maxNumTasks = numTasks;
-	}
-	virtual int getNumTasks() const
-	{
-		return m_maxNumTasks;
-	}
-	virtual void*	getThreadLocalMemory(int taskId)
-	{
-		return m_activeSpuStatus[taskId].m_lsMemory;
-	}
-	virtual btBarrier*	createBarrier();
-	virtual btCriticalSection* createCriticalSection();
diff --git a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphase.cpp b/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphase.cpp
deleted file mode 100644
index 84a5e59f..00000000
--- a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphase.cpp
+++ /dev/null
@@ -1,590 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-///The 3 following lines include the CPU implementation of the kernels, keep them in this order.
-#include "BulletMultiThreaded/btGpuDefines.h"
-#include "BulletMultiThreaded/btGpuUtilsSharedDefs.h"
-#include "BulletMultiThreaded/btGpuUtilsSharedCode.h"
-#include "LinearMath/btAlignedAllocator.h"
-#include "LinearMath/btQuickprof.h"
-#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
-#include "btGpuDefines.h"
-#include "btGpuUtilsSharedDefs.h"
-#include "btGpu3DGridBroadphaseSharedDefs.h"
-#include "btGpu3DGridBroadphase.h"
-#include <string.h> //for memset
-#include <stdio.h>
-static bt3DGridBroadphaseParams s3DGridBroadphaseParams;
-btGpu3DGridBroadphase::btGpu3DGridBroadphase(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
-										int gridSizeX, int gridSizeY, int gridSizeZ, 
-										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
-										int maxBodiesPerCell,
-										btScalar cellFactorAABB) :
-	btSimpleBroadphase(maxSmallProxies,
-//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
-				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
-	m_bInitialized(false),
-    m_numBodies(0)
-	_initialize(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, 
-				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
-				maxBodiesPerCell, cellFactorAABB);
-btGpu3DGridBroadphase::btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
-										const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
-										int gridSizeX, int gridSizeY, int gridSizeZ, 
-										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
-										int maxBodiesPerCell,
-										btScalar cellFactorAABB) :
-	btSimpleBroadphase(maxSmallProxies, overlappingPairCache),
-	m_bInitialized(false),
-    m_numBodies(0)
-	_initialize(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, 
-				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
-				maxBodiesPerCell, cellFactorAABB);
-	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
-	assert(m_bInitialized);
-	_finalize();
-void btGpu3DGridBroadphase::_initialize(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
-										int gridSizeX, int gridSizeY, int gridSizeZ, 
-										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
-										int maxBodiesPerCell,
-										btScalar cellFactorAABB)
-	// set various paramerers
-	m_ownsPairCache = true;
-	m_params.m_gridSizeX = gridSizeX;
-	m_params.m_gridSizeY = gridSizeY;
-	m_params.m_gridSizeZ = gridSizeZ;
-	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
-	btVector3 w_org = worldAabbMin;
-	m_params.m_worldOriginX = w_org.getX();
-	m_params.m_worldOriginY = w_org.getY();
-	m_params.m_worldOriginZ = w_org.getZ();
-	btVector3 w_size = worldAabbMax - worldAabbMin;
-	m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
-	m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
-	m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
-	m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
-	m_maxRadius *= btScalar(0.5f);
-	m_params.m_numBodies = m_numBodies;
-	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
-	m_numLargeHandles = 0;						
-	m_maxLargeHandles = maxLargeProxies;
-	m_maxPairsPerBody = maxPairsPerBody;
-	m_cellFactorAABB = cellFactorAABB;
-	m_LastLargeHandleIndex = -1;
-    assert(!m_bInitialized);
-    // allocate host storage
-    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
-    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
-    m_hCellStart = new unsigned int[m_params.m_numCells];
-    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
-	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
-	// --------------- for now, init with m_maxPairsPerBody for each body
-	m_hPairBuffStartCurr[0] = 0;
-	m_hPairBuffStartCurr[1] = 0;
-	for(int i = 1; i <= m_maxHandles; i++) 
-	{
-		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
-		m_hPairBuffStartCurr[i * 2 + 1] = 0;
-	}
-	//----------------
-	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
-	m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max
-	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
-	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
-	m_hPairScan = new unsigned int[m_maxHandles + 1];
-	m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];
-// large proxies
-	// allocate handles buffer and put all handles on free list
-	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
-	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
-	m_firstFreeLargeHandle = 0;
-	{
-		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
-		{
-			m_pLargeHandles[i].SetNextFree(i + 1);
-			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
-		}
-		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
-	}
-// debug data
-	m_numPairsAdded = 0;
-	m_numOverflows = 0;
-    m_bInitialized = true;
-void btGpu3DGridBroadphase::_finalize()
-    assert(m_bInitialized);
-    delete [] m_hBodiesHash;
-    delete [] m_hCellStart;
-    delete [] m_hPairBuffStartCurr;
-    delete [] m_hAABB;
-	delete [] m_hPairBuff;
-	delete [] m_hPairScan;
-	delete [] m_hPairOut;
-	btAlignedFree(m_pLargeHandlesRawPtr);
-	m_bInitialized = false;
-void btGpu3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
-	if(m_numHandles <= 0)
-	{
-		BT_PROFILE("addLarge2LargePairsToCache");
-		addLarge2LargePairsToCache(dispatcher);
-		return;
-	}
-	// update constants
-	setParameters(&m_params);
-	// prepare AABB array
-	prepareAABB();
-	// calculate hash
-	calcHashAABB();
-	// sort bodies based on hash
-	sortHash();
-	// find start of each cell
-	findCellStart();
-	// findOverlappingPairs (small/small)
-	findOverlappingPairs();
-	// findOverlappingPairs (small/large)
-	findPairsLarge();
-	// add pairs to CPU cache
-	computePairCacheChanges();
-	scanOverlappingPairBuff();
-	squeezeOverlappingPairBuff();
-	addPairsToCache(dispatcher);
-	// find and add large/large pairs to CPU cache
-	addLarge2LargePairsToCache(dispatcher);
-	return;
-void btGpu3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
-	m_numPairsAdded = 0;
-	m_numPairsRemoved = 0;
-	for(int i = 0; i < m_numHandles; i++) 
-	{
-		unsigned int num = m_hPairScan[i+1] - m_hPairScan[i];
-		if(!num)
-		{
-			continue;
-		}
-		unsigned int* pInp = m_hPairOut + m_hPairScan[i];
-		unsigned int index0 = m_hAABB[i * 2].uw;
-		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
-		for(unsigned int j = 0; j < num; j++)
-		{
-			unsigned int indx1_s = pInp[j];
-			unsigned int index1 = indx1_s & (~BT_3DGRID_PAIR_ANY_FLG);
-			btSimpleBroadphaseProxy* proxy1;
-			if(index1 < (unsigned int)m_maxHandles)
-			{
-				proxy1 = &m_pHandles[index1];
-			}
-			else
-			{
-				index1 -= m_maxHandles;
-				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
-				proxy1 = &m_pLargeHandles[index1];
-			}
-			if(indx1_s & BT_3DGRID_PAIR_NEW_FLG)
-			{
-				m_pairCache->addOverlappingPair(proxy0,proxy1);
-				m_numPairsAdded++;
-			}
-			else
-			{
-				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
-				m_numPairsRemoved++;
-			}
-		}
-	}
-btBroadphaseProxy* btGpu3DGridBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
-	btBroadphaseProxy*  proxy;
-	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
-	if(bIsLarge)
-	{
-		if (m_numLargeHandles >= m_maxLargeHandles)
-		{
-			///you have to increase the cell size, so 'large' proxies become 'small' proxies (fitting a cell)
-			btAssert(0);
-			return 0; //should never happen, but don't let the game crash ;-)
-		}
-		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
-		int newHandleIndex = allocLargeHandle();
-		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
-	}
-	else
-	{
-		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
-	}
-	return proxy;
-void btGpu3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
-	bool bIsLarge = isLargeProxy(proxy);
-	if(bIsLarge)
-	{
-		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
-		freeLargeHandle(proxy0);
-		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
-	}
-	else
-	{
-		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
-	}
-	return;
-void btGpu3DGridBroadphase::resetPool(btDispatcher* dispatcher)
-	m_hPairBuffStartCurr[0] = 0;
-	m_hPairBuffStartCurr[1] = 0;
-	for(int i = 1; i <= m_maxHandles; i++) 
-	{
-		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
-		m_hPairBuffStartCurr[i * 2 + 1] = 0;
-	}
-bool btGpu3DGridBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
-	btVector3 diag = aabbMax - aabbMin;
-	///use the bounding sphere radius of this bounding box, to include rotation
-	btScalar radius = diag.length() * btScalar(0.5f);
-	radius *= m_cellFactorAABB; // user-defined factor
-	return (radius > m_maxRadius);
-bool btGpu3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
-	return (proxy->getUid() >= (m_maxHandles+2));
-void btGpu3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
-	int i,j;
-	if (m_numLargeHandles <= 0)
-	{
-		return;
-	}
-	int new_largest_index = -1;
-	for(i = 0; i <= m_LastLargeHandleIndex; i++)
-	{
-		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
-		if(!proxy0->m_clientObject)
-		{
-			continue;
-		}
-		new_largest_index = i;
-		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
-		{
-			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
-			if(!proxy1->m_clientObject)
-			{
-				continue;
-			}
-			btAssert(proxy0 != proxy1);
-			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
-			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
-			if(aabbOverlap(p0,p1))
-			{
-				if (!m_pairCache->findPair(proxy0,proxy1))
-				{
-					m_pairCache->addOverlappingPair(proxy0,proxy1);
-				}
-			} 
-			else
-			{
-				if(m_pairCache->findPair(proxy0,proxy1))
-				{
-					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
-				}
-			}
-		}
-	}
-	m_LastLargeHandleIndex = new_largest_index;
-	return;
-void btGpu3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
-	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
-	for (int i=0; i <= m_LastLargeHandleIndex; i++)
-	{
-		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
-		if(!proxy->m_clientObject)
-		{
-			continue;
-		}
-		rayCallback.process(proxy);
-	}
-// overrides for CPU version
-void btGpu3DGridBroadphase::prepareAABB()
-	BT_PROFILE("prepareAABB");
-	bt3DGrid3F1U* pBB = m_hAABB;
-	int i;
-	int new_largest_index = -1;
-	unsigned int num_small = 0;
-	for(i = 0; i <= m_LastHandleIndex; i++) 
-	{
-		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
-		if(!proxy0->m_clientObject)
-		{
-			continue;
-		}
-		new_largest_index = i;
-		pBB->fx = proxy0->m_aabbMin.getX();
-		pBB->fy = proxy0->m_aabbMin.getY();
-		pBB->fz = proxy0->m_aabbMin.getZ();
-		pBB->uw = i;
-		pBB++;
-		pBB->fx = proxy0->m_aabbMax.getX();
-		pBB->fy = proxy0->m_aabbMax.getY();
-		pBB->fz = proxy0->m_aabbMax.getZ();
-		pBB->uw = num_small;
-		pBB++;
-		num_small++;
-	}
-	m_LastHandleIndex = new_largest_index;
-	new_largest_index = -1;
-	unsigned int num_large = 0;
-	for(i = 0; i <= m_LastLargeHandleIndex; i++) 
-	{
-		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
-		if(!proxy0->m_clientObject)
-		{
-			continue;
-		}
-		new_largest_index = i;
-		pBB->fx = proxy0->m_aabbMin.getX();
-		pBB->fy = proxy0->m_aabbMin.getY();
-		pBB->fz = proxy0->m_aabbMin.getZ();
-		pBB->uw = i + m_maxHandles;
-		pBB++;
-		pBB->fx = proxy0->m_aabbMax.getX();
-		pBB->fy = proxy0->m_aabbMax.getY();
-		pBB->fz = proxy0->m_aabbMax.getZ();
-		pBB->uw = num_large + m_maxHandles;
-		pBB++;
-		num_large++;
-	}
-	m_LastLargeHandleIndex = new_largest_index;
-	// paranoid checks
-	btAssert(num_small == m_numHandles);
-	btAssert(num_large == m_numLargeHandles);
-	return;
-void btGpu3DGridBroadphase::setParameters(bt3DGridBroadphaseParams* hostParams)
-	s3DGridBroadphaseParams = *hostParams;
-	return;
-void btGpu3DGridBroadphase::calcHashAABB()
-	BT_PROFILE("bt3DGrid_calcHashAABB");
-	btGpu_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
-	return;
-void btGpu3DGridBroadphase::sortHash()
-	class bt3DGridHashKey
-	{
-	public:
-	   unsigned int hash;
-	   unsigned int index;
-	   void quickSort(bt3DGridHashKey* pData, int lo, int hi)
-	   {
-			int i=lo, j=hi;
-			bt3DGridHashKey x = pData[(lo+hi)/2];
-			do
-			{    
-				while(pData[i].hash > x.hash) i++; 
-				while(x.hash > pData[j].hash) j--;
-				if(i <= j)
-				{
-					bt3DGridHashKey t = pData[i];
-					pData[i] = pData[j];
-					pData[j] = t;
-					i++; j--;
-				}
-			} while(i <= j);
-			if(lo < j) pData->quickSort(pData, lo, j);
-			if(i < hi) pData->quickSort(pData, i, hi);
-	   }
-	};
-	BT_PROFILE("bt3DGrid_sortHash");
-	bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
-	pHash->quickSort(pHash, 0, m_numHandles - 1);
-	return;
-void btGpu3DGridBroadphase::findCellStart()
-	BT_PROFILE("bt3DGrid_findCellStart");
-	btGpu_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
-	return;
-void btGpu3DGridBroadphase::findOverlappingPairs()
-	BT_PROFILE("bt3DGrid_findOverlappingPairs");
-	btGpu_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
-	return;
-void btGpu3DGridBroadphase::findPairsLarge()
-	BT_PROFILE("bt3DGrid_findPairsLarge");
-	btGpu_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
-	return;
-void btGpu3DGridBroadphase::computePairCacheChanges()
-	BT_PROFILE("bt3DGrid_computePairCacheChanges");
-	btGpu_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hAABB, m_numHandles);
-	return;
-void btGpu3DGridBroadphase::scanOverlappingPairBuff()
-	BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
-	m_hPairScan[0] = 0;
-	for(int i = 1; i <= m_numHandles; i++) 
-	{
-		unsigned int delta = m_hPairScan[i];
-		m_hPairScan[i] = m_hPairScan[i-1] + delta;
-	}
-	return;
-void btGpu3DGridBroadphase::squeezeOverlappingPairBuff()
-	BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
-	btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hPairOut, m_hAABB, m_numHandles);
-	return;
-#include "btGpu3DGridBroadphaseSharedCode.h"
diff --git a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphase.h b/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphase.h
deleted file mode 100644
index 1d49a055..00000000
--- a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphase.h
+++ /dev/null
@@ -1,138 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
-#include "btGpu3DGridBroadphaseSharedTypes.h"
-///The btGpu3DGridBroadphase uses GPU-style code compiled for CPU to compute overlapping pairs
-class btGpu3DGridBroadphase : public btSimpleBroadphase
-	bool			m_bInitialized;
-    unsigned int	m_numBodies;
-    unsigned int	m_numCells;
-	unsigned int	m_maxPairsPerBody;
-	btScalar		m_cellFactorAABB;
-    unsigned int	m_maxBodiesPerCell;
-	bt3DGridBroadphaseParams m_params;
-	btScalar		m_maxRadius;
-	// CPU data
-    unsigned int*	m_hBodiesHash;
-    unsigned int*	m_hCellStart;
-	unsigned int*	m_hPairBuffStartCurr;
-	bt3DGrid3F1U*		m_hAABB;
-	unsigned int*	m_hPairBuff;
-	unsigned int*	m_hPairScan;
-	unsigned int*	m_hPairOut;
-// large proxies
-	int		m_numLargeHandles;						
-	int		m_maxLargeHandles;						
-	int		m_LastLargeHandleIndex;							
-	btSimpleBroadphaseProxy* m_pLargeHandles;
-	void* m_pLargeHandlesRawPtr;
-	int		m_firstFreeLargeHandle;
-	int allocLargeHandle()
-	{
-		btAssert(m_numLargeHandles < m_maxLargeHandles);
-		int freeLargeHandle = m_firstFreeLargeHandle;
-		m_firstFreeLargeHandle = m_pLargeHandles[freeLargeHandle].GetNextFree();
-		m_numLargeHandles++;
-		if(freeLargeHandle > m_LastLargeHandleIndex)
-		{
-			m_LastLargeHandleIndex = freeLargeHandle;
-		}
-		return freeLargeHandle;
-	}
-	void freeLargeHandle(btSimpleBroadphaseProxy* proxy)
-	{
-		int handle = int(proxy - m_pLargeHandles);
-		btAssert((handle >= 0) && (handle < m_maxHandles));
-		if(handle == m_LastLargeHandleIndex)
-		{
-			m_LastLargeHandleIndex--;
-		}
-		proxy->SetNextFree(m_firstFreeLargeHandle);
-		m_firstFreeLargeHandle = handle;
-		proxy->m_clientObject = 0;
-		m_numLargeHandles--;
-	}
-	bool isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax);
-	bool isLargeProxy(btBroadphaseProxy* proxy);
-// debug
-	unsigned int	m_numPairsAdded;
-	unsigned int	m_numPairsRemoved;
-	unsigned int	m_numOverflows;
-	btGpu3DGridBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
-					   int gridSizeX, int gridSizeY, int gridSizeZ, 
-					   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
-					   int maxBodiesPerCell = 8,
-					   btScalar cellFactorAABB = btScalar(1.0f));
-	btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
-						const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
-						int gridSizeX, int gridSizeY, int gridSizeZ, 
-						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
-						int maxBodiesPerCell = 8,
-						btScalar cellFactorAABB = btScalar(1.0f));
-	virtual ~btGpu3DGridBroadphase();
-	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
-	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
-	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
-	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
-	virtual void	resetPool(btDispatcher* dispatcher);
-	void _initialize(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
-						int gridSizeX, int gridSizeY, int gridSizeZ, 
-						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
-						int maxBodiesPerCell = 8,
-						btScalar cellFactorAABB = btScalar(1.0f));
-	void _finalize();
-	void addPairsToCache(btDispatcher* dispatcher);
-	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
-// overrides for CPU version
-	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
-	virtual void prepareAABB();
-	virtual void calcHashAABB();
-	virtual void sortHash();	
-	virtual void findCellStart();
-	virtual void findOverlappingPairs();
-	virtual void findPairsLarge();
-	virtual void computePairCacheChanges();
-	virtual void scanOverlappingPairBuff();
-	virtual void squeezeOverlappingPairBuff();
diff --git a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedCode.h b/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedCode.h
deleted file mode 100644
index e0afb87b..00000000
--- a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedCode.h
+++ /dev/null
@@ -1,430 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-//               K E R N E L    F U N C T I O N S 
-// calculate position in uniform grid
-BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
-    int3 gridPos;
-    gridPos.x = (int)floor((p.x - BT_GPU_params.m_worldOriginX) / BT_GPU_params.m_cellSizeX);
-    gridPos.y = (int)floor((p.y - BT_GPU_params.m_worldOriginY) / BT_GPU_params.m_cellSizeY);
-    gridPos.z = (int)floor((p.z - BT_GPU_params.m_worldOriginZ) / BT_GPU_params.m_cellSizeZ);
-    return gridPos;
-} // bt3DGrid_calcGridPos()
-// calculate address in grid from position (clamping to edges)
-BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
-    gridPos.x = BT_GPU_max(0, BT_GPU_min(gridPos.x, (int)BT_GPU_params.m_gridSizeX - 1));
-    gridPos.y = BT_GPU_max(0, BT_GPU_min(gridPos.y, (int)BT_GPU_params.m_gridSizeY - 1));
-    gridPos.z = BT_GPU_max(0, BT_GPU_min(gridPos.z, (int)BT_GPU_params.m_gridSizeZ - 1));
-    return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
-} // bt3DGrid_calcGridHash()
-// calculate grid hash value for each body using its AABB
-BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
-    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
-    if(index >= (int)numBodies)
-	{
-		return;
-	}
-	bt3DGrid3F1U bbMin = pAABB[index*2];
-	bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
-	float4 pos;
-	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
-	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
-	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
-    // get address in grid
-    int3 gridPos = bt3DGrid_calcGridPos(pos);
-    uint gridHash = bt3DGrid_calcGridHash(gridPos);
-    // store grid hash and body index
-    pHash[index] = BT_GPU_make_uint2(gridHash, index);
-} // calcHashAABBD()
-BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
-    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
-    if(index >= (int)numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	// Load hash data into shared memory so that we can look 
-	// at neighboring body's hash value without loading
-	// two hash values per thread
-	BT_GPU___shared__ uint sharedHash[257];
-	sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
-	if((index > 0) && (BT_GPU_threadIdx.x == 0))
-	{
-		// first thread in block must load neighbor body hash
-		volatile uint2 prevData = pHash[index-1];
-		sharedHash[0] = prevData.x;
-	}
-	BT_GPU___syncthreads();
-	if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
-	{
-		cellStart[sortedData.x] = index;
-	}
-} // findCellStartD()
-BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
-	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
-			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
-			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
-} // cudaTestAABBOverlap()
-BT_GPU___device__ void findPairsInCell(	int3	gridPos,
-										uint    index,
-										uint2*  pHash,
-										uint*   pCellStart,
-										bt3DGrid3F1U* pAABB, 
-										uint*   pPairBuff,
-										uint2*	pPairBuffStartCurr,
-										uint	numBodies)
-    if (	(gridPos.x < 0) || (gridPos.x > (int)BT_GPU_params.m_gridSizeX - 1)
-		||	(gridPos.y < 0) || (gridPos.y > (int)BT_GPU_params.m_gridSizeY - 1)
-		||  (gridPos.z < 0) || (gridPos.z > (int)BT_GPU_params.m_gridSizeZ - 1)) 
-    {
-		return;
-	}
-    uint gridHash = bt3DGrid_calcGridHash(gridPos);
-    // get start of bucket for this cell
-    uint bucketStart = pCellStart[gridHash];
-    if (bucketStart == 0xffffffff)
-	{
-        return;   // cell empty
-	}
-	// iterate over bodies in this cell
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-    bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2); 
-	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
-	uint handleIndex =  min0.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
-	uint curr_max = start_curr_next.x - start - 1;
-	uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
-	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
-	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
-	{
-        uint2 cellData = pHash[index2];
-        if (cellData.x != gridHash)
-        {
-			break;   // no longer in same bucket
-		}
-		uint unsorted_indx2 = cellData.y;
-        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
-        {   
-			bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
-			bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
-			if(cudaTestAABBOverlap(min0, max0, min1, max1))
-			{
-				uint handleIndex2 = min1.uw;
-				uint k;
-				for(k = 0; k < curr; k++)
-				{
-					uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
-					if(old_pair == handleIndex2)
-					{
-						pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
-						break;
-					}
-				}
-				if(k == curr)
-				{
-					if(curr >= curr_max) 
-					{ // not a good solution, but let's avoid crash
-						break;
-					}
-					pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
-					curr++;
-				}
-			}
-		}
-	}
-	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
-    return;
-} // findPairsInCell()
-BT_GPU___global__ void findOverlappingPairsD(	bt3DGrid3F1U*	pAABB, uint2* pHash, uint* pCellStart, 
-												uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
-    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
-    if(index >= (int)numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-	bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
-	bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
-	float4 pos;
-	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
-	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
-	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
-    // get address in grid
-    int3 gridPos = bt3DGrid_calcGridPos(pos);
-    // examine only neighbouring cells
-    for(int z=-1; z<=1; z++) {
-        for(int y=-1; y<=1; y++) {
-            for(int x=-1; x<=1; x++) {
-                findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
-            }
-        }
-    }
-} // findOverlappingPairsD()
-BT_GPU___global__ void findPairsLargeD(	bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
-										uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
-    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
-    if(index >= (int)numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-	bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
-	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
-	uint handleIndex =  min0.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
-	uint curr_max = start_curr_next.x - start - 1;
-    for(uint i = 0; i < numLarge; i++)
-    {
-		uint indx2 = numBodies + i;
-		bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
-		bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
-		if(cudaTestAABBOverlap(min0, max0, min1, max1))
-		{
-			uint k;
-			uint handleIndex2 =  min1.uw;
-			for(k = 0; k < curr; k++)
-			{
-				uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
-				if(old_pair == handleIndex2)
-				{
-					pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
-					break;
-				}
-			}
-			if(k == curr)
-			{
-				pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
-				if(curr >= curr_max) 
-				{ // not a good solution, but let's avoid crash
-					break;
-				}
-				curr++;
-			}
-		}
-    }
-	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
-    return;
-} // findPairsLargeD()
-BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, 
-												uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
-    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
-    if(index >= (int)numBodies)
-	{
-		return;
-	}
-	bt3DGrid3F1U bbMin = pAABB[index * 2];
-	uint handleIndex = bbMin.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint *pInp = pPairBuff + start;
-	uint num_changes = 0;
-	for(uint k = 0; k < curr; k++, pInp++)
-	{
-		if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
-		{
-			num_changes++;
-		}
-	}
-	pPairScan[index+1] = num_changes;
-} // computePairCacheChangesD()
-BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
-												   uint* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
-    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
-    if(index >= (int)numBodies)
-	{
-		return;
-	}
-	bt3DGrid3F1U bbMin = pAABB[index * 2];
-	uint handleIndex = bbMin.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint* pInp = pPairBuff + start;
-	uint* pOut = pPairOut + pPairScan[index];
-	uint* pOut2 = pInp;
-	uint num = 0; 
-	for(uint k = 0; k < curr; k++, pInp++)
-	{
-		if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
-		{
-			*pOut = *pInp;
-			pOut++;
-		}
-		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
-		{
-			*pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
-			pOut2++;
-			num++;
-		}
-	}
-	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
-} // squeezeOverlappingPairBuffD()
-//               E N D   O F    K E R N E L    F U N C T I O N S 
-extern "C"
-void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
-    int numThreads, numBlocks;
-    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
-    // execute the kernel
-    BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
-    // check if kernel invocation generated an error
-    BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
-} // calcHashAABB()
-void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
-    int numThreads, numBlocks;
-    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
-	BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
-	BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
-    BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
-} // findCellStart()
-void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies))
-    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
-    int numThreads, numBlocks;
-    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
-    BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
-    BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
-    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
-} // findOverlappingPairs()
-void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
-    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
-    int numThreads, numBlocks;
-    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
-    BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
-    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
-    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
-} // findPairsLarge()
-void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
-    int numThreads, numBlocks;
-    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
-    BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
-    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
-} // computePairCacheChanges()
-void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
-    int numThreads, numBlocks;
-    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
-    BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
-    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
-} // btCuda_squeezeOverlappingPairBuff()
-} // extern "C"
diff --git a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedDefs.h b/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedDefs.h
deleted file mode 100644
index 607bda7e..00000000
--- a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedDefs.h
+++ /dev/null
@@ -1,61 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// Shared definitions for GPU-based 3D Grid collision detection broadphase
-//  Keep this file free from Bullet headers
-//  it is included into both CUDA and CPU code
-#include "btGpu3DGridBroadphaseSharedTypes.h"
-extern "C"
-void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies);
-void BT_GPU_PREF(findCellStart)(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
-void BT_GPU_PREF(findOverlappingPairs)(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies);
-void BT_GPU_PREF(findPairsLarge)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge);
-void BT_GPU_PREF(computePairCacheChanges)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies);
-void BT_GPU_PREF(squeezeOverlappingPairBuff)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies);
-} // extern "C"
diff --git a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedTypes.h b/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedTypes.h
deleted file mode 100644
index 616a4009..00000000
--- a/src/bullet/BulletMultiThreaded/btGpu3DGridBroadphaseSharedTypes.h
+++ /dev/null
@@ -1,67 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// Shared definitions for GPU-based 3D Grid collision detection broadphase
-//  Keep this file free from Bullet headers
-//  it is included into both CUDA and CPU code
-#define BT_3DGRID_PAIR_FOUND_FLG (0x40000000)
-#define BT_3DGRID_PAIR_NEW_FLG   (0x20000000)
-struct bt3DGridBroadphaseParams 
-	unsigned int	m_gridSizeX;
-	unsigned int	m_gridSizeY;
-	unsigned int	m_gridSizeZ;
-	unsigned int	m_numCells;
-	float			m_worldOriginX;
-	float			m_worldOriginY;
-	float			m_worldOriginZ;
-	float			m_cellSizeX;
-	float			m_cellSizeY;
-	float			m_cellSizeZ;
-	unsigned int	m_numBodies;
-	unsigned int	m_maxBodiesPerCell;
-struct bt3DGrid3F1U
-	float			fx;
-	float			fy;
-	float			fz;
-	unsigned int	uw;
diff --git a/src/bullet/BulletMultiThreaded/btGpuDefines.h b/src/bullet/BulletMultiThreaded/btGpuDefines.h
deleted file mode 100644
index f9315ab6..00000000
--- a/src/bullet/BulletMultiThreaded/btGpuDefines.h
+++ /dev/null
@@ -1,211 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// definitions for "GPU on CPU" code
-typedef unsigned int uint;
-struct int2
-	int x, y;
-struct uint2
-	unsigned int x, y;
-struct int3
-	int x, y, z;
-struct uint3
-	unsigned int x, y, z;
-struct float4
-	float x, y, z, w;
-struct float3
-	float x, y, z;
-#define BT_GPU___device__ inline
-#define BT_GPU___devdata__
-#define BT_GPU___constant__
-#define BT_GPU_max(a, b) ((a) > (b) ? (a) : (b))
-#define BT_GPU_min(a, b) ((a) < (b) ? (a) : (b))
-#define BT_GPU_params s3DGridBroadphaseParams
-#define BT_GPU___mul24(a, b) ((a)*(b))
-#define BT_GPU___global__ inline
-#define BT_GPU___shared__ static
-#define BT_GPU___syncthreads()
-static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
-  uint2 t; t.x = x; t.y = y; return t;
-#define BT_GPU_make_uint2(x, y) bt3dGrid_make_uint2(x, y)
-static inline int3 bt3dGrid_make_int3(int x, int y, int z)
-  int3 t; t.x = x; t.y = y; t.z = z; return t;
-#define BT_GPU_make_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
-static inline float3 bt3dGrid_make_float3(float x, float y, float z)
-  float3 t; t.x = x; t.y = y; t.z = z; return t;
-#define BT_GPU_make_float3(x, y, z) bt3dGrid_make_float3(x, y, z)
-static inline float3 bt3dGrid_make_float34(float4 f)
-  float3 t; t.x = f.x; t.y = f.y; t.z = f.z; return t;
-#define BT_GPU_make_float34(f) bt3dGrid_make_float34(f)
-static inline float3 bt3dGrid_make_float31(float f)
-  float3 t; t.x = t.y = t.z = f; return t;
-#define BT_GPU_make_float31(x) bt3dGrid_make_float31(x)
-static inline float4 bt3dGrid_make_float42(float3 v, float f)
-  float4 t; t.x = v.x; t.y = v.y; t.z = v.z; t.w = f; return t;
-#define BT_GPU_make_float42(a, b) bt3dGrid_make_float42(a, b) 
-static inline float4 bt3dGrid_make_float44(float a, float b, float c, float d)
-  float4 t; t.x = a; t.y = b; t.z = c; t.w = d; return t;
-#define BT_GPU_make_float44(a, b, c, d) bt3dGrid_make_float44(a, b, c, d) 
-inline int3 operator+(int3 a, int3 b)
-    return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-inline float4 operator+(const float4& a, const float4& b)
-	float4 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; r.w = a.w+b.w; return r;
-inline float4 operator*(const float4& a, float fact)
-	float4 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; r.w = a.w*fact; return r;
-inline float4 operator*(float fact, float4& a)
-	return (a * fact);
-inline float4& operator*=(float4& a, float fact)
-	a = fact * a;
-	return a;
-inline float4& operator+=(float4& a, const float4& b)
-	a = a + b;
-	return a;
-inline float3 operator+(const float3& a, const float3& b)
-	float3 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; return r;
-inline float3 operator-(const float3& a, const float3& b)
-	float3 r; r.x = a.x-b.x; r.y = a.y-b.y; r.z = a.z-b.z; return r;
-static inline float bt3dGrid_dot(float3& a, float3& b)
-	return a.x*b.x+a.y*b.y+a.z*b.z;
-#define BT_GPU_dot(a,b) bt3dGrid_dot(a,b)
-static inline float bt3dGrid_dot4(float4& a, float4& b)
-	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
-#define BT_GPU_dot4(a,b) bt3dGrid_dot4(a,b)
-static inline float3 bt3dGrid_cross(const float3& a, const float3& b)
-	float3 r; r.x = a.y*b.z-a.z*b.y; r.y = -a.x*b.z+a.z*b.x; r.z = a.x*b.y-a.y*b.x;	return r;
-#define BT_GPU_cross(a,b) bt3dGrid_cross(a,b)
-inline float3 operator*(const float3& a, float fact)
-	float3 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; return r;
-inline float3& operator+=(float3& a, const float3& b)
-	a = a + b;
-	return a;
-inline float3& operator-=(float3& a, const float3& b)
-	a = a - b;
-	return a;
-inline float3& operator*=(float3& a, float fact)
-	a = a * fact;
-	return a;
-inline float3 operator-(const float3& v)
-	float3 r; r.x = -v.x; r.y = -v.y; r.z = -v.z; return r;
-#define BT_GPU_FETCH(a, b) a[b]
-#define BT_GPU_FETCH4(a, b) a[b]
-#define BT_GPU_PREF(func) btGpu_##func
-#define BT_GPU_SAFE_CALL(func) func
-#define BT_GPU_Memset memset
-#define BT_GPU_MemcpyToSymbol(a, b, c) memcpy(&a, b, c)
-#define BT_GPU_BindTexture(a, b, c, d)
-#define BT_GPU_UnbindTexture(a)
-static uint2 s_blockIdx, s_blockDim, s_threadIdx;
-#define BT_GPU_blockIdx s_blockIdx
-#define BT_GPU_blockDim s_blockDim
-#define BT_GPU_threadIdx s_threadIdx
-#define BT_GPU_EXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
-#define BT_GPU_CHECK_ERROR(s)
-#endif //BT_GPU_DEFINES_H
diff --git a/src/bullet/BulletMultiThreaded/btGpuUtilsSharedCode.h b/src/bullet/BulletMultiThreaded/btGpuUtilsSharedCode.h
deleted file mode 100644
index 5761e790..00000000
--- a/src/bullet/BulletMultiThreaded/btGpuUtilsSharedCode.h
+++ /dev/null
@@ -1,55 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// Shared code for GPU-based utilities
-//  Keep this file free from Bullet headers
-//  will be compiled by both CPU and CUDA compilers
-//	file with definitions of BT_GPU_xxx should be included first
-#include "btGpuUtilsSharedDefs.h"
-extern "C"
-//Round a / b to nearest higher integer value
-int BT_GPU_PREF(iDivUp)(int a, int b)
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-} // iDivUp()
-// compute grid and thread block size for a given number of elements
-void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
-    numThreads = BT_GPU_min(blockSize, n);
-    numBlocks = BT_GPU_PREF(iDivUp)(n, numThreads);
-} // computeGridSize()
-} // extern "C"
diff --git a/src/bullet/BulletMultiThreaded/btGpuUtilsSharedDefs.h b/src/bullet/BulletMultiThreaded/btGpuUtilsSharedDefs.h
deleted file mode 100644
index dccfda54..00000000
--- a/src/bullet/BulletMultiThreaded/btGpuUtilsSharedDefs.h
+++ /dev/null
@@ -1,52 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// Shared definitions for GPU-based utilities
-//  Keep this file free from Bullet headers
-//  it is included into both CUDA and CPU code
-//	file with definitions of BT_GPU_xxx should be included first
-extern "C"
-//Round a / b to nearest higher integer value
-int BT_GPU_PREF(iDivUp)(int a, int b);
-// compute grid and thread block size for a given number of elements
-void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads);
-void BT_GPU_PREF(allocateArray)(void** devPtr, unsigned int size);
-void BT_GPU_PREF(freeArray)(void* devPtr);
-void BT_GPU_PREF(copyArrayFromDevice)(void* host, const void* device, unsigned int size);
-void BT_GPU_PREF(copyArrayToDevice)(void* device, const void* host, unsigned int size);
-void BT_GPU_PREF(registerGLBufferObject(unsigned int vbo));
-void* BT_GPU_PREF(mapGLBufferObject(unsigned int vbo));
-void BT_GPU_PREF(unmapGLBufferObject(unsigned int vbo));
-} // extern "C"
diff --git a/src/bullet/BulletMultiThreaded/btParallelConstraintSolver.cpp b/src/bullet/BulletMultiThreaded/btParallelConstraintSolver.cpp
deleted file mode 100644
index 10164f8e..00000000
--- a/src/bullet/BulletMultiThreaded/btParallelConstraintSolver.cpp
+++ /dev/null
@@ -1,1391 +0,0 @@
-   Copyright (C) 2010 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "btParallelConstraintSolver.h"
-#include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
-#include "BulletCollision/BroadphaseCollision/btDispatcher.h"
-#include "LinearMath/btPoolAllocator.h"
-#include "BulletMultiThreaded/vectormath2bullet.h"
-#include "LinearMath/btQuickprof.h"
-#include "BulletMultiThreaded/btThreadSupportInterface.h"
-#include "vecmath/vmInclude.h"
-#include "vectormath/vmInclude.h"
-#include "HeapManager.h"
-#include "PlatformDefinitions.h"
-//#include "PfxSimdUtils.h"
-#include "LinearMath/btScalar.h"
-#include "TrbStateVec.h"
-#define TMP_BUFF_BYTES (15*1024*1024)
-unsigned char ATTRIBUTE_ALIGNED128(tmp_buff[TMP_BUFF_BYTES]);
-// Project Gauss Seidel or the equivalent Sequential Impulse
- inline void resolveSingleConstraintRowGeneric(PfxSolverBody& body1,PfxSolverBody& body2,const btSolverConstraint& c)
-	btScalar deltaImpulse = c.m_rhs-btScalar(c.m_appliedImpulse)*c.m_cfm;
-	const btScalar deltaVel1Dotn	=	c.m_contactNormal.dot(getBtVector3(body1.mDeltaLinearVelocity)) 	+ c.m_relpos1CrossNormal.dot(getBtVector3(body1.mDeltaAngularVelocity));
-	const btScalar deltaVel2Dotn	=	-c.m_contactNormal.dot(getBtVector3(body2.mDeltaLinearVelocity)) + c.m_relpos2CrossNormal.dot(getBtVector3(body2.mDeltaAngularVelocity));
-//	const btScalar delta_rel_vel	=	deltaVel1Dotn-deltaVel2Dotn;
-	deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
-	deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
-	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
-	if (sum < c.m_lowerLimit)
-	{
-		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
-		c.m_appliedImpulse = c.m_lowerLimit;
-	}
-	else if (sum > c.m_upperLimit) 
-	{
-		deltaImpulse = c.m_upperLimit-c.m_appliedImpulse;
-		c.m_appliedImpulse = c.m_upperLimit;
-	}
-	else
-	{
-		c.m_appliedImpulse = sum;
-	}
-	if (body1.mMassInv)
-	{
-		btVector3 linearComponent = c.m_contactNormal*body1.mMassInv;
-		body1.mDeltaLinearVelocity += vmVector3(linearComponent.getX()*deltaImpulse,linearComponent.getY()*deltaImpulse,linearComponent.getZ()*deltaImpulse);
-		btVector3 tmp=c.m_angularComponentA*(btVector3(deltaImpulse,deltaImpulse,deltaImpulse));
-		body1.mDeltaAngularVelocity += vmVector3(tmp.getX(),tmp.getY(),tmp.getZ());
-	}
-	if (body2.mMassInv)
-	{
-		btVector3 linearComponent = -c.m_contactNormal*body2.mMassInv;
-		body2.mDeltaLinearVelocity += vmVector3(linearComponent.getX()*deltaImpulse,linearComponent.getY()*deltaImpulse,linearComponent.getZ()*deltaImpulse);
-		btVector3 tmp = c.m_angularComponentB*((btVector3(deltaImpulse,deltaImpulse,deltaImpulse)));//*m_angularFactor);
-		body2.mDeltaAngularVelocity += vmVector3(tmp.getX(),tmp.getY(),tmp.getZ());
-	}
-	//body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
-	//body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
-void pfxSolveLinearConstraintRow(btConstraintRow &constraint,
-	vmVector3 &deltaLinearVelocityA,vmVector3 &deltaAngularVelocityA,
-	float massInvA,const vmMatrix3 &inertiaInvA,const vmVector3 &rA,
-	vmVector3 &deltaLinearVelocityB,vmVector3 &deltaAngularVelocityB,
-	float massInvB,const vmMatrix3 &inertiaInvB,const vmVector3 &rB)
-	const vmVector3 normal(btReadVector3(constraint.m_normal));
-	btScalar deltaImpulse = constraint.m_rhs;
-	vmVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
-	vmVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
-	deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
-	btScalar oldImpulse = constraint.m_accumImpulse;
-	constraint.m_accumImpulse = btClamped(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
-	deltaImpulse = constraint.m_accumImpulse - oldImpulse;
-	deltaLinearVelocityA += deltaImpulse * massInvA * normal;
-	deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
-	deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
-	deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
-void btSolveContactConstraint(
-	btConstraintRow &constraintResponse,
-	btConstraintRow &constraintFriction1,
-	btConstraintRow &constraintFriction2,
-	const vmVector3 &contactPointA,
-	const vmVector3 &contactPointB,
-	PfxSolverBody &solverBodyA,
-	PfxSolverBody &solverBodyB,
-	float friction
-	)
-	vmVector3 rA = rotate(solverBodyA.mOrientation,contactPointA);
-	vmVector3 rB = rotate(solverBodyB.mOrientation,contactPointB);
-	pfxSolveLinearConstraintRow(constraintResponse,
-		solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA,
-		solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB);
-	float mf = friction*fabsf(constraintResponse.m_accumImpulse);
-	constraintFriction1.m_lowerLimit = -mf;
-	constraintFriction1.m_upperLimit =  mf;
-	constraintFriction2.m_lowerLimit = -mf;
-	constraintFriction2.m_upperLimit =  mf;
-	pfxSolveLinearConstraintRow(constraintFriction1,
-		solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA,
-		solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB);
-	pfxSolveLinearConstraintRow(constraintFriction2,
-		solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA,
-		solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB);
-void CustomSolveConstraintsTaskParallel(
-	const PfxParallelGroup *contactParallelGroup,const PfxParallelBatch *contactParallelBatches,
-	PfxConstraintPair *contactPairs,uint32_t numContactPairs,
-	btPersistentManifold* offsetContactManifolds,
-	const PfxParallelGroup *jointParallelGroup,const PfxParallelBatch *jointParallelBatches,
-	PfxConstraintPair *jointPairs,uint32_t numJointPairs,
-	btSolverConstraint* offsetSolverConstraints,
-	TrbState *offsetRigStates,
-	PfxSolverBody *offsetSolverBodies,
-	uint32_t numRigidBodies,
-	int iteration,unsigned int taskId,unsigned int numTasks,btBarrier *barrier)
-	PfxSolverBody staticBody;
-	staticBody.mMassInv = 0.f;
-	staticBody.mDeltaAngularVelocity=vmVector3(0,0,0);
-	staticBody.mDeltaLinearVelocity =vmVector3(0,0,0);
-	for(int k=0;k<iteration+1;k++) {
-		// Joint
-		for(uint32_t phaseId=0;phaseId<jointParallelGroup->numPhases;phaseId++) {
-			for(uint32_t batchId=0;batchId<jointParallelGroup->numBatches[phaseId];batchId++) {
-				uint32_t numPairs = jointParallelGroup->numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
-				if(batchId%numTasks == taskId && numPairs > 0) {
-					const PfxParallelBatch &batch = jointParallelBatches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
-					for(uint32_t i=0;i<numPairs;i++) {
-						PfxConstraintPair &pair = jointPairs[batch.pairIndices[i]];
-						uint16_t iA = pfxGetRigidBodyIdA(pair);
-						uint16_t iB = pfxGetRigidBodyIdB(pair);
-						PfxSolverBody &solverBodyA = iA != 65535 ? offsetSolverBodies[iA] : staticBody;
-						PfxSolverBody &solverBodyB = iB != 65535 ? offsetSolverBodies[iB] : staticBody;
-						if(k==0) {
-						}
-						else {
-							btSolverConstraint* constraintRow = &offsetSolverConstraints[pfxGetContactId1(pair)];
-							int numRows = pfxGetNumConstraints(pair);
-							int i;
-							for (i=0;i<numRows;i++)
-							{
-								resolveSingleConstraintRowGeneric(solverBodyA,solverBodyB,constraintRow[i]);
-							}
-						}
-					}
-				}
-			}
-			barrier->sync();
-		}
-		// Contact
-		for(uint32_t phaseId=0;phaseId<contactParallelGroup->numPhases;phaseId++) {
-			for(uint32_t batchId=0;batchId<contactParallelGroup->numBatches[phaseId];batchId++) {
-				uint32_t numPairs = contactParallelGroup->numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
-				if(batchId%numTasks == taskId && numPairs > 0) {
-					const PfxParallelBatch &batch = contactParallelBatches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
-					for(uint32_t i=0;i<numPairs;i++) {
-						PfxConstraintPair &pair = contactPairs[batch.pairIndices[i]];
-						uint16_t iA = pfxGetRigidBodyIdA(pair);
-						uint16_t iB = pfxGetRigidBodyIdB(pair);
-						btPersistentManifold& contact = offsetContactManifolds[pfxGetConstraintId1(pair)];
-						PfxSolverBody &solverBodyA = offsetSolverBodies[iA];
-						PfxSolverBody &solverBodyB = offsetSolverBodies[iB];
-						for(int j=0;j<contact.getNumContacts();j++) {
-							btManifoldPoint& cp = contact.getContactPoint(j);
-							if(k==0) {
-								vmVector3 rA = rotate(solverBodyA.mOrientation,btReadVector3(cp.m_localPointA));
-								vmVector3 rB = rotate(solverBodyB.mOrientation,btReadVector3(cp.m_localPointB));
-								for(int k=0;k<3;k++) {
-									vmVector3 normal = btReadVector3(cp.mConstraintRow[k].m_normal);
-									float deltaImpulse = cp.mConstraintRow[k].m_accumImpulse;
-									solverBodyA.mDeltaLinearVelocity += deltaImpulse * solverBodyA.mMassInv * normal;
-									solverBodyA.mDeltaAngularVelocity += deltaImpulse * solverBodyA.mInertiaInv * cross(rA,normal);
-									solverBodyB.mDeltaLinearVelocity -= deltaImpulse * solverBodyB.mMassInv * normal;
-									solverBodyB.mDeltaAngularVelocity -= deltaImpulse * solverBodyB.mInertiaInv * cross(rB,normal);
-								}
-							}
-							else {
-								btSolveContactConstraint(
-									cp.mConstraintRow[0],
-									cp.mConstraintRow[1],
-									cp.mConstraintRow[2],
-									btReadVector3(cp.m_localPointA),
-									btReadVector3(cp.m_localPointB),
-									solverBodyA,
-									solverBodyB,
-									cp.m_combinedFriction
-									);
-							}
-						}
-					}
-				}
-			}
-			if (barrier)
-				barrier->sync();
-		}
-	}
-void CustomPostSolverTask(
-	TrbState *states,
-	PfxSolverBody *solverBodies,
-	uint32_t numRigidBodies)
-	for(uint32_t i=0;i<numRigidBodies;i++) {
-		TrbState &state = states[i];
-		PfxSolverBody &solverBody = solverBodies[i];
-		state.setLinearVelocity(state.getLinearVelocity()+solverBody.mDeltaLinearVelocity);
-		state.setAngularVelocity(state.getAngularVelocity()+solverBody.mDeltaAngularVelocity);
-	}
-void*	SolverlsMemoryFunc()
-	//don't create local store memory, just return 0
-	return 0;
-void pfxGetPlaneSpace(const vmVector3& n, vmVector3& p, vmVector3& q)
-	if(fabsf(n[2]) > 0.707f) {
-		// choose p in y-z plane
-		float a = n[1]*n[1] + n[2]*n[2];
-		float k = 1.0f/sqrtf(a);
-		p[0] = 0;
-		p[1] = -n[2]*k;
-		p[2] = n[1]*k;
-		// set q = n x p
-		q[0] = a*k;
-		q[1] = -n[0]*p[2];
-		q[2] = n[0]*p[1];
-	}
-	else {
-		// choose p in x-y plane
-		float a = n[0]*n[0] + n[1]*n[1];
-		float k = 1.0f/sqrtf(a);
-		p[0] = -n[1]*k;
-		p[1] = n[0]*k;
-		p[2] = 0;
-		// set q = n x p
-		q[0] = -n[2]*p[1];
-		q[1] = n[2]*p[0];
-		q[2] = a*k;
-	}
-#define PFX_CONTACT_SLOP 0.001f
-void btSetupContactConstraint(
-	btConstraintRow &constraintResponse,
-	btConstraintRow &constraintFriction1,
-	btConstraintRow &constraintFriction2,
-	float penetrationDepth,
-	float restitution,
-	float friction,
-	const vmVector3 &contactNormal,
-	const vmVector3 &contactPointA,
-	const vmVector3 &contactPointB,
-	const TrbState &stateA,
-	const TrbState &stateB,
-	PfxSolverBody &solverBodyA,
-	PfxSolverBody &solverBodyB,
-	float separateBias,
-	float timeStep
-	)
-	vmVector3 rA = rotate(solverBodyA.mOrientation,contactPointA);
-	vmVector3 rB = rotate(solverBodyB.mOrientation,contactPointB);
-	vmMatrix3 K = vmMatrix3::scale(vmVector3(solverBodyA.mMassInv + solverBodyB.mMassInv)) - 
-			crossMatrix(rA) * solverBodyA.mInertiaInv * crossMatrix(rA) - 
-			crossMatrix(rB) * solverBodyB.mInertiaInv * crossMatrix(rB);
-	vmVector3 vA = stateA.getLinearVelocity() + cross(stateA.getAngularVelocity(),rA);
-	vmVector3 vB = stateB.getLinearVelocity() + cross(stateB.getAngularVelocity(),rB);
-	vmVector3 vAB = vA-vB;
-	vmVector3 tangent1,tangent2;
-	btPlaneSpace1(contactNormal,tangent1,tangent2);
-//	constraintResponse.m_accumImpulse = 0.f;
-//	constraintFriction1.m_accumImpulse = 0.f;
-//	constraintFriction2.m_accumImpulse = 0.f;
-	// Contact Constraint
-	{
-		vmVector3 normal = contactNormal;
-		float denom = dot(K*normal,normal);
-		constraintResponse.m_rhs = -(1.0f+restitution)*dot(vAB,normal); // velocity error
-		constraintResponse.m_rhs -= (separateBias * btMin(0.0f,penetrationDepth+PFX_CONTACT_SLOP)) / timeStep; // position error
-		constraintResponse.m_rhs /= denom;
-		constraintResponse.m_jacDiagInv = 1.0f/denom;
-		constraintResponse.m_lowerLimit = 0.0f;
-		constraintResponse.m_upperLimit = SIMD_INFINITY;
-		btStoreVector3(normal,constraintResponse.m_normal);
-	}
-	// Friction Constraint 1
-	{
-		vmVector3 normal = tangent1;
-		float denom = dot(K*normal,normal);
-		constraintFriction1.m_jacDiagInv = 1.0f/denom;
-		constraintFriction1.m_rhs = -dot(vAB,normal);
-		constraintFriction1.m_rhs *= constraintFriction1.m_jacDiagInv;
-		constraintFriction1.m_lowerLimit = 0.0f;
-		constraintFriction1.m_upperLimit = SIMD_INFINITY;
-		btStoreVector3(normal,constraintFriction1.m_normal);
-	}
-	// Friction Constraint 2
-	{
-		vmVector3 normal = tangent2;
-		float denom = dot(K*normal,normal);
-		constraintFriction2.m_jacDiagInv = 1.0f/denom;
-		constraintFriction2.m_rhs = -dot(vAB,normal);
-		constraintFriction2.m_rhs *= constraintFriction2.m_jacDiagInv;
-		constraintFriction2.m_lowerLimit = 0.0f;
-		constraintFriction2.m_upperLimit = SIMD_INFINITY;
-		btStoreVector3(normal,constraintFriction2.m_normal);
-	}
-void CustomSetupContactConstraintsTask(
-	PfxConstraintPair *contactPairs,uint32_t numContactPairs,
-	btPersistentManifold*	offsetContactManifolds,
-	TrbState *offsetRigStates,
-	PfxSolverBody *offsetSolverBodies,
-	uint32_t numRigidBodies,
-	float separateBias,
-	float timeStep)
-	for(uint32_t i=0;i<numContactPairs;i++) {
-		PfxConstraintPair &pair = contactPairs[i];
-		if(!pfxGetActive(pair) || pfxGetNumConstraints(pair) == 0 ||
-			((pfxGetMotionMaskA(pair)&PFX_MOTION_MASK_STATIC) && (pfxGetMotionMaskB(pair)&PFX_MOTION_MASK_STATIC)) ) {
-			continue;
-		}
-		uint16_t iA = pfxGetRigidBodyIdA(pair);
-		uint16_t iB = pfxGetRigidBodyIdB(pair);
-		int id = pfxGetConstraintId1(pair);
-		btPersistentManifold& contact = offsetContactManifolds[id];
-		TrbState &stateA = offsetRigStates[iA];
-//		PfxRigBody &bodyA = offsetRigBodies[iA];
-		PfxSolverBody &solverBodyA = offsetSolverBodies[iA];
-		TrbState &stateB = offsetRigStates[iB];
-//		PfxRigBody &bodyB = offsetRigBodies[iB];
-		PfxSolverBody &solverBodyB = offsetSolverBodies[iB];
-		float restitution = 0.5f * (solverBodyA.restitution + solverBodyB.restitution);
-		//if(contact.getDuration() > 1) restitution = 0.0f;
-		float friction = sqrtf(solverBodyA.friction * solverBodyB.friction);
-		for(int j=0;j<contact.getNumContacts();j++) {
-			btManifoldPoint& cp = contact.getContactPoint(j);
-			btSetupContactConstraint(
-				cp.mConstraintRow[0],
-				cp.mConstraintRow[1],
-				cp.mConstraintRow[2],
-				cp.getDistance(),
-				restitution,
-				friction,
-				btReadVector3(cp.m_normalWorldOnB),//.mConstraintRow[0].m_normal),
-				btReadVector3(cp.m_localPointA),
-				btReadVector3(cp.m_localPointB),
-				stateA,
-				stateB,
-				solverBodyA,
-				solverBodyB,
-				separateBias,
-				timeStep
-				);
-		}
-		//contact.setCompositeFriction(friction);
-	}
-void	SolverThreadFunc(void* userPtr,void* lsMemory)
-	btConstraintSolverIO* io = (btConstraintSolverIO*)(userPtr);//arg->io);
-	btCriticalSection* criticalsection = io->setupContactConstraints.criticalSection;
-	//CustomCriticalSection *criticalsection = &io->m_cs;
-	switch(io->cmd) {
-		CustomSolveConstraintsTaskParallel(
-			io->solveConstraints.contactParallelGroup,
-			io->solveConstraints.contactParallelBatches,
-			io->solveConstraints.contactPairs,
-			io->solveConstraints.numContactPairs,
-			io->solveConstraints.offsetContactManifolds,
-			io->solveConstraints.jointParallelGroup,
-			io->solveConstraints.jointParallelBatches,
-			io->solveConstraints.jointPairs,
-			io->solveConstraints.numJointPairs,
-			io->solveConstraints.offsetSolverConstraints,
-			io->solveConstraints.offsetRigStates1,
-			io->solveConstraints.offsetSolverBodies,
-			io->solveConstraints.numRigidBodies,
-			io->solveConstraints.iteration,
-			io->solveConstraints.taskId,
-			io->maxTasks1,
-			io->solveConstraints.barrier
-			);
-		break;
-			CustomPostSolverTask(	io->postSolver.states,io->postSolver.solverBodies,	io->postSolver.numRigidBodies);
-			break;
-		{
-			bool empty = false;
-			while(!empty) {
-				int start,batch;
-				criticalsection->lock();
-				start = (int)criticalsection->getSharedParam(0);
-				batch = (int)criticalsection->getSharedParam(1);
-				//PFX_PRINTF("taskId %d start %d num %d\n",arg->taskId,start,batch);
-				// ���̃o�b�t�@���Z�b�g
-				int nextStart = start + batch;
-				int rest = btMax((int)io->setupContactConstraints.numContactPairs1 - nextStart,0);
-				int nextBatch = (rest > batch)?batch:rest;
-				criticalsection->setSharedParam(0,nextStart);
-                criticalsection->setSharedParam(1,nextBatch);
-				criticalsection->unlock();
-				if(batch > 0) {
-					CustomSetupContactConstraintsTask(
-						io->setupContactConstraints.offsetContactPairs+start,batch,
-						io->setupContactConstraints.offsetContactManifolds,
-						io->setupContactConstraints.offsetRigStates,
-//						io->setupContactConstraints.offsetRigBodies,
-						io->setupContactConstraints.offsetSolverBodies,
-						io->setupContactConstraints.numRigidBodies,
-						io->setupContactConstraints.separateBias,
-						io->setupContactConstraints.timeStep);
-				}
-				else {
-					empty = true;
-				}
-			}
-		}
-		break;
-		default:
-			{
-				btAssert(0);
-			}
-	}
-void CustomSetupContactConstraintsNew(
-	PfxConstraintPair *contactPairs1,uint32_t numContactPairs,
-	btPersistentManifold *offsetContactManifolds,
-	TrbState *offsetRigStates,
-	PfxSolverBody *offsetSolverBodies,
-	uint32_t numRigidBodies,
-	float separationBias,
-	float timeStep,
-	class btThreadSupportInterface* threadSupport,
-	btCriticalSection* criticalSection,
-	btConstraintSolverIO *io 
-	)
-	int maxTasks = threadSupport->getNumTasks();
-	int div = (int)maxTasks * 4;
-	int batch = ((int)numContactPairs + div - 1) / div;
-#ifdef __PPU__
-		BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport;
-	if (criticalSection)
-	{
-		criticalSection->setSharedParam(0,0);
-		criticalSection->setSharedParam(1,btMin(batch,64)); // batched number
-	} else
-	{
-#ifdef __PPU__
-		spursThread->setSharedParam(0,0);
-		spursThread->setSharedParam(1,btMin(batch,64)); // batched number
-#endif //__PPU__
-	}
-	for(int t=0;t<maxTasks;t++) {
-		io[t].setupContactConstraints.offsetContactPairs = contactPairs1;
-		io[t].setupContactConstraints.numContactPairs1 = numContactPairs;
-		io[t].setupContactConstraints.offsetRigStates = offsetRigStates;
-		io[t].setupContactConstraints.offsetContactManifolds = offsetContactManifolds;		
-		io[t].setupContactConstraints.offsetSolverBodies = offsetSolverBodies;
-		io[t].setupContactConstraints.numRigidBodies = numRigidBodies;
-		io[t].setupContactConstraints.separateBias = separationBias;
-		io[t].setupContactConstraints.timeStep = timeStep;
-		io[t].setupContactConstraints.criticalSection = criticalSection;
-		io[t].maxTasks1 = maxTasks;
-#ifdef __PPU__
-		io[t].barrierAddr2 = (unsigned int)spursThread->getBarrierAddress();
-		io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress();
-		CustomSetupContactConstraintsTask(contactPairs1,numContactPairs,offsetContactManifolds,offsetRigStates,offsetSolverBodies,numRigidBodies,separationBias,timeStep);
-		threadSupport->sendRequest(1,(ppu_address_t)&io[t],t);
-	}
-	unsigned int arg0,arg1;
-	for(int t=0;t<maxTasks;t++) {
-		arg0 = t;
-		threadSupport->waitForResponse(&arg0,&arg1);
-	}
-void CustomSplitConstraints(
-	PfxConstraintPair *pairs,uint32_t numPairs,
-	PfxParallelGroup &group,PfxParallelBatch *batches,
-	uint32_t numTasks,
-	uint32_t numRigidBodies,
-	void *poolBuff,
-	uint32_t poolBytes
-	)
-	HeapManager pool((unsigned char*)poolBuff,poolBytes);
-	// �X�e�[�g�`�F�b�N�p�r�b�g�t���O�e�[�u��
-	int bufSize = sizeof(uint8_t)*numRigidBodies;
-	bufSize = ((bufSize+127)>>7)<<7; // 128 bytes alignment
-	uint8_t *bodyTable = (uint8_t*)pool.allocate(bufSize,HeapManager::ALIGN128);
-	// �y�A�`�F�b�N�p�r�b�g�t���O�e�[�u��
-	uint32_t *pairTable;
-	size_t allocSize = sizeof(uint32_t)*((numPairs+31)/32);
-	pairTable = (uint32_t*)pool.allocate(allocSize);
-	memset(pairTable,0,allocSize);
-	// �ڕW�Ƃ��镪����
-	uint32_t targetCount = btMax(uint32_t(PFX_MIN_SOLVER_PAIRS),btMin(numPairs / (numTasks*2),uint32_t(PFX_MAX_SOLVER_PAIRS)));
-	uint32_t startIndex = 0;
-	uint32_t phaseId;
-	uint32_t batchId;
-	uint32_t totalCount=0;
-	uint32_t maxBatches = btMin(numTasks,uint32_t(PFX_MAX_SOLVER_BATCHES));
-	for(phaseId=0;phaseId<PFX_MAX_SOLVER_PHASES&&totalCount<numPairs;phaseId++) {
-		bool startIndexCheck = true;
-		group.numBatches[phaseId] = 0;
-		uint32_t i = startIndex;
-        // �`�F�b�N�p�r�b�g�t���O�e�[�u�����N���A
-		memset(bodyTable,0xff,bufSize);
-		for(batchId=0;i<numPairs&&totalCount<numPairs&&batchId<maxBatches;batchId++) {
-			uint32_t pairCount=0;
-			PfxParallelBatch &batch = batches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
-			uint32_t pairId = 0;
-			for(;i<numPairs&&pairCount<targetCount;i++) {
-				uint32_t idxP = i>>5;
-				uint32_t maskP = 1L << (i & 31);
-				//pair is already assigned to a phase/batch
-				if(pairTable[idxP] & maskP) {
-					continue;
-				}
-				uint32_t idxA = pfxGetRigidBodyIdA(pairs[i]);
-				uint32_t idxB = pfxGetRigidBodyIdB(pairs[i]);
-				// �����Ƃ��A�N�e�B�u�łȂ��A�܂��͏Փ˓_���O�̃y�A�͓o�^�Ώۂ���͂���
-				if(!pfxGetActive(pairs[i]) || pfxGetNumConstraints(pairs[i]) == 0 ||
-					((pfxGetMotionMaskA(pairs[i])&PFX_MOTION_MASK_STATIC) && (pfxGetMotionMaskB(pairs[i])&PFX_MOTION_MASK_STATIC)) ) {
-					if(startIndexCheck) 
-						startIndex++;
-					//assign pair -> skip it because it has no constraints
-					pairTable[idxP] |= maskP;
-					totalCount++;
-					continue;
-				}
-				// �ˑ����̃`�F�b�N
-				if( (bodyTable[idxA] != batchId && bodyTable[idxA] != 0xff) || 
-					(bodyTable[idxB] != batchId && bodyTable[idxB] != 0xff) ) {
-					startIndexCheck = false;
-					//bodies of the pair are already assigned to another batch within this phase
-					continue;
-				}
-				// �ˑ�������e�[�u���ɓo�^
-				if(pfxGetMotionMaskA(pairs[i])&PFX_MOTION_MASK_DYNAMIC) 
-						bodyTable[idxA] = batchId;
-				if(pfxGetMotionMaskB(pairs[i])&PFX_MOTION_MASK_DYNAMIC) 
-						bodyTable[idxB] = batchId;
-				if(startIndexCheck) 
-					startIndex++;
-				pairTable[idxP] |= maskP;
-				//add the pair 'i' to the current batch
-				batch.pairIndices[pairId++] = i;
-				pairCount++;
-			}
-			group.numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId] = (uint16_t)pairId;
-			totalCount += pairCount;
-		}
-		group.numBatches[phaseId] = batchId;
-	}
-	group.numPhases = phaseId;
-	pool.clear();
-void CustomSolveConstraintsParallel(
-	PfxConstraintPair *contactPairs,uint32_t numContactPairs,
-	PfxConstraintPair *jointPairs,uint32_t numJointPairs,
-	btPersistentManifold* offsetContactManifolds,
-	btSolverConstraint* offsetSolverConstraints,
-	TrbState *offsetRigStates,
-	PfxSolverBody *offsetSolverBodies,
-	uint32_t numRigidBodies,
-	struct btConstraintSolverIO* io,
-	class btThreadSupportInterface* threadSupport,
-	int iteration,
-	void* poolBuf,
-	int poolBytes,
-	class btBarrier* barrier)
-	{
-	int maxTasks = threadSupport->getNumTasks();
-//	config.taskManager->setTaskEntry(PFX_SOLVER_ENTRY);
-	HeapManager pool((unsigned char*)poolBuf,poolBytes);
-	{
-		PfxParallelGroup *cgroup = (PfxParallelGroup*)pool.allocate(sizeof(PfxParallelGroup));
-		PfxParallelBatch *cbatches = (PfxParallelBatch*)pool.allocate(sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES),128);
-		PfxParallelGroup *jgroup = (PfxParallelGroup*)pool.allocate(sizeof(PfxParallelGroup));
-		PfxParallelBatch *jbatches = (PfxParallelBatch*)pool.allocate(sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES),128);
-		uint32_t tmpBytes = poolBytes - 2 * (sizeof(PfxParallelGroup) + sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES) + 128);
-		void *tmpBuff = pool.allocate(tmpBytes);
-		{
-			BT_PROFILE("CustomSplitConstraints");
-			CustomSplitConstraints(contactPairs,numContactPairs,*cgroup,cbatches,maxTasks,numRigidBodies,tmpBuff,tmpBytes);
-			CustomSplitConstraints(jointPairs,numJointPairs,*jgroup,jbatches,maxTasks,numRigidBodies,tmpBuff,tmpBytes);
-		}
-		{
-		CustomSolveConstraintsTask(
-			io->solveConstraints.contactParallelGroup,
-			io->solveConstraints.contactParallelBatches,
-			io->solveConstraints.contactPairs,
-			io->solveConstraints.numContactPairs,
-			io->solveConstraints.offsetContactManifolds,
-			io->solveConstraints.jointParallelGroup,
-			io->solveConstraints.jointParallelBatches,
-			io->solveConstraints.jointPairs,
-			io->solveConstraints.numJointPairs,
-			io->solveConstraints.offsetJoints,
-			io->solveConstraints.offsetRigStates,
-			io->solveConstraints.offsetSolverBodies,
-			io->solveConstraints.numRigidBodies,
-			io->solveConstraints.iteration,0,1,0);//arg->taskId,1,0);//,arg->maxTasks,arg->barrier);
-		for(int t=0;t<maxTasks;t++) {
-			io[t].solveConstraints.contactParallelGroup = cgroup;
-			io[t].solveConstraints.contactParallelBatches = cbatches;
-			io[t].solveConstraints.contactPairs = contactPairs;
-			io[t].solveConstraints.numContactPairs = numContactPairs;
-			io[t].solveConstraints.offsetContactManifolds = offsetContactManifolds;
-			io[t].solveConstraints.jointParallelGroup = jgroup;
-			io[t].solveConstraints.jointParallelBatches = jbatches;
-			io[t].solveConstraints.jointPairs = jointPairs;
-			io[t].solveConstraints.numJointPairs = numJointPairs;
-			io[t].solveConstraints.offsetSolverConstraints = offsetSolverConstraints;
-			io[t].solveConstraints.offsetRigStates1 = offsetRigStates;
-			io[t].solveConstraints.offsetSolverBodies = offsetSolverBodies;
-			io[t].solveConstraints.numRigidBodies = numRigidBodies;
-			io[t].solveConstraints.iteration = iteration;
-			io[t].solveConstraints.taskId = t;
-			io[t].solveConstraints.barrier = barrier;
-		io[t].maxTasks1 = maxTasks;
-#ifdef __PPU__
-		BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport;
-		io[t].barrierAddr2 = (unsigned int) spursThread->getBarrierAddress();
-		io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress();
-			threadSupport->sendRequest(1,(ppu_address_t)&io[t],t);
-		}
-		unsigned int arg0,arg1;
-		for(int t=0;t<maxTasks;t++) {
-			arg0 = t;
-			threadSupport->waitForResponse(&arg0,&arg1);
-		}
-		}
-		pool.clear();
-	}
-	{
-		int batch = ((int)numRigidBodies + maxTasks - 1) / maxTasks;
-		int rest = (int)numRigidBodies;
-		int start = 0;
-		for(int t=0;t<maxTasks;t++) {
-			int num = (rest - batch ) > 0 ? batch : rest;
-			io[t].postSolver.states = offsetRigStates + start;
-			io[t].postSolver.solverBodies = offsetSolverBodies + start;
-			io[t].postSolver.numRigidBodies = (uint32_t)num;
-		io[t].maxTasks1 = maxTasks;
-#ifdef __PPU__
-		BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport;
-		io[t].barrierAddr2 = (unsigned int)spursThread->getBarrierAddress();
-		io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress();
-			CustomPostSolverTask(	io[t].postSolver.states,io[t].postSolver.solverBodies,	io[t].postSolver.numRigidBodies);
-			threadSupport->sendRequest(1,(ppu_address_t)&io[t],t);
-			rest -= num;
-			start += num;
-		}
-		unsigned int arg0,arg1;
-		for(int t=0;t<maxTasks;t++) {
-			arg0 = t;
-			threadSupport->waitForResponse(&arg0,&arg1);
-		}
-	}
-void BPE_customConstraintSolverSequentialNew(unsigned int new_num, PfxBroadphasePair *new_pairs1 ,
-									btPersistentManifold* offsetContactManifolds,
-									  TrbState* states,int numRigidBodies, 
-									  struct PfxSolverBody* solverBodies, 
-									  PfxConstraintPair* jointPairs, unsigned int numJoints,
-									  btSolverConstraint* offsetSolverConstraints,
-									  float separateBias,
-									  float timeStep,
-									  int iteration,
-									  btThreadSupportInterface* solverThreadSupport,
-									  btCriticalSection* criticalSection,
-									  struct btConstraintSolverIO* solverIO,
-									  btBarrier* barrier
-									  )
-	{
-		BT_PROFILE("pfxSetupConstraints");
-		for(uint32_t i=0;i<numJoints;i++) {
-			// ���̍X�V
-			PfxConstraintPair &pair = jointPairs[i];
-			int idA = pfxGetRigidBodyIdA(pair);
-			if (idA != 65535)
-			{
-				pfxSetMotionMaskA(pair,states[pfxGetRigidBodyIdA(pair)].getMotionMask());
-			}
-			else
-			{
-				pfxSetMotionMaskA(pair,PFX_MOTION_MASK_STATIC);
-			}
-			int idB = pfxGetRigidBodyIdB(pair);
-			if (idB!= 65535)
-			{
-				pfxSetMotionMaskB(pair,states[pfxGetRigidBodyIdB(pair)].getMotionMask());
-			} else
-			{
-				pfxSetMotionMaskB(pair,PFX_MOTION_MASK_STATIC);
-			}
-		}
-//		CustomSetupJointConstraintsSeq(			jointPairs,numJoints,joints,			states,			solverBodies,			numRigidBodies,			timeStep);
-		CustomSetupContactConstraintsSeqNew(
-			(PfxConstraintPair*)new_pairs1,new_num,contacts,
-			states,
-			solverBodies,
-			numRigidBodies,
-			separateBias,
-			timeStep);
-		CustomSetupContactConstraintsNew(
-			(PfxConstraintPair*)new_pairs1,new_num,
-			offsetContactManifolds,
-			states,
-			solverBodies,
-			numRigidBodies,
-			separateBias,
-			timeStep,
-			solverThreadSupport,
-			criticalSection,solverIO
-			);
-	}
-	{
-		BT_PROFILE("pfxSolveConstraints");
-//#define SEQUENTIAL
-		CustomSolveConstraintsSeq(
-			(PfxConstraintPair*)new_pairs1,new_num,contacts,
-			jointPairs,numJoints,
-			states,
-			solverBodies,
-			numRigidBodies,
-			separateBias,
-			timeStep,
-			iteration);
-		CustomSolveConstraintsParallel(
-			(PfxConstraintPair*)new_pairs1,new_num,
-			jointPairs,numJoints,
-			offsetContactManifolds,
-			offsetSolverConstraints,
-			states,
-			solverBodies,
-			numRigidBodies,
-			solverIO, solverThreadSupport,
-			iteration,
-			tmp_buff,
-			barrier
-			);
-#endif //SEQUENTIAL
-	}
-struct	btParallelSolverMemoryCache
-	btAlignedObjectArray<TrbState>	m_mystates;
-	btAlignedObjectArray<PfxSolverBody>  m_mysolverbodies;
-	btAlignedObjectArray<PfxBroadphasePair> m_mypairs;
-	btAlignedObjectArray<PfxConstraintPair> m_jointPairs;
-btConstraintSolverIO* createSolverIO(int numThreads)
-	return new btConstraintSolverIO[numThreads];
-btParallelConstraintSolver::btParallelConstraintSolver(btThreadSupportInterface* solverThreadSupport)
-	m_solverThreadSupport = solverThreadSupport;//createSolverThreadSupport(maxNumThreads);
-	m_solverIO = createSolverIO(m_solverThreadSupport->getNumTasks());
-	m_barrier = m_solverThreadSupport->createBarrier();
-	m_criticalSection = m_solverThreadSupport->createCriticalSection();
-	m_memoryCache = new btParallelSolverMemoryCache();
-	delete m_memoryCache;
-	delete m_solverIO;
-btScalar btParallelConstraintSolver::solveGroup(btCollisionObject** bodies1,int numRigidBodies,btPersistentManifold** manifoldPtr,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher)
-/*	int sz = sizeof(PfxSolverBody);
-	int sz2 = sizeof(vmVector3);
-	int sz3 = sizeof(vmMatrix3);
-	int sz4 = sizeof(vmQuat);
-	int sz5 = sizeof(btConstraintRow);
-	int sz6 = sizeof(btSolverConstraint);
-	int sz7 = sizeof(TrbState);
-	btPersistentManifold* offsetContactManifolds= (btPersistentManifold*) dispatcher->getInternalManifoldPool()->getPoolAddress();
-	m_memoryCache->m_mysolverbodies.resize(numRigidBodies);
-	m_memoryCache->m_mystates.resize(numRigidBodies);
-	{
-			BT_PROFILE("create states and solver bodies");
-	for (int i=0;i<numRigidBodies;i++)
-	{
-		btCollisionObject* obj = bodies1[i];
-		obj->setCompanionId(i);
-		PfxSolverBody& solverBody = m_memoryCache->m_mysolverbodies[i];
-		btRigidBody* rb = btRigidBody::upcast(obj);
-		TrbState& state = m_memoryCache->m_mystates[i];
-		state.reset();
-		const btQuaternion& orgOri = obj->getWorldTransform().getRotation();
-		vmQuat orn(orgOri.getX(),orgOri.getY(),orgOri.getZ(),orgOri.getW());
-		state.setPosition(getVmVector3(obj->getWorldTransform().getOrigin()));
-		state.setOrientation(orn);
-		state.setPosition(state.getPosition());
-		state.setRigidBodyId(i);
-		state.setAngularDamping(0);
-		state.setLinearDamping(0);
-		solverBody.mOrientation = state.getOrientation();
-		solverBody.mDeltaLinearVelocity = vmVector3(0.0f);
-		solverBody.mDeltaAngularVelocity = vmVector3(0.0f);
-		solverBody.friction = obj->getFriction();
-		solverBody.restitution = obj->getRestitution();
-		state.resetSleepCount();
-		//if(state.getMotionMask()&PFX_MOTION_MASK_DYNAMIC) {
-		if (rb && (rb->getInvMass()>0.f))
-		{
-			state.setAngularVelocity(vmVector3(rb->getAngularVelocity().getX(),rb->getAngularVelocity().getY(),rb->getAngularVelocity().getZ()));
-			state.setLinearVelocity(vmVector3(rb->getLinearVelocity().getX(),rb->getLinearVelocity().getY(),rb->getLinearVelocity().getZ()));
-			state.setMotionType(PfxMotionTypeActive);
-			vmMatrix3 ori(solverBody.mOrientation);
-			vmMatrix3 localInvInertia = vmMatrix3::identity();
-			localInvInertia.setCol(0,vmVector3(rb->getInvInertiaDiagLocal().getX(),0,0));
-			localInvInertia.setCol(1,vmVector3(0, rb->getInvInertiaDiagLocal().getY(),0));
-			localInvInertia.setCol(2,vmVector3(0,0, rb->getInvInertiaDiagLocal().getZ()));
-			solverBody.mMassInv = rb->getInvMass();
-			solverBody.mInertiaInv = ori * localInvInertia * transpose(ori);
-		} else
-		{
-			state.setAngularVelocity(vmVector3(0));
-			state.setLinearVelocity(vmVector3(0));
-			state.setMotionType(PfxMotionTypeFixed);
-			m_memoryCache->m_mysolverbodies[i].mMassInv = 0.f;
-			m_memoryCache->m_mysolverbodies[i].mInertiaInv = vmMatrix3(0.0f);
-		}
-	}
-	}
-	int totalPoints = 0;
-#ifndef USE_C_ARRAYS
-	m_memoryCache->m_mypairs.resize(numManifolds);
-	m_memoryCache->m_jointPairs.resize(numConstraints);
-	int actualNumManifolds= 0;
-	{
-		BT_PROFILE("convert manifolds");
-		for (int i1=0;i1<numManifolds;i1++)
-		{
-			if (manifoldPtr[i1]->getNumContacts()>0)
-			{
-				btPersistentManifold* m = manifoldPtr[i1];
-				btCollisionObject* obA = (btCollisionObject*)m->getBody0();
-				btCollisionObject* obB = (btCollisionObject*)m->getBody1();
-				bool obAisActive = !obA->isStaticOrKinematicObject() && obA->isActive();
-				bool obBisActive = !obB->isStaticOrKinematicObject() && obB->isActive();
-				if (!obAisActive && !obBisActive)
-					continue;
-				//int contactId = i1;//actualNumManifolds;
-				PfxBroadphasePair& pair = m_memoryCache->m_mypairs[actualNumManifolds];
-				//init those
-				float compFric = obA->getFriction()*obB->getFriction();//@todo
-				int idA = obA->getCompanionId();
-				int idB = obB->getCompanionId();
-				m->m_companionIdA = idA;
-				m->m_companionIdB = idB;
-			//	if ((mysolverbodies[idA].mMassInv!=0)&&(mysolverbodies[idB].mMassInv!=0))
-			//		continue;
-				int numPosPoints=0;
-				for (int p=0;p<m->getNumContacts();p++)
-				{
-					//btManifoldPoint& pt = m->getContactPoint(p);
-					//float dist = pt.getDistance();
-					//if (dist<0.001)
-						numPosPoints++;
-				}
-				numPosPoints = numPosPoints;
-				totalPoints+=numPosPoints;
-				pfxSetRigidBodyIdA(pair,idA);
-				pfxSetRigidBodyIdB(pair,idB);
-				pfxSetMotionMaskA(pair,m_memoryCache->m_mystates[idA].getMotionMask());
-				pfxSetMotionMaskB(pair,m_memoryCache->m_mystates[idB].getMotionMask());
-				pfxSetActive(pair,numPosPoints>0);
-				pfxSetBroadphaseFlag(pair,0);
-				int contactId = m-offsetContactManifolds;
-				//likely the contact pool is not contiguous, make sure to allocate large enough contact pool
-				btAssert(contactId>=0);
-				btAssert(contactId<dispatcher->getInternalManifoldPool()->getMaxCount());
-				pfxSetContactId(pair,contactId);
-				pfxSetNumConstraints(pair,numPosPoints);//manifoldPtr[i]->getNumContacts());
-				actualNumManifolds++;
-			}
-		}
-	}
-	PfxConstraintPair* jointPairs=0;
-	jointPairs = numConstraints? &m_memoryCache->m_jointPairs[0]:0;
-	int actualNumJoints=0;
-	btSolverConstraint* offsetSolverConstraints = 0;
-	//if (1)
-	{
-		{
-			BT_PROFILE("convert constraints");
-			int totalNumRows = 0;
-			int i;
-			m_tmpConstraintSizesPool.resize(numConstraints);
-			//calculate the total number of contraint rows
-			for (i=0;i<numConstraints;i++)
-			{
-				btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
-				constraints[i]->getInfo1(&info1);
-				totalNumRows += info1.m_numConstraintRows;
-			}
-			m_tmpSolverNonContactConstraintPool.resize(totalNumRows);
-			offsetSolverConstraints =totalNumRows? &m_tmpSolverNonContactConstraintPool[0]:0;
-			///setup the btSolverConstraints
-			int currentRow = 0;
-			for (i=0;i<numConstraints;i++)
-			{
-				const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
-				if (info1.m_numConstraintRows)
-				{
-					btAssert(currentRow<totalNumRows);
-					btTypedConstraint* constraint = constraints[i];
-					btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
-					btRigidBody& rbA = constraint->getRigidBodyA();
-					btRigidBody& rbB = constraint->getRigidBodyB();
-					int j;
-					for ( j=0;j<info1.m_numConstraintRows;j++)
-					{
-						memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
-						currentConstraintRow[j].m_lowerLimit = -FLT_MAX;
-						currentConstraintRow[j].m_upperLimit = FLT_MAX;
-						currentConstraintRow[j].m_appliedImpulse = 0.f;
-						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
-						currentConstraintRow[j].m_solverBodyA = &rbA;
-						currentConstraintRow[j].m_solverBodyB = &rbB;
-					}
-					rbA.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					rbA.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					rbB.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					rbB.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					btTypedConstraint::btConstraintInfo2 info2;
-					info2.fps = 1.f/infoGlobal.m_timeStep;
-					info2.erp = infoGlobal.m_erp;
-					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
-					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
-					info2.m_J2linearAxis = 0;
-					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
-					info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
-					///the size of btSolverConstraint needs be a multiple of btScalar
-					btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
-					info2.m_constraintError = &currentConstraintRow->m_rhs;
-					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
-					info2.cfm = &currentConstraintRow->m_cfm;
-					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
-					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
-					info2.m_numIterations = infoGlobal.m_numIterations;
-					constraints[i]->getInfo2(&info2);
-					int idA = constraint->getRigidBodyA().getCompanionId();
-					int idB = constraint->getRigidBodyB().getCompanionId();
-					///finalize the constraint setup
-					for ( j=0;j<info1.m_numConstraintRows;j++)
-					{
-						btSolverConstraint& solverConstraint = currentConstraintRow[j];
-						solverConstraint.m_originalContactPoint = constraint;
-						solverConstraint.m_companionIdA = idA;
-						solverConstraint.m_companionIdB = idB;
-						{
-							const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
-							solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
-						}
-						{
-							const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
-							solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
-						}
-						{
-							btVector3 iMJlA = solverConstraint.m_contactNormal*rbA.getInvMass();
-							btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
-							btVector3 iMJlB = solverConstraint.m_contactNormal*rbB.getInvMass();//sign of normal?
-							btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
-							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
-							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
-							sum += iMJlB.dot(solverConstraint.m_contactNormal);
-							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
-							solverConstraint.m_jacDiagABInv = btScalar(1.)/sum;
-						}
-						///fix rhs
-						///todo: add force/torque accelerators
-						{
-							btScalar rel_vel;
-							btScalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.getLinearVelocity()) + solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity());
-							btScalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.getLinearVelocity()) + solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity());
-							rel_vel = vel1Dotn+vel2Dotn;
-							btScalar restitution = 0.f;
-							btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
-							btScalar	velocityError = restitution - rel_vel;// * damping;
-							btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
-							btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
-							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
-							solverConstraint.m_appliedImpulse = 0.f;
-						}
-					}
-					PfxConstraintPair& pair = jointPairs[actualNumJoints];
-					int numConstraintRows= info1.m_numConstraintRows;
-					pfxSetNumConstraints(pair,numConstraintRows);
-					pfxSetRigidBodyIdA(pair,idA);
-					pfxSetRigidBodyIdB(pair,idB);
-					//is this needed?
-					if (idA>=0)
-						pfxSetMotionMaskA(pair,m_memoryCache->m_mystates[idA].getMotionMask());
-					if (idB>=0)
-						pfxSetMotionMaskB(pair,m_memoryCache->m_mystates[idB].getMotionMask());
-					pfxSetActive(pair,true);
-					int id = currentConstraintRow-offsetSolverConstraints;
-					pfxSetContactId(pair,id);
-					actualNumJoints++;
-				}
-				currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows;
-			}
-		}
-	}
-	float separateBias=0.1;//info.m_erp;//or m_erp2?
-	float timeStep=infoGlobal.m_timeStep;
-	int iteration=infoGlobal.m_numIterations;
-	//create a pair for each constraints, copy over info etc
-	{
-		BT_PROFILE("compute num contacts");
-		int totalContacts =0;
-		for (int i=0;i<actualNumManifolds;i++)
-		{
-			PfxConstraintPair* pair = &m_memoryCache->m_mypairs[i];
-			totalContacts += pfxGetNumConstraints(*pair);
-		}
-		//printf("numManifolds = %d\n",numManifolds);
-		//printf("totalContacts=%d\n",totalContacts);
-	}
-//	printf("actualNumManifolds=%d\n",actualNumManifolds);
-	{
-		BT_PROFILE("BPE_customConstraintSolverSequentialNew");
-		if (numRigidBodies>0 && (actualNumManifolds+actualNumJoints)>0)
-		{
-//			PFX_PRINTF("num points = %d\n",totalPoints);
-//			PFX_PRINTF("num points PFX = %d\n",total);
-			BPE_customConstraintSolverSequentialNew(
-				actualNumManifolds,
-				&m_memoryCache->m_mypairs[0],
-				offsetContactManifolds,
-				&m_memoryCache->m_mystates[0],numRigidBodies,
-				&m_memoryCache->m_mysolverbodies[0],
-				jointPairs,actualNumJoints,
-				offsetSolverConstraints,
-				separateBias,timeStep,iteration,
-				m_solverThreadSupport,m_criticalSection,m_solverIO,m_barrier);
-		}
-	}
-	//copy results back to bodies
-	{
-		BT_PROFILE("copy back");
-		for (int i=0;i<numRigidBodies;i++)
-		{
-			btCollisionObject* obj = bodies1[i];
-			btRigidBody* rb = btRigidBody::upcast(obj);
-			TrbState& state = m_memoryCache->m_mystates[i];
-			if (rb && (rb->getInvMass()>0.f))
-			{
-				rb->setLinearVelocity(btVector3(state.getLinearVelocity().getX(),state.getLinearVelocity().getY(),state.getLinearVelocity().getZ()));
-				rb->setAngularVelocity(btVector3(state.getAngularVelocity().getX(),state.getAngularVelocity().getY(),state.getAngularVelocity().getZ()));
-			}
-		}
-	}
-	return 0.f;
diff --git a/src/bullet/BulletMultiThreaded/btParallelConstraintSolver.h b/src/bullet/BulletMultiThreaded/btParallelConstraintSolver.h
deleted file mode 100644
index 7c0268e7..00000000
--- a/src/bullet/BulletMultiThreaded/btParallelConstraintSolver.h
+++ /dev/null
@@ -1,285 +0,0 @@
-   Copyright (C) 2010 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
-#include "LinearMath/btScalar.h"
-#include "PlatformDefinitions.h"
-#define PFX_MAX_SOLVER_PAIRS  128
-#ifdef __CELLOS_LV2__
-ATTRIBUTE_ALIGNED128(struct) PfxParallelBatch {
-ATTRIBUTE_ALIGNED16(struct) PfxParallelBatch {
-	uint16_t pairIndices[PFX_MAX_SOLVER_PAIRS];
-#ifdef __CELLOS_LV2__
-ATTRIBUTE_ALIGNED128(struct) PfxParallelGroup {
-ATTRIBUTE_ALIGNED16(struct) PfxParallelGroup {
-	uint16_t numPhases;
-	uint16_t numBatches[PFX_MAX_SOLVER_PHASES];
-ATTRIBUTE_ALIGNED16(struct) PfxSortData16 {
-	union {
-		uint8_t   i8data[16];
-		uint16_t  i16data[8];
-		uint32_t  i32data[4];
-#ifdef __SPU__
-		vec_uint4 vdata;
-	};
-#ifdef __SPU__
-	void set8(int elem,uint8_t data)   {vdata=(vec_uint4)spu_insert(data,(vec_uchar16)vdata,elem);}
-	void set16(int elem,uint16_t data) {vdata=(vec_uint4)spu_insert(data,(vec_ushort8)vdata,elem);}
-	void set32(int elem,uint32_t data) {vdata=(vec_uint4)spu_insert(data,(vec_uint4)vdata,elem);}
-	uint8_t get8(int elem)   const {return spu_extract((vec_uchar16)vdata,elem);}
-	uint16_t get16(int elem) const {return spu_extract((vec_ushort8)vdata,elem);}
-	uint32_t get32(int elem) const {return spu_extract((vec_uint4)vdata,elem);}
-	void set8(int elem,uint8_t data)   {i8data[elem] = data;}
-	void set16(int elem,uint16_t data) {i16data[elem] = data;}
-	void set32(int elem,uint32_t data) {i32data[elem] = data;}
-	uint8_t get8(int elem)   const {return i8data[elem];}
-	uint16_t get16(int elem) const {return i16data[elem];}
-	uint32_t get32(int elem) const {return i32data[elem];}
-typedef PfxSortData16 PfxConstraintPair;
-//J	PfxBroadphasePair�Ƌ���
-SIMD_FORCE_INLINE void pfxSetConstraintId(PfxConstraintPair &pair,uint32_t i)	{pair.set32(2,i);}
-SIMD_FORCE_INLINE void pfxSetNumConstraints(PfxConstraintPair &pair,uint8_t n)	{pair.set8(7,n);}
-SIMD_FORCE_INLINE uint32_t pfxGetConstraintId1(const PfxConstraintPair &pair)	{return pair.get32(2);}
-SIMD_FORCE_INLINE uint8_t  pfxGetNumConstraints(const PfxConstraintPair &pair)	{return pair.get8(7);}
-typedef PfxSortData16 PfxBroadphasePair;
-SIMD_FORCE_INLINE void pfxSetRigidBodyIdA(PfxBroadphasePair &pair,uint16_t i)	{pair.set16(0,i);}
-SIMD_FORCE_INLINE void pfxSetRigidBodyIdB(PfxBroadphasePair &pair,uint16_t i)	{pair.set16(1,i);}
-SIMD_FORCE_INLINE void pfxSetMotionMaskA(PfxBroadphasePair &pair,uint8_t i)		{pair.set8(4,i);}
-SIMD_FORCE_INLINE void pfxSetMotionMaskB(PfxBroadphasePair &pair,uint8_t i)		{pair.set8(5,i);}
-SIMD_FORCE_INLINE void pfxSetBroadphaseFlag(PfxBroadphasePair &pair,uint8_t f)	{pair.set8(6,(pair.get8(6)&0xf0)|(f&0x0f));}
-SIMD_FORCE_INLINE void pfxSetActive(PfxBroadphasePair &pair,bool b)			{pair.set8(6,(pair.get8(6)&0x0f)|((b?1:0)<<4));}
-SIMD_FORCE_INLINE void pfxSetContactId(PfxBroadphasePair &pair,uint32_t i)		{pair.set32(2,i);}
-SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdA(const PfxBroadphasePair &pair)	{return pair.get16(0);}
-SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdB(const PfxBroadphasePair &pair)	{return pair.get16(1);}
-SIMD_FORCE_INLINE uint8_t  pfxGetMotionMaskA(const PfxBroadphasePair &pair)		{return pair.get8(4);}
-SIMD_FORCE_INLINE uint8_t  pfxGetMotionMaskB(const PfxBroadphasePair &pair)		{return pair.get8(5);}
-SIMD_FORCE_INLINE uint8_t  pfxGetBroadphaseFlag(const PfxBroadphasePair &pair)	{return pair.get8(6)&0x0f;}
-SIMD_FORCE_INLINE bool     pfxGetActive(const PfxBroadphasePair &pair)			{return (pair.get8(6)>>4)!=0;}
-SIMD_FORCE_INLINE uint32_t pfxGetContactId1(const PfxBroadphasePair &pair)		{return pair.get32(2);}
-#if defined(__PPU__) || defined (__SPU__)
-ATTRIBUTE_ALIGNED128(struct) PfxSolverBody {
-ATTRIBUTE_ALIGNED16(struct) PfxSolverBody {
-	vmVector3 mDeltaLinearVelocity;
-	vmVector3 mDeltaAngularVelocity;
-	vmMatrix3 mInertiaInv;
-	vmQuat    mOrientation;
-	float   mMassInv;
-	float   friction;
-	float   restitution;
-	float   unused;
-	float   unused2;
-	float   unused3;
-	float   unused4;
-	float   unused5;
-#ifdef __PPU__
-#include "SpuDispatch/BulletPE2ConstraintSolverSpursSupport.h"
-static SIMD_FORCE_INLINE vmVector3 btReadVector3(const double* p)
-	float tmp[3] = {float(p[0]),float(p[1]),float(p[2])};
-	vmVector3 v;
-	loadXYZ(v, tmp);
-	return v;
-static SIMD_FORCE_INLINE vmQuat btReadQuat(const double* p)
-	float tmp[4] = {float(p[0]),float(p[1]),float(p[2]),float(p[4])};
-	vmQuat vq;
-	loadXYZW(vq, tmp);
-	return vq;
-static SIMD_FORCE_INLINE void btStoreVector3(const vmVector3 &src, double* p)
-	float tmp[3];
-	vmVector3 v = src;
-	storeXYZ(v, tmp);
-	p[0] = tmp[0];
-	p[1] = tmp[1];
-	p[2] = tmp[2];
-static SIMD_FORCE_INLINE vmVector3 btReadVector3(const float* p)
-	vmVector3 v;
-	loadXYZ(v, p);
-	return v;
-static SIMD_FORCE_INLINE vmQuat btReadQuat(const float* p)
-	vmQuat vq;
-	loadXYZW(vq, p);
-	return vq;
-static SIMD_FORCE_INLINE void btStoreVector3(const vmVector3 &src, float* p)
-	vmVector3 v = src;
-	storeXYZ(v, p);
-class btPersistentManifold;
-enum {
-struct PfxSetupContactConstraintsIO {
-	PfxConstraintPair *offsetContactPairs;
-	uint32_t numContactPairs1;
-	btPersistentManifold*	offsetContactManifolds;
-	class TrbState *offsetRigStates;
-	struct PfxSolverBody *offsetSolverBodies;
-	uint32_t numRigidBodies;
-	float separateBias;
-	float timeStep;
-	class btCriticalSection* criticalSection;
-struct PfxSolveConstraintsIO {
-	PfxParallelGroup *contactParallelGroup;
-	PfxParallelBatch *contactParallelBatches;
-	PfxConstraintPair *contactPairs;
-	uint32_t numContactPairs;
-	btPersistentManifold *offsetContactManifolds;
-	PfxParallelGroup *jointParallelGroup;
-	PfxParallelBatch *jointParallelBatches;
-	PfxConstraintPair *jointPairs;
-	uint32_t numJointPairs;
-	struct btSolverConstraint* offsetSolverConstraints;
-	TrbState *offsetRigStates1;
-	PfxSolverBody *offsetSolverBodies;
-	uint32_t numRigidBodies;
-	uint32_t iteration;
-	uint32_t	taskId;
-	class btBarrier* barrier;
-struct PfxPostSolverIO {
-	TrbState *states;
-	PfxSolverBody *solverBodies;
-	uint32_t numRigidBodies;
-ATTRIBUTE_ALIGNED16(struct) btConstraintSolverIO {
-	uint8_t cmd;
-	union {
-		PfxSetupContactConstraintsIO setupContactConstraints;
-		PfxSolveConstraintsIO solveConstraints;
-		PfxPostSolverIO postSolver;
-	};
-	//SPU only
-	uint32_t barrierAddr2;
-	uint32_t criticalsectionAddr2;
-	uint32_t maxTasks1;
-void	SolverThreadFunc(void* userPtr,void* lsMemory);
-void*	SolverlsMemoryFunc();
-///The btParallelConstraintSolver performs computations on constraint rows in parallel
-///Using the cross-platform threading it supports Windows, Linux, Mac OSX and PlayStation 3 Cell SPUs
-class btParallelConstraintSolver : public btSequentialImpulseConstraintSolver
-	struct btParallelSolverMemoryCache*	m_memoryCache;
-	class btThreadSupportInterface*	m_solverThreadSupport;
-	struct btConstraintSolverIO* m_solverIO;
-	class btBarrier*			m_barrier;
-	class btCriticalSection*	m_criticalSection;
-	btParallelConstraintSolver(class btThreadSupportInterface* solverThreadSupport);
-	virtual ~btParallelConstraintSolver();
-	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);
\ No newline at end of file
diff --git a/src/bullet/BulletMultiThreaded/btThreadSupportInterface.h b/src/bullet/BulletMultiThreaded/btThreadSupportInterface.h
deleted file mode 100644
index 16850e22..00000000
--- a/src/bullet/BulletMultiThreaded/btThreadSupportInterface.h
+++ /dev/null
@@ -1,85 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <LinearMath/btScalar.h> //for ATTRIBUTE_ALIGNED16
-#include "PlatformDefinitions.h"
-#include "PpuAddressSpace.h"
-class btBarrier {
-	btBarrier() {}
-	virtual ~btBarrier() {}
-	virtual void sync() = 0;
-	virtual void setMaxCount(int n) = 0;
-	virtual int  getMaxCount() = 0;
-class btCriticalSection {
-	btCriticalSection() {}
-	virtual ~btCriticalSection() {}
-	ATTRIBUTE_ALIGNED16(unsigned int mCommonBuff[32]);
-	virtual unsigned int getSharedParam(int i) = 0;
-	virtual void setSharedParam(int i,unsigned int p) = 0;
-	virtual void lock() = 0;
-	virtual void unlock() = 0;
-class btThreadSupportInterface
-	virtual ~btThreadSupportInterface();
-///send messages to SPUs
-	virtual void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1) =0;
-///check for messages from SPUs
-	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1) =0;
-	///non-blocking test if a task is completed. First implement all versions, and then enable this API
-	///virtual bool isTaskCompleted(unsigned int *puiArgument0, unsigned int *puiArgument1, int timeOutInMilliseconds)=0;
-///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
-	virtual	void startSPU() =0;
-///tell the task scheduler we are done with the SPU tasks
-	virtual	void stopSPU()=0;
-	///tell the task scheduler to use no more than numTasks tasks
-	virtual void	setNumTasks(int numTasks)=0;
-	virtual int		getNumTasks() const = 0;
-	virtual btBarrier*	createBarrier() = 0;
-	virtual btCriticalSection* createCriticalSection() = 0;
-	virtual void*	getThreadLocalMemory(int taskId) { return 0; }
diff --git a/src/bullet/BulletMultiThreaded/vectormath2bullet.h b/src/bullet/BulletMultiThreaded/vectormath2bullet.h
deleted file mode 100644
index 11ee33ad..00000000
--- a/src/bullet/BulletMultiThreaded/vectormath2bullet.h
+++ /dev/null
@@ -1,73 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-#include "PlatformDefinitions.h"
-#include "LinearMath/btVector3.h"
-#include "LinearMath/btQuaternion.h"
-#include "LinearMath/btMatrix3x3.h"
-inline Vectormath::Aos::Vector3	getVmVector3(const btVector3& bulletVec)
-	return Vectormath::Aos::Vector3(bulletVec.getX(),bulletVec.getY(),bulletVec.getZ());
-inline btVector3 getBtVector3(const Vectormath::Aos::Vector3& vmVec)
-	return btVector3(vmVec.getX(),vmVec.getY(),vmVec.getZ());
-inline btVector3 getBtVector3(const Vectormath::Aos::Point3& vmVec)
-	return btVector3(vmVec.getX(),vmVec.getY(),vmVec.getZ());
-inline Vectormath::Aos::Quat	getVmQuat(const btQuaternion& bulletQuat)
-	Vectormath::Aos::Quat vmQuat(bulletQuat.getX(),bulletQuat.getY(),bulletQuat.getZ(),bulletQuat.getW());
-	return vmQuat;
-inline btQuaternion	getBtQuat(const Vectormath::Aos::Quat& vmQuat)
-	return btQuaternion (vmQuat.getX(),vmQuat.getY(),vmQuat.getZ(),vmQuat.getW());
-inline Vectormath::Aos::Matrix3	getVmMatrix3(const btMatrix3x3& btMat)
-	Vectormath::Aos::Matrix3 mat(
-		getVmVector3(btMat.getColumn(0)),
-		getVmVector3(btMat.getColumn(1)),
-		getVmVector3(btMat.getColumn(2)));
-		return mat;
diff --git a/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.cpp b/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.cpp
index d1435b65..e90d24e6 100644
--- a/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.cpp
+++ b/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.cpp
@@ -130,9 +130,9 @@ void btDefaultSoftBodySolver::processCollision( btSoftBody* softBody, btSoftBody
 // For the default solver just leave the soft body to do its collision processing
-void btDefaultSoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
+void btDefaultSoftBodySolver::processCollision( btSoftBody *softBody, const btCollisionObjectWrapper* collisionObjectWrap )
-	softBody->defaultCollisionHandler( collisionObject );
+	softBody->defaultCollisionHandler( collisionObjectWrap );
 } // btDefaultSoftBodySolver::processCollision
diff --git a/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.h b/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.h
index 7d9092ce..1c17ffcb 100644
--- a/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.h
+++ b/src/bullet/BulletSoftBody/btDefaultSoftBodySolver.h
@@ -19,7 +19,7 @@ subject to the following restrictions:
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "btSoftBodySolverVertexBuffer.h"
+struct btCollisionObjectWrapper;
 class btDefaultSoftBodySolver : public btSoftBodySolver
@@ -54,7 +54,7 @@ public:
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
-	virtual void processCollision( btSoftBody *, btCollisionObject* );
+	virtual void processCollision( btSoftBody *, const btCollisionObjectWrapper* );
 	virtual void processCollision( btSoftBody*, btSoftBody* );
diff --git a/src/bullet/BulletSoftBody/btSoftBody.cpp b/src/bullet/BulletSoftBody/btSoftBody.cpp
index d1b5eb43..51f4b33d 100644
--- a/src/bullet/BulletSoftBody/btSoftBody.cpp
+++ b/src/bullet/BulletSoftBody/btSoftBody.cpp
@@ -105,12 +105,12 @@ void	btSoftBody::initDefaults()
 	/* Collision shape	*/ 
 	///for now, create a collision shape internally
 	m_collisionShape = new btSoftBodyCollisionShape(this);
-	m_collisionShape->setMargin(0.25);
+	m_collisionShape->setMargin(0.25f);
 	m_windVelocity = btVector3(0,0,0);
+	m_restLengthScale = btScalar(1.0);
@@ -460,8 +460,8 @@ void			btSoftBody::addAeroForceToNode(const btVector3& windVelocity,int nodeInde
 	const btScalar dt = m_sst.sdt;
 	const btScalar kLF = m_cfg.kLF;
 	const btScalar kDG = m_cfg.kDG;
-	const btScalar kPR = m_cfg.kPR;
-	const btScalar kVC = m_cfg.kVC;
+	//const btScalar kPR = m_cfg.kPR;
+	//const btScalar kVC = m_cfg.kVC;
 	const bool as_lift = kLF>0;
 	const bool as_drag = kDG>0;
 	const bool as_aero = as_lift || as_drag;
@@ -505,6 +505,18 @@ void			btSoftBody::addAeroForceToNode(const btVector3& windVelocity,int nodeInde
 					if ( 0 < n_dot_v && n_dot_v < 0.98480f)
 						fLift = 0.5f * kLF * medium.m_density * rel_v_len * tri_area * btSqrt(1.0f-n_dot_v*n_dot_v) * (nrm.cross(rel_v_nrm).cross(rel_v_nrm));
+					// Check if the velocity change resulted by aero drag force exceeds the current velocity of the node.
+					btVector3 del_v_by_fDrag = fDrag*n.m_im*m_sst.sdt;										
+					btScalar del_v_by_fDrag_len2 = del_v_by_fDrag.length2();
+					btScalar v_len2 = n.m_v.length2();
+					if (del_v_by_fDrag_len2 >= v_len2 && del_v_by_fDrag_len2 > 0)
+					{
+						btScalar del_v_by_fDrag_len = del_v_by_fDrag.length();
+						btScalar v_len = n.m_v.length();
+						fDrag *= btScalar(0.8)*(v_len / del_v_by_fDrag_len);
+					}
 					n.m_f += fDrag;
 					n.m_f += fLift;
@@ -535,8 +547,8 @@ void			btSoftBody::addAeroForceToFace(const btVector3& windVelocity,int faceInde
 	const btScalar dt = m_sst.sdt;
 	const btScalar kLF = m_cfg.kLF;
 	const btScalar kDG = m_cfg.kDG;
-	const btScalar kPR = m_cfg.kPR;
-	const btScalar kVC = m_cfg.kVC;
+//	const btScalar kPR = m_cfg.kPR;
+//	const btScalar kVC = m_cfg.kVC;
 	const bool as_lift = kLF>0;
 	const bool as_drag = kDG>0;
 	const bool as_aero = as_lift || as_drag;
@@ -586,6 +598,18 @@ void			btSoftBody::addAeroForceToFace(const btVector3& windVelocity,int faceInde
 					if (f.m_n[j]->m_im>0)
+						// Check if the velocity change resulted by aero drag force exceeds the current velocity of the node.
+						btVector3 del_v_by_fDrag = fDrag*f.m_n[j]->m_im*m_sst.sdt;										
+						btScalar del_v_by_fDrag_len2 = del_v_by_fDrag.length2();
+						btScalar v_len2 = f.m_n[j]->m_v.length2();
+						if (del_v_by_fDrag_len2 >= v_len2 && del_v_by_fDrag_len2 > 0)
+						{
+							btScalar del_v_by_fDrag_len = del_v_by_fDrag.length();
+							btScalar v_len = f.m_n[j]->m_v.length();
+							fDrag *= btScalar(0.8)*(v_len / del_v_by_fDrag_len);
+						}
 						f.m_n[j]->m_f += fDrag; 
 						f.m_n[j]->m_f += fLift;
@@ -816,6 +840,27 @@ void			btSoftBody::scale(const btVector3& scl)
+btScalar btSoftBody::getRestLengthScale()
+	return m_restLengthScale;
+void btSoftBody::setRestLengthScale(btScalar restLengthScale)
+	for(int i=0, ni=m_links.size(); i<ni; ++i)
+	{
+		Link&		l=m_links[i];
+		l.m_rl	=	l.m_rl / m_restLengthScale * restLengthScale;
+		l.m_c1	=	l.m_rl*l.m_rl;
+	}
+	m_restLengthScale = restLengthScale;
+	if (getActivationState() == ISLAND_SLEEPING)
+		activate();
 void			btSoftBody::setPose(bool bvolume,bool bframe)
@@ -863,9 +908,20 @@ void			btSoftBody::setPose(bool bvolume,bool bframe)
+void				btSoftBody::resetLinkRestLengths()
+	for(int i=0, ni=m_links.size();i<ni;++i)
+	{
+		Link& l =	m_links[i];
+		l.m_rl	=	(l.m_n[0]->m_x-l.m_n[1]->m_x).length();
+		l.m_c1	=	l.m_rl*l.m_rl;
+	}
 btScalar		btSoftBody::getVolume() const
@@ -1388,12 +1444,12 @@ void			btSoftBody::refine(ImplicitFn* ifn,btScalar accurary,bool cut)
-						{ a.m_im/=0.5;m=1/a.m_im; }
+						{ a.m_im/=0.5f;m=1/a.m_im; }
-						{ b.m_im/=0.5;m=1/b.m_im; }
+						{ b.m_im/=0.5f;m=1/b.m_im; }
@@ -1473,7 +1529,7 @@ void			btSoftBody::refine(ImplicitFn* ifn,btScalar accurary,bool cut)
 				const btVector3	v=m_nodes[i].m_v;
 				btScalar		m=getMass(i);
-				if(m>0) { m*=0.5;m_nodes[i].m_im/=0.5; }
+				if(m>0) { m*=0.5f;m_nodes[i].m_im/=0.5f; }
@@ -1587,7 +1643,7 @@ bool			btSoftBody::cutLink(int node0,int node1,btScalar position)
 	bool			done=false;
 	int i,ni;
-	const btVector3	d=m_nodes[node0].m_x-m_nodes[node1].m_x;
+//	const btVector3	d=m_nodes[node0].m_x-m_nodes[node1].m_x;
 	const btVector3	x=Lerp(m_nodes[node0].m_x,m_nodes[node1].m_x,position);
 	const btVector3	v=Lerp(m_nodes[node0].m_v,m_nodes[node1].m_v,position);
 	const btScalar	m=1;
@@ -1711,7 +1767,23 @@ void			btSoftBody::predictMotion(btScalar dt)
 		Node&	n=m_nodes[i];
 		n.m_q	=	n.m_x;
-		n.m_v	+=	n.m_f*n.m_im*m_sst.sdt;
+		btVector3 deltaV = n.m_f*n.m_im*m_sst.sdt;
+		{
+			btScalar maxDisplacement = m_worldInfo->m_maxDisplacement;
+			btScalar clampDeltaV = maxDisplacement/m_sst.sdt;
+			for (int c=0;c<3;c++)
+			{
+				if (deltaV[c]>clampDeltaV)
+				{
+					deltaV[c] = clampDeltaV;
+				}
+				if (deltaV[c]<-clampDeltaV)
+				{
+					deltaV[c]=-clampDeltaV;
+				}
+			}
+		}
+		n.m_v	+=	deltaV;
 		n.m_x	+=	n.m_v*m_sst.sdt;
 		n.m_f	=	btVector3(0,0,0);
@@ -2171,15 +2243,18 @@ btVector3		btSoftBody::evaluateCom() const
-bool				btSoftBody::checkContact(	btCollisionObject* colObj,
+bool				btSoftBody::checkContact(	const btCollisionObjectWrapper* colObjWrap,
 											 const btVector3& x,
 											 btScalar margin,
 											 btSoftBody::sCti& cti) const
 	btVector3 nrm;
-	btCollisionShape *shp = colObj->getCollisionShape();
-	btRigidBody *tmpRigid = btRigidBody::upcast(colObj);
-	const btTransform &wtr = tmpRigid ? tmpRigid->getWorldTransform() : colObj->getWorldTransform();
+	const btCollisionShape *shp = colObjWrap->getCollisionShape();
+//	const btRigidBody *tmpRigid = btRigidBody::upcast(colObjWrap->getCollisionObject());
+	//const btTransform &wtr = tmpRigid ? tmpRigid->getWorldTransform() : colObjWrap->getWorldTransform();
+	const btTransform &wtr = colObjWrap->getWorldTransform();
+	//todo: check which transform is needed here
 	btScalar dst = 
@@ -2188,7 +2263,7 @@ bool				btSoftBody::checkContact(	btCollisionObject* colObj,
-		cti.m_colObj = colObj;
+		cti.m_colObj = colObjWrap->getCollisionObject();
 		cti.m_normal = wtr.getBasis()*nrm;
 		cti.m_offset = -btDot( cti.m_normal, x - cti.m_normal * dst );
@@ -2304,51 +2379,93 @@ void					btSoftBody::updatePose()
-void				btSoftBody::updateConstants()
+void				btSoftBody::updateArea(bool averageArea)
 	int i,ni;
-	/* Links		*/ 
-	for(i=0,ni=m_links.size();i<ni;++i)
-	{
-		Link&		l=m_links[i];
-		Material&	m=*l.m_material;
-		l.m_rl	=	(l.m_n[0]->m_x-l.m_n[1]->m_x).length();
-		l.m_c0	=	(l.m_n[0]->m_im+l.m_n[1]->m_im)/m.m_kLST;
-		l.m_c1	=	l.m_rl*l.m_rl;
-	}
-	/* Faces		*/ 
+	/* Face area		*/ 
 		Face&		f=m_faces[i];
 		f.m_ra	=	AreaOf(f.m_n[0]->m_x,f.m_n[1]->m_x,f.m_n[2]->m_x);
-	/* Area's		*/ 
-	btAlignedObjectArray<int>	counts;
-	counts.resize(m_nodes.size(),0);
-	for(i=0,ni=m_nodes.size();i<ni;++i)
+	/* Node area		*/ 
+	if (averageArea)
-		m_nodes[i].m_area	=	0;
+		btAlignedObjectArray<int>	counts;
+		counts.resize(m_nodes.size(),0);
+		for(i=0,ni=m_nodes.size();i<ni;++i)
+		{
+			m_nodes[i].m_area	=	0;
+		}
+		for(i=0,ni=m_faces.size();i<ni;++i)
+		{
+			btSoftBody::Face&	f=m_faces[i];
+			for(int j=0;j<3;++j)
+			{
+				const int index=(int)(f.m_n[j]-&m_nodes[0]);
+				counts[index]++;
+				f.m_n[j]->m_area+=btFabs(f.m_ra);
+			}
+		}
+		for(i=0,ni=m_nodes.size();i<ni;++i)
+		{
+			if(counts[i]>0)
+				m_nodes[i].m_area/=(btScalar)counts[i];
+			else
+				m_nodes[i].m_area=0;
+		}
-	for(i=0,ni=m_faces.size();i<ni;++i)
+	else
-		btSoftBody::Face&	f=m_faces[i];
-		for(int j=0;j<3;++j)
+		// initialize node area as zero
+		for(i=0,ni=m_nodes.size();i<ni;++i)
+		{
+			m_nodes[i].m_area=0;	
+		}
+		for(i=0,ni=m_faces.size();i<ni;++i)
+		{
+			btSoftBody::Face&	f=m_faces[i];
+			for(int j=0;j<3;++j)
+			{
+				f.m_n[j]->m_area += f.m_ra;
+			}
+		}
+		for(i=0,ni=m_nodes.size();i<ni;++i)
-			const int index=(int)(f.m_n[j]-&m_nodes[0]);
-			counts[index]++;
-			f.m_n[j]->m_area+=btFabs(f.m_ra);
+			m_nodes[i].m_area *= 0.3333333f;
-	for(i=0,ni=m_nodes.size();i<ni;++i)
+void				btSoftBody::updateLinkConstants()
+	int i,ni;
+	/* Links		*/ 
+	for(i=0,ni=m_links.size();i<ni;++i)
-		if(counts[i]>0)
-			m_nodes[i].m_area/=(btScalar)counts[i];
-		else
-			m_nodes[i].m_area=0;
+		Link&		l=m_links[i];
+		Material&	m=*l.m_material;
+		l.m_c0	=	(l.m_n[0]->m_im+l.m_n[1]->m_im)/m.m_kLST;
+void				btSoftBody::updateConstants()
+	resetLinkRestLengths();
+	updateLinkConstants();
+	updateArea();
 void					btSoftBody::initializeClusters()
@@ -2817,7 +2934,7 @@ void				btSoftBody::applyForces()
 	BT_PROFILE("SoftBody applyForces");
-	const btScalar					dt =			m_sst.sdt;
+//	const btScalar					dt =			m_sst.sdt;
 	const btScalar					kLF =			m_cfg.kLF;
 	const btScalar					kDG =			m_cfg.kDG;
 	const btScalar					kPR =			m_cfg.kPR;
@@ -2828,10 +2945,10 @@ void				btSoftBody::applyForces()
 	const bool						as_volume =		kVC>0;
 	const bool						as_aero =		as_lift	||
 													as_drag		;
-	const bool						as_vaero =		as_aero	&&
-													(m_cfg.aeromodel < btSoftBody::eAeroModel::F_TwoSided);
-	const bool						as_faero =		as_aero	&&
-													(m_cfg.aeromodel >= btSoftBody::eAeroModel::F_TwoSided);
+	//const bool						as_vaero =		as_aero	&&
+	//												(m_cfg.aeromodel < btSoftBody::eAeroModel::F_TwoSided);
+	//const bool						as_faero =		as_aero	&&
+	//												(m_cfg.aeromodel >= btSoftBody::eAeroModel::F_TwoSided);
 	const bool						use_medium =	as_aero;
 	const bool						use_volume =	as_pressure	||
 		as_volume	;
@@ -2874,7 +2991,7 @@ void				btSoftBody::applyForces()
 	/* Per face forces				*/ 
-		btSoftBody::Face&	f=m_faces[i];
+	//	btSoftBody::Face&	f=m_faces[i];
 		/* Aerodynamics			*/ 
 		addAeroForceToFace(m_windVelocity, i);	
@@ -2910,21 +3027,23 @@ void btSoftBody::PSolve_RContacts(btSoftBody* psb, btScalar kst, btScalar ti)
 		const RContact&		c = psb->m_rcontacts[i];
 		const sCti&			cti = c.m_cti;	
-		btRigidBody* tmpRigid = btRigidBody::upcast(cti.m_colObj);
-		const btVector3		va = tmpRigid ? tmpRigid->getVelocityInLocalPoint(c.m_c1)*dt : btVector3(0,0,0);
-		const btVector3		vb = c.m_node->m_x-c.m_node->m_q;	
-		const btVector3		vr = vb-va;
-		const btScalar		dn = btDot(vr, cti.m_normal);		
-		if(dn<=SIMD_EPSILON)
+		if (cti.m_colObj->hasContactResponse()) 
-			const btScalar		dp = btMin( (btDot(c.m_node->m_x, cti.m_normal) + cti.m_offset), mrg );
-			const btVector3		fv = vr - (cti.m_normal * dn);
-			// c0 is the impulse matrix, c3 is 1 - the friction coefficient or 0, c4 is the contact hardness coefficient
-			const btVector3		impulse = c.m_c0 * ( (vr - (fv * c.m_c3) + (cti.m_normal * (dp * c.m_c4))) * kst );
-			c.m_node->m_x -= impulse * c.m_c2;
-			if (tmpRigid)
-				tmpRigid->applyImpulse(impulse,c.m_c1);
+			btRigidBody* tmpRigid = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+			const btVector3		va = tmpRigid ? tmpRigid->getVelocityInLocalPoint(c.m_c1)*dt : btVector3(0,0,0);
+			const btVector3		vb = c.m_node->m_x-c.m_node->m_q;	
+			const btVector3		vr = vb-va;
+			const btScalar		dn = btDot(vr, cti.m_normal);		
+			if(dn<=SIMD_EPSILON)
+			{
+				const btScalar		dp = btMin( (btDot(c.m_node->m_x, cti.m_normal) + cti.m_offset), mrg );
+				const btVector3		fv = vr - (cti.m_normal * dn);
+				// c0 is the impulse matrix, c3 is 1 - the friction coefficient or 0, c4 is the contact hardness coefficient
+				const btVector3		impulse = c.m_c0 * ( (vr - (fv * c.m_c3) + (cti.m_normal * (dp * c.m_c4))) * kst );
+				c.m_node->m_x -= impulse * c.m_c2;
+				if (tmpRigid)
+					tmpRigid->applyImpulse(impulse,c.m_c1);
+			}
@@ -3031,7 +3150,7 @@ btSoftBody::vsolver_t	btSoftBody::getSolver(eVSolver::_ solver)
-void			btSoftBody::defaultCollisionHandler(btCollisionObject* pco)
+void			btSoftBody::defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap)
@@ -3039,22 +3158,22 @@ void			btSoftBody::defaultCollisionHandler(btCollisionObject* pco)
 	case	fCollision::SDF_RS:
 			btSoftColliders::CollideSDF_RS	docollide;		
-			btRigidBody*		prb1=btRigidBody::upcast(pco);
-			btTransform	wtr=pco->getWorldTransform();
+			btRigidBody*		prb1=(btRigidBody*) btRigidBody::upcast(pcoWrap->getCollisionObject());
+			btTransform	wtr=pcoWrap->getWorldTransform();
-			const btTransform	ctr=pco->getWorldTransform();
+			const btTransform	ctr=pcoWrap->getWorldTransform();
 			const btScalar		timemargin=(wtr.getOrigin()-ctr.getOrigin()).length();
 			const btScalar		basemargin=getCollisionShape()->getMargin();
 			btVector3			mins;
 			btVector3			maxs;
 			ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume;
-			pco->getCollisionShape()->getAabb(	pco->getWorldTransform(),
+			pcoWrap->getCollisionShape()->getAabb(	pcoWrap->getWorldTransform(),
 			docollide.psb		=	this;
-			docollide.m_colObj1 = pco;
+			docollide.m_colObj1Wrap = pcoWrap;
 			docollide.m_rigidBody = prb1;
 			docollide.dynmargin	=	basemargin+timemargin;
@@ -3065,7 +3184,7 @@ void			btSoftBody::defaultCollisionHandler(btCollisionObject* pco)
 	case	fCollision::CL_RS:
 			btSoftColliders::CollideCL_RS	collider;
-			collider.Process(this,pco);
+			collider.ProcessColObj(this,pcoWrap);
@@ -3084,7 +3203,7 @@ void			btSoftBody::defaultCollisionHandler(btSoftBody* psb)
 			if (this!=psb || psb->m_cfg.collisions&fCollision::CL_SELF)
 				btSoftColliders::CollideCL_SS	docollide;
-				docollide.Process(this,psb);
+				docollide.ProcessSoftSoft(this,psb);
@@ -3486,8 +3605,8 @@ const char*	btSoftBody::serialize(void* dataBuffer, class btSerializer* serializ
 			memPtr->m_cfm = m_joints[i]->m_cfm;
-			memPtr->m_erp = m_joints[i]->m_erp;
-			memPtr->m_split = m_joints[i]->m_split;
+			memPtr->m_erp = float(m_joints[i]->m_erp);
+			memPtr->m_split = float(m_joints[i]->m_split);
 			memPtr->m_delete = m_joints[i]->m_delete;
 			for (int j=0;j<4;j++)
diff --git a/src/bullet/BulletSoftBody/btSoftBody.h b/src/bullet/BulletSoftBody/btSoftBody.h
index ba589486..bd5846bf 100644
--- a/src/bullet/BulletSoftBody/btSoftBody.h
+++ b/src/bullet/BulletSoftBody/btSoftBody.h
@@ -45,6 +45,7 @@ struct	btSoftBodyWorldInfo
 	btScalar				air_density;
 	btScalar				water_density;
 	btScalar				water_offset;
+	btScalar				m_maxDisplacement;
 	btVector3				water_normal;
 	btBroadphaseInterface*	m_broadphase;
 	btDispatcher*	m_dispatcher;
@@ -55,6 +56,7 @@ struct	btSoftBodyWorldInfo
+		m_maxDisplacement(1000.f),//avoid soft body from 'exploding' so use some upper threshold of maximum motion that a node can travel per frame
@@ -69,7 +71,7 @@ struct	btSoftBodyWorldInfo
 class	btSoftBody : public btCollisionObject
-	btAlignedObjectArray<class btCollisionObject*> m_collisionDisabledObjects;
+	btAlignedObjectArray<const class btCollisionObject*> m_collisionDisabledObjects;
 	// The solver object that handles this soft body
 	btSoftBodySolver *m_softBodySolver;
@@ -169,6 +171,7 @@ public:
 	/* ImplicitFn	*/ 
 	struct	ImplicitFn
+		virtual ~ImplicitFn() {}
 		virtual btScalar	Eval(const btVector3& x)=0;
@@ -182,7 +185,7 @@ public:
 	/* sCti is Softbody contact info	*/ 
 	struct	sCti
-		btCollisionObject*	m_colObj;		/* Rigid body			*/ 
+		const btCollisionObject*	m_colObj;		/* Rigid body			*/ 
 		btVector3		m_normal;	/* Outward normal		*/ 
 		btScalar		m_offset;	/* Offset from origin	*/ 
@@ -374,13 +377,13 @@ public:
 		Cluster*			m_soft;
 		btRigidBody*		m_rigid;
-		btCollisionObject*	m_collisionObject;
+		const btCollisionObject*	m_collisionObject;
 		Body() : m_soft(0),m_rigid(0),m_collisionObject(0)				{}
 		Body(Cluster* p) : m_soft(p),m_rigid(0),m_collisionObject(0)	{}
-		Body(btCollisionObject* colObj) : m_soft(0),m_collisionObject(colObj)
+		Body(const btCollisionObject* colObj) : m_soft(0),m_collisionObject(colObj)
-			m_rigid = btRigidBody::upcast(m_collisionObject);
+			m_rigid = (btRigidBody*)btRigidBody::upcast(m_collisionObject);
 		void						activate() const
@@ -526,6 +529,7 @@ public:
 		struct IControl
+			virtual ~IControl() {}
 			virtual void			Prepare(AJoint*)				{}
 			virtual btScalar		Speed(AJoint*,btScalar current) { return(current); }
 			static IControl*		Default()						{ static IControl def;return(&def); }
@@ -671,6 +675,9 @@ public:
 	btTransform			m_initialWorldTransform;
 	btVector3			m_windVelocity;
+	btScalar        m_restLengthScale;
 	// Api
@@ -810,9 +817,15 @@ public:
 	void				rotate(	const btQuaternion& rot);
 	/* Scale																*/ 
 	void				scale(	const btVector3& scl);
+	/* Get link resting lengths scale										*/
+	btScalar			getRestLengthScale();
+	/* Scale resting length of all springs									*/
+	void				setRestLengthScale(btScalar restLength);
 	/* Set current state as pose											*/ 
 	void				setPose(		bool bvolume,
 		bool bframe);
+	/* Set current link lengths as resting lengths							*/ 
+	void				resetLinkRestLengths();
 	/* Return the volume													*/ 
 	btScalar			getVolume() const;
 	/* Cluster count														*/ 
@@ -867,7 +880,7 @@ public:
 	/* integrateMotion														*/ 
 	void				integrateMotion();
 	/* defaultCollisionHandlers												*/ 
-	void				defaultCollisionHandler(btCollisionObject* pco);
+	void				defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap);
 	void				defaultCollisionHandler(btSoftBody* psb);
@@ -949,11 +962,13 @@ public:
 		btScalar& mint,eFeature::_& feature,int& index,bool bcountonly) const;
 	void				initializeFaceTree();
 	btVector3			evaluateCom() const;
-	bool				checkContact(btCollisionObject* colObj,const btVector3& x,btScalar margin,btSoftBody::sCti& cti) const;
+	bool				checkContact(const btCollisionObjectWrapper* colObjWrap,const btVector3& x,btScalar margin,btSoftBody::sCti& cti) const;
 	void				updateNormals();
 	void				updateBounds();
 	void				updatePose();
 	void				updateConstants();
+	void				updateLinkConstants();
+	void				updateArea(bool averageArea = true);
 	void				initializeClusters();
 	void				updateClusters();
 	void				cleanupClusters();
diff --git a/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp b/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp
index d99be3b8..9f0d4452 100644
--- a/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp
+++ b/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionShapes/btTetrahedronShape.h"
 #include "BulletCollision/CollisionShapes/btConvexHullShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 #include "LinearMath/btIDebugDraw.h"
@@ -34,10 +34,10 @@ subject to the following restrictions:
 #define BT_SOFTBODY_TRIANGLE_EXTRUSION btScalar(0.06)//make this configurable
-btSoftBodyConcaveCollisionAlgorithm::btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
+btSoftBodyConcaveCollisionAlgorithm::btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
 : btCollisionAlgorithm(ci),
@@ -49,12 +49,12 @@ btSoftBodyConcaveCollisionAlgorithm::~btSoftBodyConcaveCollisionAlgorithm()
-btSoftBodyTriangleCallback::btSoftBodyTriangleCallback(btDispatcher*  dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped):
+btSoftBodyTriangleCallback::btSoftBodyTriangleCallback(btDispatcher*  dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped):
-	m_softBody = (btSoftBody*) (isSwapped? body1:body0);
-	m_triBody = isSwapped? body0:body1;
+	m_softBody = (isSwapped? (btSoftBody*)body1Wrap->getCollisionObject():(btSoftBody*)body0Wrap->getCollisionObject());
+	m_triBody = isSwapped? body0Wrap->getCollisionObject():body1Wrap->getCollisionObject();
 	// create the manifold from the dispatcher 'manifold pool'
@@ -90,7 +90,7 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 	//just for debugging purposes
 	//printf("triangle %d",m_triangleCount++);
-	btCollisionObject* ob = static_cast<btCollisionObject*>(m_triBody);
 	btCollisionAlgorithmConstructionInfo ci;
 	ci.m_dispatcher1 = m_dispatcher;
@@ -98,7 +98,7 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 	if (m_dispatchInfoPtr && m_dispatchInfoPtr->m_debugDraw && (m_dispatchInfoPtr->m_debugDraw->getDebugMode() &btIDebugDraw::DBG_DrawWireframe))
 		btVector3 color(1,1,0);
-		btTransform& tr = ob->getWorldTransform();
+		const btTransform& tr = m_triBody->getWorldTransform();
@@ -115,18 +115,18 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 		//copy over user pointers to temporary shape
-		tm->setUserPointer(ob->getRootCollisionShape()->getUserPointer());
-		btCollisionShape* tmpShape = ob->getCollisionShape();
-		ob->internalSetTemporaryCollisionShape( tm );
+		tm->setUserPointer(m_triBody->getCollisionShape()->getUserPointer());
+		btCollisionObjectWrapper softBody(0,m_softBody->getCollisionShape(),m_softBody,m_softBody->getWorldTransform(),-1,-1);
+		//btCollisionObjectWrapper triBody(0,tm, ob, btTransform::getIdentity());//ob->getWorldTransform());//??
+		btCollisionObjectWrapper triBody(0,tm, m_triBody, m_triBody->getWorldTransform(),partId, triangleIndex);
-		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_softBody,m_triBody,0);//m_manifoldPtr);
+		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(&softBody,&triBody,0);//m_manifoldPtr);
-		colAlgo->processCollision(m_softBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
+		colAlgo->processCollision(&softBody,&triBody,*m_dispatchInfoPtr,m_resultOut);
-		ob->internalSetTemporaryCollisionShape( tmpShape);
@@ -158,24 +158,18 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 		//	tm.setMargin(m_collisionMarginTriangle);
 		//copy over user pointers to temporary shape
-		tm->setUserPointer(ob->getRootCollisionShape()->getUserPointer());
+		tm->setUserPointer(m_triBody->getCollisionShape()->getUserPointer());
-		btCollisionShape* tmpShape = ob->getCollisionShape();
-		ob->internalSetTemporaryCollisionShape( tm );
+		btCollisionObjectWrapper softBody(0,m_softBody->getCollisionShape(),m_softBody,m_softBody->getWorldTransform(),-1,-1);
+		btCollisionObjectWrapper triBody(0,tm, m_triBody, m_triBody->getWorldTransform(),partId, triangleIndex);//btTransform::getIdentity());//??
+		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(&softBody,&triBody,0);//m_manifoldPtr);
-		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_softBody,m_triBody,0);//m_manifoldPtr);
-		///this should use the btDispatcher, so the actual registered algorithm is used
-		//		btConvexConvexAlgorithm cvxcvxalgo(m_manifoldPtr,ci,m_convexBody,m_triBody);
-		//m_resultOut->setShapeIdentifiersB(partId,triangleIndex);
-		//		cvxcvxalgo.processCollision(m_convexBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
-		colAlgo->processCollision(m_softBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
+		colAlgo->processCollision(&softBody,&triBody,*m_dispatchInfoPtr,m_resultOut);
-		ob->internalSetTemporaryCollisionShape( tmpShape );
 		triIndex.m_childShape = tm;
@@ -187,7 +181,7 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
-void	btSoftBodyTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void	btSoftBodyTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btCollisionObjectWrapper* triBodyWrap, const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	m_dispatchInfoPtr = &dispatchInfo;
 	m_collisionMarginTriangle = collisionMarginTriangle+btScalar(BT_SOFTBODY_TRIANGLE_EXTRUSION);
@@ -204,7 +198,7 @@ void	btSoftBodyTriangleCallback::setTimeStepAndCounters(btScalar collisionMargin
 	btTransform convexInTriangleSpace;
-	convexInTriangleSpace = m_triBody->getWorldTransform().inverse() * softTransform;
+	convexInTriangleSpace = triBodyWrap->getWorldTransform().inverse() * softTransform;
@@ -214,33 +208,28 @@ void btSoftBodyConcaveCollisionAlgorithm::clearCache()
-void btSoftBodyConcaveCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSoftBodyConcaveCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 	//btCollisionObject* convexBody = m_isSwapped ? body1 : body0;
-	btCollisionObject* triBody = m_isSwapped ? body0 : body1;
+	const btCollisionObjectWrapper* triBody = m_isSwapped ? body0Wrap : body1Wrap;
 	if (triBody->getCollisionShape()->isConcave())
-		btCollisionObject*	triOb = triBody;
-		btConcaveShape* concaveShape = static_cast<btConcaveShape*>( triOb->getCollisionShape());
+		const btCollisionObject*	triOb = triBody->getCollisionObject();
+		const btConcaveShape* concaveShape = static_cast<const btConcaveShape*>( triOb->getCollisionShape());
 		//	if (convexBody->getCollisionShape()->isConvex())
 			btScalar collisionMarginTriangle = concaveShape->getMargin();
 			//			resultOut->setPersistentManifold(m_btSoftBodyTriangleCallback.m_manifoldPtr);
-			m_btSoftBodyTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,dispatchInfo,resultOut);
-			//Disable persistency. previously, some older algorithm calculated all contacts in one go, so you can clear it here.
-			//m_dispatcher->clearManifold(m_btSoftBodyTriangleCallback.m_manifoldPtr);
-			//			m_btSoftBodyTriangleCallback.m_manifoldPtr->setBodies(convexBody,triBody);
+			m_btSoftBodyTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,triBody,dispatchInfo,resultOut);
 			concaveShape->processAllTriangles( &m_btSoftBodyTriangleCallback,m_btSoftBodyTriangleCallback.getAabbMin(),m_btSoftBodyTriangleCallback.getAabbMax());
 			//	resultOut->refreshContactPoints();
diff --git a/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h b/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
index 11ec5b37..11c7b88f 100644
--- a/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
+++ b/src/bullet/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
@@ -45,7 +45,9 @@ struct btTriIndex
 	int	getTriangleIndex() const
 		// Get only the lower bits where the triangle index is stored
-		return (m_PartIdTriangleIndex&~((~0)<<(31-MAX_NUM_PARTS_IN_BITS)));
+		unsigned int x = 0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		return (m_PartIdTriangleIndex&~(y));
 	int	getPartId() const
@@ -63,7 +65,7 @@ struct btTriIndex
 class btSoftBodyTriangleCallback : public btTriangleCallback
 	btSoftBody* m_softBody;
-	btCollisionObject* m_triBody;
+	const btCollisionObject* m_triBody;
 	btVector3	m_aabbMin;
 	btVector3	m_aabbMax ;
@@ -81,9 +83,9 @@ public:
 	//	btPersistentManifold*	m_manifoldPtr;
-	btSoftBodyTriangleCallback(btDispatcher* dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btSoftBodyTriangleCallback(btDispatcher* dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
-	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btCollisionObjectWrapper* triObjWrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual ~btSoftBodyTriangleCallback();
@@ -115,11 +117,11 @@ class btSoftBodyConcaveCollisionAlgorithm  : public btCollisionAlgorithm
-	btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 	virtual ~btSoftBodyConcaveCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -132,19 +134,19 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSoftBodyConcaveCollisionAlgorithm));
-			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0,body1,false);
+			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSoftBodyConcaveCollisionAlgorithm));
-			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0,body1,true);
+			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
diff --git a/src/bullet/BulletSoftBody/btSoftBodyData.h b/src/bullet/BulletSoftBody/btSoftBodyData.h
index 40dc65c3..87d8841c 100644
--- a/src/bullet/BulletSoftBody/btSoftBodyData.h
+++ b/src/bullet/BulletSoftBody/btSoftBodyData.h
@@ -17,7 +17,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
 struct	SoftBodyMaterialData
diff --git a/src/bullet/BulletSoftBody/btSoftBodyHelpers.cpp b/src/bullet/BulletSoftBody/btSoftBodyHelpers.cpp
index eac0ba06..293a393e 100644
--- a/src/bullet/BulletSoftBody/btSoftBodyHelpers.cpp
+++ b/src/bullet/BulletSoftBody/btSoftBodyHelpers.cpp
@@ -480,6 +480,168 @@ void			btSoftBodyHelpers::DrawClusterTree(	btSoftBody* psb,
+//The btSoftBody object from the BulletSDK includes an array of Nodes and Links. These links appear
+// to be first set up to connect a node to between 5 and 6 of its neighbors [480 links], 
+//and then to the rest of the nodes after the execution of the Floyd-Warshall graph algorithm 
+//[another 930 links]. 
+//The way the links are stored by default, we have a number of cases where adjacent links share a node in common
+// - this leads to the creation of a data dependency through memory. 
+//The PSolve_Links() function reads and writes nodes as it iterates over each link. 
+//So, we now have the possibility of a data dependency between iteration X 
+//that processes link L with iteration X+1 that processes link L+1 
+//because L and L+1 have one node in common, and iteration X updates the positions of that node, 
+//and iteration X+1 reads in the position of that shared node.
+//Such a memory dependency limits the ability of a modern CPU to speculate beyond 
+//a certain point because it has to respect a possible dependency 
+//- this prevents the CPU from making full use of its out-of-order resources. 
+//If we re-order the links such that we minimize the cases where a link L and L+1 share a common node, 
+//we create a temporal gap between when the node position is written, 
+//and when it is subsequently read. This in turn allows the CPU to continue execution without 
+//risking a dependency violation. Such a reordering would result in significant speedups on 
+//modern CPUs with lots of execution resources. 
+//In our testing, we see it have a tremendous impact not only on the A7, 
+//but also on all x86 cores that ship with modern Macs. 
+//The attached source file includes a single function (ReoptimizeLinkOrder) which can be called on a 
+//btSoftBody object in the solveConstraints() function before the actual solver is invoked, 
+//or right after generateBendingConstraints() once we have all 1410 links.
+// This function takes in a list of interdependent Links and tries 
+// to maximize the distance between calculation
+// of dependent links.  This increases the amount of parallelism that can
+// be exploited by out-of-order instruction processors with large but
+// (inevitably) finite instruction windows.
+// A small structure to track lists of dependent link calculations
+class LinkDeps_t {
+	public:
+	int value;			// A link calculation that is dependent on this one
+		// Positive values = "input A" while negative values = "input B"
+	LinkDeps_t *next;	// Next dependence in the list
+typedef LinkDeps_t *LinkDepsPtr_t;
+// Dependency list constants
+#define REOP_NODE_COMPLETE	-2	// Must be less than REOP_NOT_DEPENDENT
+void btSoftBodyHelpers::ReoptimizeLinkOrder(btSoftBody *psb /* This can be replaced by a btSoftBody pointer */)
+	int i, nLinks=psb->m_links.size(), nNodes=psb->m_nodes.size();
+	btSoftBody::Link *lr;
+	int ar, br;
+	btSoftBody::Node *node0 = &(psb->m_nodes[0]);
+	btSoftBody::Node *node1 = &(psb->m_nodes[1]);
+	LinkDepsPtr_t linkDep;
+	int readyListHead, readyListTail, linkNum, linkDepFrees, depLink;
+	// Allocate temporary buffers
+	int *nodeWrittenAt = new int[nNodes+1];	// What link calculation produced this node's current values?
+	int *linkDepA = new int[nLinks];			// Link calculation input is dependent upon prior calculation #N
+	int *linkDepB = new int[nLinks];
+	int *readyList = new int[nLinks];		// List of ready-to-process link calculations (# of links, maximum)
+	LinkDeps_t *linkDepFreeList = new LinkDeps_t[2*nLinks];		// Dependent-on-me list elements (2x# of links, maximum)
+	LinkDepsPtr_t *linkDepListStarts = new LinkDepsPtr_t[nLinks];	// Start nodes of dependent-on-me lists, one for each link
+	// Copy the original, unsorted links to a side buffer
+	btSoftBody::Link *linkBuffer = new btSoftBody::Link[nLinks];
+	memcpy(linkBuffer, &(psb->m_links[0]), sizeof(btSoftBody::Link)*nLinks);
+	// Clear out the node setup and ready list
+	for (i=0; i < nNodes+1; i++) {
+		nodeWrittenAt[i] = REOP_NOT_DEPENDENT;
+	}
+	for (i=0; i < nLinks; i++) {
+		linkDepListStarts[i] = NULL;
+	}
+	readyListHead = readyListTail = linkDepFrees = 0;
+	// Initial link analysis to set up data structures
+	for (i=0; i < nLinks; i++) {
+		// Note which prior link calculations we are dependent upon & build up dependence lists
+		lr = &(psb->m_links[i]);
+		ar = (lr->m_n[0] - node0)/(node1 - node0);
+		br = (lr->m_n[1] - node0)/(node1 - node0);
+		if (nodeWrittenAt[ar] > REOP_NOT_DEPENDENT) {
+			linkDepA[i] = nodeWrittenAt[ar];
+			linkDep = &linkDepFreeList[linkDepFrees++];
+			linkDep->value = i;
+			linkDep->next = linkDepListStarts[nodeWrittenAt[ar]];
+			linkDepListStarts[nodeWrittenAt[ar]] = linkDep;
+		} else {
+			linkDepA[i] = REOP_NOT_DEPENDENT;
+		}
+		if (nodeWrittenAt[br] > REOP_NOT_DEPENDENT) {
+			linkDepB[i] = nodeWrittenAt[br];
+			linkDep = &linkDepFreeList[linkDepFrees++];
+			linkDep->value = -(i+1);
+			linkDep->next = linkDepListStarts[nodeWrittenAt[br]];
+			linkDepListStarts[nodeWrittenAt[br]] = linkDep;
+		} else {
+			linkDepB[i] = REOP_NOT_DEPENDENT;
+		}
+		// Add this link to the initial ready list, if it is not dependent on any other links
+		if ((linkDepA[i] == REOP_NOT_DEPENDENT) && (linkDepB[i] == REOP_NOT_DEPENDENT)) {
+			readyList[readyListTail++] = i;
+			linkDepA[i] = linkDepB[i] = REOP_NODE_COMPLETE;	// Probably not needed now
+		}
+		// Update the nodes to mark which ones are calculated by this link
+		nodeWrittenAt[ar] = nodeWrittenAt[br] = i;
+	}
+	// Process the ready list and create the sorted list of links
+	// -- By treating the ready list as a queue, we maximize the distance between any
+	//    inter-dependent node calculations
+	// -- All other (non-related) nodes in the ready list will automatically be inserted
+	//    in between each set of inter-dependent link calculations by this loop
+	i = 0;
+	while (readyListHead != readyListTail) {
+		// Use ready list to select the next link to process
+		linkNum = readyList[readyListHead++];
+		// Copy the next-to-calculate link back into the original link array
+		psb->m_links[i++] = linkBuffer[linkNum];
+		// Free up any link inputs that are dependent on this one
+		linkDep = linkDepListStarts[linkNum];
+		while (linkDep) {
+			depLink = linkDep->value;
+			if (depLink >= 0) {
+				linkDepA[depLink] = REOP_NOT_DEPENDENT;
+			} else {
+				depLink = -depLink - 1;
+				linkDepB[depLink] = REOP_NOT_DEPENDENT;
+			}
+			// Add this dependent link calculation to the ready list if *both* inputs are clear
+			if ((linkDepA[depLink] == REOP_NOT_DEPENDENT) && (linkDepB[depLink] == REOP_NOT_DEPENDENT)) {
+				readyList[readyListTail++] = depLink;
+				linkDepA[depLink] = linkDepB[depLink] = REOP_NODE_COMPLETE;	// Probably not needed now
+			}
+			linkDep = linkDep->next;
+		}
+	}
+	// Delete the temporary buffers
+	delete [] nodeWrittenAt;
+	delete [] linkDepA;
+	delete [] linkDepB;
+	delete [] readyList;
+	delete [] linkDepFreeList;
+	delete [] linkDepListStarts;
+	delete [] linkBuffer;
 void			btSoftBodyHelpers::DrawFrame(		btSoftBody* psb,
 											 btIDebugDraw* idraw)
@@ -911,11 +1073,9 @@ btSoftBody*		btSoftBodyHelpers::CreateFromConvexHull(btSoftBodyWorldInfo& worldI
 	for(int i=0;i<(int)hres.mNumFaces;++i)
-		const unsigned int idx[]={	hres.m_Indices[i*3+0],
-			hres.m_Indices[i*3+1],
-			hres.m_Indices[i*3+2]};
+		const int idx[]={	static_cast<int>(hres.m_Indices[i*3+0]),
+							static_cast<int>(hres.m_Indices[i*3+1]),
+							static_cast<int>(hres.m_Indices[i*3+2])};
 		if(idx[0]<idx[1]) psb->appendLink(	idx[0],idx[1]);
 		if(idx[1]<idx[2]) psb->appendLink(	idx[1],idx[2]);
 		if(idx[2]<idx[0]) psb->appendLink(	idx[2],idx[0]);
diff --git a/src/bullet/BulletSoftBody/btSoftBodyHelpers.h b/src/bullet/BulletSoftBody/btSoftBodyHelpers.h
index 620a52fe..72715301 100644
--- a/src/bullet/BulletSoftBody/btSoftBodyHelpers.h
+++ b/src/bullet/BulletSoftBody/btSoftBodyHelpers.h
@@ -137,7 +137,12 @@ struct	btSoftBodyHelpers
 													bool bfacelinks,
 													bool btetralinks,
 													bool bfacesfromtetras);
+	/// Sort the list of links to move link calculations that are dependent upon earlier
+	/// ones as far as possible away from the calculation of those values
+	/// This tends to make adjacent loop iterations not dependent upon one another,
+	/// so out-of-order processors can execute instructions from multiple iterations at once
+	static void ReoptimizeLinkOrder(btSoftBody *psb );
diff --git a/src/bullet/BulletSoftBody/btSoftBodyInternals.h b/src/bullet/BulletSoftBody/btSoftBodyInternals.h
index 5ef8db19..759509a1 100644
--- a/src/bullet/BulletSoftBody/btSoftBodyInternals.h
+++ b/src/bullet/BulletSoftBody/btSoftBodyInternals.h
@@ -21,6 +21,7 @@ subject to the following restrictions:
 #include "LinearMath/btQuickprof.h"
+#include "LinearMath/btPolarDecomposition.h"
 #include "BulletCollision/BroadphaseCollision/btBroadphaseInterface.h"
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
 #include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
@@ -160,7 +161,7 @@ public:
 	virtual btScalar	getMargin() const
-		return getMargin();
+		return btConvexInternalShape::getMargin();
@@ -614,32 +615,8 @@ private:
 static inline int			PolarDecompose(	const btMatrix3x3& m,btMatrix3x3& q,btMatrix3x3& s)
-	static const btScalar	half=(btScalar)0.5;
-	static const btScalar	accuracy=(btScalar)0.0001;
-	static const int		maxiterations=16;
-	int						i=0;
-	btScalar				det=0;
-	q	=	Mul(m,1/btVector3(m[0][0],m[1][1],m[2][2]).length());
-	det	=	q.determinant();
-	if(!btFuzzyZero(det))
-	{
-		for(;i<maxiterations;++i)
-		{
-			q=Mul(Add(q,Mul(q.adjoint(),1/det).transpose()),half);
-			const btScalar	ndet=q.determinant();
-			if(Sq(ndet-det)>accuracy) det=ndet; else break;
-		}
-		/* Final orthogonalization	*/ 
-		Orthogonalize(q);
-		/* Compute 'S'				*/ 
-		s=q.transpose()*m;
-	}
-	else
-	{
-		q.setIdentity();
-		s.setIdentity();
-	}
-	return(i);
+	static const btPolarDecomposition polar;  
+	return polar.decompose(m, q, s);
@@ -666,7 +643,7 @@ struct btSoftColliders
 			threshold	=(btScalar)0;
 		bool				SolveContact(	const btGjkEpaSolver2::sResults& res,
-			btSoftBody::Body ba,btSoftBody::Body bb,
+			btSoftBody::Body ba,const btSoftBody::Body bb,
 			btSoftBody::CJoint& joint)
@@ -702,7 +679,7 @@ struct btSoftColliders
 				joint.m_normal		=	norm;
 //				printf("normal=%f,%f,%f\n",res.normal.getX(),res.normal.getY(),res.normal.getZ());
 				joint.m_delete		=	false;
-				joint.m_friction	=	fv.length2()<(-rvac*friction)?1:friction;
+				joint.m_friction	=	fv.length2()<(rvac*friction*rvac*friction)?1:friction;
 				joint.m_massmatrix	=	ImpulseMatrix(	ba.invMass(),ba.invWorldInertia(),joint.m_rpos[0],
@@ -717,30 +694,30 @@ struct btSoftColliders
 	struct	CollideCL_RS : ClusterBase
 		btSoftBody*		psb;
-		btCollisionObject*	m_colObj;
+		const btCollisionObjectWrapper*	m_colObjWrap;
 		void		Process(const btDbvtNode* leaf)
 			btSoftBody::Cluster*		cluster=(btSoftBody::Cluster*)leaf->data;
 			btSoftClusterCollisionShape	cshape(cluster);
-			const btConvexShape*		rshape=(const btConvexShape*)m_colObj->getCollisionShape();
+			const btConvexShape*		rshape=(const btConvexShape*)m_colObjWrap->getCollisionShape();
 			///don't collide an anchored cluster with a static/kinematic object
-			if(m_colObj->isStaticOrKinematicObject() && cluster->m_containsAnchor)
+			if(m_colObjWrap->getCollisionObject()->isStaticOrKinematicObject() && cluster->m_containsAnchor)
 			btGjkEpaSolver2::sResults	res;		
 			if(btGjkEpaSolver2::SignedDistance(	&cshape,btTransform::getIdentity(),
-				rshape,m_colObj->getWorldTransform(),
+				rshape,m_colObjWrap->getWorldTransform(),
 				btSoftBody::CJoint	joint;
-				if(SolveContact(res,cluster,m_colObj,joint))//prb,joint))
+				if(SolveContact(res,cluster,m_colObjWrap->getCollisionObject(),joint))//prb,joint))
 					btSoftBody::CJoint*	pj=new(btAlignedAlloc(sizeof(btSoftBody::CJoint),16)) btSoftBody::CJoint();
-					if(m_colObj->isStaticOrKinematicObject())
+					if(m_colObjWrap->getCollisionObject()->isStaticOrKinematicObject())
 						pj->m_erp	*=	psb->m_cfg.kSKHR_CL;
 						pj->m_split	*=	psb->m_cfg.kSK_SPLT_CL;
@@ -753,19 +730,19 @@ struct btSoftColliders
-		void		Process(btSoftBody* ps,btCollisionObject* colOb)
+		void		ProcessColObj(btSoftBody* ps,const btCollisionObjectWrapper* colObWrap)
 			psb			=	ps;
-			m_colObj			=	colOb;
+			m_colObjWrap			=	colObWrap;
 			idt			=	ps->m_sst.isdt;
-			m_margin		=	m_colObj->getCollisionShape()->getMargin()+psb->getCollisionShape()->getMargin();
+			m_margin		=	m_colObjWrap->getCollisionShape()->getMargin()+psb->getCollisionShape()->getMargin();
 			///Bullet rigid body uses multiply instead of minimum to determine combined friction. Some customization would be useful.
-			friction	=	btMin(psb->m_cfg.kDF,m_colObj->getFriction());
+			friction	=	btMin(psb->m_cfg.kDF,m_colObjWrap->getCollisionObject()->getFriction());
 			btVector3			mins;
 			btVector3			maxs;
 			ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume;
-			colOb->getCollisionShape()->getAabb(colOb->getWorldTransform(),mins,maxs);
+			colObWrap->getCollisionShape()->getAabb(colObWrap->getWorldTransform(),mins,maxs);
@@ -815,7 +792,7 @@ struct btSoftColliders
-		void		Process(btSoftBody* psa,btSoftBody* psb)
+		void		ProcessSoftSoft(btSoftBody* psa,btSoftBody* psb)
 			idt			=	psa->m_sst.isdt;
 			//m_margin		=	(psa->getCollisionShape()->getMargin()+psb->getCollisionShape()->getMargin())/2;
@@ -840,15 +817,16 @@ struct btSoftColliders
 			const btScalar			m=n.m_im>0?dynmargin:stamargin;
 			btSoftBody::RContact	c;
 			if(	(!n.m_battach)&&
-				psb->checkContact(m_colObj1,n.m_x,m,c.m_cti))
+				psb->checkContact(m_colObj1Wrap,n.m_x,m,c.m_cti))
 				const btScalar	ima=n.m_im;
 				const btScalar	imb= m_rigidBody? m_rigidBody->getInvMass() : 0.f;
 				const btScalar	ms=ima+imb;
-					const btTransform&	wtr=m_rigidBody?m_rigidBody->getWorldTransform() : m_colObj1->getWorldTransform();
+					const btTransform&	wtr=m_rigidBody?m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
 					static const btMatrix3x3	iwiStatic(0,0,0,0,0,0,0,0,0);
 					const btMatrix3x3&	iwi=m_rigidBody?m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
 					const btVector3		ra=n.m_x-wtr.getOrigin();
@@ -857,13 +835,13 @@ struct btSoftColliders
 					const btVector3		vr=vb-va;
 					const btScalar		dn=btDot(vr,c.m_cti.m_normal);
 					const btVector3		fv=vr-c.m_cti.m_normal*dn;
-					const btScalar		fc=psb->m_cfg.kDF*m_colObj1->getFriction();
+					const btScalar		fc=psb->m_cfg.kDF*m_colObj1Wrap->getCollisionObject()->getFriction();
 					c.m_node	=	&n;
 					c.m_c0		=	ImpulseMatrix(psb->m_sst.sdt,ima,imb,iwi,ra);
 					c.m_c1		=	ra;
 					c.m_c2		=	ima*psb->m_sst.sdt;
-					c.m_c3		=	fv.length2()<(btFabs(dn)*fc)?0:1-fc;
-					c.m_c4		=	m_colObj1->isStaticOrKinematicObject()?psb->m_cfg.kKHR:psb->m_cfg.kCHR;
+			        c.m_c3		=	fv.length2()<(dn*fc*dn*fc)?0:1-fc;
+					c.m_c4		=	m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject()?psb->m_cfg.kKHR:psb->m_cfg.kCHR;
 					if (m_rigidBody)
@@ -871,7 +849,7 @@ struct btSoftColliders
 		btSoftBody*		psb;
-		btCollisionObject*	m_colObj1;
+		const btCollisionObjectWrapper*	m_colObj1Wrap;
 		btRigidBody*	m_rigidBody;
 		btScalar		dynmargin;
 		btScalar		stamargin;
diff --git a/src/bullet/BulletSoftBody/btSoftBodySolvers.h b/src/bullet/BulletSoftBody/btSoftBodySolvers.h
index 2fcd8b67..6947bc27 100644
--- a/src/bullet/BulletSoftBody/btSoftBodySolvers.h
+++ b/src/bullet/BulletSoftBody/btSoftBodySolvers.h
@@ -85,7 +85,7 @@ public:
 	virtual void updateSoftBodies() = 0;
 	/** Process a collision between one of the world's soft bodies and another collision object */
-	virtual void processCollision( btSoftBody *, btCollisionObject* ) = 0;
+	virtual void processCollision( btSoftBody *, const struct btCollisionObjectWrapper* ) = 0;
 	/** Process a collision between two soft bodies */
 	virtual void processCollision( btSoftBody*, btSoftBody* ) = 0;
diff --git a/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp b/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp
index bc374c80..01c148a2 100644
--- a/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp
+++ b/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp
@@ -20,13 +20,14 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "btSoftBody.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 ///TODO: include all the shapes that the softbody can collide with
 ///alternatively, implement special case collision algorithms (just like for rigid collision shapes)
 //#include <stdio.h>
-btSoftRigidCollisionAlgorithm::btSoftRigidCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* /*col0*/,btCollisionObject* /*col1*/, bool isSwapped)
+btSoftRigidCollisionAlgorithm::btSoftRigidCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* ,const btCollisionObjectWrapper* , bool isSwapped)
 : btCollisionAlgorithm(ci),
@@ -52,18 +53,19 @@ btSoftRigidCollisionAlgorithm::~btSoftRigidCollisionAlgorithm()
 #include <stdio.h>
-void btSoftRigidCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSoftRigidCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
-	btSoftBody* softBody =  m_isSwapped? (btSoftBody*)body1 : (btSoftBody*)body0;
-	btCollisionObject* rigidCollisionObject = m_isSwapped? body0 : body1;
+//	const btCollisionObjectWrapper* softWrap = m_isSwapped?body1Wrap:body0Wrap;
+//	const btCollisionObjectWrapper* rigidWrap = m_isSwapped?body0Wrap:body1Wrap;
+	btSoftBody* softBody =  m_isSwapped? (btSoftBody*)body1Wrap->getCollisionObject() : (btSoftBody*)body0Wrap->getCollisionObject();
+	const btCollisionObjectWrapper* rigidCollisionObjectWrap = m_isSwapped? body0Wrap : body1Wrap;
-	if (softBody->m_collisionDisabledObjects.findLinearSearch(rigidCollisionObject)==softBody->m_collisionDisabledObjects.size())
+	if (softBody->m_collisionDisabledObjects.findLinearSearch(rigidCollisionObjectWrap->getCollisionObject())==softBody->m_collisionDisabledObjects.size())
-		softBody->getSoftBodySolver()->processCollision(softBody, rigidCollisionObject);
+		softBody->getSoftBodySolver()->processCollision(softBody, rigidCollisionObjectWrap);
diff --git a/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.h b/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.h
index 7658e3c2..a9b513e3 100644
--- a/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.h
+++ b/src/bullet/BulletSoftBody/btSoftRigidCollisionAlgorithm.h
@@ -39,11 +39,11 @@ class btSoftRigidCollisionAlgorithm : public btCollisionAlgorithm
-	btSoftRigidCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped);
+	btSoftRigidCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0,const btCollisionObjectWrapper* col1Wrap, bool isSwapped);
 	virtual ~btSoftRigidCollisionAlgorithm();
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -55,15 +55,15 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSoftRigidCollisionAlgorithm));
 			if (!m_swapped)
-				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0,body1,false);
+				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,false);
 			} else
-				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0,body1,true);
+				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,true);
diff --git a/src/bullet/BulletSoftBody/btSoftRigidDynamicsWorld.cpp b/src/bullet/BulletSoftBody/btSoftRigidDynamicsWorld.cpp
index 8f4be231..653d5a06 100644
--- a/src/bullet/BulletSoftBody/btSoftRigidDynamicsWorld.cpp
+++ b/src/bullet/BulletSoftBody/btSoftRigidDynamicsWorld.cpp
@@ -76,7 +76,7 @@ void	btSoftRigidDynamicsWorld::predictUnconstraintMotion(btScalar timeStep)
 	btDiscreteDynamicsWorld::predictUnconstraintMotion( timeStep );
-		m_softBodySolver->predictMotion( timeStep );
+		m_softBodySolver->predictMotion( float(timeStep) );
@@ -353,6 +353,8 @@ void	btSoftRigidDynamicsWorld::serialize(btSerializer* serializer)
+	serializeDynamicsWorldInfo( serializer);
diff --git a/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp b/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp
index 1b8cfa72..72043e69 100644
--- a/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp
+++ b/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp
@@ -19,10 +19,11 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "btSoftBody.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
-btSoftSoftCollisionAlgorithm::btSoftSoftCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* /*obj0*/,btCollisionObject* /*obj1*/)
+btSoftSoftCollisionAlgorithm::btSoftSoftCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* /*obj0*/,const btCollisionObjectWrapper* /*obj1*/)
 : btCollisionAlgorithm(ci)
@@ -33,10 +34,10 @@ btSoftSoftCollisionAlgorithm::~btSoftSoftCollisionAlgorithm()
-void btSoftSoftCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& /*dispatchInfo*/,btManifoldResult* /*resultOut*/)
+void btSoftSoftCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& /*dispatchInfo*/,btManifoldResult* /*resultOut*/)
-	btSoftBody* soft0 =	(btSoftBody*)body0;
-	btSoftBody* soft1 =	(btSoftBody*)body1;
+	btSoftBody* soft0 =	(btSoftBody*)body0Wrap->getCollisionObject();
+	btSoftBody* soft1 =	(btSoftBody*)body1Wrap->getCollisionObject();
 	soft0->getSoftBodySolver()->processCollision(soft0, soft1);
diff --git a/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.h b/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.h
index 92d683c1..43b1439c 100644
--- a/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.h
+++ b/src/bullet/BulletSoftBody/btSoftSoftCollisionAlgorithm.h
@@ -38,7 +38,7 @@ public:
 	btSoftSoftCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btCollisionAlgorithm(ci) {}
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
@@ -48,17 +48,17 @@ public:
-	btSoftSoftCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btSoftSoftCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 	virtual ~btSoftSoftCollisionAlgorithm();
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 			int bbsize = sizeof(btSoftSoftCollisionAlgorithm);
 			void* ptr = ci.m_dispatcher1->allocateCollisionAlgorithm(bbsize);
-			return new(ptr) btSoftSoftCollisionAlgorithm(0,ci,body0,body1);
+			return new(ptr) btSoftSoftCollisionAlgorithm(0,ci,body0Wrap,body1Wrap);
diff --git a/src/bullet/BulletSoftBody/btSparseSDF.h b/src/bullet/BulletSoftBody/btSparseSDF.h
index 90a26cdf..bcf0c798 100644
--- a/src/bullet/BulletSoftBody/btSparseSDF.h
+++ b/src/bullet/BulletSoftBody/btSparseSDF.h
@@ -58,7 +58,7 @@ struct	btSparseSdf
 		int					c[3];
 		int					puid;
 		unsigned			hash;
-		btCollisionShape*	pclient;
+		const btCollisionShape*	pclient;
 		Cell*				next;
@@ -69,6 +69,7 @@ struct	btSparseSdf
 	btScalar						voxelsz;
 	int								puid;
 	int								ncells;
+	int								m_clampCells;
 	int								nprobes;
 	int								nqueries;	
@@ -77,10 +78,13 @@ struct	btSparseSdf
-	void					Initialize(int hashsize=2383)
+	void					Initialize(int hashsize=2383, int clampCells = 256*1024)
+		//avoid a crash due to running out of memory, so clamp the maximum number of cells allocated
+		//if this limit is reached, the SDF is reset (at the cost of some performance during the reset)
+		m_clampCells = clampCells;
-		Reset();		
+		Reset();
 	void					Reset()
@@ -152,7 +156,7 @@ struct	btSparseSdf
 	btScalar				Evaluate(	const btVector3& x,
-		btCollisionShape* shape,
+		const btCollisionShape* shape,
 		btVector3& normal,
 		btScalar margin)
@@ -181,6 +185,15 @@ struct	btSparseSdf
+			int sz = sizeof(Cell);
+			if (ncells>m_clampCells)
+			{
+				static int numResets=0;
+				numResets++;
+//				printf("numResets=%d\n",numResets);
+				Reset();
+			}
 			c=new Cell();
@@ -248,14 +261,14 @@ struct	btSparseSdf
 	static inline btScalar	DistanceToShape(const btVector3& x,
-		btCollisionShape* shape)
+		const btCollisionShape* shape)
 		btTransform	unit;
 			btGjkEpaSolver2::sResults	res;
-			btConvexShape*				csh=static_cast<btConvexShape*>(shape);
+			const btConvexShape*				csh=static_cast<const btConvexShape*>(shape);
@@ -282,7 +295,7 @@ struct	btSparseSdf
-	static inline unsigned int	Hash(int x,int y,int z,btCollisionShape* shape)
+	static inline unsigned int	Hash(int x,int y,int z,const btCollisionShape* shape)
 		struct btS
@@ -292,7 +305,7 @@ struct	btSparseSdf
 		btS myset;
-		myset.x=x;myset.y=y;myset.z=z;myset.p=shape;
+		myset.x=x;myset.y=y;myset.z=z;myset.p=(void*)shape;
 		const void* ptr = &myset;
 		unsigned int result = HsiehHash<sizeof(btS)/4> (ptr);
diff --git a/src/bullet/LinearMath/btAabbUtil2.h b/src/bullet/LinearMath/btAabbUtil2.h
index 42b721de..d2997b4e 100644
--- a/src/bullet/LinearMath/btAabbUtil2.h
+++ b/src/bullet/LinearMath/btAabbUtil2.h
@@ -184,9 +184,7 @@ SIMD_FORCE_INLINE	void btTransformAabb(const btVector3& halfExtents, btScalar ma
 	btVector3 halfExtentsWithMargin = halfExtents+btVector3(margin,margin,margin);
 	btMatrix3x3 abs_b = t.getBasis().absolute();  
 	btVector3 center = t.getOrigin();
-	btVector3 extent = btVector3(abs_b[0].dot(halfExtentsWithMargin),
-		   abs_b[1].dot(halfExtentsWithMargin),
-		  abs_b[2].dot(halfExtentsWithMargin));
+    btVector3 extent = halfExtentsWithMargin.dot3( abs_b[0], abs_b[1], abs_b[2] );
 	aabbMinOut = center - extent;
 	aabbMaxOut = center + extent;
@@ -203,9 +201,7 @@ SIMD_FORCE_INLINE	void btTransformAabb(const btVector3& localAabbMin,const btVec
 		btVector3 localCenter = btScalar(0.5)*(localAabbMax+localAabbMin);
 		btMatrix3x3 abs_b = trans.getBasis().absolute();  
 		btVector3 center = trans(localCenter);
-		btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-			   abs_b[1].dot(localHalfExtents),
-			  abs_b[2].dot(localHalfExtents));
+        btVector3 extent = localHalfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
 		aabbMinOut = center-extent;
 		aabbMaxOut = center+extent;
diff --git a/src/bullet/LinearMath/btAlignedAllocator.cpp b/src/bullet/LinearMath/btAlignedAllocator.cpp
index c4c0ceb2..e5f6040c 100644
--- a/src/bullet/LinearMath/btAlignedAllocator.cpp
+++ b/src/bullet/LinearMath/btAlignedAllocator.cpp
@@ -105,30 +105,94 @@ void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc)
+static int allocations_id[10241024];
+static int allocations_bytes[10241024];
+static int mynumallocs = 0;
+#include <stdio.h>
+int btDumpMemoryLeaks()
+	int totalLeak = 0;
+	for (int i=0;i<mynumallocs;i++)
+	{
+		printf("Error: leaked memory of allocation #%d (%d bytes)\n", allocations_id[i], allocations_bytes[i]);
+		totalLeak+=allocations_bytes[i];
+	}
+	if (totalLeak)
+	{
+		printf("Error: memory leaks: %d allocations were not freed and leaked together %d bytes\n",mynumallocs,totalLeak);
+	}
+	return totalLeak;
 //this generic allocator provides the total allocated number of bytes
 #include <stdio.h>
+struct btDebugPtrMagic
+	union
+	{
+		void** vptrptr;
+		void* vptr;
+		int* iptr;
+		char* cptr;
+	};
 void*   btAlignedAllocInternal  (size_t size, int alignment,int line,char* filename)
+	if (size==0)
+	{
+		printf("Whaat? size==0");
+		return 0;
+	}
+	static int allocId = 0;
  void *ret;
  char *real;
+// to find some particular memory leak, you could do something like this:
+//	if (allocId==172)
+//	{
+//		printf("catch me!\n");
+//	}
+//	if (size>1024*1024)
+//	{
+//		printf("big alloc!%d\n", size);
+//	}
  gTotalBytesAlignedAllocs += size;
- real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
+int sz4prt = 4*sizeof(void *);
+ real = (char *)sAllocFunc(size + sz4prt + (alignment-1));
  if (real) {
-   ret = (void*) btAlignPointer((real + 2*sizeof(void *), alignment);
-   *((void **)(ret)-1) = (void *)(real);
-       *((int*)(ret)-2) = size;
+   ret = (void*) btAlignPointer(real + sz4prt, alignment);
+	 btDebugPtrMagic p;
+	 p.vptr = ret;
+	 p.cptr-=sizeof(void*);
+	 *p.vptrptr = (void*)real;
+	 p.cptr-=sizeof(void*);
+	 *p.iptr = size;
+	 p.cptr-=sizeof(void*);
+	 *p.iptr = allocId;
+	 allocations_id[mynumallocs] = allocId;
+	 allocations_bytes[mynumallocs] = size;
+	 mynumallocs++;
  } else {
    ret = (void *)(real);//??
- printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size);
+ printf("allocation %d at address %x, from %s,line %d, size %d (total allocated = %d)\n",allocId,real, filename,line,size,gTotalBytesAlignedAllocs);
+	allocId++;
  int* ptr = (int*)ret;
  *ptr = 12;
  return (ret);
@@ -138,19 +202,43 @@ void    btAlignedFreeInternal   (void* ptr,int line,char* filename)
  void* real;
- gNumAlignedFree++;
  if (ptr) {
-   real = *((void **)(ptr)-1);
-       int size = *((int*)(ptr)-2);
-       gTotalBytesAlignedAllocs -= size;
-	   printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size);
+	 gNumAlignedFree++;
+	 btDebugPtrMagic p;
+	 p.vptr = ptr;
+	 p.cptr-=sizeof(void*);
+	 real = *p.vptrptr;
+	 p.cptr-=sizeof(void*);
+	 int size = *p.iptr;
+	 p.cptr-=sizeof(void*);
+	 int allocId = *p.iptr;
+	 bool found = false;
+	 for (int i=0;i<mynumallocs;i++)
+	 {
+		 if ( allocations_id[i] == allocId)
+		 {
+			 allocations_id[i] = allocations_id[mynumallocs-1];
+			 allocations_bytes[i] = allocations_bytes[mynumallocs-1];
+			 mynumallocs--;
+			 found = true;
+			 break;
+		 }
+	 }
+	gTotalBytesAlignedAllocs -= size;
+	 int diff = gNumAlignedAllocs-gNumAlignedFree;
+	printf("free %d at address %x, from %s,line %d, size %d (total remain = %d in %d non-freed allocations)\n",allocId,real, filename,line,size, gTotalBytesAlignedAllocs, diff);
  } else
-	 printf("NULL ptr\n");
+	 //printf("deleting a NULL ptr, no effect\n");
diff --git a/src/bullet/LinearMath/btAlignedAllocator.h b/src/bullet/LinearMath/btAlignedAllocator.h
index f168f3c6..9873b338 100644
--- a/src/bullet/LinearMath/btAlignedAllocator.h
+++ b/src/bullet/LinearMath/btAlignedAllocator.h
@@ -21,9 +21,15 @@ subject to the following restrictions:
 ///that is better portable and more predictable
 #include "btScalar.h"
+///BT_DEBUG_MEMORY_ALLOCATIONS preprocessor can be set in build system
+///for regression tests to detect memory leaks
+int btDumpMemoryLeaks();
 #define btAlignedAlloc(a,b) \
diff --git a/src/bullet/LinearMath/btAlignedObjectArray.h b/src/bullet/LinearMath/btAlignedObjectArray.h
index 47e65926..6193ef7f 100644
--- a/src/bullet/LinearMath/btAlignedObjectArray.h
+++ b/src/bullet/LinearMath/btAlignedObjectArray.h
@@ -39,6 +39,12 @@ subject to the following restrictions:
 #include <new> //for placement new
+// The register keyword is deprecated in C++11 so don't use it.
+#if __cplusplus > 199711L
+#define BT_REGISTER
+#define BT_REGISTER register
 ///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods
 ///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
@@ -67,12 +73,10 @@ private:
-		SIMD_FORCE_INLINE	int	allocSize(int _size)
+		SIMD_FORCE_INLINE	int	allocSize(int size)
-			return (_size ? _size*2 : 1);
+			return (size ? size*2 : 1);
 		SIMD_FORCE_INLINE	void	copy(int start,int end, T* dest) const
 			int i;
@@ -101,14 +105,12 @@ protected:
-		SIMD_FORCE_INLINE	void* allocate(int _size)
+		SIMD_FORCE_INLINE	void* allocate(int size)
-			if (_size)
-				return m_allocator.allocate(_size);
+			if (size)
+				return m_allocator.allocate(size);
 			return 0;
 		SIMD_FORCE_INLINE	void	deallocate()
@@ -201,11 +203,21 @@ protected:
 		///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
 		///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
+		SIMD_FORCE_INLINE	void	resizeNoInitialize(int newsize)
+		{
+			if (newsize > size())
+			{
+				reserve(newsize);
+			}
+			m_size = newsize;
+		}
 		SIMD_FORCE_INLINE	void	resize(int newsize, const T& fillData=T())
-			int curSize = size();
+			const BT_REGISTER int curSize = size();
 			if (newsize < curSize)
@@ -215,7 +227,7 @@ protected:
 			} else
-				if (newsize > size())
+				if (newsize > curSize)
@@ -230,10 +242,9 @@ protected:
 			m_size = newsize;
 		SIMD_FORCE_INLINE	T&  expandNonInitializing( )
-			int sz = size();
+			const BT_REGISTER int sz = size();
 			if( sz == capacity() )
 				reserve( allocSize(size()) );
@@ -246,7 +257,7 @@ protected:
 		SIMD_FORCE_INLINE	T&  expand( const T& fillValue=T())
-			int sz = size();
+			const BT_REGISTER int sz = size();
 			if( sz == capacity() )
 				reserve( allocSize(size()) );
@@ -262,7 +273,7 @@ protected:
 		SIMD_FORCE_INLINE	void push_back(const T& _Val)
-			int sz = size();
+			const BT_REGISTER int sz = size();
 			if( sz == capacity() )
 				reserve( allocSize(size()) );
@@ -477,16 +488,14 @@ protected:
 	//PCK: whole function
-	void initializeFromBuffer(void *buffer, int _size, int _capacity)
+	void initializeFromBuffer(void *buffer, int size, int capacity)
 		m_ownsMemory = false;
 		m_data = (T*)buffer;
-		m_size = _size;
-		m_capacity = _capacity;
+		m_size = size;
+		m_capacity = capacity;
 	void copyFromArray(const btAlignedObjectArray& otherArray)
diff --git a/src/bullet/LinearMath/btConvexHull.cpp b/src/bullet/LinearMath/btConvexHull.cpp
index 35d1aa20..2ae855db 100644
--- a/src/bullet/LinearMath/btConvexHull.cpp
+++ b/src/bullet/LinearMath/btConvexHull.cpp
@@ -22,13 +22,6 @@ subject to the following restrictions:
-template <class T>
-void Swap(T &a,T &b)
-	T tmp = a;
-	a=b;
-	b=tmp;
@@ -275,11 +268,9 @@ int maxdirsterid(const T *p,int count,const T &dir,btAlignedObjectArray<int> &al
 				int mc = ma;
 				for(btScalar xx = x-btScalar(40.0) ; xx <= x ; xx+= btScalar(5.0))
-					btScalar ss = btSin(SIMD_RADS_PER_DEG*(xx));
-					btScalar cc = btCos(SIMD_RADS_PER_DEG*(xx));
-					int md = maxdirfiltered(p,count,dir+(u*ss+v*cc)*btScalar(0.025),allow);
+					btScalar s = btSin(SIMD_RADS_PER_DEG*(xx));
+					btScalar c = btCos(SIMD_RADS_PER_DEG*(xx));
+					int md = maxdirfiltered(p,count,dir+(u*s+v*c)*btScalar(0.025),allow);
 					if(mc==m && md==m)
@@ -311,10 +302,8 @@ int operator ==(const int3 &a,const int3 &b)
-int above(btVector3 const* vertices,const int3& t, const btVector3 &p, btScalar epsilon);
-int above(btVector3 const* vertices,const int3& t, const btVector3 &p, btScalar epsilon) 
+int above(btVector3* vertices,const int3& t, const btVector3 &p, btScalar epsilon);
+int above(btVector3* vertices,const int3& t, const btVector3 &p, btScalar epsilon) 
 	btVector3 n=TriNormal(vertices[t[0]],vertices[t[1]],vertices[t[2]]);
 	return (btDot(n,p-vertices[t[0]]) > epsilon); // EPSILON???
@@ -490,9 +479,7 @@ btHullTriangle* HullLibrary::extrudable(btScalar epsilon)
-int4 HullLibrary::FindSimplex(btVector3 const *verts,int verts_count,btAlignedObjectArray<int> &allow)
+int4 HullLibrary::FindSimplex(btVector3 *verts,int verts_count,btAlignedObjectArray<int> &allow)
 	btVector3 basis[3];
 	basis[0] = btVector3( btScalar(0.01), btScalar(0.02), btScalar(1.0) );      
@@ -524,13 +511,11 @@ int4 HullLibrary::FindSimplex(btVector3 const *verts,int verts_count,btAlignedOb
 		return int4(-1,-1,-1,-1);
-	if(btDot(verts[p3]-verts[p0],btCross(verts[p1]-verts[p0],verts[p2]-verts[p0])) <0) {Swap(p2,p3);}
+	if(btDot(verts[p3]-verts[p0],btCross(verts[p1]-verts[p0],verts[p2]-verts[p0])) <0) {btSwap(p2,p3);}
 	return int4(p0,p1,p2,p3);
-int HullLibrary::calchullgen(btVector3 const *verts,int verts_count, int vlimit)
+int HullLibrary::calchullgen(btVector3 *verts,int verts_count, int vlimit)
 	if(verts_count <4) return 0;
 	if(vlimit==0) vlimit=1000000000;
@@ -578,9 +563,7 @@ int HullLibrary::calchullgen(btVector3 const *verts,int verts_count, int vlimit)
 	while(vlimit >0 && ((te=extrudable(epsilon)) != 0))
 		//int3 ti=*te;
 		int v=te->vmax;
 		btAssert(v != -1);
 		btAssert(!isextreme[v]);  // wtf we've already done this vertex
@@ -632,9 +615,7 @@ int HullLibrary::calchullgen(btVector3 const *verts,int verts_count, int vlimit)
 	return 1;
-int HullLibrary::calchull(btVector3 const *verts,int verts_count, TUIntArray& tris_out, int &tris_count,int vlimit) 
+int HullLibrary::calchull(btVector3 *verts,int verts_count, TUIntArray& tris_out, int &tris_count,int vlimit) 
 	int rc=calchullgen(verts,verts_count,  vlimit) ;
 	if(!rc) return 0;
@@ -670,9 +651,7 @@ bool HullLibrary::ComputeHull(unsigned int vcount,const btVector3 *vertices,PHul
 	int    tris_count;
-	int ret = calchull( vertices, (int) vcount, result.m_Indices, tris_count, static_cast<int>(vlimit) );
+	int ret = calchull( (btVector3 *) vertices, (int) vcount, result.m_Indices, tris_count, static_cast<int>(vlimit) );
 	if(!ret) return false;
 	result.mIndexCount = (unsigned int) (tris_count*3);
 	result.mFaceCount  = (unsigned int) tris_count;
diff --git a/src/bullet/LinearMath/btConvexHull.h b/src/bullet/LinearMath/btConvexHull.h
index 8f8264c1..69c52bc6 100644
--- a/src/bullet/LinearMath/btConvexHull.h
+++ b/src/bullet/LinearMath/btConvexHull.h
@@ -209,13 +209,11 @@ private:
 	btHullTriangle* extrudable(btScalar epsilon);
-	int calchull(btVector3 const *verts,int verts_count, TUIntArray& tris_out, int &tris_count,int vlimit);
+	int calchull(btVector3 *verts,int verts_count, TUIntArray& tris_out, int &tris_count,int vlimit);
-	int calchullgen(btVector3 const *verts,int verts_count, int vlimit);
+	int calchullgen(btVector3 *verts,int verts_count, int vlimit);
-	int4 FindSimplex(btVector3 const *verts,int verts_count,btAlignedObjectArray<int> &allow);
+	int4 FindSimplex(btVector3 *verts,int verts_count,btAlignedObjectArray<int> &allow);
 	class ConvexH* ConvexHCrop(ConvexH& convex,const btPlane& slice);
diff --git a/src/bullet/LinearMath/btConvexHullComputer.cpp b/src/bullet/LinearMath/btConvexHullComputer.cpp
index c03c901c..d58ac955 100644
--- a/src/bullet/LinearMath/btConvexHullComputer.cpp
+++ b/src/bullet/LinearMath/btConvexHullComputer.cpp
@@ -1931,11 +1931,15 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1)
-static bool pointCmp(const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q)
+class pointCmp
-	return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
+	public:
+    bool operator() ( const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q ) const
+		{
+			return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
+		}
 void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count)
@@ -2026,7 +2030,7 @@ void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int st
 			points[i].index = i;
-	points.quickSort(pointCmp);
+	points.quickSort(pointCmp());
diff --git a/src/bullet/LinearMath/btCpuFeatureUtility.h b/src/bullet/LinearMath/btCpuFeatureUtility.h
new file mode 100644
index 00000000..d2cab52d
--- /dev/null
+++ b/src/bullet/LinearMath/btCpuFeatureUtility.h
@@ -0,0 +1,92 @@
+#include "LinearMath/btScalar.h"
+#include <string.h>//memset
+#ifdef  USE_SIMD
+#include <emmintrin.h>
+#ifdef BT_ALLOW_SSE4
+#include <intrin.h>
+#endif //BT_ALLOW_SSE4
+#endif //USE_SIMD
+#if defined BT_USE_NEON
+#include <arm_neon.h>
+#include <sys/types.h>
+#include <sys/sysctl.h> //for sysctlbyname
+#endif //BT_USE_NEON
+///Rudimentary btCpuFeatureUtility for CPU features: only report the features that Bullet actually uses (SSE4/FMA3, NEON_HPFP)
+///We assume SSE2 in case BT_USE_SSE2 is defined in LinearMath/btScalar.h
+class btCpuFeatureUtility
+	enum btCpuFeature
+	{
+	};
+	static int getCpuFeatures()
+	{
+		static int capabilities = 0;
+		static bool testedCapabilities = false;
+		if (0 != testedCapabilities)
+		{
+			return capabilities;
+		}
+#ifdef BT_USE_NEON
+		{
+			uint32_t hasFeature = 0;
+			size_t featureSize = sizeof(hasFeature);
+			int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
+			if (0 == err && hasFeature)
+				capabilities |= CPU_FEATURE_NEON_HPFP;
+		}
+#endif //BT_USE_NEON
+#ifdef  BT_ALLOW_SSE4
+		{
+			int					cpuInfo[4];
+			memset(cpuInfo, 0, sizeof(cpuInfo));
+			unsigned long long	sseExt = 0;
+			__cpuid(cpuInfo, 1);
+			bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false;
+			bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;
+			if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
+			{
+				sseExt = _xgetbv(0);
+			}
+			const int OSXSAVEFlag = (1UL << 27);
+			const int AVXFlag = ((1UL << 28) | OSXSAVEFlag);
+			const int FMAFlag = ((1UL << 12) | AVXFlag | OSXSAVEFlag);
+			if ((cpuInfo[2] & FMAFlag) == FMAFlag && (sseExt & 6) == 6)
+			{
+				capabilities |= btCpuFeatureUtility::CPU_FEATURE_FMA3;
+			}
+			const int SSE41Flag = (1 << 19);
+			if (cpuInfo[2] & SSE41Flag)
+			{
+				capabilities |= btCpuFeatureUtility::CPU_FEATURE_SSE4_1;
+			}
+		}
+		testedCapabilities = true;
+		return capabilities;
+	}
+#endif //BT_CPU_UTILITY_H
diff --git a/src/bullet/LinearMath/btDefaultMotionState.h b/src/bullet/LinearMath/btDefaultMotionState.h
index a6b7ef15..01c5f8d9 100644
--- a/src/bullet/LinearMath/btDefaultMotionState.h
+++ b/src/bullet/LinearMath/btDefaultMotionState.h
@@ -4,13 +4,15 @@
 #include "btMotionState.h"
 ///The btDefaultMotionState provides a common implementation to synchronize world transforms with offsets.
-struct	btDefaultMotionState : public btMotionState
+ATTRIBUTE_ALIGNED16(struct)	btDefaultMotionState : public btMotionState
 	btTransform m_graphicsWorldTrans;
 	btTransform	m_centerOfMassOffset;
 	btTransform m_startWorldTrans;
 	void*		m_userPointer;
 	btDefaultMotionState(const btTransform& startTrans = btTransform::getIdentity(),const btTransform& centerOfMassOffset = btTransform::getIdentity())
 		: m_graphicsWorldTrans(startTrans),
@@ -23,14 +25,14 @@ struct	btDefaultMotionState : public btMotionState
 	///synchronizes world transform from user to physics
 	virtual void	getWorldTransform(btTransform& centerOfMassWorldTrans ) const 
-			centerOfMassWorldTrans = 	m_centerOfMassOffset.inverse() * m_graphicsWorldTrans ;
+			centerOfMassWorldTrans = m_graphicsWorldTrans * m_centerOfMassOffset.inverse() ;
 	///synchronizes world transform from physics to user
 	///Bullet only calls the update of worldtransform for active objects
 	virtual void	setWorldTransform(const btTransform& centerOfMassWorldTrans)
-			m_graphicsWorldTrans = centerOfMassWorldTrans * m_centerOfMassOffset ;
+			m_graphicsWorldTrans = centerOfMassWorldTrans * m_centerOfMassOffset;
diff --git a/src/bullet/LinearMath/btGrahamScan2dConvexHull.h b/src/bullet/LinearMath/btGrahamScan2dConvexHull.h
index d7bd3eb8..13a79aa5 100644
--- a/src/bullet/LinearMath/btGrahamScan2dConvexHull.h
+++ b/src/bullet/LinearMath/btGrahamScan2dConvexHull.h
@@ -21,9 +21,9 @@ subject to the following restrictions:
 #include "btVector3.h"
 #include "btAlignedObjectArray.h"
-struct GrahamVector2 : public btVector3
+struct GrahamVector3 : public btVector3
-	GrahamVector2(const btVector3& org, int orgIndex)
+	GrahamVector3(const btVector3& org, int orgIndex)
@@ -39,7 +39,7 @@ struct btAngleCompareFunc {
 	: m_anchor(anchor) 
-	bool operator()(const GrahamVector2& a, const GrahamVector2& b) const {
+	bool operator()(const GrahamVector3& a, const GrahamVector3& b) const {
 		if (a.m_angle != b.m_angle)
 			return a.m_angle < b.m_angle;
@@ -56,31 +56,46 @@ struct btAngleCompareFunc {
-inline void GrahamScanConvexHull2D(btAlignedObjectArray<GrahamVector2>& originalPoints, btAlignedObjectArray<GrahamVector2>& hull)
+inline void GrahamScanConvexHull2D(btAlignedObjectArray<GrahamVector3>& originalPoints, btAlignedObjectArray<GrahamVector3>& hull, const btVector3& normalAxis)
+	btVector3 axis0,axis1;
+	btPlaneSpace1(normalAxis,axis0,axis1);
 	if (originalPoints.size()<=1)
 		for (int i=0;i<originalPoints.size();i++)
-	//step1 : find anchor point with smallest x/y and move it to first location
-	//also precompute angles
+	//step1 : find anchor point with smallest projection on axis0 and move it to first location
 	for (int i=0;i<originalPoints.size();i++)
-		const btVector3& left = originalPoints[i];
-		const btVector3& right = originalPoints[0];
-		if (left.x() < right.x() || !(right.x() < left.x()) && left.y() < right.y())
+//		const btVector3& left = originalPoints[i];
+//		const btVector3& right = originalPoints[0];
+		btScalar projL = originalPoints[i].dot(axis0);
+		btScalar projR = originalPoints[0].dot(axis0);
+		if (projL < projR)
-	for (int i=0;i<originalPoints.size();i++)
+	//also precompute angles
+	originalPoints[0].m_angle = -1e30f;
+	for (int i=1;i<originalPoints.size();i++)
-		btVector3 xvec(1,0,0);
-		btVector3 ar = originalPoints[i]-originalPoints[0];
-		originalPoints[i].m_angle = btCross(xvec, ar).dot(btVector3(0,0,1)) / ar.length();
+	    btVector3 ar = originalPoints[i]-originalPoints[0];
+	    btScalar ar1 = axis1.dot(ar);
+	    btScalar ar0 = axis0.dot(ar);
+	    if( ar1*ar1+ar0*ar0 < FLT_EPSILON ) 
+	    {
+	      originalPoints[i].m_angle = 0.0f;
+	    }
+	    else
+	    {
+	      originalPoints[i].m_angle = btAtan2Fast(ar1, ar0);
+	    }
 	//step 2: sort all points, based on 'angle' with this anchor
@@ -98,12 +113,17 @@ inline void GrahamScanConvexHull2D(btAlignedObjectArray<GrahamVector2>& original
 		while (!isConvex&& hull.size()>1) {
 			btVector3& a = hull[hull.size()-2];
 			btVector3& b = hull[hull.size()-1];
-			isConvex = btCross(a-b,a-originalPoints[i]).dot(btVector3(0,0,1))> 0;
+			isConvex = btCross(a-b,a-originalPoints[i]).dot(normalAxis)> 0;
 			if (!isConvex)
+	    if( hull.size() == 1 )
+	    {
+	      hull.push_back( originalPoints[i] );
+	    }
diff --git a/src/bullet/LinearMath/btHashMap.h b/src/bullet/LinearMath/btHashMap.h
index dbe8abc3..ca6f326b 100644
--- a/src/bullet/LinearMath/btHashMap.h
+++ b/src/bullet/LinearMath/btHashMap.h
@@ -52,10 +52,8 @@ struct btHashString
 			int ret = 0 ;
-			while( ! (ret = *src - *dst) && *dst)
+			while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
 					++src, ++dst;
 			if ( ret < 0 )
 					ret = -1 ;
@@ -397,10 +395,27 @@ protected:
 		return &m_valueArray[index];
+    Key getKeyAtIndex(int index)
+    {
+        btAssert(index < m_keyArray.size());
+        return m_keyArray[index];
+    }
+    const Key getKeyAtIndex(int index) const
+    {
+        btAssert(index < m_keyArray.size());
+        return m_keyArray[index];
+    }
 	Value* operator[](const Key& key) {
 		return find(key);
+	const Value* operator[](const Key& key) const {
+		return find(key);
+	}
 	const Value*	find(const Key& key) const
 		int index = findIndex(key);
diff --git a/src/bullet/LinearMath/btIDebugDraw.h b/src/bullet/LinearMath/btIDebugDraw.h
index 935502f8..a020c3f4 100644
--- a/src/bullet/LinearMath/btIDebugDraw.h
+++ b/src/bullet/LinearMath/btIDebugDraw.h
@@ -21,6 +21,7 @@ subject to the following restrictions:
 #include "btTransform.h"
 ///The btIDebugDraw interface class allows hooking up a debug renderer to visually debug simulations.
 ///Typical use case: create a debug drawer object, and assign it to a btCollisionWorld or btDynamicsWorld using setDebugDrawer and call debugDrawWorld.
 ///A class that implements the btIDebugDraw interface has to implement the drawLine method at a minimum.
@@ -29,6 +30,29 @@ class	btIDebugDraw
+	ATTRIBUTE_ALIGNED16(struct) DefaultColors
+	{
+		btVector3	m_activeObject;
+		btVector3	m_deactivatedObject;
+		btVector3	m_wantsDeactivationObject;
+		btVector3	m_disabledDeactivationObject;
+		btVector3	m_disabledSimulationObject;
+		btVector3	m_aabb;
+		btVector3 m_contactPoint;
+		DefaultColors()
+		:	m_activeObject(1,1,1),
+			m_deactivatedObject(0,1,0),
+			m_wantsDeactivationObject(0,1,1),
+			m_disabledDeactivationObject(1,0,0),
+			m_disabledSimulationObject(1,1,0),
+			m_aabb(1,0,0),
+			m_contactPoint(1,1,0)
+		{
+		}
+	};
 	enum	DebugDrawModes
@@ -46,12 +70,18 @@ class	btIDebugDraw
 		DBG_DrawConstraints = (1 << 11),
 		DBG_DrawConstraintLimits = (1 << 12),
 		DBG_FastWireframe = (1<<13),
-        DBG_DrawNormals = (1<<14),
+		DBG_DrawNormals = (1<<14),
+		DBG_DrawFrames = (1<<15),
 	virtual ~btIDebugDraw() {};
+	virtual DefaultColors	getDefaultColors() const	{	DefaultColors colors;	return colors;	}
+	///the default implementation for setDefaultColors has no effect. A derived class can implement it and store the colors.
+	virtual void setDefaultColors(const DefaultColors& /*colors*/) {}
 	virtual void	drawLine(const btVector3& from,const btVector3& to,const btVector3& color)=0;
 	virtual void    drawLine(const btVector3& from,const btVector3& to, const btVector3& fromColor, const btVector3& toColor)
@@ -62,29 +92,17 @@ class	btIDebugDraw
 	virtual void	drawSphere(btScalar radius, const btTransform& transform, const btVector3& color)
-		btVector3 start = transform.getOrigin();
-		const btVector3 xoffs = transform.getBasis() * btVector3(radius,0,0);
-		const btVector3 yoffs = transform.getBasis() * btVector3(0,radius,0);
-		const btVector3 zoffs = transform.getBasis() * btVector3(0,0,radius);
-		// XY 
-		drawLine(start-xoffs, start+yoffs, color);
-		drawLine(start+yoffs, start+xoffs, color);
-		drawLine(start+xoffs, start-yoffs, color);
-		drawLine(start-yoffs, start-xoffs, color);
-		// XZ
-		drawLine(start-xoffs, start+zoffs, color);
-		drawLine(start+zoffs, start+xoffs, color);
-		drawLine(start+xoffs, start-zoffs, color);
-		drawLine(start-zoffs, start-xoffs, color);
-		// YZ
-		drawLine(start-yoffs, start+zoffs, color);
-		drawLine(start+zoffs, start+yoffs, color);
-		drawLine(start+yoffs, start-zoffs, color);
-		drawLine(start-zoffs, start-yoffs, color);
+		btVector3 center = transform.getOrigin();
+		btVector3 up = transform.getBasis().getColumn(1);
+		btVector3 axis = transform.getBasis().getColumn(0);
+		btScalar minTh = -SIMD_HALF_PI;
+		btScalar maxTh = SIMD_HALF_PI;
+		btScalar minPs = -SIMD_HALF_PI;
+		btScalar maxPs = SIMD_HALF_PI;
+		btScalar stepDegrees = 30.f;
+		drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, stepDegrees ,false);
+		drawSpherePatch(center, up, -axis, radius,minTh, maxTh, minPs, maxPs, color, stepDegrees,false );
 	virtual void	drawSphere (const btVector3& p, btScalar radius, const btVector3& color)
@@ -148,9 +166,9 @@ class	btIDebugDraw
 	virtual void drawTransform(const btTransform& transform, btScalar orthoLen)
 		btVector3 start = transform.getOrigin();
-		drawLine(start, start+transform.getBasis() * btVector3(orthoLen, 0, 0), btVector3(0.7f,0,0));
-		drawLine(start, start+transform.getBasis() * btVector3(0, orthoLen, 0), btVector3(0,0.7f,0));
-		drawLine(start, start+transform.getBasis() * btVector3(0, 0, orthoLen), btVector3(0,0,0.7f));
+		drawLine(start, start+transform.getBasis() * btVector3(orthoLen, 0, 0), btVector3(1.f,0.3,0.3));
+		drawLine(start, start+transform.getBasis() * btVector3(0, orthoLen, 0), btVector3(0.3,1.f, 0.3));
+		drawLine(start, start+transform.getBasis() * btVector3(0, 0, orthoLen), btVector3(0.3, 0.3,1.f));
 	virtual void drawArc(const btVector3& center, const btVector3& normal, const btVector3& axis, btScalar radiusA, btScalar radiusB, btScalar minAngle, btScalar maxAngle, 
@@ -159,7 +177,7 @@ class	btIDebugDraw
 		const btVector3& vx = axis;
 		btVector3 vy = normal.cross(axis);
 		btScalar step = stepDegrees * SIMD_RADS_PER_DEG;
-		int nSteps = (int)((maxAngle - minAngle) / step);
+		int nSteps = (int)btFabs((maxAngle - minAngle) / step);
 		if(!nSteps) nSteps = 1;
 		btVector3 prev = center + radiusA * vx * btCos(minAngle) + radiusB * vy * btSin(minAngle);
@@ -179,7 +197,7 @@ class	btIDebugDraw
 	virtual void drawSpherePatch(const btVector3& center, const btVector3& up, const btVector3& axis, btScalar radius, 
-		btScalar minTh, btScalar maxTh, btScalar minPs, btScalar maxPs, const btVector3& color, btScalar stepDegrees = btScalar(10.f))
+		btScalar minTh, btScalar maxTh, btScalar minPs, btScalar maxPs, const btVector3& color, btScalar stepDegrees = btScalar(10.f),bool drawCenter = true)
 		btVector3 vA[74];
 		btVector3 vB[74];
@@ -261,18 +279,22 @@ class	btIDebugDraw
 					drawLine(npole, pvB[j], color);
-				if(isClosed)
+				if (drawCenter)
-					if(j == (n_vert-1))
+					if(isClosed)
-						drawLine(arcStart, pvB[j], color);
+						if(j == (n_vert-1))
+						{
+							drawLine(arcStart, pvB[j], color);
+						}
-				}
-				else
-				{
-					if(((!i) || (i == (n_hor-1))) && ((!j) || (j == (n_vert-1))))
+					else
-						drawLine(center, pvB[j], color);
+						if(((!i) || (i == (n_hor-1))) && ((!j) || (j == (n_vert-1))))
+						{
+							drawLine(center, pvB[j], color);
+						}
@@ -280,6 +302,7 @@ class	btIDebugDraw
 	virtual void drawBox(const btVector3& bbMin, const btVector3& bbMax, const btVector3& color)
 		drawLine(btVector3(bbMin[0], bbMin[1], bbMin[2]), btVector3(bbMax[0], bbMin[1], bbMin[2]), color);
@@ -313,6 +336,8 @@ class	btIDebugDraw
 	virtual void drawCapsule(btScalar radius, btScalar halfHeight, int upAxis, const btTransform& transform, const btVector3& color)
+		int stepDegrees = 30;
 		btVector3 capStart(0.f,0.f,0.f);
 		capStart[upAxis] = -halfHeight;
@@ -324,34 +349,47 @@ class	btIDebugDraw
 			btTransform childTransform = transform;
 			childTransform.getOrigin() = transform * capStart;
-			drawSphere(radius, childTransform, color);
+			{
+				btVector3 center = childTransform.getOrigin();
+				btVector3 up = childTransform.getBasis().getColumn((upAxis+1)%3);
+				btVector3 axis = -childTransform.getBasis().getColumn(upAxis);
+				btScalar minTh = -SIMD_HALF_PI;
+				btScalar maxTh = SIMD_HALF_PI;
+				btScalar minPs = -SIMD_HALF_PI;
+				btScalar maxPs = SIMD_HALF_PI;
+				drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees) ,false);
+			}
 			btTransform childTransform = transform;
 			childTransform.getOrigin() = transform * capEnd;
-			drawSphere(radius, childTransform, color);
+			{
+				btVector3 center = childTransform.getOrigin();
+				btVector3 up = childTransform.getBasis().getColumn((upAxis+1)%3);
+				btVector3 axis = childTransform.getBasis().getColumn(upAxis);
+				btScalar minTh = -SIMD_HALF_PI;
+				btScalar maxTh = SIMD_HALF_PI;
+				btScalar minPs = -SIMD_HALF_PI;
+				btScalar maxPs = SIMD_HALF_PI;
+				drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees) ,false);
+			}
 		// Draw some additional lines
 		btVector3 start = transform.getOrigin();
-		capStart[(upAxis+1)%3] = radius;
-		capEnd[(upAxis+1)%3] = radius;
-		drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
-		capStart[(upAxis+1)%3] = -radius;
-		capEnd[(upAxis+1)%3] = -radius;
-		drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
-		capStart[(upAxis+1)%3] = 0.f;
-		capEnd[(upAxis+1)%3] = 0.f;
-		capStart[(upAxis+2)%3] = radius;
-		capEnd[(upAxis+2)%3] = radius;
-		drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
-		capStart[(upAxis+2)%3] = -radius;
-		capEnd[(upAxis+2)%3] = -radius;
-		drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
+		for (int i=0;i<360;i+=stepDegrees)
+		{
+			capEnd[(upAxis+1)%3] = capStart[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
+			capEnd[(upAxis+2)%3] = capStart[(upAxis+2)%3]  = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
+			drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
+		}
 	virtual void drawCylinder(btScalar radius, btScalar halfHeight, int upAxis, const btTransform& transform, const btVector3& color)
@@ -359,11 +397,18 @@ class	btIDebugDraw
 		btVector3 start = transform.getOrigin();
 		btVector3	offsetHeight(0,0,0);
 		offsetHeight[upAxis] = halfHeight;
-		btVector3	offsetRadius(0,0,0);
-		offsetRadius[(upAxis+1)%3] = radius;
-		drawLine(start+transform.getBasis() * (offsetHeight+offsetRadius),start+transform.getBasis() * (-offsetHeight+offsetRadius),color);
-		drawLine(start+transform.getBasis() * (offsetHeight-offsetRadius),start+transform.getBasis() * (-offsetHeight-offsetRadius),color);
+		int stepDegrees=30;
+		btVector3 capStart(0.f,0.f,0.f);
+		capStart[upAxis] = -halfHeight;
+		btVector3 capEnd(0.f,0.f,0.f);
+		capEnd[upAxis] = halfHeight;
+		for (int i=0;i<360;i+=stepDegrees)
+		{
+			capEnd[(upAxis+1)%3] = capStart[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
+			capEnd[(upAxis+2)%3] = capStart[(upAxis+2)%3]  = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
+			drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
+		}
 		// Drawing top and bottom caps of the cylinder
 		btVector3 yaxis(0,0,0);
 		yaxis[upAxis] = btScalar(1.0);
@@ -375,16 +420,28 @@ class	btIDebugDraw
 	virtual void drawCone(btScalar radius, btScalar height, int upAxis, const btTransform& transform, const btVector3& color)
+		int stepDegrees = 30;
 		btVector3 start = transform.getOrigin();
 		btVector3	offsetHeight(0,0,0);
-		offsetHeight[upAxis] = height * btScalar(0.5);
+		btScalar halfHeight = height * btScalar(0.5);
+		offsetHeight[upAxis] = halfHeight;
 		btVector3	offsetRadius(0,0,0);
 		offsetRadius[(upAxis+1)%3] = radius;
 		btVector3	offset2Radius(0,0,0);
 		offset2Radius[(upAxis+2)%3] = radius;
+		btVector3 capEnd(0.f,0.f,0.f);
+		capEnd[upAxis] = -halfHeight;
+		for (int i=0;i<360;i+=stepDegrees)
+		{
+			capEnd[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
+			capEnd[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
+			drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * capEnd, color);
+		}
 		drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight+offsetRadius),color);
 		drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight-offsetRadius),color);
 		drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight+offset2Radius),color);
@@ -411,6 +468,10 @@ class	btIDebugDraw
+	virtual void flushLines()
+	{
+	}
diff --git a/src/bullet/LinearMath/btMatrix3x3.h b/src/bullet/LinearMath/btMatrix3x3.h
index d0234a04..963c5db9 100644
--- a/src/bullet/LinearMath/btMatrix3x3.h
+++ b/src/bullet/LinearMath/btMatrix3x3.h
@@ -18,6 +18,23 @@ subject to the following restrictions:
 #include "btVector3.h"
 #include "btQuaternion.h"
+#include <stdio.h>
+#ifdef BT_USE_SSE
+//const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
+//const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
+#define vMPPP (_mm_set_ps (+0.0f, +0.0f, +0.0f, -0.0f))
+#if defined(BT_USE_SSE)
+#define v1000 (_mm_set_ps(0.0f,0.0f,0.0f,1.0f))
+#define v0100 (_mm_set_ps(0.0f,0.0f,1.0f,0.0f))
+#define v0010 (_mm_set_ps(0.0f,1.0f,0.0f,0.0f))
+#elif defined(BT_USE_NEON)
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
 #define btMatrix3x3Data	btMatrix3x3DoubleData 
@@ -28,7 +45,7 @@ subject to the following restrictions:
 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
 * Make sure to only include a pure orthogonal matrix without scaling. */
-class btMatrix3x3 {
+ATTRIBUTE_ALIGNED16(class) btMatrix3x3 {
 	///Data storage for the matrix, each vector is a row of the matrix
 	btVector3 m_el[3];
@@ -57,6 +74,42 @@ public:
 			yx, yy, yz, 
 			zx, zy, zz);
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+	SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 ) 
+	{
+        m_el[0].mVec128 = v0;
+        m_el[1].mVec128 = v1;
+        m_el[2].mVec128 = v2;
+	}
+	SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 ) 
+	{
+        m_el[0] = v0;
+        m_el[1] = v1;
+        m_el[2] = v2;
+	}
+	// Copy constructor
+	SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
+	{
+		m_el[0].mVec128 = rhs.m_el[0].mVec128;
+		m_el[1].mVec128 = rhs.m_el[1].mVec128;
+		m_el[2].mVec128 = rhs.m_el[2].mVec128;
+	}
+	// Assignment Operator
+	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m) 
+	{
+		m_el[0].mVec128 = m.m_el[0].mVec128;
+		m_el[1].mVec128 = m.m_el[1].mVec128;
+		m_el[2].mVec128 = m.m_el[2].mVec128;
+		return *this;
+	}
 	/** @brief Copy constructor */
 	SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other)
@@ -64,6 +117,7 @@ public:
 		m_el[1] = other.m_el[1];
 		m_el[2] = other.m_el[2];
 	/** @brief Assignment Operator */
 	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
@@ -73,6 +127,8 @@ public:
 		return *this;
 	/** @brief Get a column of the matrix as a vector 
 	*  @param i Column number 0 indexed */
 	SIMD_FORCE_INLINE btVector3 getColumn(int i) const
@@ -155,14 +211,69 @@ public:
 		btScalar d = q.length2();
 		btFullAssert(d != btScalar(0.0));
 		btScalar s = btScalar(2.0) / d;
+    #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        __m128	vs, Q = q.get128();
+		__m128i Qi = btCastfTo128i(Q);
+        __m128	Y, Z;
+        __m128	V1, V2, V3;
+        __m128	V11, V21, V31;
+        __m128	NQ = _mm_xor_ps(Q, btvMzeroMask);
+		__m128i NQi = btCastfTo128i(NQ);
+        V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3)));	// Y X Z W
+		V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3));     // -X -X  Y  W
+        V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3)));	// Z Y X W
+        V1 = _mm_xor_ps(V1, vMPPP);	//	change the sign of the first element
+        V11	= btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3)));	// Y Y X W
+		V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
+		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3));	//  X  Z -X -W
+		V2 = V2 * V1;	//
+		V1 = V1 * V11;	//
+		V3 = V3 * V31;	//
+        V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3));	//	-Z -W  Y  W
+		V11 = V11 * V21;	//
+        V21 = _mm_xor_ps(V21, vMPPP);	//	change the sign of the first element
+		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3));	//	 W  W -Y -W
+        V31 = _mm_xor_ps(V31, vMPPP);	//	change the sign of the first element
+		Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3)));	// -W -Z -X -W
+		Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3)));	//  Y  X  Y  W
+		vs = _mm_load_ss(&s);
+		V21 = V21 * Y;
+		V31 = V31 * Z;
+		V1 = V1 + V11;
+        V2 = V2 + V21;
+        V3 = V3 + V31;
+        vs = bt_splat3_ps(vs, 0);
+            //	s ready
+        V1 = V1 * vs;
+        V2 = V2 * vs;
+        V3 = V3 * vs;
+        V1 = V1 + v1000;
+        V2 = V2 + v0100;
+        V3 = V3 + v0010;
+        m_el[0] = V1; 
+        m_el[1] = V2;
+        m_el[2] = V3;
+    #else    
 		btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
 		btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
 		btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
 		btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
-		setValue(btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
+		setValue(
+            btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
 			xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
 			xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
-	}
+	#endif
+    }
 	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
@@ -205,16 +316,29 @@ public:
 	/**@brief Set the matrix to the identity */
 	void setIdentity()
+#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
+			m_el[0] = v1000; 
+			m_el[1] = v0100;
+			m_el[2] = v0010;
 		setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(1.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(0.0), btScalar(1.0)); 
 	static const btMatrix3x3&	getIdentity()
-		static const btMatrix3x3 identityMatrix(btScalar(1.0), btScalar(0.0), btScalar(0.0), 
+#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
+        static const btMatrix3x3 
+        identityMatrix(v1000, v0100, v0010);
+		static const btMatrix3x3 
+        identityMatrix(
+            btScalar(1.0), btScalar(0.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(1.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(0.0), btScalar(1.0));
 		return identityMatrix;
@@ -222,6 +346,40 @@ public:
 	* @param m The array to be filled */
 	void getOpenGLSubMatrix(btScalar *m) const 
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        __m128 v0 = m_el[0].mVec128;
+        __m128 v1 = m_el[1].mVec128;
+        __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
+        __m128 *vm = (__m128 *)m;
+        __m128 vT;
+        v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
+        vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
+        v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
+        v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
+        v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
+        v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));	// z0 z1 z2 0
+        vm[0] = v0;
+        vm[1] = v1;
+        vm[2] = v2;
+#elif defined(BT_USE_NEON)
+        // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+        static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
+        float32x4_t *vm = (float32x4_t *)m;
+        float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+        float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
+        float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
+        float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
+        float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
+        float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
+        vm[0] = v0;
+        vm[1] = v1;
+        vm[2] = v2;
 		m[0]  = btScalar(m_el[0].x()); 
 		m[1]  = btScalar(m_el[1].x());
 		m[2]  = btScalar(m_el[2].x());
@@ -234,13 +392,67 @@ public:
 		m[9]  = btScalar(m_el[1].z());
 		m[10] = btScalar(m_el[2].z());
 		m[11] = btScalar(0.0); 
 	/**@brief Get the matrix represented as a quaternion 
 	* @param q The quaternion which will be set */
 	void getRotation(btQuaternion& q) const
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+        btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
+        btScalar s, x;
+        union {
+            btSimdFloat4 vec;
+            btScalar f[4];
+        } temp;
+        if (trace > btScalar(0.0)) 
+        {
+            x = trace + btScalar(1.0);
+            temp.f[0]=m_el[2].y() - m_el[1].z();
+            temp.f[1]=m_el[0].z() - m_el[2].x();
+            temp.f[2]=m_el[1].x() - m_el[0].y();
+            temp.f[3]=x;
+            //temp.f[3]= s * btScalar(0.5);
+        } 
+        else 
+        {
+            int i, j, k;
+            if(m_el[0].x() < m_el[1].y()) 
+            { 
+                if( m_el[1].y() < m_el[2].z() )
+                    { i = 2; j = 0; k = 1; }
+                else
+                    { i = 1; j = 2; k = 0; }
+            }
+            else
+            {
+                if( m_el[0].x() < m_el[2].z())
+                    { i = 2; j = 0; k = 1; }
+                else
+                    { i = 0; j = 1; k = 2; }
+            }
+            x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
+            temp.f[3] = (m_el[k][j] - m_el[j][k]);
+            temp.f[j] = (m_el[j][i] + m_el[i][j]);
+            temp.f[k] = (m_el[k][i] + m_el[i][k]);
+            temp.f[i] = x;
+            //temp.f[i] = s * btScalar(0.5);
+        }
+        s = btSqrt(x);
+        q.set128(temp.vec);
+        s = btScalar(0.5) / s;
+        q *= s;
 		btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
 		btScalar temp[4];
 		if (trace > btScalar(0.0)) 
@@ -270,6 +482,7 @@ public:
 			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
 	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
@@ -376,9 +589,14 @@ public:
 	btMatrix3x3 scaled(const btVector3& s) const
-		return btMatrix3x3(m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+		return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
+		return btMatrix3x3(
+            m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
 			m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
 			m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
 	/**@brief Return the determinant of the matrix */
@@ -392,6 +610,27 @@ public:
 	/**@brief Return the inverse of the matrix */
 	btMatrix3x3 inverse() const; 
+	/// Solve A * x = b, where b is a column vector. This is more efficient
+	/// than computing the inverse in one-shot cases.
+	///Solve33 is from Box2d, thanks to Erin Catto,
+	btVector3 solve33(const btVector3& b) const
+	{
+		btVector3 col1 = getColumn(0);
+		btVector3 col2 = getColumn(1);
+		btVector3 col3 = getColumn(2);
+		btScalar det = btDot(col1, btCross(col2, col3));
+		if (btFabs(det)>SIMD_EPSILON)
+		{
+			det = 1.0f / det;
+		}
+		btVector3 x;
+		x[0] = det * btDot(b, btCross(col2, col3));
+		x[1] = det * btDot(col1, btCross(b, col3));
+		x[2] = det * btDot(col1, btCross(col2, b));
+		return x;
+	}
 	btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
 	btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
@@ -527,15 +766,101 @@ public:
 SIMD_FORCE_INLINE btMatrix3x3& 
 btMatrix3x3::operator*=(const btMatrix3x3& m)
-	setValue(m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+    __m128 rv00, rv01, rv02;
+    __m128 rv10, rv11, rv12;
+    __m128 rv20, rv21, rv22;
+    __m128 mv0, mv1, mv2;
+    rv02 = m_el[0].mVec128;
+    rv12 = m_el[1].mVec128;
+    rv22 = m_el[2].mVec128;
+    mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask); 
+    mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask); 
+    mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask); 
+    // rv0
+    rv00 = bt_splat_ps(rv02, 0);
+    rv01 = bt_splat_ps(rv02, 1);
+    rv02 = bt_splat_ps(rv02, 2);
+    rv00 = _mm_mul_ps(rv00, mv0);
+    rv01 = _mm_mul_ps(rv01, mv1);
+    rv02 = _mm_mul_ps(rv02, mv2);
+    // rv1
+    rv10 = bt_splat_ps(rv12, 0);
+    rv11 = bt_splat_ps(rv12, 1);
+    rv12 = bt_splat_ps(rv12, 2);
+    rv10 = _mm_mul_ps(rv10, mv0);
+    rv11 = _mm_mul_ps(rv11, mv1);
+    rv12 = _mm_mul_ps(rv12, mv2);
+    // rv2
+    rv20 = bt_splat_ps(rv22, 0);
+    rv21 = bt_splat_ps(rv22, 1);
+    rv22 = bt_splat_ps(rv22, 2);
+    rv20 = _mm_mul_ps(rv20, mv0);
+    rv21 = _mm_mul_ps(rv21, mv1);
+    rv22 = _mm_mul_ps(rv22, mv2);
+    rv00 = _mm_add_ps(rv00, rv01);
+    rv10 = _mm_add_ps(rv10, rv11);
+    rv20 = _mm_add_ps(rv20, rv21);
+    m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
+    m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
+    m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
+#elif defined(BT_USE_NEON)
+    float32x4_t rv0, rv1, rv2;
+    float32x4_t v0, v1, v2;
+    float32x4_t mv0, mv1, mv2;
+    v0 = m_el[0].mVec128;
+    v1 = m_el[1].mVec128;
+    v2 = m_el[2].mVec128;
+    mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); 
+    mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); 
+    mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); 
+    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+    m_el[0].mVec128 = rv0;
+    m_el[1].mVec128 = rv1;
+    m_el[2].mVec128 = rv2;
+	setValue(
+        m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
 		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
 		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
 	return *this;
 SIMD_FORCE_INLINE btMatrix3x3& 
 btMatrix3x3::operator+=(const btMatrix3x3& m)
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+    m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
+    m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
+    m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
@@ -546,52 +871,89 @@ btMatrix3x3::operator+=(const btMatrix3x3& m)
 	return *this;
 operator*(const btMatrix3x3& m, const btScalar & k)
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
+    return btMatrix3x3(
+                _mm_mul_ps(m[0].mVec128, vk), 
+                _mm_mul_ps(m[1].mVec128, vk), 
+                _mm_mul_ps(m[2].mVec128, vk)); 
+#elif defined(BT_USE_NEON)
+    return btMatrix3x3(
+                vmulq_n_f32(m[0].mVec128, k),
+                vmulq_n_f32(m[1].mVec128, k),
+                vmulq_n_f32(m[2].mVec128, k)); 
 	return btMatrix3x3(
- SIMD_FORCE_INLINE btMatrix3x3 
 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+	return btMatrix3x3(
+        m1[0].mVec128 + m2[0].mVec128,
+        m1[1].mVec128 + m2[1].mVec128,
+        m1[2].mVec128 + m2[2].mVec128);
 	return btMatrix3x3(
-	m1[0][0]+m2[0][0], 
-	m1[0][1]+m2[0][1],
-	m1[0][2]+m2[0][2],
-	m1[1][0]+m2[1][0], 
-	m1[1][1]+m2[1][1],
-	m1[1][2]+m2[1][2],
-	m1[2][0]+m2[2][0], 
-	m1[2][1]+m2[2][1],
-	m1[2][2]+m2[2][2]);
+        m1[0][0]+m2[0][0], 
+        m1[0][1]+m2[0][1],
+        m1[0][2]+m2[0][2],
+        m1[1][0]+m2[1][0], 
+        m1[1][1]+m2[1][1],
+        m1[1][2]+m2[1][2],
+        m1[2][0]+m2[2][0], 
+        m1[2][1]+m2[2][1],
+        m1[2][2]+m2[2][2]);
 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 	return btMatrix3x3(
-	m1[0][0]-m2[0][0], 
-	m1[0][1]-m2[0][1],
-	m1[0][2]-m2[0][2],
-	m1[1][0]-m2[1][0], 
-	m1[1][1]-m2[1][1],
-	m1[1][2]-m2[1][2],
-	m1[2][0]-m2[2][0], 
-	m1[2][1]-m2[2][1],
-	m1[2][2]-m2[2][2]);
+        m1[0].mVec128 - m2[0].mVec128,
+        m1[1].mVec128 - m2[1].mVec128,
+        m1[2].mVec128 - m2[2].mVec128);
+	return btMatrix3x3(
+        m1[0][0]-m2[0][0], 
+        m1[0][1]-m2[0][1],
+        m1[0][2]-m2[0][2],
+        m1[1][0]-m2[1][0], 
+        m1[1][1]-m2[1][1],
+        m1[1][2]-m2[1][2],
+        m1[2][0]-m2[2][0], 
+        m1[2][1]-m2[2][1],
+        m1[2][2]-m2[2][2]);
 SIMD_FORCE_INLINE btMatrix3x3& 
 btMatrix3x3::operator-=(const btMatrix3x3& m)
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+    m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
+    m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
+    m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
@@ -602,6 +964,7 @@ btMatrix3x3::operator-=(const btMatrix3x3& m)
 	return *this;
@@ -616,18 +979,59 @@ btMatrix3x3::determinant() const
 btMatrix3x3::absolute() const
+#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    return btMatrix3x3(
+            _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
+            _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
+            _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
+#elif defined(BT_USE_NEON)
+    return btMatrix3x3(
+            (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
+            (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
+            (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
 	return btMatrix3x3(
-		btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
-		btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
-		btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
+            btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
+            btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
+            btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
 btMatrix3x3::transpose() const 
-	return btMatrix3x3(m_el[0].x(), m_el[1].x(), m_el[2].x(),
-		m_el[0].y(), m_el[1].y(), m_el[2].y(),
-		m_el[0].z(), m_el[1].z(), m_el[2].z());
+#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 v0 = m_el[0].mVec128;
+    __m128 v1 = m_el[1].mVec128;
+    __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
+    __m128 vT;
+    v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
+    vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
+    v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
+    v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
+    v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
+    v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));	// z0 z1 z2 0
+    return btMatrix3x3( v0, v1, v2 );
+#elif defined(BT_USE_NEON)
+    // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+    static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
+    float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+    float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
+    float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
+    float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
+    float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
+    float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
+    return btMatrix3x3( v0, v1, v2 ); 
+	return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
+                        m_el[0].y(), m_el[1].y(), m_el[2].y(),
+                        m_el[0].z(), m_el[1].z(), m_el[2].z());
@@ -653,7 +1057,47 @@ btMatrix3x3::inverse() const
 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
-	return btMatrix3x3(
+#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    // zeros w
+//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
+    __m128 row = m_el[0].mVec128;
+    __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
+    __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
+    __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
+    __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
+    __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
+    __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
+    row = m_el[1].mVec128;
+    r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
+    r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
+    r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
+    row = m_el[2].mVec128;
+    r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
+    r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
+    r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
+    return btMatrix3x3( r0, r1, r2 );
+#elif defined BT_USE_NEON
+    // zeros w
+    static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
+    float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
+    float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
+    float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
+    float32x4_t row = m_el[0].mVec128;
+    float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
+    float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
+    float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
+    row = m_el[1].mVec128;
+    r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
+    r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
+    r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
+    row = m_el[2].mVec128;
+    r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
+    r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
+    r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
+    return btMatrix3x3( r0, r1, r2 );
+    return btMatrix3x3(
 		m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
 		m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
 		m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
@@ -663,38 +1107,196 @@ btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
 		m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
 		m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
 		m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 a0 = m_el[0].mVec128;
+    __m128 a1 = m_el[1].mVec128;
+    __m128 a2 = m_el[2].mVec128;
+    btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
+    __m128 mx = mT[0].mVec128;
+    __m128 my = mT[1].mVec128;
+    __m128 mz = mT[2].mVec128;
+    __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
+    __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
+    __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
+    r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
+    r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
+    r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
+    r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
+    r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
+    r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
+    return btMatrix3x3( r0, r1, r2);
+#elif defined BT_USE_NEON
+    float32x4_t a0 = m_el[0].mVec128;
+    float32x4_t a1 = m_el[1].mVec128;
+    float32x4_t a2 = m_el[2].mVec128;
+    btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
+    float32x4_t mx = mT[0].mVec128;
+    float32x4_t my = mT[1].mVec128;
+    float32x4_t mz = mT[2].mVec128;
+    float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
+    float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
+    float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
+    r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
+    r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
+    r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
+    r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
+    r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
+    r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
+    return btMatrix3x3( r0, r1, r2 );
 	return btMatrix3x3(
 		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
 		m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
 		m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
 operator*(const btMatrix3x3& m, const btVector3& v) 
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+    return v.dot3(m[0], m[1], m[2]);
 	return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
 operator*(const btVector3& v, const btMatrix3x3& m)
+#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    const __m128 vv = v.mVec128;
+    __m128 c0 = bt_splat_ps( vv, 0);
+    __m128 c1 = bt_splat_ps( vv, 1);
+    __m128 c2 = bt_splat_ps( vv, 2);
+    c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
+    c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
+    c0 = _mm_add_ps(c0, c1);
+    c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
+    return btVector3(_mm_add_ps(c0, c2));
+#elif defined(BT_USE_NEON)
+    const float32x4_t vv = v.mVec128;
+    const float32x2_t vlo = vget_low_f32(vv);
+    const float32x2_t vhi = vget_high_f32(vv);
+    float32x4_t c0, c1, c2;
+    c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
+    c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
+    c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
+    c0 = vmulq_lane_f32(c0, vlo, 0);
+    c1 = vmulq_lane_f32(c1, vlo, 1);
+    c2 = vmulq_lane_f32(c2, vhi, 0);
+    c0 = vaddq_f32(c0, c1);
+    c0 = vaddq_f32(c0, c2);
+    return btVector3(c0);
 	return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
+#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 m10 = m1[0].mVec128;  
+    __m128 m11 = m1[1].mVec128;
+    __m128 m12 = m1[2].mVec128;
+    __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
+    __m128 c0 = bt_splat_ps( m10, 0);
+    __m128 c1 = bt_splat_ps( m11, 0);
+    __m128 c2 = bt_splat_ps( m12, 0);
+    c0 = _mm_mul_ps(c0, m2v);
+    c1 = _mm_mul_ps(c1, m2v);
+    c2 = _mm_mul_ps(c2, m2v);
+    m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
+    __m128 c0_1 = bt_splat_ps( m10, 1);
+    __m128 c1_1 = bt_splat_ps( m11, 1);
+    __m128 c2_1 = bt_splat_ps( m12, 1);
+    c0_1 = _mm_mul_ps(c0_1, m2v);
+    c1_1 = _mm_mul_ps(c1_1, m2v);
+    c2_1 = _mm_mul_ps(c2_1, m2v);
+    m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
+    c0 = _mm_add_ps(c0, c0_1);
+    c1 = _mm_add_ps(c1, c1_1);
+    c2 = _mm_add_ps(c2, c2_1);
+    m10 = bt_splat_ps( m10, 2);
+    m11 = bt_splat_ps( m11, 2);
+    m12 = bt_splat_ps( m12, 2);
+    m10 = _mm_mul_ps(m10, m2v);
+    m11 = _mm_mul_ps(m11, m2v);
+    m12 = _mm_mul_ps(m12, m2v);
+    c0 = _mm_add_ps(c0, m10);
+    c1 = _mm_add_ps(c1, m11);
+    c2 = _mm_add_ps(c2, m12);
+    return btMatrix3x3(c0, c1, c2);
+#elif defined(BT_USE_NEON)
+    float32x4_t rv0, rv1, rv2;
+    float32x4_t v0, v1, v2;
+    float32x4_t mv0, mv1, mv2;
+    v0 = m1[0].mVec128;
+    v1 = m1[1].mVec128;
+    v2 = m1[2].mVec128;
+    mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask); 
+    mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask); 
+    mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask); 
+    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+	return btMatrix3x3(rv0, rv1, rv2);
 	return btMatrix3x3(
 		m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
 		m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
 		m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
@@ -716,9 +1318,26 @@ m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
 * It will test all elements are equal.  */
 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
-	return ( m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 c0, c1, c2;
+    c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
+    c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
+    c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
+    c0 = _mm_and_ps(c0, c1);
+    c0 = _mm_and_ps(c0, c2);
+	int m = _mm_movemask_ps((__m128)c0);
+	return (0x7 == (m & 0x7));
+	return 
+    (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
 		m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
 		m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
 ///for serialization
diff --git a/src/bullet/LinearMath/btMatrixX.h b/src/bullet/LinearMath/btMatrixX.h
new file mode 100644
index 00000000..42caed42
--- /dev/null
+++ b/src/bullet/LinearMath/btMatrixX.h
@@ -0,0 +1,554 @@
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///original version written by Erwin Coumans, October 2013
+#ifndef BT_MATRIX_X_H
+#define BT_MATRIX_X_H
+#include "LinearMath/btQuickprof.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include <stdio.h>
+#include <iostream>
+#include <iomanip>      // std::setw
+class btIntSortPredicate
+	public:
+		bool operator() ( const int& a, const int& b ) const
+		{
+			 return a < b;
+		}
+template <typename T>
+struct btVectorX
+	btAlignedObjectArray<T>	m_storage;
+	btVectorX()
+	{
+	}
+	btVectorX(int numRows)
+	{
+		m_storage.resize(numRows);
+	}
+	void resize(int rows)
+	{
+		m_storage.resize(rows);
+	}
+	int cols() const
+	{
+		return 1;
+	}
+	int rows() const
+	{
+		return m_storage.size();
+	}
+	int size() const
+	{
+		return rows();
+	}
+	T nrm2() const
+	{
+		T norm = T(0);
+		int nn = rows();
+		{
+			if (nn == 1)
+			{
+				norm = btFabs((*this)[0]);
+			}
+			else
+			{
+				T scale = 0.0;
+				T ssq = 1.0;
+				/* The following loop is equivalent to this call to the LAPACK
+				 auxiliary routine:   CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */
+				for (int ix=0;ix<nn;ix++)
+				{
+					if ((*this)[ix] != 0.0)
+					{
+						T absxi = btFabs((*this)[ix]);
+						if (scale < absxi)
+						{
+							T temp;
+							temp = scale / absxi;
+							ssq = ssq * (temp * temp) + BT_ONE;
+							scale = absxi;
+						}
+						else
+						{
+							T temp;
+							temp = absxi / scale;
+							ssq += temp * temp;
+						}
+					}
+				}
+				norm = scale * sqrt(ssq);
+			}
+		}
+		return norm;
+	}
+	void	setZero()
+	{
+		if (m_storage.size())
+		{
+			//	for (int i=0;i<m_storage.size();i++)
+			//		m_storage[i]=0;
+			//memset(&m_storage[0],0,sizeof(T)*m_storage.size());
+			btSetZero(&m_storage[0],m_storage.size());
+		}
+	}
+	const T& operator[] (int index) const
+	{
+		return m_storage[index];
+	}
+	T& operator[] (int index)
+	{
+		return m_storage[index];
+	}
+	T* getBufferPointerWritable()
+	{
+		return m_storage.size() ? &m_storage[0] : 0;
+	}
+	const T* getBufferPointer() const
+	{
+		return m_storage.size() ? &m_storage[0] : 0;
+	}
+ template <typename T>
+ void setElem(btMatrixX<T>& mat, int row, int col, T val)
+ {
+ mat.setElem(row,col,val);
+ }
+ */
+template <typename T> 
+struct btMatrixX
+	int m_rows;
+	int m_cols;
+	int m_operations;
+	int m_resizeOperations;
+	int m_setElemOperations;
+	btAlignedObjectArray<T>	m_storage;
+	mutable btAlignedObjectArray< btAlignedObjectArray<int> > m_rowNonZeroElements1;
+	T* getBufferPointerWritable() 
+	{
+		return m_storage.size() ? &m_storage[0] : 0;
+	}
+	const T* getBufferPointer() const
+	{
+		return m_storage.size() ? &m_storage[0] : 0;
+	}
+	btMatrixX()
+		:m_rows(0),
+		m_cols(0),
+		m_operations(0),
+		m_resizeOperations(0),
+		m_setElemOperations(0)
+	{
+	}
+	btMatrixX(int rows,int cols)
+		:m_rows(rows),
+		m_cols(cols),
+		m_operations(0),
+		m_resizeOperations(0),
+		m_setElemOperations(0)
+	{
+		resize(rows,cols);
+	}
+	void resize(int rows, int cols)
+	{
+		m_resizeOperations++;
+		m_rows = rows;
+		m_cols = cols;
+		{
+			BT_PROFILE("m_storage.resize");
+			m_storage.resize(rows*cols);
+		}
+	}
+	int cols() const
+	{
+		return m_cols;
+	}
+	int rows() const
+	{
+		return m_rows;
+	}
+	///we don't want this read/write operator(), because we cannot keep track of non-zero elements, use setElem instead
+	/*T& operator() (int row,int col)
+	{
+		return m_storage[col*m_rows+row];
+	}
+	*/
+	void addElem(int row,int col, T val)
+	{
+		if (val)
+		{
+			if (m_storage[col+row*m_cols]==0.f)
+			{
+				setElem(row,col,val);
+			} else
+			{
+				m_storage[row*m_cols+col] += val;
+			}
+		}
+	}
+	void setElem(int row,int col, T val)
+	{
+		m_setElemOperations++;
+		m_storage[row*m_cols+col] = val;
+	}
+	void mulElem(int row,int col, T val)
+	{
+		m_setElemOperations++;
+		//mul doesn't change sparsity info
+		m_storage[row*m_cols+col] *= val;
+	}
+	void copyLowerToUpperTriangle()
+	{
+		int count=0;
+		for (int row=0;row<rows();row++)
+		{
+			for (int col=0;col<row;col++)
+			{
+				setElem(col,row, (*this)(row,col));
+				count++;
+			}
+		}
+		//printf("copyLowerToUpperTriangle copied %d elements out of %dx%d=%d\n", count,rows(),cols(),cols()*rows());
+	}
+	const T& operator() (int row,int col) const
+	{
+		return m_storage[col+row*m_cols];
+	}
+	void setZero()
+	{
+		{
+			BT_PROFILE("storage=0");
+			btSetZero(&m_storage[0],m_storage.size());
+			//memset(&m_storage[0],0,sizeof(T)*m_storage.size());
+			//for (int i=0;i<m_storage.size();i++)
+	//			m_storage[i]=0;
+		}
+	}
+	void setIdentity()
+	{
+		btAssert(rows() == cols());
+		setZero();
+		for (int row=0;row<rows();row++)
+		{
+			setElem(row,row,1);
+		}
+	}
+	void	printMatrix(const char* msg)
+	{
+		printf("%s ---------------------\n",msg);
+		for (int i=0;i<rows();i++)
+		{
+			printf("\n");
+			for (int j=0;j<cols();j++)
+			{
+				printf("%2.1f\t",(*this)(i,j));
+			}
+		}
+		printf("\n---------------------\n");
+	}
+	void rowComputeNonZeroElements() const
+	{
+		m_rowNonZeroElements1.resize(rows());
+		for (int i=0;i<rows();i++)
+		{
+			m_rowNonZeroElements1[i].resize(0);
+			for (int j=0;j<cols();j++)
+			{
+				if ((*this)(i,j)!=0.f)
+				{
+					m_rowNonZeroElements1[i].push_back(j);
+				}
+			}
+		}
+	}
+	btMatrixX transpose() const
+	{
+		//transpose is optimized for sparse matrices
+		btMatrixX tr(m_cols,m_rows);
+		tr.setZero();
+		for (int i=0;i<m_cols;i++)
+			for (int j=0;j<m_rows;j++)
+			{
+				T v = (*this)(j,i);
+				if (v)
+				{
+					tr.setElem(i,j,v);
+				}
+			}
+		return tr;
+	}
+	btMatrixX operator*(const btMatrixX& other)
+	{
+		//btMatrixX*btMatrixX implementation, brute force
+		btAssert(cols() == other.rows());
+		btMatrixX res(rows(),other.cols());
+		res.setZero();
+//		BT_PROFILE("btMatrixX mul");
+		for (int j=0; j < res.cols(); ++j)
+		{
+			{
+				for (int i=0; i < res.rows(); ++i)
+				{
+					T dotProd=0;
+//					T dotProd2=0;
+					//int waste=0,waste2=0;
+					{
+//						bool useOtherCol = true;
+						{
+							for (int v=0;v<rows();v++)
+							{
+								T w = (*this)(i,v);
+								if (other(v,j)!=0.f)
+								{
+									dotProd+=w*other(v,j);	
+								}
+							}
+						}
+					}
+					if (dotProd)
+						res.setElem(i,j,dotProd);
+				}
+			}
+		}
+		return res;
+	}
+	// this assumes the 4th and 8th rows of B and C are zero.
+	void multiplyAdd2_p8r (const btScalar *B, const btScalar *C,  int numRows,  int numRowsOther ,int row, int col)
+	{
+		const btScalar *bb = B;
+		for ( int i = 0;i<numRows;i++)
+		{
+			const btScalar *cc = C;
+			for ( int j = 0;j<numRowsOther;j++)
+			{
+				btScalar sum;
+				sum  = bb[0]*cc[0];
+				sum += bb[1]*cc[1];
+				sum += bb[2]*cc[2];
+				sum += bb[4]*cc[4];
+				sum += bb[5]*cc[5];
+				sum += bb[6]*cc[6];
+				addElem(row+i,col+j,sum);
+				cc += 8;
+			}
+			bb += 8;
+		}
+	}
+	void multiply2_p8r (const btScalar *B, const btScalar *C,  int numRows,  int numRowsOther, int row, int col)
+	{
+		btAssert (numRows>0 && numRowsOther>0 && B && C);
+		const btScalar *bb = B;
+		for ( int i = 0;i<numRows;i++)
+		{
+			const btScalar *cc = C;
+			for ( int j = 0;j<numRowsOther;j++)
+			{
+				btScalar sum;
+				sum  = bb[0]*cc[0];
+				sum += bb[1]*cc[1];
+				sum += bb[2]*cc[2];
+				sum += bb[4]*cc[4];
+				sum += bb[5]*cc[5];
+				sum += bb[6]*cc[6];
+				setElem(row+i,col+j,sum);
+				cc += 8;
+			}
+			bb += 8;
+		}
+	}
+	void setSubMatrix(int rowstart,int colstart,int rowend,int colend,const T value)
+	{
+		int numRows = rowend+1-rowstart;
+		int numCols = colend+1-colstart;
+		for (int row=0;row<numRows;row++)
+		{
+			for (int col=0;col<numCols;col++)
+			{
+				setElem(rowstart+row,colstart+col,value);
+			}
+		}
+	}
+	void setSubMatrix(int rowstart,int colstart,int rowend,int colend,const btMatrixX& block)
+	{
+		btAssert(rowend+1-rowstart == block.rows());
+		btAssert(colend+1-colstart == block.cols());
+		for (int row=0;row<block.rows();row++)
+		{
+			for (int col=0;col<block.cols();col++)
+			{
+				setElem(rowstart+row,colstart+col,block(row,col));
+			}
+		}
+	}
+	void setSubMatrix(int rowstart,int colstart,int rowend,int colend,const btVectorX<T>& block)
+	{
+		btAssert(rowend+1-rowstart == block.rows());
+		btAssert(colend+1-colstart == block.cols());
+		for (int row=0;row<block.rows();row++)
+		{
+			for (int col=0;col<block.cols();col++)
+			{
+				setElem(rowstart+row,colstart+col,block[row]);
+			}
+		}
+	}
+	btMatrixX negative()
+	{
+		btMatrixX neg(rows(),cols());
+		for (int i=0;i<rows();i++)
+			for (int j=0;j<cols();j++)
+			{
+				T v = (*this)(i,j);
+				neg.setElem(i,j,-v);
+			}
+		return neg;
+	}
+typedef btMatrixX<float> btMatrixXf;
+typedef btVectorX<float> btVectorXf;
+typedef btMatrixX<double> btMatrixXd;
+typedef btVectorX<double> btVectorXd;
+template <typename T> 
+std::ostream& operator<< (std::ostream& os, const btMatrixX<T>& mat)
+	{
+		os << " [";
+		//printf("%s ---------------------\n",msg);
+		for (int i=0;i<mat.rows();i++)
+		{
+			for (int j=0;j<mat.cols();j++)
+			{
+				os << std::setw(12) << mat(i,j);
+			}
+			if (i!=mat.rows()-1)
+				os << std::endl << "  ";
+		}
+		os << " ]";
+		//printf("\n---------------------\n");
+		return os;
+	}
+template <typename T> 
+std::ostream& operator<< (std::ostream& os, const btVectorX<T>& mat)
+	{
+		os << " [";
+		//printf("%s ---------------------\n",msg);
+		for (int i=0;i<mat.rows();i++)
+		{
+				os << std::setw(12) << mat[i];
+			if (i!=mat.rows()-1)
+				os << std::endl << "  ";
+		}
+		os << " ]";
+		//printf("\n---------------------\n");
+		return os;
+	}
+inline void setElem(btMatrixXd& mat, int row, int col, double val)
+	mat.setElem(row,col,val);
+inline void setElem(btMatrixXf& mat, int row, int col, float val)
+	mat.setElem(row,col,val);
+	#define btVectorXu btVectorXd
+	#define btMatrixXu btMatrixXd
+	#define btVectorXu btVectorXf
+	#define btMatrixXu btMatrixXf
diff --git a/src/bullet/LinearMath/btPolarDecomposition.cpp b/src/bullet/LinearMath/btPolarDecomposition.cpp
new file mode 100644
index 00000000..a4dca7fd
--- /dev/null
+++ b/src/bullet/LinearMath/btPolarDecomposition.cpp
@@ -0,0 +1,99 @@
+#include "btPolarDecomposition.h"
+#include "btMinMax.h"
+  btScalar abs_column_sum(const btMatrix3x3& a, int i)
+  {
+    return btFabs(a[0][i]) + btFabs(a[1][i]) + btFabs(a[2][i]);
+  }
+  btScalar abs_row_sum(const btMatrix3x3& a, int i)
+  {
+    return btFabs(a[i][0]) + btFabs(a[i][1]) + btFabs(a[i][2]);
+  }
+  btScalar p1_norm(const btMatrix3x3& a)
+  {
+    const btScalar sum0 = abs_column_sum(a,0);
+    const btScalar sum1 = abs_column_sum(a,1);
+    const btScalar sum2 = abs_column_sum(a,2);
+    return btMax(btMax(sum0, sum1), sum2);
+  }
+  btScalar pinf_norm(const btMatrix3x3& a)
+  {
+    const btScalar sum0 = abs_row_sum(a,0);
+    const btScalar sum1 = abs_row_sum(a,1);
+    const btScalar sum2 = abs_row_sum(a,2);
+    return btMax(btMax(sum0, sum1), sum2);
+  }
+const btScalar btPolarDecomposition::DEFAULT_TOLERANCE = btScalar(0.0001);
+const unsigned int btPolarDecomposition::DEFAULT_MAX_ITERATIONS = 16;
+btPolarDecomposition::btPolarDecomposition(btScalar tolerance, unsigned int maxIterations)
+: m_tolerance(tolerance)
+, m_maxIterations(maxIterations)
+unsigned int btPolarDecomposition::decompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h) const
+  // Use the 'u' and 'h' matrices for intermediate calculations
+  u = a;
+  h = a.inverse();
+  for (unsigned int i = 0; i < m_maxIterations; ++i)
+  {
+    const btScalar h_1 = p1_norm(h);
+    const btScalar h_inf = pinf_norm(h);
+    const btScalar u_1 = p1_norm(u);
+    const btScalar u_inf = pinf_norm(u);
+    const btScalar h_norm = h_1 * h_inf;
+    const btScalar u_norm = u_1 * u_inf;
+    // The matrix is effectively singular so we cannot invert it
+    if (btFuzzyZero(h_norm) || btFuzzyZero(u_norm))
+      break;
+    const btScalar gamma = btPow(h_norm / u_norm, 0.25f);
+    const btScalar inv_gamma = btScalar(1.0) / gamma;
+    // Determine the delta to 'u'
+    const btMatrix3x3 delta = (u * (gamma - btScalar(2.0)) + h.transpose() * inv_gamma) * btScalar(0.5);
+    // Update the matrices
+    u += delta;
+    h = u.inverse();
+    // Check for convergence
+    if (p1_norm(delta) <= m_tolerance * u_1)
+    {
+      h = u.transpose() * a;
+      h = (h + h.transpose()) * 0.5;
+      return i;
+    }
+  }
+  // The algorithm has failed to converge to the specified tolerance, but we
+  // want to make sure that the matrices returned are in the right form.
+  h = u.transpose() * a;
+  h = (h + h.transpose()) * 0.5;
+  return m_maxIterations;
+unsigned int btPolarDecomposition::maxIterations() const
+  return m_maxIterations;
+unsigned int polarDecompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h)
+  static btPolarDecomposition polar;
+  return polar.decompose(a, u, h);
diff --git a/src/bullet/LinearMath/btPolarDecomposition.h b/src/bullet/LinearMath/btPolarDecomposition.h
new file mode 100644
index 00000000..56156676
--- /dev/null
+++ b/src/bullet/LinearMath/btPolarDecomposition.h
@@ -0,0 +1,73 @@
+#include "btMatrix3x3.h"
+ * This class is used to compute the polar decomposition of a matrix. In
+ * general, the polar decomposition factorizes a matrix, A, into two parts: a
+ * unitary matrix (U) and a positive, semi-definite Hermitian matrix (H).
+ * However, in this particular implementation the original matrix, A, is
+ * required to be a square 3x3 matrix with real elements. This means that U will
+ * be an orthogonal matrix and H with be a positive-definite, symmetric matrix.
+ */
+class btPolarDecomposition
+  public:
+    static const btScalar DEFAULT_TOLERANCE;
+    static const unsigned int DEFAULT_MAX_ITERATIONS;
+    /**
+     * Creates an instance with optional parameters.
+     *
+     * @param tolerance     - the tolerance used to determine convergence of the
+     *                        algorithm
+     * @param maxIterations - the maximum number of iterations used to achieve
+     *                        convergence
+     */
+    btPolarDecomposition(btScalar tolerance = DEFAULT_TOLERANCE, 
+      unsigned int maxIterations = DEFAULT_MAX_ITERATIONS);
+    /**
+     * Decomposes a matrix into orthogonal and symmetric, positive-definite
+     * parts. If the number of iterations returned by this function is equal to
+     * the maximum number of iterations, the algorithm has failed to converge.
+     *
+     * @param a - the original matrix
+     * @param u - the resulting orthogonal matrix
+     * @param h - the resulting symmetric matrix
+     *
+     * @return the number of iterations performed by the algorithm.
+     */
+    unsigned int decompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h) const; 
+    /**
+     * Returns the maximum number of iterations that this algorithm will perform
+     * to achieve convergence.
+     *
+     * @return maximum number of iterations
+     */
+    unsigned int maxIterations() const;
+  private:
+    btScalar m_tolerance;
+    unsigned int m_maxIterations;
+ * This functions decomposes the matrix 'a' into two parts: an orthogonal matrix
+ * 'u' and a symmetric, positive-definite matrix 'h'. If the number of
+ * iterations returned by this function is equal to
+ * btPolarDecomposition::DEFAULT_MAX_ITERATIONS, the algorithm has failed to
+ * converge.
+ *
+ * @param a - the original matrix
+ * @param u - the resulting orthogonal matrix
+ * @param h - the resulting symmetric matrix
+ *
+ * @return the number of iterations performed by the algorithm.
+ */
+unsigned int polarDecompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h); 
diff --git a/src/bullet/LinearMath/btQuadWord.h b/src/bullet/LinearMath/btQuadWord.h
index 68c7b63c..fcfb3be4 100644
--- a/src/bullet/LinearMath/btQuadWord.h
+++ b/src/bullet/LinearMath/btQuadWord.h
@@ -20,6 +20,9 @@ subject to the following restrictions:
 #include "btMinMax.h"
 #if defined (__CELLOS_LV2) && defined (__SPU__)
 #include <altivec.h>
@@ -47,11 +50,53 @@ public:
 #else //__CELLOS_LV2__ __SPU__
+#if defined(BT_USE_SSE) || defined(BT_USE_NEON) 
+	union {
+		btSimdFloat4 mVec128;
+		btScalar	m_floats[4];
+	};
+	SIMD_FORCE_INLINE	btSimdFloat4	get128() const
+	{
+		return mVec128;
+	}
+	SIMD_FORCE_INLINE	void	set128(btSimdFloat4 v128)
+	{
+		mVec128 = v128;
+	}
 	btScalar	m_floats[4];
+#endif // BT_USE_SSE
 #endif //__CELLOS_LV2__ __SPU__
+#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
+	// Set Vector 
+	SIMD_FORCE_INLINE btQuadWord(const btSimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+	// Copy constructor
+	SIMD_FORCE_INLINE btQuadWord(const btQuadWord& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+	// Assignment Operator
+	operator=(const btQuadWord& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
   /**@brief Return the x value */
 		SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; }
@@ -59,7 +104,6 @@ protected:
 		SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; }
   /**@brief Return the z value */
 		SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; }
   /**@brief Set the x value */
 		SIMD_FORCE_INLINE void	setX(btScalar _x) { m_floats[0] = _x;};
   /**@brief Set the y value */
@@ -68,7 +112,6 @@ protected:
 		SIMD_FORCE_INLINE void	setZ(btScalar _z) { m_floats[2] = _z;};
   /**@brief Set the w value */
 		SIMD_FORCE_INLINE void	setW(btScalar _w) { m_floats[3] = _w;};
   /**@brief Return the x value */
 		SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; }
   /**@brief Return the y value */
@@ -86,7 +129,14 @@ protected:
 	SIMD_FORCE_INLINE	bool	operator==(const btQuadWord& other) const
-		return ((m_floats[3]==other.m_floats[3]) && (m_floats[2]==other.m_floats[2]) && (m_floats[1]==other.m_floats[1]) && (m_floats[0]==other.m_floats[0]));
+#ifdef BT_USE_SSE
+        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+		return ((m_floats[3]==other.m_floats[3]) && 
+                (m_floats[2]==other.m_floats[2]) && 
+                (m_floats[1]==other.m_floats[1]) && 
+                (m_floats[0]==other.m_floats[0]));
 	SIMD_FORCE_INLINE	bool	operator!=(const btQuadWord& other) const
@@ -99,7 +149,6 @@ protected:
    * @param y Value of y
    * @param z Value of z
 		SIMD_FORCE_INLINE void 	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
@@ -107,7 +156,6 @@ protected:
 			m_floats[3] = 0.f;
 /*		void getValue(btScalar *m) const 
@@ -122,7 +170,6 @@ protected:
    * @param z Value of z
    * @param w Value of w
 		SIMD_FORCE_INLINE void	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w)
@@ -130,7 +177,6 @@ protected:
   /**@brief No initialization constructor */
 		//	:m_floats[0](btScalar(0.)),m_floats[1](btScalar(0.)),m_floats[2](btScalar(0.)),m_floats[3](btScalar(0.))
@@ -142,12 +188,10 @@ protected:
    * @param y Value of y
    * @param z Value of z
 		SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z)		
 			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f;
 /**@brief Initializing constructor
    * @param x Value of x
@@ -155,33 +199,43 @@ protected:
    * @param z Value of z
    * @param w Value of w
 		SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) 
 			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w;
   /**@brief Set each element to the max of the current values and the values of another btQuadWord
    * @param other The other btQuadWord to compare with 
 		SIMD_FORCE_INLINE void	setMax(const btQuadWord& other)
-			btSetMax(m_floats[0], other.m_floats[0]);
+        #ifdef BT_USE_SSE
+            mVec128 = _mm_max_ps(mVec128, other.mVec128);
+        #elif defined(BT_USE_NEON)
+            mVec128 = vmaxq_f32(mVec128, other.mVec128);
+        #else
+        	btSetMax(m_floats[0], other.m_floats[0]);
 			btSetMax(m_floats[1], other.m_floats[1]);
 			btSetMax(m_floats[2], other.m_floats[2]);
 			btSetMax(m_floats[3], other.m_floats[3]);
-		}
+		#endif
+        }
   /**@brief Set each element to the min of the current values and the values of another btQuadWord
    * @param other The other btQuadWord to compare with 
 		SIMD_FORCE_INLINE void	setMin(const btQuadWord& other)
-			btSetMin(m_floats[0], other.m_floats[0]);
+        #ifdef BT_USE_SSE
+            mVec128 = _mm_min_ps(mVec128, other.mVec128);
+        #elif defined(BT_USE_NEON)
+            mVec128 = vminq_f32(mVec128, other.mVec128);
+        #else
+        	btSetMin(m_floats[0], other.m_floats[0]);
 			btSetMin(m_floats[1], other.m_floats[1]);
 			btSetMin(m_floats[2], other.m_floats[2]);
 			btSetMin(m_floats[3], other.m_floats[3]);
-		}
+		#endif
+        }
diff --git a/src/bullet/LinearMath/btQuaternion.h b/src/bullet/LinearMath/btQuaternion.h
index 34d1012a..f7dafcc8 100644
--- a/src/bullet/LinearMath/btQuaternion.h
+++ b/src/bullet/LinearMath/btQuaternion.h
@@ -21,29 +21,79 @@ subject to the following restrictions:
 #include "btVector3.h"
 #include "btQuadWord.h"
+#define btQuaternionData btQuaternionDoubleData
+#define btQuaternionDataName "btQuaternionDoubleData"
+#define btQuaternionData btQuaternionFloatData
+#define btQuaternionDataName "btQuaternionFloatData"
+#ifdef BT_USE_SSE
+//const __m128 ATTRIBUTE_ALIGNED16(vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
+#define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f))
+#if defined(BT_USE_SSE) 
+#define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f))
+#define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f))
+#elif defined(BT_USE_NEON)
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(vQInv) = {-0.0f, -0.0f, -0.0f, +0.0f};
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f};
 /**@brief The btQuaternion implements quaternion to perform linear algebra rotations in combination with btMatrix3x3, btVector3 and btTransform. */
 class btQuaternion : public btQuadWord {
   /**@brief No initialization constructor */
 	btQuaternion() {}
+#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON) 
+	// Set Vector 
+	SIMD_FORCE_INLINE btQuaternion(const btSimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+	// Copy constructor
+	SIMD_FORCE_INLINE btQuaternion(const btQuaternion& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+	// Assignment Operator
+	SIMD_FORCE_INLINE btQuaternion& 
+	operator=(const btQuaternion& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
 	//		template <typename btScalar>
 	//		explicit Quaternion(const btScalar *v) : Tuple4<btScalar>(v) {}
   /**@brief Constructor from scalars */
 	btQuaternion(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) 
 		: btQuadWord(_x, _y, _z, _w) 
   /**@brief Axis angle Constructor
    * @param axis The axis which the rotation is around
    * @param angle The magnitude of the rotation around the angle (Radians) */
-	btQuaternion(const btVector3& axis, const btScalar& _angle) 
+	btQuaternion(const btVector3& _axis, const btScalar& _angle) 
-		setRotation(axis, _angle); 
+		setRotation(_axis, _angle); 
   /**@brief Constructor from Euler angles
    * @param yaw Angle around Y unless BT_EULER_DEFAULT_ZYX defined then Z
    * @param pitch Angle around X unless BT_EULER_DEFAULT_ZYX defined then Y
@@ -59,7 +109,6 @@ public:
   /**@brief Set the rotation using axis angle notation 
    * @param axis The axis around which to rotate
    * @param angle The magnitude of the rotation in Radians */
 	void setRotation(const btVector3& axis, const btScalar& _angle)
 		btScalar d = axis.length();
@@ -68,7 +117,6 @@ public:
 		setValue(axis.x() * s, axis.y() * s, axis.z() * s, 
 			btCos(_angle * btScalar(0.5)));
   /**@brief Set the quaternion using Euler angles
    * @param yaw Angle around Y
    * @param pitch Angle around X
@@ -113,7 +161,16 @@ public:
    * @param q The quaternion to add to this one */
 	SIMD_FORCE_INLINE	btQuaternion& operator+=(const btQuaternion& q)
-		m_floats[0] += q.x(); m_floats[1] += q.y(); m_floats[2] += q.z(); m_floats[3] += q.m_floats[3];
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, q.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, q.mVec128);
+		m_floats[0] += q.x(); 
+        m_floats[1] += q.y(); 
+        m_floats[2] += q.z(); 
+        m_floats[3] += q.m_floats[3];
 		return *this;
@@ -121,15 +178,35 @@ public:
    * @param q The quaternion to subtract from this one */
 	btQuaternion& operator-=(const btQuaternion& q) 
-		m_floats[0] -= q.x(); m_floats[1] -= q.y(); m_floats[2] -= q.z(); m_floats[3] -= q.m_floats[3];
-		return *this;
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, q.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, q.mVec128);
+		m_floats[0] -= q.x(); 
+        m_floats[1] -= q.y(); 
+        m_floats[2] -= q.z(); 
+        m_floats[3] -= q.m_floats[3];
+        return *this;
   /**@brief Scale this quaternion
    * @param s The scalar to scale by */
 	btQuaternion& operator*=(const btScalar& s)
-		m_floats[0] *= s; m_floats[1] *= s; m_floats[2] *= s; m_floats[3] *= s;
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0);	//	(S S S S)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+		m_floats[0] *= s; 
+        m_floats[1] *= s; 
+        m_floats[2] *= s; 
+        m_floats[3] *= s;
 		return *this;
@@ -138,17 +215,111 @@ public:
    * Equivilant to this = this * q */
 	btQuaternion& operator*=(const btQuaternion& q)
-		setValue(m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128 vQ2 = q.get128();
+		__m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0));
+		__m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
+		A1 = A1 * B1;
+		__m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1));
+		__m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
+		A2 = A2 * B2;
+		B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2));
+		B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
+		B1 = B1 * B2;	//	A3 *= B3
+		mVec128 = bt_splat_ps(mVec128, 3);	//	A0
+		mVec128 = mVec128 * vQ2;	//	A0 * B0
+		A1 = A1 + A2;	//	AB12
+		mVec128 = mVec128 - B1;	//	AB03 = AB0 - AB3 
+		A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+		mVec128 = mVec128+ A1;	//	AB03 + AB12
+#elif defined(BT_USE_NEON)     
+        float32x4_t vQ1 = mVec128;
+        float32x4_t vQ2 = q.get128();
+        float32x4_t A0, A1, B1, A2, B2, A3, B3;
+        float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+        {
+        float32x2x2_t tmp;
+        tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+        vQ1zx = tmp.val[0];
+        tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+        vQ2zx = tmp.val[0];
+        }
+        vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+        vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+        vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+        vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+        A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+        B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+        A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+        B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+        A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+        B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+        A1 = vmulq_f32(A1, B1);
+        A2 = vmulq_f32(A2, B2);
+        A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+        A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
+        A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+        A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
+        //	change the sign of the last element
+        A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+        A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
+        mVec128 = A0;
+		setValue(
+            m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(),
 			m_floats[3] * q.y() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.x() - m_floats[0] * q.z(),
 			m_floats[3] * q.z() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.y() - m_floats[1] * q.x(),
 			m_floats[3] * q.m_floats[3] - m_floats[0] * q.x() - m_floats[1] * q.y() - m_floats[2] * q.z());
 		return *this;
   /**@brief Return the dot product between this quaternion and another
    * @param q The other quaternion */
 	btScalar dot(const btQuaternion& q) const
-		return m_floats[0] * q.x() + m_floats[1] * q.y() + m_floats[2] * q.z() + m_floats[3] * q.m_floats[3];
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vd;
+		vd = _mm_mul_ps(mVec128, q.mVec128);
+        __m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+        return _mm_cvtss_f32(vd);
+#elif defined(BT_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));  
+		x = vpadd_f32(x, x);
+		return vget_lane_f32(x, 0);
+		return  m_floats[0] * q.x() + 
+                m_floats[1] * q.y() + 
+                m_floats[2] * q.z() + 
+                m_floats[3] * q.m_floats[3];
   /**@brief Return the length squared of the quaternion */
@@ -167,7 +338,25 @@ public:
    * Such that x^2 + y^2 + z^2 +w^2 = 1 */
 	btQuaternion& normalize() 
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vd;
+		vd = _mm_mul_ps(mVec128, mVec128);
+        __m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+		vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(vOnes, vd);
+        vd = bt_pshufd_ps(vd, 0); // splat
+		mVec128 = _mm_mul_ps(mVec128, vd);
+		return *this;
 		return *this /= length();
   /**@brief Return a scaled version of this quaternion
@@ -175,10 +364,18 @@ public:
 	SIMD_FORCE_INLINE btQuaternion
 	operator*(const btScalar& s) const
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
+		return btQuaternion(_mm_mul_ps(mVec128, vs));
+#elif defined(BT_USE_NEON)
+		return btQuaternion(vmulq_n_f32(mVec128, s));
 		return btQuaternion(x() * s, y() * s, z() * s, m_floats[3] * s);
   /**@brief Return an inversely scaled versionof this quaternion
    * @param s The inverse scale factor */
 	btQuaternion operator/(const btScalar& s) const
@@ -200,7 +397,7 @@ public:
 		return *this / length();
-  /**@brief Return the angle between this quaternion and the other 
+	/**@brief Return the ***half*** angle between this quaternion and the other
    * @param q The other quaternion */
 	btScalar angle(const btQuaternion& q) const 
@@ -208,13 +405,38 @@ public:
 		btAssert(s != btScalar(0.0));
 		return btAcos(dot(q) / s);
-  /**@brief Return the angle of rotation represented by this quaternion */
+	/**@brief Return the angle between this quaternion and the other along the shortest path
+	* @param q The other quaternion */
+	btScalar angleShortestPath(const btQuaternion& q) const 
+	{
+		btScalar s = btSqrt(length2() * q.length2());
+		btAssert(s != btScalar(0.0));
+		if (dot(q) < 0) // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
+			return btAcos(dot(-q) / s) * btScalar(2.0);
+		else 
+			return btAcos(dot(q) / s) * btScalar(2.0);
+	}
+	/**@brief Return the angle [0, 2Pi] of rotation represented by this quaternion */
 	btScalar getAngle() const 
 		btScalar s = btScalar(2.) * btAcos(m_floats[3]);
 		return s;
+	/**@brief Return the angle [0, Pi] of rotation represented by this quaternion along the shortest path */
+	btScalar getAngleShortestPath() const 
+	{
+		btScalar s;
+		if (m_floats[3] >= 0)
+			s = btScalar(2.) * btAcos(m_floats[3]);
+		else
+			s = btScalar(2.) * btAcos(-m_floats[3]);
+		return s;
+	}
 	/**@brief Return the axis of the rotation represented by this quaternion */
 	btVector3 getAxis() const
@@ -229,7 +451,13 @@ public:
 	/**@brief Return the inverse of this quaternion */
 	btQuaternion inverse() const
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_xor_ps(mVec128, vQInv));
+#elif defined(BT_USE_NEON)
+        return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
 		return btQuaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]);
   /**@brief Return the sum of this quaternion and the other 
@@ -237,8 +465,14 @@ public:
 	SIMD_FORCE_INLINE btQuaternion
 	operator+(const btQuaternion& q2) const
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_add_ps(mVec128, q2.mVec128));
+#elif defined(BT_USE_NEON)
+        return btQuaternion(vaddq_f32(mVec128, q2.mVec128));
 		const btQuaternion& q1 = *this;
 		return btQuaternion(q1.x() + q2.x(), q1.y() + q2.y(), q1.z() + q2.z(), q1.m_floats[3] + q2.m_floats[3]);
   /**@brief Return the difference between this quaternion and the other 
@@ -246,16 +480,28 @@ public:
 	SIMD_FORCE_INLINE btQuaternion
 	operator-(const btQuaternion& q2) const
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_sub_ps(mVec128, q2.mVec128));
+#elif defined(BT_USE_NEON)
+        return btQuaternion(vsubq_f32(mVec128, q2.mVec128));
 		const btQuaternion& q1 = *this;
 		return btQuaternion(q1.x() - q2.x(), q1.y() - q2.y(), q1.z() - q2.z(), q1.m_floats[3] - q2.m_floats[3]);
   /**@brief Return the negative of this quaternion 
    * This simply negates each element */
 	SIMD_FORCE_INLINE btQuaternion operator-() const
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_xor_ps(mVec128, btvMzeroMask));
+#elif defined(BT_USE_NEON)
+		return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) );
 		const btQuaternion& q2 = *this;
 		return btQuaternion( - q2.x(), - q2.y(),  - q2.z(),  - q2.m_floats[3]);
   /**@todo document this and it's use */
 	SIMD_FORCE_INLINE btQuaternion farthest( const btQuaternion& qd) const 
@@ -290,7 +536,7 @@ public:
 	  btAssert(magnitude > btScalar(0));
     btScalar product = dot(q) / magnitude;
-    if (btFabs(product) != btScalar(1))
+    if (btFabs(product) < btScalar(1))
       // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
       const btScalar sign = (product < 0) ? btScalar(-1) : btScalar(1);
@@ -320,7 +566,18 @@ public:
 	SIMD_FORCE_INLINE const btScalar& getW() const { return m_floats[3]; }
+	SIMD_FORCE_INLINE	void	serialize(struct	btQuaternionData& dataOut) const;
+	SIMD_FORCE_INLINE	void	deSerialize(const struct	btQuaternionData& dataIn);
+	SIMD_FORCE_INLINE	void	serializeFloat(struct	btQuaternionFloatData& dataOut) const;
+	SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btQuaternionFloatData& dataIn);
+	SIMD_FORCE_INLINE	void	serializeDouble(struct	btQuaternionDoubleData& dataOut) const;
+	SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btQuaternionDoubleData& dataIn);
@@ -329,29 +586,257 @@ public:
 /**@brief Return the product of two quaternions */
-operator*(const btQuaternion& q1, const btQuaternion& q2) {
-	return btQuaternion(q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
+operator*(const btQuaternion& q1, const btQuaternion& q2) 
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128 vQ1 = q1.get128();
+	__m128 vQ2 = q2.get128();
+	__m128 A0, A1, B1, A2, B2;
+	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0)); // X Y  z x     //      vtrn
+	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0)); // W W  W X     // vdup vext
+	A1 = A1 * B1;
+	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1)); // Y Z  X Y     // vext 
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); // z x  Y Y     // vtrn vdup
+	A2 = A2 * B2;
+	B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2)); // z x Y Z      // vtrn vext
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); // Y Z x z      // vext vtrn
+	B1 = B1 * B2;	//	A3 *= B3
+	A0 = bt_splat_ps(vQ1, 3);	//	A0
+	A0 = A0 * vQ2;	//	A0 * B0
+	A1 = A1 + A2;	//	AB12
+	A0 =  A0 - B1;	//	AB03 = AB0 - AB3 
+    A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+	A0 = A0 + A1;	//	AB03 + AB12
+	return btQuaternion(A0);
+#elif defined(BT_USE_NEON)     
+	float32x4_t vQ1 = q1.get128();
+	float32x4_t vQ2 = q2.get128();
+	float32x4_t A0, A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    }
+    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+	A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
+    //	change the sign of the last element
+    A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+	A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
+	return btQuaternion(A0);
+	return btQuaternion(
+        q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
 		q1.w() * q2.y() + q1.y() * q2.w() + q1.z() * q2.x() - q1.x() * q2.z(),
 		q1.w() * q2.z() + q1.z() * q2.w() + q1.x() * q2.y() - q1.y() * q2.x(),
 		q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z()); 
 operator*(const btQuaternion& q, const btVector3& w)
-	return btQuaternion( q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
-		q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
-		q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128 vQ1 = q.get128();
+	__m128 vQ2 = w.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0));
+	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0));
+	A1 = A1 * B1;
+	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
+	A2 = A2 * B2;
+	A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
+	B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
+	A3 = A3 * B3;	//	A3 *= B3
+	A1 = A1 + A2;	//	AB12
+	A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+    A1 = A1 - A3;	//	AB123 = AB12 - AB3 
+	return btQuaternion(A1);
+#elif defined(BT_USE_NEON)     
+	float32x4_t vQ1 = q.get128();
+	float32x4_t vQ2 = w.get128();
+	float32x4_t A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
+    vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); 
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    }
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+    A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W  W X 
+    B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                    // X Y  z x 
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+    //	change the sign of the last element
+    A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
+	return btQuaternion(A1);
+	return btQuaternion( 
+         q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
+		 q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
+		 q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
 		-q.x() * w.x() - q.y() * w.y() - q.z() * w.z()); 
 operator*(const btVector3& w, const btQuaternion& q)
-	return btQuaternion( w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
-		w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
-		w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128 vQ1 = w.get128();
+	__m128 vQ2 = q.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));  // X Y  z x
+	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));  // W W  W X 
+	A1 = A1 * B1;
+	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
+	A2 = A2 *B2;
+	A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
+	B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
+	A3 = A3 * B3;	//	A3 *= B3
+	A1 = A1 + A2;	//	AB12
+	A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+	A1 = A1 - A3;	//	AB123 = AB12 - AB3 
+	return btQuaternion(A1);
+#elif defined(BT_USE_NEON)     
+	float32x4_t vQ1 = w.get128();
+	float32x4_t vQ2 = q.get128();
+	float32x4_t  A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    }
+    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+    //	change the sign of the last element
+    A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
+	return btQuaternion(A1);
+	return btQuaternion( 
+        +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
+		+w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
+		+w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
 		-w.x() * q.x() - w.y() * q.y() - w.z() * q.z()); 
 /**@brief Calculate the dot product between two quaternions */
@@ -371,7 +856,7 @@ length(const btQuaternion& q)
 /**@brief Return the angle between two quaternions*/
-angle(const btQuaternion& q1, const btQuaternion& q2) 
+btAngle(const btQuaternion& q1, const btQuaternion& q2) 
 	return q1.angle(q2); 
@@ -399,7 +884,13 @@ quatRotate(const btQuaternion& rotation, const btVector3& v)
 	btQuaternion q = rotation * v;
 	q *= rotation.inverse();
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
+#elif defined(BT_USE_NEON)
+    return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
 	return btVector3(q.getX(),q.getY(),q.getZ());
 SIMD_FORCE_INLINE btQuaternion 
@@ -429,8 +920,63 @@ shortestArcQuatNormalize2(btVector3& v0,btVector3& v1)
 	return shortestArcQuat(v0,v1);
+struct	btQuaternionFloatData
+	float	m_floats[4];
+struct	btQuaternionDoubleData
+	double	m_floats[4];
+SIMD_FORCE_INLINE	void	btQuaternion::serializeFloat(struct	btQuaternionFloatData& dataOut) const
+	///could also do a memcpy, check if it is worth it
+	for (int i=0;i<4;i++)
+		dataOut.m_floats[i] = float(m_floats[i]);
+SIMD_FORCE_INLINE void	btQuaternion::deSerializeFloat(const struct	btQuaternionFloatData& dataIn)
+	for (int i=0;i<4;i++)
+		m_floats[i] = btScalar(dataIn.m_floats[i]);
+SIMD_FORCE_INLINE	void	btQuaternion::serializeDouble(struct	btQuaternionDoubleData& dataOut) const
+	///could also do a memcpy, check if it is worth it
+	for (int i=0;i<4;i++)
+		dataOut.m_floats[i] = double(m_floats[i]);
+SIMD_FORCE_INLINE void	btQuaternion::deSerializeDouble(const struct	btQuaternionDoubleData& dataIn)
+	for (int i=0;i<4;i++)
+		m_floats[i] = btScalar(dataIn.m_floats[i]);
+SIMD_FORCE_INLINE	void	btQuaternion::serialize(struct	btQuaternionData& dataOut) const
+	///could also do a memcpy, check if it is worth it
+	for (int i=0;i<4;i++)
+		dataOut.m_floats[i] = m_floats[i];
+SIMD_FORCE_INLINE void	btQuaternion::deSerialize(const struct	btQuaternionData& dataIn)
+	for (int i=0;i<4;i++)
+		m_floats[i] = dataIn.m_floats[i];
diff --git a/src/bullet/LinearMath/btQuickprof.cpp b/src/bullet/LinearMath/btQuickprof.cpp
index 6c2b4814..cfbda362 100644
--- a/src/bullet/LinearMath/btQuickprof.cpp
+++ b/src/bullet/LinearMath/btQuickprof.cpp
@@ -10,15 +10,12 @@
-// Credits: The Clock class was inspired by the Timer classes in 
+// Credits: The Clock class was inspired by the Timer classes in
 // Ogre (www.ogre3d.org).
 #include "btQuickprof.h"
-#ifndef BT_NO_PROFILE
-static btClock gProfileClock;
 #ifdef __CELLOS_LV2__
@@ -27,8 +24,8 @@ static btClock gProfileClock;
 #include <stdio.h>
-#if defined (SUNOS) || defined (__SUNOS__) 
-#include <stdio.h> 
+#if defined (SUNOS) || defined (__SUNOS__)
+#include <stdio.h>
 #if defined(WIN32) || defined(_WIN32)
@@ -37,12 +34,17 @@ static btClock gProfileClock;
 #define WIN32_LEAN_AND_MEAN
 #define NOWINRES
 #define NOMCX
-#define NOIME 
+#define NOIME
 #ifdef _XBOX
 	#include <Xtl.h>
 #else //_XBOX
 	#include <windows.h>
+#if WINVER <0x0602
+#define GetTickCount64 GetTickCount
 #endif //_XBOX
 #include <time.h>
@@ -59,7 +61,7 @@ struct btClockData
 	LARGE_INTEGER mClockFrequency;
-	DWORD mStartTick;
+	LONGLONG mStartTick;
 	LONGLONG mPrevElapsedTime;
@@ -105,7 +107,7 @@ void btClock::reset()
-	m_data->mStartTick = GetTickCount();
+	m_data->mStartTick = GetTickCount64();
 	m_data->mPrevElapsedTime = 0;
 #ifdef __CELLOS_LV2__
@@ -121,34 +123,34 @@ void btClock::reset()
-/// Returns the time in ms since the last call to reset or since 
+/// Returns the time in ms since the last call to reset or since
 /// the btClock was created.
 unsigned long int btClock::getTimeMilliseconds()
 	LARGE_INTEGER currentTime;
-	LONGLONG elapsedTime = currentTime.QuadPart - 
+	LONGLONG elapsedTime = currentTime.QuadPart -
 		// Compute the number of millisecond ticks elapsed.
-	unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / 
+	unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
-		// Check for unexpected leaps in the Win32 performance counter.  
-	// (This is caused by unexpected data across the PCI to ISA 
+		// Check for unexpected leaps in the Win32 performance counter.
+		// (This is caused by unexpected data across the PCI to ISA
 		// bridge, aka south bridge.  See Microsoft KB274323.)
-		unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
+		unsigned long elapsedTicks = (unsigned long)(GetTickCount64() - m_data->mStartTick);
 		signed long msecOff = (signed long)(msecTicks - elapsedTicks);
 		if (msecOff < -100 || msecOff > 100)
 			// Adjust the starting time forwards.
-			LONGLONG msecAdjustment = mymin(msecOff * 
-				m_data->mClockFrequency.QuadPart / 1000, elapsedTime - 
+			LONGLONG msecAdjustment = mymin(msecOff *
+				m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
 			m_data->mStartTime.QuadPart += msecAdjustment;
 			elapsedTime -= msecAdjustment;
 			// Recompute the number of millisecond ticks elapsed.
-			msecTicks = (unsigned long)(1000 * elapsedTime / 
+			msecTicks = (unsigned long)(1000 * elapsedTime /
@@ -171,36 +173,36 @@ unsigned long int btClock::getTimeMilliseconds()
 		struct timeval currentTime;
 		gettimeofday(&currentTime, 0);
-		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 + 
+		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 +
 			(currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000;
 #endif //__CELLOS_LV2__
-	/// Returns the time in us since the last call to reset or since 
+	/// Returns the time in us since the last call to reset or since
 	/// the Clock was created.
 unsigned long int btClock::getTimeMicroseconds()
 		LARGE_INTEGER currentTime;
-		LONGLONG elapsedTime = currentTime.QuadPart - 
+		LONGLONG elapsedTime = currentTime.QuadPart -
 		// Compute the number of millisecond ticks elapsed.
-		unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / 
+		unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
-		// Check for unexpected leaps in the Win32 performance counter.  
-		// (This is caused by unexpected data across the PCI to ISA 
+		// Check for unexpected leaps in the Win32 performance counter.
+		// (This is caused by unexpected data across the PCI to ISA
 		// bridge, aka south bridge.  See Microsoft KB274323.)
-		unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
+		unsigned long elapsedTicks = (unsigned long)(GetTickCount64() - m_data->mStartTick);
 		signed long msecOff = (signed long)(msecTicks - elapsedTicks);
 		if (msecOff < -100 || msecOff > 100)
 			// Adjust the starting time forwards.
-			LONGLONG msecAdjustment = mymin(msecOff * 
-				m_data->mClockFrequency.QuadPart / 1000, elapsedTime - 
+			LONGLONG msecAdjustment = mymin(msecOff *
+				m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
 			m_data->mStartTime.QuadPart += msecAdjustment;
 			elapsedTime -= msecAdjustment;
@@ -210,7 +212,7 @@ unsigned long int btClock::getTimeMicroseconds()
 		m_data->mPrevElapsedTime = elapsedTime;
 		// Convert to microseconds.
-		unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime / 
+		unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime /
 		return usecTicks;
@@ -229,14 +231,26 @@ unsigned long int btClock::getTimeMicroseconds()
 		struct timeval currentTime;
 		gettimeofday(&currentTime, 0);
-		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + 
+		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 +
 			(currentTime.tv_usec - m_data->mStartTime.tv_usec);
+/// Returns the time in s since the last call to reset or since 
+/// the Clock was created.
+btScalar btClock::getTimeSeconds()
+	static const btScalar microseconds_to_seconds = btScalar(0.000001);
+	return btScalar(getTimeMicroseconds()) * microseconds_to_seconds;
+#ifndef BT_NO_PROFILE
+static btClock gProfileClock;
 inline void Profile_Get_Ticks(unsigned long int * ticks)
@@ -252,7 +266,6 @@ inline float Profile_Get_Tick_Rate(void)
 ** CProfileNode
@@ -293,8 +306,7 @@ void	CProfileNode::CleanupMemory()
 CProfileNode::~CProfileNode( void )
-	delete ( Child);
-	delete ( Sibling);
+	CleanupMemory();
@@ -318,7 +330,7 @@ CProfileNode * CProfileNode::Get_Sub_Node( const char * name )
 	// We didn't find it, so add it
 	CProfileNode * node = new CProfileNode( name, this );
 	node->Sibling = Child;
 	Child = node;
@@ -330,7 +342,7 @@ void	CProfileNode::Reset( void )
 	TotalCalls = 0;
 	TotalTime = 0.0f;
 	if ( Child ) {
@@ -352,7 +364,7 @@ void	CProfileNode::Call( void )
 bool	CProfileNode::Return( void )
-	if ( --RecursionCounter == 0 && TotalCalls != 0 ) { 
+	if ( --RecursionCounter == 0 && TotalCalls != 0 ) {
 		unsigned long int time;
@@ -445,8 +457,8 @@ void	CProfileManager::Start_Profile( const char * name )
 	if (name != CurrentNode->Get_Name()) {
 		CurrentNode = CurrentNode->Get_Sub_Node( name );
-	} 
+	}
@@ -470,7 +482,7 @@ void	CProfileManager::Stop_Profile( void )
  *    This resets everything except for the tree structure.  All of the timing data is reset.  *
 void	CProfileManager::Reset( void )
@@ -516,9 +528,9 @@ void	CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spaci
 	printf("Profiling: %s (total running time: %.3f ms) ---\n",	profileIterator->Get_Current_Parent_Name(), parent_time );
 	float totalTime = 0.f;
 	int numChildren = 0;
 	for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next())
@@ -526,9 +538,7 @@ void	CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spaci
 		accumulated_time += current_total_time;
 		float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f;
-			int j;	for (j=0;j<spacing;j++)	printf(".");
+			int i;	for (i=0;i<spacing;i++)	printf(".");
 		printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n",i, profileIterator->Get_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls());
 		totalTime += current_total_time;
@@ -537,11 +547,11 @@ void	CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spaci
 	if (parent_time < accumulated_time)
-		printf("what's wrong\n");
+		//printf("what's wrong\n");
 	for (i=0;i<spacing;i++)	printf(".");
 	printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:",parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time);
 	for (i=0;i<numChildren;i++)
diff --git a/src/bullet/LinearMath/btQuickprof.h b/src/bullet/LinearMath/btQuickprof.h
index 93f3f4a6..49545713 100644
--- a/src/bullet/LinearMath/btQuickprof.h
+++ b/src/bullet/LinearMath/btQuickprof.h
@@ -15,18 +15,7 @@
 #ifndef BT_QUICK_PROF_H
 #define BT_QUICK_PROF_H
-//To disable built-in profiling, please comment out next line
-//#define BT_NO_PROFILE 1
-#ifndef BT_NO_PROFILE
-#include <stdio.h>//@todo remove this, backwards compatibility
 #include "btScalar.h"
-#include "btAlignedAllocator.h"
-#include <new>
 #define USE_BT_CLOCK 1
 #ifdef USE_BT_CLOCK
@@ -52,6 +41,11 @@ public:
 	/// Returns the time in us since the last call to reset or since 
 	/// the Clock was created.
 	unsigned long int getTimeMicroseconds();
+	/// Returns the time in s since the last call to reset or since 
+	/// the Clock was created.
+	btScalar getTimeSeconds();
 	struct btClockData* m_data;
@@ -59,6 +53,20 @@ private:
 #endif //USE_BT_CLOCK
+//To disable built-in profiling, please comment out next line
+#define BT_NO_PROFILE 1
+#ifndef BT_NO_PROFILE
+#include <stdio.h>//@todo remove this, backwards compatibility
+#include "btAlignedAllocator.h"
+#include <new>
 ///A node in the Profile Hierarchy Tree
diff --git a/src/bullet/LinearMath/btScalar.h b/src/bullet/LinearMath/btScalar.h
index 2684014b..011fa139 100644
--- a/src/bullet/LinearMath/btScalar.h
+++ b/src/bullet/LinearMath/btScalar.h
@@ -17,6 +17,7 @@ subject to the following restrictions:
 #ifndef BT_SCALAR_H
 #define BT_SCALAR_H
 //Aligned data types not supported in managed code
 #pragma unmanaged
@@ -28,7 +29,7 @@ subject to the following restrictions:
 #include <float.h>
 /* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
-#define BT_BULLET_VERSION 280
+#define BT_BULLET_VERSION 285
 inline int	btGetVersion()
@@ -48,6 +49,11 @@ inline int	btGetVersion()
 			#define ATTRIBUTE_ALIGNED16(a) a
 			#define ATTRIBUTE_ALIGNED64(a) a
 			#define ATTRIBUTE_ALIGNED128(a) a
+		#elif (_M_ARM)
+			#define SIMD_FORCE_INLINE __forceinline
+			#define ATTRIBUTE_ALIGNED16(a) __declspec() a
+			#define ATTRIBUTE_ALIGNED64(a) __declspec() a
+			#define ATTRIBUTE_ALIGNED128(a) __declspec () a
 			#pragma warning(disable : 4324) // disable padding warning
@@ -67,8 +73,28 @@ inline int	btGetVersion()
  			#define btFsel(a,b,c) __fsel((a),(b),(c))
-#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
+#if defined (_M_ARM)
+            //Do not turn SSE on for ARM (may want to turn on BT_USE_NEON however)
+#elif (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
+			#if _MSC_VER>1400
+				#define BT_USE_SIMD_VECTOR3
+			#endif
 			#define BT_USE_SSE
+			#ifdef BT_USE_SSE
+#if (_MSC_FULL_VER >= 170050727)//Visual Studio 2012 can compile SSE4/FMA3 (but SSE4/FMA3 is not enabled by default)
+			#define BT_ALLOW_SSE4
+#endif //(_MSC_FULL_VER >= 160040219)
+			//BT_USE_SSE_IN_API is disabled under Windows by default, because 
+			//it makes it harder to integrate Bullet into your application under Windows 
+			//(structured embedding Bullet structs/classes need to be 16-byte aligned)
+			//with relatively little performance gain
+			//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
+			//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
+			//#define BT_USE_SSE_IN_API
+			#endif //BT_USE_SSE
 			#include <emmintrin.h>
@@ -76,9 +102,14 @@ inline int	btGetVersion()
 		#endif //__MINGW32__
-		#include <assert.h>
 #ifdef BT_DEBUG
+	#ifdef _MSC_VER
+		#include <stdio.h>
+		#define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u (%s)\n", __LINE__, #x);__debugbreak();	}}
+	#else//_MSC_VER
+		#include <assert.h>
 		#define btAssert assert
+	#endif//_MSC_VER
 		#define btAssert(x)
@@ -102,9 +133,7 @@ inline int	btGetVersion()
 #ifdef __SPU__
 #include <spu_printf.h>
 #define printf spu_printf
-	#define btAssert(x) {if(!(x)){printf("Assert " __FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
+	#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
 	#define btAssert assert
@@ -145,11 +174,37 @@ inline int	btGetVersion()
 	//non-windows systems
-#if (defined (__APPLE__) && defined (__i386__) && (!defined (BT_USE_DOUBLE_PRECISION)))
-	#define BT_USE_SSE
-	#include <emmintrin.h>
+#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
+    #if defined (__i386__) || defined (__x86_64__)
+		#define BT_USE_SIMD_VECTOR3
+		#define BT_USE_SSE
+		//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
+		//if apps run into issues, we will disable the next line
+		#define BT_USE_SSE_IN_API
+        #ifdef BT_USE_SSE
+            // include appropriate SSE level
+            #if defined (__SSE4_1__)
+                #include <smmintrin.h>
+            #elif defined (__SSSE3__)
+                #include <tmmintrin.h>
+            #elif defined (__SSE3__)
+                #include <pmmintrin.h>
+            #else
+                #include <emmintrin.h>
+            #endif
+        #endif //BT_USE_SSE
+    #elif defined( __ARM_NEON__ )
+        #ifdef __clang__
+            #define BT_USE_NEON 1
+			#define BT_USE_SIMD_VECTOR3
+            #if defined BT_USE_NEON && defined (__clang__)
+                #include <arm_neon.h>
+            #endif//BT_USE_NEON
+       #endif //__clang__
+    #endif//__arm__
-	#define SIMD_FORCE_INLINE inline
+	#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
 ///@todo: check out alignment methods for other platforms/compilers
 	#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
 	#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
@@ -159,10 +214,22 @@ inline int	btGetVersion()
 	#if defined(DEBUG) || defined (_DEBUG)
+	 #if defined (__i386__) || defined (__x86_64__)
+	#include <stdio.h>
+	 #define btAssert(x)\
+	{\
+	if(!(x))\
+	{\
+		printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
+		asm volatile ("int3");\
+	}\
+	}
+	#else//defined (__i386__) || defined (__x86_64__)
 		#define btAssert assert
-	#else
+	#endif//defined (__i386__) || defined (__x86_64__)
+	#else//defined(DEBUG) || defined (_DEBUG)
 		#define btAssert(x)
-	#endif
+	#endif//defined(DEBUG) || defined (_DEBUG)
 	//btFullAssert is optional, slows down a lot
 	#define btFullAssert(x)
@@ -203,15 +270,120 @@ inline int	btGetVersion()
 ///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
 typedef double btScalar;
 //this number could be bigger in double precision
 #define BT_LARGE_FLOAT 1e30
 typedef float btScalar;
 #define BT_LARGE_FLOAT 1e18f
+#ifdef BT_USE_SSE
+typedef __m128 btSimdFloat4;
+#if defined (BT_USE_SSE)
+//#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
+#ifdef _WIN32
+#ifndef BT_NAN
+static int btNanMask = 0x7F800001;
+#define BT_NAN (*(float*)&btNanMask)
+#ifndef BT_INFINITY
+static  int btInfinityMask = 0x7F800000;
+#define BT_INFINITY (*(float*)&btInfinityMask)
+inline int btGetInfinityMask()//suppress stupid compiler warning
+	return btInfinityMask;
+//use this, in case there are clashes (such as xnamath.h)
+inline __m128 operator + (const __m128 A, const __m128 B)
+    return _mm_add_ps(A, B);
+inline __m128 operator - (const __m128 A, const __m128 B)
+    return _mm_sub_ps(A, B);
+inline __m128 operator * (const __m128 A, const __m128 B)
+    return _mm_mul_ps(A, B);
+#define btCastfTo128i(a) (_mm_castps_si128(a))
+#define btCastfTo128d(a) (_mm_castps_pd(a))
+#define btCastiTo128f(a) (_mm_castsi128_ps(a))
+#define btCastdTo128f(a) (_mm_castpd_ps(a))
+#define btCastdTo128i(a) (_mm_castpd_si128(a))
+#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
+#define btCastfTo128i(a) ((__m128i)(a))
+#define btCastfTo128d(a) ((__m128d)(a))
+#define btCastiTo128f(a)  ((__m128) (a))
+#define btCastdTo128f(a) ((__m128) (a))
+#define btCastdTo128i(a) ((__m128i)(a))
+#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
+#define BT_NAN NAN
+#ifdef BT_USE_NEON
+	#include <arm_neon.h>
+	typedef float32x4_t btSimdFloat4;
+	#define BT_NAN NAN
+	#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
+	#ifndef BT_INFINITY
+		struct btInfMaskConverter
+		{
+		        union {
+		                float mask;
+		                int intmask;
+		        };
+		        btInfMaskConverter(int mask=0x7F800000)
+		        :intmask(mask)
+		        {
+		        }
+		};
+		static btInfMaskConverter btInfinityMask = 0x7F800000;
+		#define BT_INFINITY (btInfinityMask.mask)
+		inline int btGetInfinityMask()//suppress stupid compiler warning
+		{
+		        return btInfinityMask.intmask;
+		}
+	#endif
+#endif //BT_USE_SSE
+#ifdef BT_USE_NEON
+#include <arm_neon.h>
+typedef float32x4_t btSimdFloat4;
+#define BT_NAN NAN
+#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
@@ -247,19 +419,30 @@ SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); }
 SIMD_FORCE_INLINE btScalar btSqrt(btScalar y) 
+#ifdef __LP64__
+    float xhalf = 0.5f*y;
+    int i = *(int*)&y;
+    i = 0x5f375a86 - (i>>1);
+    y = *(float*)&i;
+    y = y*(1.5f - xhalf*y*y);
+    y = y*(1.5f - xhalf*y*y);
+    y = y*(1.5f - xhalf*y*y);
+    y=1/y;
+    return y;
     double x, z, tempf;
     unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
-	tempf = y;
-	*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
-	x =  tempf;
-	z =  y*btScalar(0.5);
-	x = (btScalar(1.5)*x)-(x*x)*(x*z);         /* iteration formula     */
-	x = (btScalar(1.5)*x)-(x*x)*(x*z);
-	x = (btScalar(1.5)*x)-(x*x)*(x*z);
-	x = (btScalar(1.5)*x)-(x*x)*(x*z);
-	x = (btScalar(1.5)*x)-(x*x)*(x*z);
-	return x*y;
+    tempf = y;
+    *tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
+    x =  tempf;
+    z =  y*btScalar(0.5);
+    x = (btScalar(1.5)*x)-(x*x)*(x*z);         /* iteration formula     */
+    x = (btScalar(1.5)*x)-(x*x)*(x*z);
+    x = (btScalar(1.5)*x)-(x*x)*(x*z);
+    x = (btScalar(1.5)*x)-(x*x)*(x*z);
+    x = (btScalar(1.5)*x)-(x*x)*(x*z);
+    return x*y;
 	return sqrtf(y); 
@@ -291,22 +474,30 @@ SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
-#define SIMD_2_PI         btScalar(6.283185307179586232)
-#define SIMD_PI           (SIMD_2_PI * btScalar(0.5))
-#define SIMD_HALF_PI      (SIMD_2_PI * btScalar(0.25))
+#define SIMD_PI           btScalar(3.1415926535897932384626433832795029)
+#define SIMD_2_PI         (btScalar(2.0) * SIMD_PI)
+#define SIMD_HALF_PI      (SIMD_PI * btScalar(0.5))
 #define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
 #define SIMD_DEGS_PER_RAD  (btScalar(360.0) / SIMD_2_PI)
 #define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
 #define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x))))		/* reciprocal square root */
+#define btRecip(x) (btScalar(1.0)/btScalar(x))
+#define BT_ONE			1.0
+#define BT_ZERO			0.0
+#define BT_TWO			2.0
+#define BT_HALF			0.5
+#define BT_ONE			1.0f
+#define BT_ZERO			0.0f
+#define BT_TWO			2.0f
+#define BT_HALF			0.5f
 SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x) 
@@ -490,6 +681,46 @@ SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src)
 	return d;
+template<typename T>
+SIMD_FORCE_INLINE void btSetZero(T* a, int n)
+  T* acurr = a;
+  size_t ncurr = n;
+  while (ncurr > 0) 
+  {
+    *(acurr++) = 0;
+    --ncurr;
+  }
+SIMD_FORCE_INLINE btScalar btLargeDot(const btScalar *a, const btScalar *b, int n)
+  btScalar p0,q0,m0,p1,q1,m1,sum;
+  sum = 0;
+  n -= 2;
+  while (n >= 0) {
+    p0 = a[0]; q0 = b[0];
+    m0 = p0 * q0;
+    p1 = a[1]; q1 = b[1];
+    m1 = p1 * q1;
+    sum += m0;
+    sum += m1;
+    a += 2;
+    b += 2;
+    n -= 2;
+  }
+  n += 2;
+  while (n > 0) {
+    sum += (*a) * (*b);
+    a++;
+    b++;
+    n--;
+  }
+  return sum;
 // returns normalized value in range [-SIMD_PI, SIMD_PI]
 SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians) 
@@ -508,6 +739,8 @@ SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
 ///rudimentary class to provide type info
 struct btTypedObject
@@ -523,19 +756,28 @@ struct btTypedObject
 ///align a pointer to the provided alignment, upwards
 template <typename T>T* btAlignPointer(T* unalignedPtr, size_t alignment)
-        union
-        {
-                T* ptr;
-                size_t integer;
-        };
-        const size_t bit_mask = ~(alignment - 1);
-        ptr = unalignedPtr;
-		integer += alignment-1;
-        integer &= bit_mask;
-        return ptr;
+	struct btConvertPointerSizeT
+	{
+		union 
+		{
+				T* ptr;
+				size_t integer;
+		};
+	};
+    btConvertPointerSizeT converter;
+	const size_t bit_mask = ~(alignment - 1);
+    converter.ptr = unalignedPtr;
+	converter.integer += alignment-1;
+	converter.integer &= bit_mask;
+	return converter.ptr;
 #endif //BT_SCALAR_H
diff --git a/src/bullet/LinearMath/btSerializer.cpp b/src/bullet/LinearMath/btSerializer.cpp
index 49c25b7e..9838e6c0 100644
--- a/src/bullet/LinearMath/btSerializer.cpp
+++ b/src/bullet/LinearMath/btSerializer.cpp
@@ -1,841 +1,1184 @@
 char sBulletDNAstr[]= {
 int sBulletDNAlen= sizeof(sBulletDNAstr);
- char sBulletDNAstr64[]= {
+char sBulletDNAstr64[]= {
 int sBulletDNAlen64= sizeof(sBulletDNAstr64);
diff --git a/src/bullet/LinearMath/btSerializer.h b/src/bullet/LinearMath/btSerializer.h
index 81da2052..454068d6 100644
--- a/src/bullet/LinearMath/btSerializer.h
+++ b/src/bullet/LinearMath/btSerializer.h
@@ -4,8 +4,8 @@ Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
@@ -17,12 +17,9 @@ subject to the following restrictions:
 #include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
-#include "btStackAlloc.h"
 #include "btHashMap.h"
-#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__) && !defined(__native_client__)
+#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
 #include <memory.h>
 #include <string.h>
@@ -35,12 +32,12 @@ extern int sBulletDNAlen;
 extern char sBulletDNAstr64[];
 extern int sBulletDNAlen64;
-SIMD_FORCE_INLINE	int btStrLen(const char* str) 
+SIMD_FORCE_INLINE	int btStrLen(const char* str)
-    if (!str) 
+    if (!str)
 	int len = 0;
 	while (*str != 0)
@@ -88,7 +85,7 @@ public:
 	virtual	void*	getUniquePointer(void*oldPtr) = 0;
 	virtual	void	startSerialization() = 0;
 	virtual	void	finishSerialization() = 0;
 	virtual	const char*	findNameForPointer(const void* ptr) const = 0;
@@ -101,6 +98,9 @@ public:
 	virtual void	setSerializationFlags(int flags) = 0;
+	virtual int getNumChunks() const = 0;
+	virtual const btChunk* getChunk(int chunkIndex) const = 0;
@@ -108,23 +108,26 @@ public:
 #define BT_HEADER_LENGTH 12
 #if defined(__sgi) || defined (__sparc) || defined (__sparc__) || defined (__PPC__) || defined (__ppc__) || defined (__BIG_ENDIAN__)
-#	define MAKE_ID(a,b,c,d) ( (int)(a)<<24 | (int)(b)<<16 | (c)<<8 | (d) )
+#	define BT_MAKE_ID(a,b,c,d) ( (int)(a)<<24 | (int)(b)<<16 | (c)<<8 | (d) )
-#	define MAKE_ID(a,b,c,d) ( (int)(d)<<24 | (int)(c)<<16 | (b)<<8 | (a) )
+#	define BT_MAKE_ID(a,b,c,d) ( (int)(d)<<24 | (int)(c)<<16 | (b)<<8 | (a) )
-#define BT_SOFTBODY_CODE		MAKE_ID('S','B','D','Y')
-#define BT_RIGIDBODY_CODE		MAKE_ID('R','B','D','Y')
-#define BT_CONSTRAINT_CODE		MAKE_ID('C','O','N','S')
-#define BT_BOXSHAPE_CODE		MAKE_ID('B','O','X','S')
-#define BT_TRIANLGE_INFO_MAP	MAKE_ID('T','M','A','P')
-#define BT_SHAPE_CODE			MAKE_ID('S','H','A','P')
-#define BT_ARRAY_CODE			MAKE_ID('A','R','A','Y')
-#define BT_SBMATERIAL_CODE		MAKE_ID('S','B','M','T')
-#define BT_SBNODE_CODE			MAKE_ID('S','B','N','D')
-#define BT_DNA_CODE				MAKE_ID('D','N','A','1')
+#define BT_MULTIBODY_CODE       BT_MAKE_ID('M','B','D','Y')
+#define BT_SOFTBODY_CODE		BT_MAKE_ID('S','B','D','Y')
+#define BT_RIGIDBODY_CODE		BT_MAKE_ID('R','B','D','Y')
+#define BT_CONSTRAINT_CODE		BT_MAKE_ID('C','O','N','S')
+#define BT_BOXSHAPE_CODE		BT_MAKE_ID('B','O','X','S')
+#define BT_SHAPE_CODE			BT_MAKE_ID('S','H','A','P')
+#define BT_ARRAY_CODE			BT_MAKE_ID('A','R','A','Y')
+#define BT_SBMATERIAL_CODE		BT_MAKE_ID('S','B','M','T')
+#define BT_SBNODE_CODE			BT_MAKE_ID('S','B','N','D')
+#define BT_DNA_CODE				BT_MAKE_ID('D','N','A','1')
 struct	btPointerUid
@@ -136,21 +139,46 @@ struct	btPointerUid
+struct btBulletSerializedArrays
+	btBulletSerializedArrays()
+	{
+	}
+	btAlignedObjectArray<struct btQuantizedBvhDoubleData*>	m_bvhsDouble;
+	btAlignedObjectArray<struct btQuantizedBvhFloatData*>	m_bvhsFloat;
+	btAlignedObjectArray<struct btCollisionShapeData*> m_colShapeData;
+	btAlignedObjectArray<struct btDynamicsWorldDoubleData*> m_dynamicWorldInfoDataDouble;
+	btAlignedObjectArray<struct btDynamicsWorldFloatData*> m_dynamicWorldInfoDataFloat;
+	btAlignedObjectArray<struct btRigidBodyDoubleData*> m_rigidBodyDataDouble;
+	btAlignedObjectArray<struct btRigidBodyFloatData*> m_rigidBodyDataFloat;
+	btAlignedObjectArray<struct btCollisionObjectDoubleData*> m_collisionObjectDataDouble;
+	btAlignedObjectArray<struct btCollisionObjectFloatData*> m_collisionObjectDataFloat;
+	btAlignedObjectArray<struct btTypedConstraintFloatData*> m_constraintDataFloat;
+	btAlignedObjectArray<struct btTypedConstraintDoubleData*> m_constraintDataDouble;
+	btAlignedObjectArray<struct btTypedConstraintData*> m_constraintData;//for backwards compatibility
+	btAlignedObjectArray<struct btSoftBodyFloatData*> m_softBodyFloatData;
+	btAlignedObjectArray<struct btSoftBodyDoubleData*> m_softBodyDoubleData;
 ///The btDefaultSerializer is the main Bullet serialization class.
 ///The constructor takes an optional argument for backwards compatibility, it is recommended to leave this empty/zero.
 class btDefaultSerializer	:	public btSerializer
 	btAlignedObjectArray<char*>			mTypes;
 	btAlignedObjectArray<short*>			mStructs;
 	btAlignedObjectArray<short>			mTlens;
 	btHashMap<btHashInt, int>			mStructReverse;
 	btHashMap<btHashString,int>	mTypeLookup;
 	btHashMap<btHashPtr,void*>	m_chunkP;
 	btHashMap<btHashPtr,const char*>	m_nameMap;
 	btHashMap<btHashPtr,btPointerUid>	m_uniquePointers;
@@ -158,6 +186,7 @@ class btDefaultSerializer	:	public btSerializer
 	int					m_totalSize;
 	unsigned char*		m_buffer;
+	bool                m_ownsBuffer;
 	int					m_currentSize;
 	void*				m_dna;
 	int					m_dnaLength;
@@ -166,10 +195,11 @@ class btDefaultSerializer	:	public btSerializer
 	btAlignedObjectArray<btChunk*>	m_chunkPtrs;
-	virtual	void*	findPointer(void* oldPtr) 
+	virtual	void*	findPointer(void* oldPtr)
 		void** ptr = m_chunkP.find(oldPtr);
 		if (ptr && *ptr)
@@ -177,11 +207,11 @@ protected:
 		return 0;
-		void	writeDNA()
+		virtual void	writeDNA()
 			btChunk* dnaChunk = allocate(m_dnaLength,1);
@@ -195,7 +225,7 @@ protected:
 			const int* valuePtr = mTypeLookup.find(key);
 			if (valuePtr)
 				return *valuePtr;
 			return -1;
@@ -207,7 +237,7 @@ protected:
 			int littleEndian= 1;
 			littleEndian= ((char*)&littleEndian)[0];
 			m_dna = btAlignedAlloc(dnalen,16);
@@ -235,16 +265,16 @@ protected:
 			// Parse names
 			if (!littleEndian)
 				*intPtr = btSwapEndian(*intPtr);
 			dataLen = *intPtr;
 			cp = (char*)intPtr;
 			int i;
 			for ( i=0; i<dataLen; i++)
 				while (*cp)cp++;
@@ -258,15 +288,15 @@ protected:
 			intPtr = (int*)cp;
-			assert(strncmp(cp, "TYPE", 4)==0); intPtr++;
+			btAssert(strncmp(cp, "TYPE", 4)==0); intPtr++;
 			if (!littleEndian)
 				*intPtr =  btSwapEndian(*intPtr);
 			dataLen = *intPtr;
 			cp = (char*)intPtr;
 			for (i=0; i<dataLen; i++)
@@ -286,7 +316,7 @@ protected:
 			// Parse type lens
 			intPtr = (int*)cp;
-			assert(strncmp(cp, "TLEN", 4)==0); intPtr++;
+			btAssert(strncmp(cp, "TLEN", 4)==0); intPtr++;
 			dataLen = (int)mTypes.size();
@@ -313,11 +343,11 @@ protected:
 			intPtr = (int*)shtPtr;
 			cp = (char*)intPtr;
-			assert(strncmp(cp, "STRC", 4)==0); intPtr++;
+			btAssert(strncmp(cp, "STRC", 4)==0); intPtr++;
 			if (!littleEndian)
 				*intPtr = btSwapEndian(*intPtr);
-			dataLen = *intPtr ; 
+			dataLen = *intPtr ;
@@ -325,7 +355,7 @@ protected:
 			for (i=0; i<dataLen; i++)
 				mStructs.push_back (shtPtr);
 				if (!littleEndian)
 					shtPtr[0]= btSwapEndian(shtPtr[0]);
@@ -355,20 +385,28 @@ protected:
+	btHashMap<btHashPtr,void*> m_skipPointers;
-		btDefaultSerializer(int totalSize=0)
+		btDefaultSerializer(int totalSize=0, unsigned char*	buffer=0)
-			m_buffer = m_totalSize?(unsigned char*)btAlignedAlloc(totalSize,16):0;
+		    if (buffer==0)
+            {
+                m_buffer = m_totalSize?(unsigned char*)btAlignedAlloc(totalSize,16):0;
+                m_ownsBuffer = true;
+            } else
+            {
+                m_buffer = buffer;
+                m_ownsBuffer = false;
+            }
 			const bool VOID_IS_8 = ((sizeof(void*)==8));
@@ -387,7 +425,7 @@ public:
 			if (VOID_IS_8)
@@ -397,27 +435,33 @@ public:
 				initDNA((const char*)sBulletDNAstr,sBulletDNAlen);
-		virtual ~btDefaultSerializer() 
+		virtual ~btDefaultSerializer()
-			if (m_buffer)
+			if (m_buffer && m_ownsBuffer)
 			if (m_dna)
+		void	insertHeader()
+		{
+			writeHeader(m_buffer);
+			m_currentSize += BT_HEADER_LENGTH;
+		}
 		void	writeHeader(unsigned char* buffer) const
 			memcpy(buffer, "BULLETd", 7);
 			memcpy(buffer, "BULLETf", 7);
 			int littleEndian= 1;
 			littleEndian= ((char*)&littleEndian)[0];
@@ -431,7 +475,7 @@ public:
 			if (littleEndian)
-				buffer[8]='v';				
+				buffer[8]='v';
 			} else
@@ -440,7 +484,7 @@ public:
 			buffer[9] = '2';
 			buffer[10] = '8';
-			buffer[11] = '0';
+			buffer[11] = '5';
@@ -452,7 +496,7 @@ public:
 				unsigned char* buffer = internalAlloc(BT_HEADER_LENGTH);
 		virtual	void	finishSerialization()
@@ -488,6 +532,7 @@ public:
+			m_skipPointers.clear();
@@ -504,8 +549,15 @@ public:
 				return uptr->m_ptr;
+			void** ptr2 = m_skipPointers[oldPtr];
+            if (ptr2)
+			{
+				return 0;
+			}
 			btPointerUid uid;
 			uid.m_uniqueIds[0] = m_uniqueIdGenerator;
 			uid.m_uniqueIds[1] = m_uniqueIdGenerator;
@@ -532,17 +584,17 @@ public:
 			chunk->m_dna_nr = getReverseType(structType);
 			chunk->m_chunkCode = chunkCode;
 			void* uniquePtr = getUniquePointer(oldPtr);
 			chunk->m_oldPtr = uniquePtr;//oldPtr;
 		virtual unsigned char* internalAlloc(size_t size)
 			unsigned char* ptr = 0;
@@ -560,7 +612,7 @@ public:
 			return ptr;
 		virtual	btChunk*	allocate(size_t size, int numElements)
@@ -568,15 +620,15 @@ public:
 			unsigned char* ptr = internalAlloc(int(size)*numElements+sizeof(btChunk));
 			unsigned char* data = ptr + sizeof(btChunk);
 			btChunk* chunk = (btChunk*)ptr;
 			chunk->m_chunkCode = 0;
 			chunk->m_oldPtr = data;
 			chunk->m_length = int(size)*numElements;
 			chunk->m_number = numElements;
 			return chunk;
@@ -633,9 +685,202 @@ public:
 			m_serializationFlags = flags;
+		int getNumChunks() const
+		{
+			return m_chunkPtrs.size();
+		}
+		const btChunk* getChunk(int chunkIndex) const
+		{
+			return m_chunkPtrs[chunkIndex];
+		}
+///In general it is best to use btDefaultSerializer,
+///in particular when writing the data to disk or sending it over the network.
+///The btInMemorySerializer is experimental and only suitable in a few cases.
+///The btInMemorySerializer takes a shortcut and can be useful to create a deep-copy
+///of objects. There will be a demo on how to use the btInMemorySerializer.
+struct btInMemorySerializer : public btDefaultSerializer
+    btHashMap<btHashPtr,btChunk*> m_uid2ChunkPtr;
+    btHashMap<btHashPtr,void*> m_orgPtr2UniqueDataPtr;
+    btHashMap<btHashString,const void*> m_names2Ptr;
+    btBulletSerializedArrays    m_arrays;
+    btInMemorySerializer(int totalSize=0, unsigned char*	buffer=0)
+    :btDefaultSerializer(totalSize,buffer)
+    {
+    }
+    virtual void startSerialization()
+    {
+        m_uid2ChunkPtr.clear();
+        //todo: m_arrays.clear();
+        btDefaultSerializer::startSerialization();
+    }
+    btChunk* findChunkFromUniquePointer(void* uniquePointer)
+    {
+        btChunk** chkPtr = m_uid2ChunkPtr[uniquePointer];
+        if (chkPtr)
+        {
+            return *chkPtr;
+        }
+        return 0;
+    }
+	virtual	void	registerNameForPointer(const void* ptr, const char* name)
+    {
+       btDefaultSerializer::registerNameForPointer(ptr,name);
+       m_names2Ptr.insert(name,ptr);
+    }
+    virtual void finishSerialization()
+    {
+    }
+    virtual void* getUniquePointer(void*oldPtr)
+    {
+        if (oldPtr==0)
+            return 0;
+        // void* uniquePtr = getUniquePointer(oldPtr);
+        btChunk* chunk = findChunkFromUniquePointer(oldPtr);
+        if (chunk)
+        {
+            return chunk->m_oldPtr;
+        } else
+        {
+            const char* n = (const char*) oldPtr;
+            const void** ptr = m_names2Ptr[n];
+            if (ptr)
+            {
+                return oldPtr;
+            } else
+            {
+            		void** ptr2 = m_skipPointers[oldPtr];
+            		if (ptr2)
+								{
+									return 0;
+								} else
+								{
+									//If this assert hit, serialization happened in the wrong order
+									// 'getUniquePointer'
+									btAssert(0);
+								}
+            }
+            return 0;
+        }
+				return oldPtr;
+    }
+    virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode,void* oldPtr)
+    {
+        if (!(m_serializationFlags&BT_SERIALIZE_NO_DUPLICATE_ASSERT))
+        {
+            btAssert(!findPointer(oldPtr));
+        }
+        chunk->m_dna_nr = getReverseType(structType);
+        chunk->m_chunkCode = chunkCode;
+        //void* uniquePtr = getUniquePointer(oldPtr);
+        m_chunkP.insert(oldPtr,oldPtr);//chunk->m_oldPtr);
+        // chunk->m_oldPtr = uniquePtr;//oldPtr;
+        void* uid = findPointer(oldPtr);
+        m_uid2ChunkPtr.insert(uid,chunk);
+        switch (chunk->m_chunkCode)
+			{
+			{
+					m_arrays.m_softBodyDoubleData.push_back((btSoftBodyDoubleData*) chunk->m_oldPtr);
+	#else
+					m_arrays.m_softBodyFloatData.push_back((btSoftBodyFloatData*) chunk->m_oldPtr);
+	#endif
+					break;
+				}
+				{
+					m_arrays.m_collisionObjectDataDouble.push_back((btCollisionObjectDoubleData*)chunk->m_oldPtr);
+					m_arrays.m_collisionObjectDataFloat.push_back((btCollisionObjectFloatData*)chunk->m_oldPtr);
+					break;
+				}
+				{
+					m_arrays.m_rigidBodyDataDouble.push_back((btRigidBodyDoubleData*)chunk->m_oldPtr);
+	#else
+					m_arrays.m_rigidBodyDataFloat.push_back((btRigidBodyFloatData*)chunk->m_oldPtr);
+					break;
+				};
+				{
+					m_arrays.m_constraintDataDouble.push_back((btTypedConstraintDoubleData*)chunk->m_oldPtr);
+	#else
+					m_arrays.m_constraintDataFloat.push_back((btTypedConstraintFloatData*)chunk->m_oldPtr);
+	#endif
+					break;
+				}
+				{
+					m_arrays.m_bvhsDouble.push_back((btQuantizedBvhDoubleData*) chunk->m_oldPtr);
+	#else
+					m_arrays.m_bvhsFloat.push_back((btQuantizedBvhFloatData*) chunk->m_oldPtr);
+	#endif
+					break;
+				}
+			case BT_SHAPE_CODE:
+				{
+					btCollisionShapeData* shapeData = (btCollisionShapeData*) chunk->m_oldPtr;
+					m_arrays.m_colShapeData.push_back(shapeData);
+					break;
+				}
+			case BT_ARRAY_CODE:
+			case BT_SBNODE_CODE:
+			case BT_DNA_CODE:
+				{
+					break;
+				}
+			default:
+				{
+				}
+			};
+    }
+    int getNumChunks() const
+    {
+        return m_uid2ChunkPtr.size();
+    }
+    const btChunk* getChunk(int chunkIndex) const
+    {
+        return *m_uid2ChunkPtr.getAtIndex(chunkIndex);
+    }
diff --git a/src/bullet/LinearMath/btSpatialAlgebra.h b/src/bullet/LinearMath/btSpatialAlgebra.h
new file mode 100644
index 00000000..8e59658b
--- /dev/null
+++ b/src/bullet/LinearMath/btSpatialAlgebra.h
@@ -0,0 +1,331 @@
+Copyright (c) 2003-2015 Erwin Coumans, Jakub Stepien
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+///These spatial algebra classes are used for btMultiBody, 
+///see BulletDynamics/Featherstone
+#include "btMatrix3x3.h"
+struct btSpatialForceVector
+	btVector3 m_topVec, m_bottomVec;	
+	//
+	btSpatialForceVector() { setZero(); }
+	btSpatialForceVector(const btVector3 &angular, const btVector3 &linear) : m_topVec(linear), m_bottomVec(angular) {}
+	btSpatialForceVector(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz)
+	{
+		setValue(ax, ay, az, lx, ly, lz);
+	}
+	//
+	void setVector(const btVector3 &angular, const btVector3 &linear) { m_topVec = linear; m_bottomVec = angular; }
+	void setValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz)
+	{
+		m_bottomVec.setValue(ax, ay, az); m_topVec.setValue(lx, ly, lz);
+	}
+	//
+	void addVector(const btVector3 &angular, const btVector3 &linear) { m_topVec += linear; m_bottomVec += angular; }
+	void addValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz)
+	{
+		m_bottomVec[0] += ax; m_bottomVec[1] += ay; m_bottomVec[2] += az;
+		m_topVec[0] += lx; m_topVec[1] += ly; m_topVec[2] += lz;			
+	}
+	//
+	const btVector3 & getLinear()  const { return m_topVec; }
+	const btVector3 & getAngular() const { return m_bottomVec; }
+	//
+	void setLinear(const btVector3 &linear) { m_topVec = linear; }
+	void setAngular(const btVector3 &angular) { m_bottomVec = angular; }
+	//
+	void addAngular(const btVector3 &angular) { m_bottomVec += angular; }
+	void addLinear(const btVector3 &linear) { m_topVec += linear; }
+	//
+	void setZero() { m_topVec.setZero(); m_bottomVec.setZero(); }
+	//
+	btSpatialForceVector & operator += (const btSpatialForceVector &vec) { m_topVec += vec.m_topVec; m_bottomVec += vec.m_bottomVec; return *this; }
+	btSpatialForceVector & operator -= (const btSpatialForceVector &vec) { m_topVec -= vec.m_topVec; m_bottomVec -= vec.m_bottomVec; return *this; }
+	btSpatialForceVector operator - (const btSpatialForceVector &vec) const { return btSpatialForceVector(m_bottomVec - vec.m_bottomVec, m_topVec - vec.m_topVec); }
+	btSpatialForceVector operator + (const btSpatialForceVector &vec) const { return btSpatialForceVector(m_bottomVec + vec.m_bottomVec, m_topVec + vec.m_topVec); }
+	btSpatialForceVector operator - () const { return btSpatialForceVector(-m_bottomVec, -m_topVec); }
+	btSpatialForceVector operator * (const btScalar &s) const { return btSpatialForceVector(s * m_bottomVec, s * m_topVec); }		
+	//btSpatialForceVector & operator = (const btSpatialForceVector &vec) { m_topVec = vec.m_topVec; m_bottomVec = vec.m_bottomVec; return *this; }
+struct btSpatialMotionVector
+	btVector3 m_topVec, m_bottomVec;
+	//
+	btSpatialMotionVector() { setZero(); }
+	btSpatialMotionVector(const btVector3 &angular, const btVector3 &linear) : m_topVec(angular), m_bottomVec(linear) {}		
+	//
+	void setVector(const btVector3 &angular, const btVector3 &linear) { m_topVec = angular; m_bottomVec = linear; }
+	void setValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz)
+	{
+		m_topVec.setValue(ax, ay, az); m_bottomVec.setValue(lx, ly, lz);
+	}
+	//
+	void addVector(const btVector3 &angular, const btVector3 &linear) { m_topVec += linear; m_bottomVec += angular; }
+	void addValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz)
+	{
+		m_topVec[0] += ax; m_topVec[1] += ay; m_topVec[2] += az;
+		m_bottomVec[0] += lx; m_bottomVec[1] += ly; m_bottomVec[2] += lz;			
+	}
+	//	
+	const btVector3 & getAngular() const { return m_topVec; }
+	const btVector3 & getLinear() const { return m_bottomVec; }
+	//
+	void setAngular(const btVector3 &angular) { m_topVec = angular; }
+	void setLinear(const btVector3 &linear) { m_bottomVec = linear; }
+	//
+	void addAngular(const btVector3 &angular) { m_topVec += angular; }
+	void addLinear(const btVector3 &linear) { m_bottomVec += linear; }
+	//
+	void setZero() { m_topVec.setZero(); m_bottomVec.setZero(); }
+	//
+	btScalar dot(const btSpatialForceVector &b) const
+	{
+		return m_bottomVec.dot(b.m_topVec) + m_topVec.dot(b.m_bottomVec);
+	}
+	//
+	template<typename SpatialVectorType>
+	void cross(const SpatialVectorType &b, SpatialVectorType &out) const
+	{
+		out.m_topVec = m_topVec.cross(b.m_topVec);
+		out.m_bottomVec = m_bottomVec.cross(b.m_topVec) + m_topVec.cross(b.m_bottomVec);
+	}
+	template<typename SpatialVectorType>
+	SpatialVectorType cross(const SpatialVectorType &b) const
+	{
+		SpatialVectorType out;
+		out.m_topVec = m_topVec.cross(b.m_topVec);
+		out.m_bottomVec = m_bottomVec.cross(b.m_topVec) + m_topVec.cross(b.m_bottomVec);
+		return out;
+	}
+	//
+	btSpatialMotionVector & operator += (const btSpatialMotionVector &vec) { m_topVec += vec.m_topVec; m_bottomVec += vec.m_bottomVec; return *this; }
+	btSpatialMotionVector & operator -= (const btSpatialMotionVector &vec) { m_topVec -= vec.m_topVec; m_bottomVec -= vec.m_bottomVec; return *this; }
+	btSpatialMotionVector & operator *= (const btScalar &s) { m_topVec *= s; m_bottomVec *= s; return *this; }
+	btSpatialMotionVector operator - (const btSpatialMotionVector &vec) const { return btSpatialMotionVector(m_topVec - vec.m_topVec, m_bottomVec - vec.m_bottomVec); }
+	btSpatialMotionVector operator + (const btSpatialMotionVector &vec) const { return btSpatialMotionVector(m_topVec + vec.m_topVec, m_bottomVec + vec.m_bottomVec); }
+	btSpatialMotionVector operator - () const { return btSpatialMotionVector(-m_topVec, -m_bottomVec); }
+	btSpatialMotionVector operator * (const btScalar &s) const { return btSpatialMotionVector(s * m_topVec, s * m_bottomVec); }
+struct btSymmetricSpatialDyad
+	btMatrix3x3 m_topLeftMat, m_topRightMat, m_bottomLeftMat;
+	//		
+	btSymmetricSpatialDyad() { setIdentity(); }
+	btSymmetricSpatialDyad(const btMatrix3x3 &topLeftMat, const btMatrix3x3 &topRightMat, const btMatrix3x3 &bottomLeftMat) { setMatrix(topLeftMat, topRightMat, bottomLeftMat); }			
+	//
+	void setMatrix(const btMatrix3x3 &topLeftMat, const btMatrix3x3 &topRightMat, const btMatrix3x3 &bottomLeftMat)
+	{
+		m_topLeftMat = topLeftMat;
+		m_topRightMat = topRightMat;
+		m_bottomLeftMat = bottomLeftMat;
+	}
+	//
+	void addMatrix(const btMatrix3x3 &topLeftMat, const btMatrix3x3 &topRightMat, const btMatrix3x3 &bottomLeftMat)
+	{
+		m_topLeftMat += topLeftMat;
+		m_topRightMat += topRightMat;
+		m_bottomLeftMat += bottomLeftMat;
+	}
+	//
+	void setIdentity() { m_topLeftMat.setIdentity(); m_topRightMat.setIdentity(); m_bottomLeftMat.setIdentity();  }
+	//
+	btSymmetricSpatialDyad & operator -= (const btSymmetricSpatialDyad &mat)
+	{
+		m_topLeftMat -= mat.m_topLeftMat;
+		m_topRightMat -= mat.m_topRightMat;
+		m_bottomLeftMat -= mat.m_bottomLeftMat;
+		return *this; 
+	}
+	//
+	btSpatialForceVector operator * (const btSpatialMotionVector &vec)
+	{
+		return btSpatialForceVector(m_bottomLeftMat * vec.m_topVec + m_topLeftMat.transpose() * vec.m_bottomVec, m_topLeftMat * vec.m_topVec + m_topRightMat * vec.m_bottomVec);
+	}
+struct btSpatialTransformationMatrix
+	btMatrix3x3 m_rotMat; //btMatrix3x3 m_trnCrossMat;
+	btVector3 m_trnVec;
+	//
+	enum eOutputOperation
+	{
+		None = 0,
+		Add = 1,
+		Subtract = 2
+	};
+	//
+	template<typename SpatialVectorType>
+	void transform(	const SpatialVectorType &inVec,
+                      SpatialVectorType &outVec,
+					eOutputOperation outOp = None)
+	{
+		if(outOp == None)
+		{
+			outVec.m_topVec = m_rotMat * inVec.m_topVec;
+			outVec.m_bottomVec = -m_trnVec.cross(outVec.m_topVec) + m_rotMat * inVec.m_bottomVec;
+		}
+		else if(outOp == Add)
+		{
+			outVec.m_topVec += m_rotMat * inVec.m_topVec;
+			outVec.m_bottomVec += -m_trnVec.cross(outVec.m_topVec) + m_rotMat * inVec.m_bottomVec;
+		}
+		else if(outOp == Subtract)
+		{
+			outVec.m_topVec -= m_rotMat * inVec.m_topVec;
+			outVec.m_bottomVec -= -m_trnVec.cross(outVec.m_topVec) + m_rotMat * inVec.m_bottomVec;
+		}
+	}
+	template<typename SpatialVectorType>
+	void transformRotationOnly(	const SpatialVectorType &inVec,
+								SpatialVectorType &outVec,
+								eOutputOperation outOp = None)
+	{
+		if(outOp == None)
+		{
+			outVec.m_topVec = m_rotMat * inVec.m_topVec;
+			outVec.m_bottomVec = m_rotMat * inVec.m_bottomVec;
+		}
+		else if(outOp == Add)
+		{
+			outVec.m_topVec += m_rotMat * inVec.m_topVec;
+			outVec.m_bottomVec += m_rotMat * inVec.m_bottomVec;
+		}
+		else if(outOp == Subtract)
+		{
+			outVec.m_topVec -= m_rotMat * inVec.m_topVec;
+			outVec.m_bottomVec -= m_rotMat * inVec.m_bottomVec;
+		}
+	}
+	template<typename SpatialVectorType>
+	void transformInverse(	const SpatialVectorType &inVec,
+							SpatialVectorType &outVec,
+							eOutputOperation outOp = None)
+	{
+		if(outOp == None)
+		{
+			outVec.m_topVec = m_rotMat.transpose() * inVec.m_topVec;
+			outVec.m_bottomVec = m_rotMat.transpose() * (inVec.m_bottomVec + m_trnVec.cross(inVec.m_topVec));
+		}
+		else if(outOp == Add)
+		{
+			outVec.m_topVec += m_rotMat.transpose() * inVec.m_topVec;
+			outVec.m_bottomVec += m_rotMat.transpose() * (inVec.m_bottomVec + m_trnVec.cross(inVec.m_topVec));
+		}
+		else if(outOp == Subtract)
+		{
+			outVec.m_topVec -= m_rotMat.transpose() * inVec.m_topVec;
+			outVec.m_bottomVec -= m_rotMat.transpose() * (inVec.m_bottomVec + m_trnVec.cross(inVec.m_topVec));
+		}			
+	}
+	template<typename SpatialVectorType>
+	void transformInverseRotationOnly(	const SpatialVectorType &inVec,
+										SpatialVectorType &outVec,
+										eOutputOperation outOp = None)
+	{
+		if(outOp == None)
+		{
+			outVec.m_topVec = m_rotMat.transpose() * inVec.m_topVec;
+			outVec.m_bottomVec = m_rotMat.transpose() * inVec.m_bottomVec;
+		}
+		else if(outOp == Add)
+		{
+			outVec.m_topVec += m_rotMat.transpose() * inVec.m_topVec;
+			outVec.m_bottomVec += m_rotMat.transpose() * inVec.m_bottomVec;
+		}
+		else if(outOp == Subtract)
+		{
+			outVec.m_topVec -= m_rotMat.transpose() * inVec.m_topVec;
+			outVec.m_bottomVec -= m_rotMat.transpose() * inVec.m_bottomVec;
+		}
+	}
+	void transformInverse(	const btSymmetricSpatialDyad &inMat,
+							btSymmetricSpatialDyad &outMat,
+							eOutputOperation outOp = None)
+	{
+		const btMatrix3x3 r_cross(	0, -m_trnVec[2], m_trnVec[1],
+								m_trnVec[2], 0, -m_trnVec[0],
+								-m_trnVec[1], m_trnVec[0], 0);
+		if(outOp == None)
+		{
+			outMat.m_topLeftMat = m_rotMat.transpose() * ( inMat.m_topLeftMat - inMat.m_topRightMat * r_cross ) * m_rotMat;
+			outMat.m_topRightMat = m_rotMat.transpose() * inMat.m_topRightMat * m_rotMat;
+			outMat.m_bottomLeftMat = m_rotMat.transpose() * (r_cross * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) + inMat.m_bottomLeftMat - inMat.m_topLeftMat.transpose() * r_cross) * m_rotMat;
+		}
+		else if(outOp == Add)
+		{
+			outMat.m_topLeftMat += m_rotMat.transpose() * ( inMat.m_topLeftMat - inMat.m_topRightMat * r_cross ) * m_rotMat;
+			outMat.m_topRightMat += m_rotMat.transpose() * inMat.m_topRightMat * m_rotMat;
+			outMat.m_bottomLeftMat += m_rotMat.transpose() * (r_cross * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) + inMat.m_bottomLeftMat - inMat.m_topLeftMat.transpose() * r_cross) * m_rotMat;
+		}
+		else if(outOp == Subtract)
+		{
+			outMat.m_topLeftMat -= m_rotMat.transpose() * ( inMat.m_topLeftMat - inMat.m_topRightMat * r_cross ) * m_rotMat;
+			outMat.m_topRightMat -= m_rotMat.transpose() * inMat.m_topRightMat * m_rotMat;
+			outMat.m_bottomLeftMat -= m_rotMat.transpose() * (r_cross * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) + inMat.m_bottomLeftMat - inMat.m_topLeftMat.transpose() * r_cross) * m_rotMat;
+		}
+	}
+	template<typename SpatialVectorType>
+	SpatialVectorType operator * (const SpatialVectorType &vec)
+	{
+		SpatialVectorType out;
+		transform(vec, out);
+		return out;
+	}
+template<typename SpatialVectorType>
+void symmetricSpatialOuterProduct(const SpatialVectorType &a, const SpatialVectorType &b, btSymmetricSpatialDyad &out)
+	//output op maybe?
+	out.m_topLeftMat = outerProduct(a.m_topVec, b.m_bottomVec);
+	out.m_topRightMat = outerProduct(a.m_topVec, b.m_topVec);
+	out.m_topLeftMat = outerProduct(a.m_bottomVec, b.m_bottomVec);
+	//maybe simple a*spatTranspose(a) would be nicer?
+template<typename SpatialVectorType>
+btSymmetricSpatialDyad symmetricSpatialOuterProduct(const SpatialVectorType &a, const SpatialVectorType &b)
+	btSymmetricSpatialDyad out;
+	out.m_topLeftMat = outerProduct(a.m_topVec, b.m_bottomVec);
+	out.m_topRightMat = outerProduct(a.m_topVec, b.m_topVec);
+	out.m_bottomLeftMat = outerProduct(a.m_bottomVec, b.m_bottomVec);
+	return out;
+	//maybe simple a*spatTranspose(a) would be nicer?
diff --git a/src/bullet/LinearMath/btTransform.h b/src/bullet/LinearMath/btTransform.h
index 5e52d183..d4f939a5 100644
--- a/src/bullet/LinearMath/btTransform.h
+++ b/src/bullet/LinearMath/btTransform.h
@@ -31,7 +31,7 @@ subject to the following restrictions:
 /**@brief The btTransform class supports rigid transforms with only translation and rotation and no scaling/shear.
  *It can be used in combination with btVector3, btQuaternion and btMatrix3x3 linear algebra classes. */
-class btTransform {
+ATTRIBUTE_ALIGNED16(class) btTransform {
   ///Storage for the rotation
 	btMatrix3x3 m_basis;
@@ -93,9 +93,7 @@ public:
 /**@brief Return the transform of the vector */
 	SIMD_FORCE_INLINE btVector3 operator()(const btVector3& x) const
-		return btVector3(m_basis[0].dot(x) + m_origin.x(), 
-			m_basis[1].dot(x) + m_origin.y(), 
-			m_basis[2].dot(x) + m_origin.z());
+        return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin;
   /**@brief Return the transform of the vector */
@@ -129,7 +127,7 @@ public:
   /**@brief Set from an array 
-   * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
+   * @param m A pointer to a 16 element array (12 rotation(row major padded on the right by 1), and 3 translation */
 	void setFromOpenGLMatrix(const btScalar *m)
@@ -137,7 +135,7 @@ public:
   /**@brief Fill an array representation
-   * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
+   * @param m A pointer to a 16 element array (12 rotation(row major padded on the right by 1), and 3 translation */
 	void getOpenGLMatrix(btScalar *m) const 
diff --git a/src/bullet/LinearMath/btVector3.cpp b/src/bullet/LinearMath/btVector3.cpp
new file mode 100644
index 00000000..e05bdccd
--- /dev/null
+++ b/src/bullet/LinearMath/btVector3.cpp
@@ -0,0 +1,1670 @@
+ Copyright (c) 2011 Apple Inc.
+ http://continuousphysics.com/Bullet/
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose, 
+ including commercial applications, and to alter it and redistribute it freely, 
+ subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ This source version has been altered.
+ */
+#if defined (_WIN32) || defined (__i386__)
+#define BT_USE_SSE_IN_API
+#include "btVector3.h"
+#if defined BT_USE_SIMD_VECTOR3
+#if DEBUG
+#include <string.h>//for memset
+#ifdef __APPLE__
+#include <stdint.h>
+typedef  float float4 __attribute__ ((vector_size(16)));
+#define float4 __m128
+//typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
+#if defined BT_USE_SSE || defined _WIN32
+#define LOG2_ARRAY_SIZE     6
+#include <emmintrin.h>
+long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    const float4 *vertices = (const float4*) vv;
+    static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+    float4 dotMax = btAssign128( -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY );
+    float4 vvec = _mm_loadu_ps( vec );
+    float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
+    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+    long maxIndex = -1L;
+    size_t segment = 0;
+    float4 stack_array[ STACK_ARRAY_COUNT ];
+#if DEBUG
+    //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    size_t index;
+    float4 max;
+    // Faster loop without cleanup code for full tiles
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
+    {
+        max = dotMax;
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+        }
+        // If we found a new max
+        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
+        { 
+            // copy the new max across all lanes of our max accumulator
+            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
+            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
+            dotMax = max;
+            // find first occurrence of that max  
+            size_t test;
+            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
+            {}
+            // record where it is.
+            maxIndex = 4*index + segment + indexTable[test];
+        }
+    }
+    // account for work we've already done
+    count -= segment;
+    // Deal with the last < STACK_ARRAY_COUNT vectors
+    max = dotMax;
+    index = 0;
+    if( btUnlikely( count > 16) )
+    {
+        for( ; index + 4 <= count / 4; index+=4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+        }
+    }
+    size_t localCount = (count & -4L) - 4*index;
+    if( localCount )
+    {
+#ifdef __APPLE__
+        float4 t0, t1, t2, t3, t4;
+        float4 * sap = &stack_array[index + localCount / 4];
+          vertices += localCount;      // counter the offset
+         size_t byteIndex = -(localCount) * sizeof(float);
+        //AT&T Code style assembly
+        asm volatile
+        (   ".align 4                                                                   \n\
+             0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
+          movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+          movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+          movaps  %[t0], %[max]                               // vertices[0]      \n\
+          movlhps %[t1], %[max]                               // x0y0x1y1         \n\
+         movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+         movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+          mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
+         movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+         movaps  %[t3], %[t0]                                // vertices[2]      \n\
+         movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+         mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+          movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+          shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+          mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+         movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
+         shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
+         shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+         addps   %[t3], %[max]                               // x + y            \n\
+         addps   %[t1], %[max]                               // x + y + z        \n\
+         movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+         maxps   %[t2], %[max]                               // record max, restore max   \n\
+         add     $16, %[byteIndex]                           // advance loop counter\n\
+         jnz     0b                                          \n\
+     "
+         : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
+         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
+         : "memory", "cc"
+         );
+        index += localCount/4;
+        {
+            for( unsigned int i=0; i<localCount/4; i++,index++)   
+            { // do four dot products at a time. Carefully avoid touching the w element.
+                float4 v0 = vertices[0];
+                float4 v1 = vertices[1];
+                float4 v2 = vertices[2];
+                float4 v3 = vertices[3];            
+                vertices += 4;
+                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+                lo0 = lo0*vLo;
+                lo1 = lo1*vLo;
+                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+                z = z*vHi;
+                x = x+y;
+                x = x+z;
+                stack_array[index] = x;
+                max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            }
+        }
+#endif //__APPLE__
+    }
+    // process the last few points
+    if( count & 3 )
+    {
+        float4 v0, v1, v2, x, y, z;
+        switch( count & 3 )
+        {
+            case 3:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                v2 = vertices[2];
+                // Calculate 3 dot products, transpose, duplicate v2
+                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
+                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
+                lo0 = lo0*vLo;
+                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
+                z = z*vHi;
+                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
+                lo1 = lo1*vLo;
+                x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            }
+                break;
+            case 2:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                float4 xy = _mm_movelh_ps(v0, v1);
+                z = _mm_movehl_ps(v1, v0);
+                xy = xy*vLo;
+                z = _mm_shuffle_ps( z, z,  0xa8);
+                x = _mm_shuffle_ps( xy, xy, 0xa8);
+                y = _mm_shuffle_ps( xy, xy, 0xfd);
+                z = z*vHi;
+            }
+                break;
+            case 1:
+            {
+                float4 xy = vertices[0];
+                z =  _mm_shuffle_ps( xy, xy, 0xaa);
+                xy = xy*vLo;
+                z = z*vHi;
+                x = _mm_shuffle_ps(xy, xy, 0);
+                y = _mm_shuffle_ps(xy, xy, 0x55);
+            }
+                break;
+        }
+        x = x+y;
+        x = x+z;
+        stack_array[index] = x;
+        max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+        index++;
+    }
+    // if we found a new max. 
+    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
+    { // we found a new max. Search for it
+      // find max across the max vector, place in all elements of max -- big latency hit here
+        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
+        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
+        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+        // this where it actually makes a difference is handled in the early out at the top of the function, 
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
+        // complexity, and removed it.
+        dotMax = max;
+        // scan for the first occurence of max in the array  
+        size_t test;
+        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
+        {}
+        maxIndex = 4*index + segment + indexTable[test];
+    }
+    _mm_store_ss( dotResult, dotMax);
+    return maxIndex;
+long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    const float4 *vertices = (const float4*) vv;
+    static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+    float4 dotmin = btAssign128( BT_INFINITY,  BT_INFINITY,  BT_INFINITY,  BT_INFINITY );
+    float4 vvec = _mm_loadu_ps( vec );
+    float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
+    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+    long minIndex = -1L;
+    size_t segment = 0;
+    float4 stack_array[ STACK_ARRAY_COUNT ];
+#if DEBUG
+    //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    size_t index;
+    float4 min;
+    // Faster loop without cleanup code for full tiles
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
+    {
+        min = dotmin;
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+        }
+        // If we found a new min
+        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
+        { 
+            // copy the new min across all lanes of our min accumulator
+            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
+            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
+            dotmin = min;
+            // find first occurrence of that min  
+            size_t test;
+            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
+            {}
+            // record where it is.
+            minIndex = 4*index + segment + indexTable[test];
+        }
+    }
+    // account for work we've already done
+    count -= segment;
+    // Deal with the last < STACK_ARRAY_COUNT vectors
+    min = dotmin;
+    index = 0;
+    if(btUnlikely( count > 16) )
+    {
+        for( ; index + 4 <= count / 4; index+=4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+        }
+    }
+    size_t localCount = (count & -4L) - 4*index;
+    if( localCount )
+    {
+#ifdef __APPLE__
+        vertices += localCount;      // counter the offset
+        float4 t0, t1, t2, t3, t4;
+        size_t byteIndex = -(localCount) * sizeof(float);
+        float4 * sap = &stack_array[index + localCount / 4];
+        asm volatile
+        (   ".align 4                                                                   \n\
+             0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
+             movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+             movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+             movaps  %[t0], %[min]                               // vertices[0]      \n\
+             movlhps %[t1], %[min]                               // x0y0x1y1         \n\
+             movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+             movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+             mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
+             movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+             movaps  %[t3], %[t0]                                // vertices[2]      \n\
+             movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+             movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+             mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+             shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+             mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+             movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
+             shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
+             shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+             addps   %[t3], %[min]                               // x + y            \n\
+             addps   %[t1], %[min]                               // x + y + z        \n\
+             movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+             minps   %[t2], %[min]                               // record min, restore min   \n\
+             add     $16, %[byteIndex]                           // advance loop counter\n\
+             jnz     0b                                          \n\
+             "
+         : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
+         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
+         : "memory", "cc"
+         );
+        index += localCount/4;
+        {
+            for( unsigned int i=0; i<localCount/4; i++,index++)   
+            { // do four dot products at a time. Carefully avoid touching the w element.
+                float4 v0 = vertices[0];
+                float4 v1 = vertices[1];
+                float4 v2 = vertices[2];
+                float4 v3 = vertices[3];            
+                vertices += 4;
+                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+                lo0 = lo0*vLo;
+                lo1 = lo1*vLo;
+                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+                z = z*vHi;
+                x = x+y;
+                x = x+z;
+                stack_array[index] = x;
+                min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
+            }
+        }
+    }
+    // process the last few points
+    if( count & 3 )
+    {
+        float4 v0, v1, v2, x, y, z;
+        switch( count & 3 )
+        {
+            case 3:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                v2 = vertices[2];
+                // Calculate 3 dot products, transpose, duplicate v2
+                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
+                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
+                lo0 = lo0*vLo;
+                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
+                z = z*vHi;
+                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
+                lo1 = lo1*vLo;
+                x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            }
+                break;
+            case 2:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                float4 xy = _mm_movelh_ps(v0, v1);
+                z = _mm_movehl_ps(v1, v0);
+                xy = xy*vLo;
+                z = _mm_shuffle_ps( z, z,  0xa8);
+                x = _mm_shuffle_ps( xy, xy, 0xa8);
+                y = _mm_shuffle_ps( xy, xy, 0xfd);
+                z = z*vHi;
+            }
+                break;
+            case 1:
+            {
+                float4 xy = vertices[0];
+                z =  _mm_shuffle_ps( xy, xy, 0xaa);
+                xy = xy*vLo;
+                z = z*vHi;
+                x = _mm_shuffle_ps(xy, xy, 0);
+                y = _mm_shuffle_ps(xy, xy, 0x55);
+            }
+                break;
+        }
+        x = x+y;
+        x = x+z;
+        stack_array[index] = x;
+        min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+        index++;
+    }
+    // if we found a new min. 
+    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
+    { // we found a new min. Search for it
+      // find min across the min vector, place in all elements of min -- big latency hit here
+        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
+        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
+        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+        // this where it actually makes a difference is handled in the early out at the top of the function, 
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
+        // complexity, and removed it.
+        dotmin = min;
+        // scan for the first occurence of min in the array  
+        size_t test;
+        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
+        {}
+        minIndex = 4*index + segment + indexTable[test];
+    }
+    _mm_store_ss( dotResult, dotmin);
+    return minIndex;
+#elif defined BT_USE_NEON
+#include <arm_neon.h>
+#include <sys/types.h>
+#include <sys/sysctl.h> //for sysctlbyname
+static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
+long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
+static inline uint32_t btGetCpuCapabilities( void )
+    static uint32_t capabilities = 0;
+    static bool testedCapabilities = false;
+    if( 0 == testedCapabilities)
+    {
+        uint32_t hasFeature = 0;
+        size_t featureSize = sizeof( hasFeature );
+        int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
+        if( 0 == err && hasFeature)
+            capabilities |= 0x2000;
+		testedCapabilities = true;
+    }
+    return capabilities;
+static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    if( btGetCpuCapabilities() & 0x2000 )
+        _maxdot_large = _maxdot_large_v1;
+    else
+        _maxdot_large = _maxdot_large_v0;
+    return _maxdot_large(vv, vec, count, dotResult);
+static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    if( btGetCpuCapabilities() & 0x2000 )
+        _mindot_large = _mindot_large_v1;
+    else
+        _mindot_large = _mindot_large_v0;
+    return _mindot_large(vv, vec, count, dotResult);
+#if defined __arm__
+# define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
+//support 64bit arm
+# define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
+long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    unsigned long i = 0;
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x2_t vLo = vget_low_f32(vvec);
+    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+    float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
+    float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
+    uint32x2_t indexLo = (uint32x2_t) {0, 1};
+    uint32x2_t indexHi = (uint32x2_t) {2, 3};
+    uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
+    uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
+    const uint32x2_t four = (uint32x2_t) {4,4};
+    for( ; i+8 <= count; i+= 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four); 
+        indexHi = vadd_u32(indexHi, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        zLo = vmul_f32( z0.val[0], vHi);
+        zHi = vmul_f32( z1.val[0], vHi);
+        rLo = vpadd_f32( xy0, xy1);
+        rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        maskLo = vcgt_f32( rLo, dotMaxLo );
+        maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    for( ; i+4 <= count; i+= 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    switch( count & 3 )
+    {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            float32x2_t rHi = vpadd_f32( xy2, xy2);
+            rLo = vadd_f32(rLo, zLo);
+            rHi = vadd_f32(rHi, zHi);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+            iHi = vbsl_u32(maskHi, indexHi, iHi);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+            float32x2_t zLo = vmul_f32( z0, vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy0);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
+    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    // select best answer between even and odd results
+    dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
+    iHi = vdup_lane_u32(iLo, 1);
+    mask = vcgt_f32( dotMaxHi, dotMaxLo );
+    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    *dotResult = vget_lane_f32( dotMaxLo, 0);
+    return vget_lane_u32(iLo, 0);
+long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
+    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
+    uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
+    float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
+    unsigned long i = 0;
+    for( ; i + 8 <= count; i += 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        zb = vuzpq_f32( z0, z1);
+        z = vmulq_f32( zb.val[0], vHi);
+        xy = vuzpq_f32( xy0, xy1);
+        x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    for( ; i + 4 <= count; i += 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    switch (count & 3) {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
+            xy0 = vmulq_f32(xy0, vLo);
+            xy1 = vmulq_f32(xy1, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z1);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            xy0 = vmulq_f32(xy0, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z0);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
+            xy0 = vmulq_f32(xy0, vLo);
+            z = vmulq_f32( z, vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
+    float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
+    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+    // select best answer between even and odd results
+    float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
+    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+    mask = vcgt_f32( maxDotO, maxDot2 );
+    maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
+    index2 = vbsl_u32(mask, indexHi, index2);
+    *dotResult = vget_lane_f32( maxDot2, 0);
+    return vget_lane_u32(index2, 0);
+long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    unsigned long i = 0;
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x2_t vLo = vget_low_f32(vvec);
+    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+    float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
+    float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
+    uint32x2_t indexLo = (uint32x2_t) {0, 1};
+    uint32x2_t indexHi = (uint32x2_t) {2, 3};
+    uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
+    uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
+    const uint32x2_t four = (uint32x2_t) {4,4};
+    for( ; i+8 <= count; i+= 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        zLo = vmul_f32( z0.val[0], vHi);
+        zHi = vmul_f32( z1.val[0], vHi);
+        rLo = vpadd_f32( xy0, xy1);
+        rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        maskLo = vclt_f32( rLo, dotMinLo );
+        maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    for( ; i+4 <= count; i+= 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    switch( count & 3 )
+    {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            float32x2_t rHi = vpadd_f32( xy2, xy2);
+            rLo = vadd_f32(rLo, zLo);
+            rHi = vadd_f32(rHi, zHi);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+            iHi = vbsl_u32(maskHi, indexHi, iHi);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+            float32x2_t zLo = vmul_f32( z0, vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy0);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
+    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    // select best answer between even and odd results
+    dotMinHi = vdup_lane_f32(dotMinLo, 1);
+    iHi = vdup_lane_u32(iLo, 1);
+    mask = vclt_f32( dotMinHi, dotMinLo );
+    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    *dotResult = vget_lane_f32( dotMinLo, 0);
+    return vget_lane_u32(iLo, 0);
+long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
+    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
+    uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
+    float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
+    unsigned long i = 0;
+    for( ; i + 8 <= count; i += 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        zb = vuzpq_f32( z0, z1);
+        z = vmulq_f32( zb.val[0], vHi);
+        xy = vuzpq_f32( xy0, xy1);
+        x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    for( ; i + 4 <= count; i += 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        uint32x4_t mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    switch (count & 3) {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
+            xy0 = vmulq_f32(xy0, vLo);
+            xy1 = vmulq_f32(xy1, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z1);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            xy0 = vmulq_f32(xy0, vLo);
+            float32x4x2_t zb = vuzpq_f32( z0, z0);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
+            xy0 = vmulq_f32(xy0, vLo);
+            z = vmulq_f32( z, vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+        default:
+            break;
+    }
+    // select best answer between hi and lo results
+    uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
+    float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
+    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+    // select best answer between even and odd results
+    float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
+    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+    mask = vclt_f32( minDotO, minDot2 );
+    minDot2 = vbsl_f32(mask, minDotO, minDot2);
+    index2 = vbsl_u32(mask, indexHi, index2);
+    *dotResult = vget_lane_f32( minDot2, 0);
+    return vget_lane_u32(index2, 0);
+    #error Unhandled __APPLE__ arch
+#endif  /* __APPLE__ */
diff --git a/src/bullet/LinearMath/btVector3.h b/src/bullet/LinearMath/btVector3.h
index 749e7680..839b19c1 100644
--- a/src/bullet/LinearMath/btVector3.h
+++ b/src/bullet/LinearMath/btVector3.h
@@ -17,9 +17,10 @@ subject to the following restrictions:
 #ifndef BT_VECTOR3_H
 #define BT_VECTOR3_H
+//#include <stdint.h>
 #include "btScalar.h"
 #include "btMinMax.h"
+#include "btAlignedAllocator.h"
 #define btVector3Data btVector3DoubleData
@@ -29,8 +30,51 @@ subject to the following restrictions:
 #define btVector3DataName "btVector3FloatData"
+#if defined BT_USE_SSE
+//typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
+#ifdef _MSC_VER
+#pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
+#define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
+//#define bt_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
+#define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
+#define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) )
+#define bt_splat_ps( _a, _i )  bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) )
+#define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
+#define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
+#define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
+#define btvxyzMaskf btvFFF0fMask
+#define btvAbsfMask btCastiTo128f(btvAbsMask)
+//there is an issue with XCode 3.2 (LCx errors)
+#define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f))
+#define v1110		 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f))
+#define vHalf		 (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f))
+#define v1_5		 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f))
+//const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
+//const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
+//const __m128 ATTRIBUTE_ALIGNED16(vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
+//const __m128 ATTRIBUTE_ALIGNED16(v1_5)  = {1.5f, 1.5f, 1.5f, 1.5f};
+#ifdef BT_USE_NEON
+const float32x4_t ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
+const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){static_cast<int32_t>(0xFFFFFFFF),
+	static_cast<int32_t>(0xFFFFFFFF), static_cast<int32_t>(0xFFFFFFFF), 0x0};
+const int32x4_t ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
 /**@brief btVector3 can be used to represent 3D points and vectors.
  * It has an un-used w component to suit 16-byte alignment when btVector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
@@ -40,6 +84,8 @@ ATTRIBUTE_ALIGNED16(class) btVector3
 #if defined (__SPU__) && defined (__CELLOS_LV2__)
 		btScalar	m_floats[4];
@@ -49,28 +95,31 @@ public:
 #else //__CELLOS_LV2__ __SPU__
-#ifdef BT_USE_SSE // _WIN32
-	union {
-		__m128 mVec128;
-		btScalar	m_floats[4];
-	};
-	SIMD_FORCE_INLINE	__m128	get128() const
-	{
-		return mVec128;
-	}
-	SIMD_FORCE_INLINE	void	set128(__m128 v128)
-	{
-		mVec128 = v128;
-	}
-	btScalar	m_floats[4];
+    #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
+        union {
+            btSimdFloat4      mVec128;
+            btScalar	m_floats[4];
+        };
+        SIMD_FORCE_INLINE	btSimdFloat4	get128() const
+        {
+            return mVec128;
+        }
+        SIMD_FORCE_INLINE	void	set128(btSimdFloat4 v128)
+        {
+            mVec128 = v128;
+        }
+    #else
+        btScalar	m_floats[4];
+    #endif
 #endif //__CELLOS_LV2__ __SPU__
   /**@brief No initialization constructor */
-	SIMD_FORCE_INLINE btVector3() {}
+	SIMD_FORCE_INLINE btVector3() 
+	{
+	}
@@ -79,23 +128,50 @@ public:
    * @param y Y value 
    * @param z Z value 
 	SIMD_FORCE_INLINE btVector3(const btScalar& _x, const btScalar& _y, const btScalar& _z)
 		m_floats[0] = _x;
 		m_floats[1] = _y;
 		m_floats[2] = _z;
-		m_floats[3] = btScalar(0.);
+		m_floats[3] = btScalar(0.f);
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON)
+	// Set Vector 
+	SIMD_FORCE_INLINE btVector3( btSimdFloat4 v)
+	{
+		mVec128 = v;
+	}
+	// Copy constructor
+	SIMD_FORCE_INLINE btVector3(const btVector3& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+	// Assignment Operator
+	operator=(const btVector3& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
+#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) 
 /**@brief Add a vector to this one 
  * @param The vector to add to this one */
 	SIMD_FORCE_INLINE btVector3& operator+=(const btVector3& v)
-		m_floats[0] += v.m_floats[0]; m_floats[1] += v.m_floats[1];m_floats[2] += v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, v.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, v.mVec128);
+		m_floats[0] += v.m_floats[0]; 
+		m_floats[1] += v.m_floats[1];
+		m_floats[2] += v.m_floats[2];
 		return *this;
@@ -104,14 +180,33 @@ public:
    * @param The vector to subtract */
 	SIMD_FORCE_INLINE btVector3& operator-=(const btVector3& v) 
-		m_floats[0] -= v.m_floats[0]; m_floats[1] -= v.m_floats[1];m_floats[2] -= v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, v.mVec128);
+		m_floats[0] -= v.m_floats[0]; 
+		m_floats[1] -= v.m_floats[1];
+		m_floats[2] -= v.m_floats[2];
 		return *this;
   /**@brief Scale the vector
    * @param s Scale factor */
 	SIMD_FORCE_INLINE btVector3& operator*=(const btScalar& s)
-		m_floats[0] *= s; m_floats[1] *= s;m_floats[2] *= s;
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+		m_floats[0] *= s; 
+		m_floats[1] *= s;
+		m_floats[2] *= s;
 		return *this;
@@ -120,14 +215,42 @@ public:
 	SIMD_FORCE_INLINE btVector3& operator/=(const btScalar& s) 
 		btFullAssert(s != btScalar(0.0));
+#if 0 //defined(BT_USE_SSE_IN_API)
+// this code is not faster !
+		__m128 vs = _mm_load_ss(&s);
+		vs = _mm_div_ss(v1110, vs);
+		vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+		return *this;
 		return *this *= btScalar(1.0) / s;
   /**@brief Return the dot product
    * @param v The other vector in the dot product */
 	SIMD_FORCE_INLINE btScalar dot(const btVector3& v) const
-		return m_floats[0] * v.m_floats[0] + m_floats[1] * v.m_floats[1] +m_floats[2] * v.m_floats[2];
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+		return _mm_cvtss_f32(vd);
+#elif defined(BT_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));  
+		x = vadd_f32(x, vget_high_f32(vd));
+		return vget_lane_f32(x, 0);
+		return	m_floats[0] * v.m_floats[0] + 
+				m_floats[1] * v.m_floats[1] + 
+				m_floats[2] * v.m_floats[2];
   /**@brief Return the length of the vector squared */
@@ -142,6 +265,12 @@ public:
 		return btSqrt(length2());
+	/**@brief Return the norm (length) of the vector */
+	SIMD_FORCE_INLINE btScalar norm() const
+	{
+		return length();
+	}
   /**@brief Return the distance squared between the ends of this and another vector
    * This is symantically treating the vector like a point */
 	SIMD_FORCE_INLINE btScalar distance2(const btVector3& v) const;
@@ -167,7 +296,47 @@ public:
    * x^2 + y^2 + z^2 = 1 */
 	SIMD_FORCE_INLINE btVector3& normalize() 
+		btAssert(!fuzzyZero());
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)		
+        // dot product first
+		__m128 vd = _mm_mul_ps(mVec128, mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+        #if 0
+        vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(v1110, vd);
+		vd = bt_splat_ps(vd, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, vd);
+        #else
+        // NR step 1/sqrt(x) - vd is x, y is output 
+        y = _mm_rsqrt_ss(vd); // estimate 
+        //  one step NR 
+        z = v1_5;
+        vd = _mm_mul_ss(vd, vHalf); // vd * 0.5	
+        //x2 = vd;
+        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0
+        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0
+        z = _mm_sub_ss(z, vd);  // 1.5 - vd * 0.5 * y0 * y0 
+        y = _mm_mul_ss(y, z);   // y0 * (1.5 - vd * 0.5 * y0 * y0)
+		y = bt_splat_ps(y, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, y);
+        #endif
+		return *this;
 		return *this /= length();
   /**@brief Return a normalized version of this vector */
@@ -186,29 +355,112 @@ public:
 		btFullAssert(s != btScalar(0.0));
 		return btAcos(dot(v) / s);
   /**@brief Return a vector will the absolute values of each element */
 	SIMD_FORCE_INLINE btVector3 absolute() const 
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) 
+		return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
+#elif defined(BT_USE_NEON)
+		return btVector3(vabsq_f32(mVec128));
 		return btVector3(
   /**@brief Return the cross product between this and another vector 
    * @param v The other vector */
 	SIMD_FORCE_INLINE btVector3 cross(const btVector3& v) const
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	T, V;
+		T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = _mm_mul_ps(V, mVec128);
+		T = _mm_mul_ps(T, v.mVec128);
+		V = _mm_sub_ps(V, T);
+		V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
+		return btVector3(V);
+#elif defined(BT_USE_NEON)
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(mVec128);
+		float32x2_t Vlow = vget_low_f32(v.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
+		V = vmulq_f32(V, mVec128);
+		T = vmulq_f32(T, v.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+		V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask);
+		return btVector3(V);
 		return btVector3(
-			m_floats[1] * v.m_floats[2] -m_floats[2] * v.m_floats[1],
+			m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
 			m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
 			m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
 	SIMD_FORCE_INLINE btScalar triple(const btVector3& v1, const btVector3& v2) const
-		return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + 
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		// cross:
+		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = _mm_mul_ps(V, v1.mVec128);
+		T = _mm_mul_ps(T, v2.mVec128);
+		V = _mm_sub_ps(V, T);
+		V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
+		// dot: 
+		V = _mm_mul_ps(V, mVec128);
+		__m128 z = _mm_movehl_ps(V, V);
+		__m128 y = _mm_shuffle_ps(V, V, 0x55);
+		V = _mm_add_ss(V, y);
+		V = _mm_add_ss(V, z);
+		return _mm_cvtss_f32(V);
+#elif defined(BT_USE_NEON)
+		// cross:
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(v1.mVec128);
+		float32x2_t Vlow = vget_low_f32(v2.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
+		V = vmulq_f32(V, v1.mVec128);
+		T = vmulq_f32(T, v2.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+		// dot: 
+		V = vmulq_f32(mVec128, V);
+		float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));  
+		x = vadd_f32(x, vget_high_f32(V));
+		return vget_lane_f32(x, 0);
+		return 
+			m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + 
 			m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + 
 			m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
   /**@brief Return the axis with the smallest value 
@@ -235,14 +487,31 @@ public:
 		return absolute().maxAxis();
 	SIMD_FORCE_INLINE void setInterpolate3(const btVector3& v0, const btVector3& v1, btScalar rt)
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vrt = _mm_load_ss(&rt);	//	(rt 0 0 0)
+		btScalar s = btScalar(1.0) - rt;
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+		__m128 r0 = _mm_mul_ps(v0.mVec128, vs);
+		vrt = bt_pshufd_ps(vrt, 0x80);	//	(rt rt rt 0.0)
+		__m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
+		__m128 tmp3 = _mm_add_ps(r0,r1);
+		mVec128 = tmp3;
+#elif defined(BT_USE_NEON)
+		float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
+		vl = vmulq_n_f32(vl, rt);
+		mVec128 = vaddq_f32(vl, v0.mVec128);
 		btScalar s = btScalar(1.0) - rt;
 		m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
 		m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
 		m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
 		//don't do the unused w component
 		//		m_co[3] = s * v0[3] + rt * v1[3];
   /**@brief Return the linear interpolation between this and another vector 
@@ -250,16 +519,41 @@ public:
    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
 	SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v, const btScalar& t) const 
-		return btVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
-			m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
-			m_floats[2] + (v.m_floats[2] -m_floats[2]) * t);
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vt = _mm_load_ss(&t);	//	(t 0 0 0)
+		vt = bt_pshufd_ps(vt, 0x80);	//	(rt rt rt 0.0)
+		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
+		vl = _mm_mul_ps(vl, vt);
+		vl = _mm_add_ps(vl, mVec128);
+		return btVector3(vl);
+#elif defined(BT_USE_NEON)
+		float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
+		vl = vmulq_n_f32(vl, t);
+		vl = vaddq_f32(vl, mVec128);
+		return btVector3(vl);
+		return 
+			btVector3(	m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
+						m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
+						m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
   /**@brief Elementwise multiply this vector by the other 
    * @param v The other vector */
 	SIMD_FORCE_INLINE btVector3& operator*=(const btVector3& v)
-		m_floats[0] *= v.m_floats[0]; m_floats[1] *= v.m_floats[1];m_floats[2] *= v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmulq_f32(mVec128, v.mVec128);
+		m_floats[0] *= v.m_floats[0]; 
+		m_floats[1] *= v.m_floats[1];
+		m_floats[2] *= v.m_floats[2];
 		return *this;
@@ -269,16 +563,14 @@ public:
 		SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; }
   /**@brief Return the z value */
 		SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; }
   /**@brief Set the x value */
 		SIMD_FORCE_INLINE void	setX(btScalar _x) { m_floats[0] = _x;};
   /**@brief Set the y value */
 		SIMD_FORCE_INLINE void	setY(btScalar _y) { m_floats[1] = _y;};
   /**@brief Set the z value */
-		SIMD_FORCE_INLINE void	setZ(btScalar _z) {m_floats[2] = _z;};
+		SIMD_FORCE_INLINE void	setZ(btScalar _z) { m_floats[2] = _z;};
   /**@brief Set the w value */
 		SIMD_FORCE_INLINE void	setW(btScalar _w) { m_floats[3] = _w;};
   /**@brief Return the x value */
 		SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; }
   /**@brief Return the y value */
@@ -296,7 +588,14 @@ public:
 	SIMD_FORCE_INLINE	bool	operator==(const btVector3& other) const
-		return ((m_floats[3]==other.m_floats[3]) && (m_floats[2]==other.m_floats[2]) && (m_floats[1]==other.m_floats[1]) && (m_floats[0]==other.m_floats[0]));
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+		return ((m_floats[3]==other.m_floats[3]) && 
+                (m_floats[2]==other.m_floats[2]) && 
+                (m_floats[1]==other.m_floats[1]) && 
+                (m_floats[0]==other.m_floats[0]));
 	SIMD_FORCE_INLINE	bool	operator!=(const btVector3& other) const
@@ -304,105 +603,231 @@ public:
 		return !(*this == other);
-	 /**@brief Set each element to the max of the current values and the values of another btVector3
+  /**@brief Set each element to the max of the current values and the values of another btVector3
    * @param other The other btVector3 to compare with 
-		SIMD_FORCE_INLINE void	setMax(const btVector3& other)
-		{
-			btSetMax(m_floats[0], other.m_floats[0]);
-			btSetMax(m_floats[1], other.m_floats[1]);
-			btSetMax(m_floats[2], other.m_floats[2]);
-			btSetMax(m_floats[3], other.w());
-		}
+	SIMD_FORCE_INLINE void	setMax(const btVector3& other)
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_max_ps(mVec128, other.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmaxq_f32(mVec128, other.mVec128);
+		btSetMax(m_floats[0], other.m_floats[0]);
+		btSetMax(m_floats[1], other.m_floats[1]);
+		btSetMax(m_floats[2], other.m_floats[2]);
+		btSetMax(m_floats[3], other.w());
+	}
   /**@brief Set each element to the min of the current values and the values of another btVector3
    * @param other The other btVector3 to compare with 
-		SIMD_FORCE_INLINE void	setMin(const btVector3& other)
-		{
-			btSetMin(m_floats[0], other.m_floats[0]);
-			btSetMin(m_floats[1], other.m_floats[1]);
-			btSetMin(m_floats[2], other.m_floats[2]);
-			btSetMin(m_floats[3], other.w());
-		}
-		SIMD_FORCE_INLINE void 	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
-		{
-			m_floats[0]=_x;
-			m_floats[1]=_y;
-			m_floats[2]=_z;
-			m_floats[3] = btScalar(0.);
-		}
-		void	getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const
-		{
-			v0->setValue(0.		,-z()		,y());
-			v1->setValue(z()	,0.			,-x());
-			v2->setValue(-y()	,x()	,0.);
-		}
-		void	setZero()
-		{
-			setValue(btScalar(0.),btScalar(0.),btScalar(0.));
-		}
-		SIMD_FORCE_INLINE bool isZero() const 
-		{
-			return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0);
-		}
-		SIMD_FORCE_INLINE bool fuzzyZero() const 
-		{
-			return length2() < SIMD_EPSILON;
-		}
+	SIMD_FORCE_INLINE void	setMin(const btVector3& other)
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_min_ps(mVec128, other.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vminq_f32(mVec128, other.mVec128);
+		btSetMin(m_floats[0], other.m_floats[0]);
+		btSetMin(m_floats[1], other.m_floats[1]);
+		btSetMin(m_floats[2], other.m_floats[2]);
+		btSetMin(m_floats[3], other.w());
+	}
-		SIMD_FORCE_INLINE	void	serialize(struct	btVector3Data& dataOut) const;
+	SIMD_FORCE_INLINE void 	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
+	{
+		m_floats[0]=_x;
+		m_floats[1]=_y;
+		m_floats[2]=_z;
+		m_floats[3] = btScalar(0.f);
+	}
-		SIMD_FORCE_INLINE	void	deSerialize(const struct	btVector3Data& dataIn);
+	void	getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const
+	{
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128 V  = _mm_and_ps(mVec128, btvFFF0fMask);
+		__m128 V0 = _mm_xor_ps(btvMzeroMask, V);
+		__m128 V2 = _mm_movelh_ps(V0, V);
+		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
+        V0 = _mm_shuffle_ps(V0, V, 0xDB);
+		V2 = _mm_shuffle_ps(V2, V, 0xF9);
+		v0->mVec128 = V0;
+		v1->mVec128 = V1;
+		v2->mVec128 = V2;
+		v0->setValue(0.		,-z()		,y());
+		v1->setValue(z()	,0.			,-x());
+		v2->setValue(-y()	,x()	,0.);
+	}
-		SIMD_FORCE_INLINE	void	serializeFloat(struct	btVector3FloatData& dataOut) const;
+	void setZero()
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
+#elif defined(BT_USE_NEON)
+		int32x4_t vi = vdupq_n_s32(0); 
+		mVec128 = vreinterpretq_f32_s32(vi);
+		setValue(btScalar(0.),btScalar(0.),btScalar(0.));
+	}
-		SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btVector3FloatData& dataIn);
+	SIMD_FORCE_INLINE bool isZero() const 
+	{
+		return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0);
+	}
-		SIMD_FORCE_INLINE	void	serializeDouble(struct	btVector3DoubleData& dataOut) const;
-		SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btVector3DoubleData& dataIn);
+	SIMD_FORCE_INLINE bool fuzzyZero() const 
+	{
+		return length2() < SIMD_EPSILON*SIMD_EPSILON;
+	}
+	SIMD_FORCE_INLINE	void	serialize(struct	btVector3Data& dataOut) const;
+	SIMD_FORCE_INLINE	void	deSerialize(const struct	btVector3Data& dataIn);
+	SIMD_FORCE_INLINE	void	serializeFloat(struct	btVector3FloatData& dataOut) const;
+	SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btVector3FloatData& dataIn);
+	SIMD_FORCE_INLINE	void	serializeDouble(struct	btVector3DoubleData& dataOut) const;
+	SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btVector3DoubleData& dataIn);
+        /**@brief returns index of maximum dot product between this and vectors in array[]
+         * @param array The other vectors 
+         * @param array_count The number of other vectors 
+         * @param dotOut The maximum dot product */
+        SIMD_FORCE_INLINE   long    maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const; 
+        /**@brief returns index of minimum dot product between this and vectors in array[]
+         * @param array The other vectors 
+         * @param array_count The number of other vectors 
+         * @param dotOut The minimum dot product */    
+        SIMD_FORCE_INLINE   long    minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const; 
+    /* create a vector as  btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 ))  */
+    SIMD_FORCE_INLINE btVector3  dot3( const btVector3 &v0, const btVector3 &v1, const btVector3 &v2 ) const
+    {
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
+        __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
+        __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
+        __m128 b0 = _mm_unpacklo_ps( a0, a1 );
+        __m128 b1 = _mm_unpackhi_ps( a0, a1 );
+        __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
+        __m128 r = _mm_movelh_ps( b0, b2 );
+        r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
+        a2 = _mm_and_ps( a2, btvxyzMaskf);
+        r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) )));
+        return btVector3(r);
+#elif defined(BT_USE_NEON)
+        static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
+        float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
+        float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
+        float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
+        float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
+        a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
+        float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
+        float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
+        return btVector3( vcombine_f32(b0, b1) );
+		return btVector3( dot(v0), dot(v1), dot(v2));
+    }
 /**@brief Return the sum of two vectors (Point symantics)*/
 operator+(const btVector3& v1, const btVector3& v2) 
-	return btVector3(v1.m_floats[0] + v2.m_floats[0], v1.m_floats[1] + v2.m_floats[1], v1.m_floats[2] + v2.m_floats[2]);
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
+#elif defined(BT_USE_NEON)
+	return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
+	return btVector3(
+			v1.m_floats[0] + v2.m_floats[0], 
+			v1.m_floats[1] + v2.m_floats[1], 
+			v1.m_floats[2] + v2.m_floats[2]);
 /**@brief Return the elementwise product of two vectors */
 operator*(const btVector3& v1, const btVector3& v2) 
-	return btVector3(v1.m_floats[0] * v2.m_floats[0], v1.m_floats[1] * v2.m_floats[1], v1.m_floats[2] * v2.m_floats[2]);
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
+#elif defined(BT_USE_NEON)
+	return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
+	return btVector3(
+			v1.m_floats[0] * v2.m_floats[0], 
+			v1.m_floats[1] * v2.m_floats[1], 
+			v1.m_floats[2] * v2.m_floats[2]);
 /**@brief Return the difference between two vectors */
 operator-(const btVector3& v1, const btVector3& v2)
-	return btVector3(v1.m_floats[0] - v2.m_floats[0], v1.m_floats[1] - v2.m_floats[1], v1.m_floats[2] - v2.m_floats[2]);
+#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)  && defined(BT_USE_SSE))
+	//	without _mm_and_ps this code causes slowdown in Concave moving
+	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
+	return btVector3(_mm_and_ps(r, btvFFF0fMask));
+#elif defined(BT_USE_NEON)
+	float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
+	return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
+	return btVector3(
+			v1.m_floats[0] - v2.m_floats[0], 
+			v1.m_floats[1] - v2.m_floats[1], 
+			v1.m_floats[2] - v2.m_floats[2]);
 /**@brief Return the negative of the vector */
 operator-(const btVector3& v)
+#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+	__m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
+	return btVector3(_mm_and_ps(r, btvFFF0fMask)); 
+#elif defined(BT_USE_NEON)
+	return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
 	return btVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
 /**@brief Return the vector scaled by s */
 operator*(const btVector3& v, const btScalar& s)
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+	vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+	return btVector3(_mm_mul_ps(v.mVec128, vs));
+#elif defined(BT_USE_NEON)
+	float32x4_t r = vmulq_n_f32(v.mVec128, s);
+	return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
 	return btVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
 /**@brief Return the vector scaled by s */
@@ -417,14 +842,46 @@ SIMD_FORCE_INLINE btVector3
 operator/(const btVector3& v, const btScalar& s)
 	btFullAssert(s != btScalar(0.0));
+#if 0 //defined(BT_USE_SSE_IN_API)
+// this code is not faster !
+	__m128 vs = _mm_load_ss(&s);
+    vs = _mm_div_ss(v1110, vs);
+	vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
+	return btVector3(_mm_mul_ps(v.mVec128, vs));
 	return v * (btScalar(1.0) / s);
 /**@brief Return the vector inversely scaled by s */
 operator/(const btVector3& v1, const btVector3& v2)
-	return btVector3(v1.m_floats[0] / v2.m_floats[0],v1.m_floats[1] / v2.m_floats[1],v1.m_floats[2] / v2.m_floats[2]);
+#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
+	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
+	vec = _mm_and_ps(vec, btvFFF0fMask);
+	return btVector3(vec); 
+#elif defined(BT_USE_NEON)
+	float32x4_t x, y, v, m;
+	x = v1.mVec128;
+	y = v2.mVec128;
+	v = vrecpeq_f32(y);			// v ~ 1/y
+	m = vrecpsq_f32(y, v);		// m = (2-v*y)
+	v = vmulq_f32(v, m);		// vv = v*m ~~ 1/y
+	m = vrecpsq_f32(y, v);		// mm = (2-vv*y)
+	v = vmulq_f32(v, x);		// x*vv
+	v = vmulq_f32(v, m);		// (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
+	return btVector3(v);
+	return btVector3(
+			v1.m_floats[0] / v2.m_floats[0], 
+			v1.m_floats[1] / v2.m_floats[1],
+			v1.m_floats[2] / v2.m_floats[2]);
 /**@brief Return the dot product between two vectors */
@@ -494,14 +951,43 @@ SIMD_FORCE_INLINE btScalar btVector3::distance(const btVector3& v) const
 SIMD_FORCE_INLINE btVector3 btVector3::normalized() const
-	return *this / length();
+	btVector3 nrm = *this;
+	return nrm.normalize();
 SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btScalar _angle ) const
 	// wAxis must be a unit lenght vector
+#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+    __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
+	btScalar ssin = btSin( _angle );
+    __m128 C = wAxis.cross( mVec128 ).mVec128;
+	O = _mm_and_ps(O, btvFFF0fMask);
+    btScalar scos = btCos( _angle );
+	__m128 vsin = _mm_load_ss(&ssin);	//	(S 0 0 0)
+    __m128 vcos = _mm_load_ss(&scos);	//	(S 0 0 0)
+	__m128 Y = bt_pshufd_ps(O, 0xC9);	//	(Y Z X 0)
+	__m128 Z = bt_pshufd_ps(O, 0xD2);	//	(Z X Y 0)
+	O = _mm_add_ps(O, Y);
+	vsin = bt_pshufd_ps(vsin, 0x80);	//	(S S S 0)
+	O = _mm_add_ps(O, Z);
+    vcos = bt_pshufd_ps(vcos, 0x80);	//	(S S S 0)
+    vsin = vsin * C; 
+	O = O * wAxis.mVec128; 
+	__m128 X = mVec128 - O; 
+    O = O + vsin;
+	vcos = vcos * X;
+	O = O + vcos;	
+	return btVector3(O);
 	btVector3 o = wAxis * wAxis.dot( *this );
 	btVector3 _x = *this - o;
 	btVector3 _y;
@@ -509,8 +995,84 @@ SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btS
 	_y = wAxis.cross( *this );
 	return ( o + _x * btCos( _angle ) + _y * btSin( _angle ) );
+SIMD_FORCE_INLINE   long    btVector3::maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
+#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+    #if defined _WIN32 || defined (BT_USE_SSE)
+        const long scalar_cutoff = 10;
+        long _maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #elif defined BT_USE_NEON
+        const long scalar_cutoff = 4;
+        extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #endif
+    if( array_count < scalar_cutoff )	
+    {
+        btScalar maxDot1 = -SIMD_INFINITY;
+        int i = 0;
+        int ptIndex = -1;
+        for( i = 0; i < array_count; i++ )
+        {
+            btScalar dot = array[i].dot(*this);
+            if( dot > maxDot1 )
+            {
+                maxDot1 = dot;
+                ptIndex = i;
+            }
+        }
+        dotOut = maxDot1;
+        return ptIndex;
+    }
+#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+    return _maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+SIMD_FORCE_INLINE   long    btVector3::minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
+#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+    #if defined BT_USE_SSE
+        const long scalar_cutoff = 10;
+        long _mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #elif defined BT_USE_NEON
+        const long scalar_cutoff = 4;
+        extern long (*_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #else
+        #error unhandled arch!
+    #endif
+    if( array_count < scalar_cutoff )
+    {
+        btScalar  minDot = SIMD_INFINITY;
+        int i = 0;
+        int ptIndex = -1;
+        for( i = 0; i < array_count; i++ )
+        {
+            btScalar dot = array[i].dot(*this);
+            if( dot < minDot )
+            {
+                minDot = dot;
+                ptIndex = i;
+            }
+        }
+        dotOut = minDot;
+        return ptIndex;
+    }
+#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+    return _mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
 class btVector4 : public btVector3
@@ -519,26 +1081,47 @@ public:
 	SIMD_FORCE_INLINE btVector4() {}
 	SIMD_FORCE_INLINE btVector4(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) 
 		: btVector3(_x,_y,_z)
 		m_floats[3] = _w;
+#if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON) 
+	SIMD_FORCE_INLINE btVector4(const btSimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+	SIMD_FORCE_INLINE btVector4(const btVector3& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+	operator=(const btVector4& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
+#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) 
 	SIMD_FORCE_INLINE btVector4 absolute4() const 
+#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) 
+		return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
+#elif defined(BT_USE_NEON)
+		return btVector4(vabsq_f32(mVec128));
 		return btVector4(
 	btScalar	getW() const { return m_floats[3];}
@@ -566,12 +1149,8 @@ public:
 			maxIndex = 3;
 			maxVal = m_floats[3];
 		return maxIndex;
@@ -601,7 +1180,6 @@ public:
 		return minIndex;
@@ -633,7 +1211,6 @@ public:
    * @param z Value of z
    * @param w Value of w
 		SIMD_FORCE_INLINE void	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w)
@@ -641,7 +1218,6 @@ public:
@@ -652,9 +1228,7 @@ SIMD_FORCE_INLINE void	btSwapScalarEndian(const btScalar& sourceVal, btScalar& d
 	unsigned char* dest = (unsigned char*) &destVal;
-	unsigned char const* src  = (unsigned char const*) &sourceVal;
+	unsigned char* src  = (unsigned char*) &sourceVal;
 	dest[0] = src[7];
     dest[1] = src[6];
     dest[2] = src[5];
@@ -665,9 +1239,7 @@ SIMD_FORCE_INLINE void	btSwapScalarEndian(const btScalar& sourceVal, btScalar& d
     dest[7] = src[0];
 	unsigned char* dest = (unsigned char*) &destVal;
-	unsigned char const* src  = (unsigned char const*) &sourceVal;
+	unsigned char* src  = (unsigned char*) &sourceVal;
 	dest[0] = src[3];
     dest[1] = src[2];
     dest[2] = src[1];
@@ -778,5 +1350,4 @@ SIMD_FORCE_INLINE void	btVector3::deSerialize(const struct	btVector3Data& dataIn
 		m_floats[i] = dataIn.m_floats[i];
 #endif //BT_VECTOR3_H
diff --git a/src/bullet/Makefile.am b/src/bullet/Makefile.am
index 00e6c0a6..60ef4eac 100644
--- a/src/bullet/Makefile.am
+++ b/src/bullet/Makefile.am
@@ -11,557 +11,656 @@ disable_cflags = $(filter $(AM_CPPFLAGS:-W%=-Wno-%), \
 liblolbullet_a_SOURCES = $(bullet_sources)
-liblolbullet_a_CPPFLAGS = $(AM_CPPFLAGS) -I$(srcdir) $(disable_cflags)
+liblolbullet_a_CPPFLAGS = -DB3_USE_CLEW $(AM_CPPFLAGS) -I$(srcdir) $(disable_cflags)
 bullet_sources =
+libBulletMultiThreaded_la_CXXFLAGS = -I./BulletMultiThreaded/vectormath/scalar/cpp
+bullet_sources +=
 bullet_sources += \
-    BulletMultiThreaded/PosixThreadSupport.h \
-    BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h \
-    BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h \
-    BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h \
-    BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h \
-    BulletMultiThreaded/PpuAddressSpace.h \
-    BulletMultiThreaded/SpuCollisionTaskProcess.h \
-    BulletMultiThreaded/PlatformDefinitions.h \
-    BulletMultiThreaded/vectormath2bullet.h \
-    BulletMultiThreaded/SpuGatheringCollisionDispatcher.h \
-    BulletMultiThreaded/SpuCollisionObjectWrapper.h \
-    BulletMultiThreaded/SpuSampleTaskProcess.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h \
-    BulletMultiThreaded/SpuSync.h \
-    BulletMultiThreaded/btThreadSupportInterface.h \
-    BulletMultiThreaded/SpuLibspe2Support.h \
-    BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h \
-    BulletMultiThreaded/SpuFakeDma.h \
-    BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h \
-    BulletMultiThreaded/SpuDoubleBuffer.h \
-    BulletMultiThreaded/Win32ThreadSupport.h \
-    BulletMultiThreaded/SequentialThreadSupport.h
+	Bullet3Collision/BroadPhaseCollision/b3BroadphaseCallback.h \
+	Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.cpp \
+	Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h \
+	Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.cpp \
+	Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h \
+	Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.cpp \
+	Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h \
+	Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h \
+	Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h \
+	Bullet3Collision/NarrowPhaseCollision/b3Config.h \
+	Bullet3Collision/NarrowPhaseCollision/b3Contact4.h \
+	Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.cpp \
+	Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h \
+	Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.cpp \
+	Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h \
+	Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h \
+	Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3ContactSphereSphere.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h \
+	Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h \
+	$(NULL)
-libBulletMultiThreaded_la_CXXFLAGS = -I./BulletMultiThreaded/vectormath/scalar/cpp
 bullet_sources += \
-    BulletMultiThreaded/SpuCollisionObjectWrapper.cpp \
-    BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp \
-    BulletMultiThreaded/SpuLibspe2Support.cpp \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp \
-    BulletMultiThreaded/btThreadSupportInterface.cpp \
-    BulletMultiThreaded/SequentialThreadSupport.cpp \
-    BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp \
-    BulletMultiThreaded/Win32ThreadSupport.cpp \
-    BulletMultiThreaded/SpuFakeDma.cpp \
-    BulletMultiThreaded/PosixThreadSupport.cpp \
-    BulletMultiThreaded/SpuCollisionTaskProcess.cpp \
-    BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp \
-    BulletMultiThreaded/SpuSampleTaskProcess.cpp \
-    BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h \
-    BulletMultiThreaded/PpuAddressSpace.h \
-    BulletMultiThreaded/SpuSampleTaskProcess.h \
-    BulletMultiThreaded/SequentialThreadSupport.h \
-    BulletMultiThreaded/PlatformDefinitions.h \
-    BulletMultiThreaded/Win32ThreadSupport.h \
-    BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h \
-    BulletMultiThreaded/btThreadSupportInterface.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h \
-    BulletMultiThreaded/SpuGatheringCollisionDispatcher.h \
-    BulletMultiThreaded/SpuFakeDma.h \
-    BulletMultiThreaded/SpuSync.h \
-    BulletMultiThreaded/SpuCollisionObjectWrapper.h \
-    BulletMultiThreaded/SpuDoubleBuffer.h \
-    BulletMultiThreaded/SpuCollisionTaskProcess.h \
-    BulletMultiThreaded/PosixThreadSupport.h \
-    BulletMultiThreaded/SpuLibspe2Support.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h \
-    BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
+	Bullet3Common/b3AlignedAllocator.cpp \
+	Bullet3Common/b3AlignedAllocator.h \
+	Bullet3Common/b3AlignedObjectArray.h \
+	Bullet3Common/b3CommandLineArgs.h \
+	Bullet3Common/b3FileUtils.h \
+	Bullet3Common/b3HashMap.h \
+	Bullet3Common/b3Logging.cpp \
+	Bullet3Common/b3Logging.h \
+	Bullet3Common/b3Matrix3x3.h \
+	Bullet3Common/b3MinMax.h \
+	Bullet3Common/b3PoolAllocator.h \
+	Bullet3Common/b3QuadWord.h \
+	Bullet3Common/b3Quaternion.h \
+	Bullet3Common/b3Random.h \
+	Bullet3Common/b3Scalar.h \
+	Bullet3Common/b3StackAlloc.h \
+	Bullet3Common/b3Transform.h \
+	Bullet3Common/b3TransformUtil.h \
+	Bullet3Common/b3Vector3.cpp \
+	Bullet3Common/b3Vector3.h \
+	Bullet3Common/shared/b3Float4.h \
+	Bullet3Common/shared/b3Int2.h \
+	Bullet3Common/shared/b3Int4.h \
+	Bullet3Common/shared/b3Mat3x3.h \
+	Bullet3Common/shared/b3PlatformDefinitions.h \
+	Bullet3Common/shared/b3Quat.h \
+	$(NULL)
+bullet_sources += \
+	Bullet3Dynamics/b3CpuRigidBodyPipeline.cpp \
+	Bullet3Dynamics/b3CpuRigidBodyPipeline.h \
+	Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h \
+	Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.cpp \
+	Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.h \
+	Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.cpp \
+	Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.h \
+	Bullet3Dynamics/ConstraintSolver/b3JacobianEntry.h \
+	Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.cpp \
+	Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h \
+	Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.cpp \
+	Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h \
+	Bullet3Dynamics/ConstraintSolver/b3SolverBody.h \
+	Bullet3Dynamics/ConstraintSolver/b3SolverConstraint.h \
+	Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.cpp \
+	Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h \
+	Bullet3Dynamics/shared/b3ContactConstraint4.h \
+	Bullet3Dynamics/shared/b3ConvertConstraint4.h \
+	Bullet3Dynamics/shared/b3Inertia.h \
+	Bullet3Dynamics/shared/b3IntegrateTransforms.h \
+	$(NULL)
+bullet_sources += \
+	Bullet3Geometry/b3AabbUtil.h \
+	Bullet3Geometry/b3ConvexHullComputer.cpp \
+	Bullet3Geometry/b3ConvexHullComputer.h \
+	Bullet3Geometry/b3GeometryUtil.cpp \
+	Bullet3Geometry/b3GeometryUtil.h \
+	Bullet3Geometry/b3GrahamScan2dConvexHull.h \
+	$(NULL)
+bullet_sources += \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp \
+	Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h \
+	Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h \
+	Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl \
+	Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h \
+	Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl \
+	Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h \
+	Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl \
+	Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h \
+	Bullet3OpenCL/Initialize/b3OpenCLInclude.h \
+	Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp \
+	Bullet3OpenCL/Initialize/b3OpenCLUtils.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3GjkPairDetector.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h \
+	Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp \
+	Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h \
+	Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h \
+	Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp \
+	Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h \
+	Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h \
+	Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp \
+	Bullet3OpenCL/ParallelPrimitives/b3FillCL.h \
+	Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp \
+	Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h \
+	Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h \
+	Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp \
+	Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h \
+	Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp \
+	Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h \
+	Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp \
+	Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h \
+	Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl \
+	Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h \
+	Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl \
+	Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h \
+	Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl \
+	Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h \
+	Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl \
+	Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl \
+	Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h \
+	Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h \
+	Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl \
+	Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h \
+	Bullet3OpenCL/Raycast/b3GpuRaycast.cpp \
+	Bullet3OpenCL/Raycast/b3GpuRaycast.h \
+	Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl \
+	Bullet3OpenCL/Raycast/kernels/rayCastKernels.h \
+	Bullet3OpenCL/RigidBody/b3GpuConstraint4.h \
+	Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp \
+	Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h \
+	Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp \
+	Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h \
+	Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp \
+	Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h \
+	Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h \
+	Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp \
+	Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h \
+	Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp \
+	Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h \
+	Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp \
+	Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h \
+	Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h \
+	Bullet3OpenCL/RigidBody/b3GpuSolverBody.h \
+	Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h \
+	Bullet3OpenCL/RigidBody/b3Solver.cpp \
+	Bullet3OpenCL/RigidBody/b3Solver.h \
+	Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl \
+	Bullet3OpenCL/RigidBody/kernels/batchingKernels.h \
+	Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl \
+	Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h \
+	Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl \
+	Bullet3OpenCL/RigidBody/kernels/integrateKernel.h \
+	Bullet3OpenCL/RigidBody/kernels/jointSolver.cl \
+	Bullet3OpenCL/RigidBody/kernels/jointSolver.h \
+	Bullet3OpenCL/RigidBody/kernels/solveContact.cl \
+	Bullet3OpenCL/RigidBody/kernels/solveContact.h \
+	Bullet3OpenCL/RigidBody/kernels/solveFriction.cl \
+	Bullet3OpenCL/RigidBody/kernels/solveFriction.h \
+	Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl \
+	Bullet3OpenCL/RigidBody/kernels/solverSetup2.h \
+	Bullet3OpenCL/RigidBody/kernels/solverSetup.cl \
+	Bullet3OpenCL/RigidBody/kernels/solverSetup.h \
+	Bullet3OpenCL/RigidBody/kernels/solverUtils.cl \
+	Bullet3OpenCL/RigidBody/kernels/solverUtils.h \
+	Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl \
+	Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h \
+	$(NULL)
+bullet_sources += \
+	Bullet3Serialize/Bullet2FileLoader/autogenerated/bullet2.h \
+	Bullet3Serialize/Bullet2FileLoader/b3BulletFile.cpp \
+	Bullet3Serialize/Bullet2FileLoader/b3BulletFile.h \
+	Bullet3Serialize/Bullet2FileLoader/b3Chunk.cpp \
+	Bullet3Serialize/Bullet2FileLoader/b3Chunk.h \
+	Bullet3Serialize/Bullet2FileLoader/b3Common.h \
+	Bullet3Serialize/Bullet2FileLoader/b3Defines.h \
+	Bullet3Serialize/Bullet2FileLoader/b3DNA.cpp \
+	Bullet3Serialize/Bullet2FileLoader/b3DNA.h \
+	Bullet3Serialize/Bullet2FileLoader/b3File.cpp \
+	Bullet3Serialize/Bullet2FileLoader/b3File.h \
+	Bullet3Serialize/Bullet2FileLoader/b3Serializer.cpp \
+	Bullet3Serialize/Bullet2FileLoader/b3Serializer.h \
+	$(NULL)
+bullet_sources += \
+	BulletCollision/BroadphaseCollision/btAxisSweep3.cpp \
+	BulletCollision/BroadphaseCollision/btAxisSweep3.h \
+	BulletCollision/BroadphaseCollision/btBroadphaseInterface.h \
+	BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp \
+	BulletCollision/BroadphaseCollision/btBroadphaseProxy.h \
+	BulletCollision/BroadphaseCollision/btCollisionAlgorithm.cpp \
+	BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h \
+	BulletCollision/BroadphaseCollision/btDbvtBroadphase.cpp \
+	BulletCollision/BroadphaseCollision/btDbvtBroadphase.h \
+	BulletCollision/BroadphaseCollision/btDbvt.cpp \
+	BulletCollision/BroadphaseCollision/btDbvt.h \
+	BulletCollision/BroadphaseCollision/btDispatcher.cpp \
+	BulletCollision/BroadphaseCollision/btDispatcher.h \
+	BulletCollision/BroadphaseCollision/btMultiSapBroadphase.cpp \
+	BulletCollision/BroadphaseCollision/btMultiSapBroadphase.h \
+	BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp \
+	BulletCollision/BroadphaseCollision/btOverlappingPairCache.h \
+	BulletCollision/BroadphaseCollision/btOverlappingPairCallback.h \
+	BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp \
+	BulletCollision/BroadphaseCollision/btQuantizedBvh.h \
+	BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp \
+	BulletCollision/BroadphaseCollision/btSimpleBroadphase.h \
+	BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp \
+	BulletCollision/CollisionDispatch/btBoxBoxDetector.h \
+	BulletCollision/CollisionDispatch/btCollisionConfiguration.h \
+	BulletCollision/CollisionDispatch/btCollisionCreateFunc.h \
+	BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp \
+	BulletCollision/CollisionDispatch/btCollisionDispatcher.h \
+	BulletCollision/CollisionDispatch/btCollisionObject.cpp \
+	BulletCollision/CollisionDispatch/btCollisionObject.h \
+	BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h \
+	BulletCollision/CollisionDispatch/btCollisionWorld.cpp \
+	BulletCollision/CollisionDispatch/btCollisionWorld.h \
+	BulletCollision/CollisionDispatch/btCollisionWorldImporter.cpp \
+	BulletCollision/CollisionDispatch/btCollisionWorldImporter.h \
+	BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btCompoundCompoundCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h \
+	BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h \
+	BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp \
+	BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h \
+	BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btGhostObject.cpp \
+	BulletCollision/CollisionDispatch/btGhostObject.h \
+	BulletCollision/CollisionDispatch/btHashedSimplePairCache.cpp \
+	BulletCollision/CollisionDispatch/btHashedSimplePairCache.h \
+	BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp \
+	BulletCollision/CollisionDispatch/btInternalEdgeUtility.h \
+	BulletCollision/CollisionDispatch/btManifoldResult.cpp \
+	BulletCollision/CollisionDispatch/btManifoldResult.h \
+	BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp \
+	BulletCollision/CollisionDispatch/btSimulationIslandManager.h \
+	BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp \
+	BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h \
+	BulletCollision/CollisionDispatch/btUnionFind.cpp \
+	BulletCollision/CollisionDispatch/btUnionFind.h \
+	BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp \
+	BulletCollision/CollisionDispatch/SphereTriangleDetector.h \
+	BulletCollision/CollisionShapes/btBox2dShape.cpp \
+	BulletCollision/CollisionShapes/btBox2dShape.h \
+	BulletCollision/CollisionShapes/btBoxShape.cpp \
+	BulletCollision/CollisionShapes/btBoxShape.h \
+	BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp \
+	BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h \
+	BulletCollision/CollisionShapes/btCapsuleShape.cpp \
+	BulletCollision/CollisionShapes/btCapsuleShape.h \
+	BulletCollision/CollisionShapes/btCollisionMargin.h \
+	BulletCollision/CollisionShapes/btCollisionShape.cpp \
+	BulletCollision/CollisionShapes/btCollisionShape.h \
+	BulletCollision/CollisionShapes/btCompoundShape.cpp \
+	BulletCollision/CollisionShapes/btCompoundShape.h \
+	BulletCollision/CollisionShapes/btConcaveShape.cpp \
+	BulletCollision/CollisionShapes/btConcaveShape.h \
+	BulletCollision/CollisionShapes/btConeShape.cpp \
+	BulletCollision/CollisionShapes/btConeShape.h \
+	BulletCollision/CollisionShapes/btConvex2dShape.cpp \
+	BulletCollision/CollisionShapes/btConvex2dShape.h \
+	BulletCollision/CollisionShapes/btConvexHullShape.cpp \
+	BulletCollision/CollisionShapes/btConvexHullShape.h \
+	BulletCollision/CollisionShapes/btConvexInternalShape.cpp \
+	BulletCollision/CollisionShapes/btConvexInternalShape.h \
+	BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp \
+	BulletCollision/CollisionShapes/btConvexPointCloudShape.h \
+	BulletCollision/CollisionShapes/btConvexPolyhedron.cpp \
+	BulletCollision/CollisionShapes/btConvexPolyhedron.h \
+	BulletCollision/CollisionShapes/btConvexShape.cpp \
+	BulletCollision/CollisionShapes/btConvexShape.h \
+	BulletCollision/CollisionShapes/btConvexTriangleMeshShape.cpp \
+	BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h \
+	BulletCollision/CollisionShapes/btCylinderShape.cpp \
+	BulletCollision/CollisionShapes/btCylinderShape.h \
+	BulletCollision/CollisionShapes/btEmptyShape.cpp \
+	BulletCollision/CollisionShapes/btEmptyShape.h \
+	BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp \
+	BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h \
+	BulletCollision/CollisionShapes/btMaterial.h \
+	BulletCollision/CollisionShapes/btMinkowskiSumShape.cpp \
+	BulletCollision/CollisionShapes/btMinkowskiSumShape.h \
+	BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.cpp \
+	BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h \
+	BulletCollision/CollisionShapes/btMultiSphereShape.cpp \
+	BulletCollision/CollisionShapes/btMultiSphereShape.h \
+	BulletCollision/CollisionShapes/btOptimizedBvh.cpp \
+	BulletCollision/CollisionShapes/btOptimizedBvh.h \
+	BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp \
+	BulletCollision/CollisionShapes/btPolyhedralConvexShape.h \
+	BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp \
+	BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h \
+	BulletCollision/CollisionShapes/btShapeHull.cpp \
+	BulletCollision/CollisionShapes/btShapeHull.h \
+	BulletCollision/CollisionShapes/btSphereShape.cpp \
+	BulletCollision/CollisionShapes/btSphereShape.h \
+	BulletCollision/CollisionShapes/btStaticPlaneShape.cpp \
+	BulletCollision/CollisionShapes/btStaticPlaneShape.h \
+	BulletCollision/CollisionShapes/btStridingMeshInterface.cpp \
+	BulletCollision/CollisionShapes/btStridingMeshInterface.h \
+	BulletCollision/CollisionShapes/btTetrahedronShape.cpp \
+	BulletCollision/CollisionShapes/btTetrahedronShape.h \
+	BulletCollision/CollisionShapes/btTriangleBuffer.cpp \
+	BulletCollision/CollisionShapes/btTriangleBuffer.h \
+	BulletCollision/CollisionShapes/btTriangleCallback.cpp \
+	BulletCollision/CollisionShapes/btTriangleCallback.h \
+	BulletCollision/CollisionShapes/btTriangleIndexVertexArray.cpp \
+	BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h \
+	BulletCollision/CollisionShapes/btTriangleIndexVertexMaterialArray.cpp \
+	BulletCollision/CollisionShapes/btTriangleIndexVertexMaterialArray.h \
+	BulletCollision/CollisionShapes/btTriangleInfoMap.h \
+	BulletCollision/CollisionShapes/btTriangleMesh.cpp \
+	BulletCollision/CollisionShapes/btTriangleMesh.h \
+	BulletCollision/CollisionShapes/btTriangleMeshShape.cpp \
+	BulletCollision/CollisionShapes/btTriangleMeshShape.h \
+	BulletCollision/CollisionShapes/btTriangleShape.h \
+	BulletCollision/CollisionShapes/btUniformScalingShape.cpp \
+	BulletCollision/CollisionShapes/btUniformScalingShape.h \
+	BulletCollision/Gimpact/btBoxCollision.h \
+	BulletCollision/Gimpact/btClipPolygon.h \
+	BulletCollision/Gimpact/btCompoundFromGimpact.h \
+	BulletCollision/Gimpact/btContactProcessing.cpp \
+	BulletCollision/Gimpact/btContactProcessing.h \
+	BulletCollision/Gimpact/btGenericPoolAllocator.cpp \
+	BulletCollision/Gimpact/btGenericPoolAllocator.h \
+	BulletCollision/Gimpact/btGeometryOperations.h \
+	BulletCollision/Gimpact/btGImpactBvh.cpp \
+	BulletCollision/Gimpact/btGImpactBvh.h \
+	BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp \
+	BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h \
+	BulletCollision/Gimpact/btGImpactMassUtil.h \
+	BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp \
+	BulletCollision/Gimpact/btGImpactQuantizedBvh.h \
+	BulletCollision/Gimpact/btGImpactShape.cpp \
+	BulletCollision/Gimpact/btGImpactShape.h \
+	BulletCollision/Gimpact/btQuantization.h \
+	BulletCollision/Gimpact/btTriangleShapeEx.cpp \
+	BulletCollision/Gimpact/btTriangleShapeEx.h \
+	BulletCollision/Gimpact/gim_array.h \
+	BulletCollision/Gimpact/gim_basic_geometry_operations.h \
+	BulletCollision/Gimpact/gim_bitset.h \
+	BulletCollision/Gimpact/gim_box_collision.h \
+	BulletCollision/Gimpact/gim_box_set.cpp \
+	BulletCollision/Gimpact/gim_box_set.h \
+	BulletCollision/Gimpact/gim_clip_polygon.h \
+	BulletCollision/Gimpact/gim_contact.cpp \
+	BulletCollision/Gimpact/gim_contact.h \
+	BulletCollision/Gimpact/gim_geometry.h \
+	BulletCollision/Gimpact/gim_geom_types.h \
+	BulletCollision/Gimpact/gim_hash_table.h \
+	BulletCollision/Gimpact/gim_linear_math.h \
+	BulletCollision/Gimpact/gim_math.h \
+	BulletCollision/Gimpact/gim_memory.cpp \
+	BulletCollision/Gimpact/gim_memory.h \
+	BulletCollision/Gimpact/gim_radixsort.h \
+	BulletCollision/Gimpact/gim_tri_collision.cpp \
+	BulletCollision/Gimpact/gim_tri_collision.h \
+	BulletCollision/NarrowPhaseCollision/btComputeGjkEpaPenetration.h \
+	BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp \
+	BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.h \
+	BulletCollision/NarrowPhaseCollision/btConvexCast.cpp \
+	BulletCollision/NarrowPhaseCollision/btConvexCast.h \
+	BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h \
+	BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h \
+	BulletCollision/NarrowPhaseCollision/btGjkCollisionDescription.h \
+	BulletCollision/NarrowPhaseCollision/btGjkConvexCast.cpp \
+	BulletCollision/NarrowPhaseCollision/btGjkConvexCast.h \
+	BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp \
+	BulletCollision/NarrowPhaseCollision/btGjkEpa2.h \
+	BulletCollision/NarrowPhaseCollision/btGjkEpa3.h \
+	BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.cpp \
+	BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h \
+	BulletCollision/NarrowPhaseCollision/btGjkPairDetector.cpp \
+	BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h \
+	BulletCollision/NarrowPhaseCollision/btManifoldPoint.h \
+	BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.cpp \
+	BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h \
+	BulletCollision/NarrowPhaseCollision/btMprPenetration.h \
+	BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp \
+	BulletCollision/NarrowPhaseCollision/btPersistentManifold.h \
+	BulletCollision/NarrowPhaseCollision/btPointCollector.h \
+	BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp \
+	BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h \
+	BulletCollision/NarrowPhaseCollision/btRaycastCallback.cpp \
+	BulletCollision/NarrowPhaseCollision/btRaycastCallback.h \
+	BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h \
+	BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.cpp \
+	BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.h \
+	BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.cpp \
+	BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h \
+	$(NULL)
 bullet_sources += \
-    LinearMath/btQuickprof.cpp \
-    LinearMath/btGeometryUtil.cpp \
-    LinearMath/btAlignedAllocator.cpp \
-    LinearMath/btSerializer.cpp \
-    LinearMath/btConvexHull.cpp \
-    LinearMath/btConvexHullComputer.cpp \
-    LinearMath/btConvexHullComputer.h \
-    LinearMath/btGrahamScan2dConvexHull.h \
-    LinearMath/btHashMap.h \
-    LinearMath/btConvexHull.h \
-    LinearMath/btAabbUtil2.h \
-    LinearMath/btGeometryUtil.h \
-    LinearMath/btQuadWord.h \
-    LinearMath/btPoolAllocator.h \
-    LinearMath/btScalar.h \
-    LinearMath/btMinMax.h \
-    LinearMath/btVector3.h \
-    LinearMath/btList.h \
-    LinearMath/btStackAlloc.h \
-    LinearMath/btMatrix3x3.h \
-    LinearMath/btMotionState.h \
-    LinearMath/btAlignedAllocator.h \
-    LinearMath/btQuaternion.h \
-    LinearMath/btAlignedObjectArray.h \
-    LinearMath/btQuickprof.h \
-    LinearMath/btSerializer.h \
-    LinearMath/btTransformUtil.h \
-    LinearMath/btTransform.h \
-    LinearMath/btDefaultMotionState.h \
-    LinearMath/btIDebugDraw.h \
-    LinearMath/btRandom.h
+	BulletDynamics/Character/btCharacterControllerInterface.h \
+	BulletDynamics/Character/btKinematicCharacterController.cpp \
+	BulletDynamics/Character/btKinematicCharacterController.h \
+	BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btConeTwistConstraint.h \
+	BulletDynamics/ConstraintSolver/btConstraintSolver.h \
+	BulletDynamics/ConstraintSolver/btContactConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btContactConstraint.h \
+	BulletDynamics/ConstraintSolver/btContactSolverInfo.h \
+	BulletDynamics/ConstraintSolver/btFixedConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btFixedConstraint.h \
+	BulletDynamics/ConstraintSolver/btGearConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btGearConstraint.h \
+	BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h \
+	BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp \
+	BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h \
+	BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h \
+	BulletDynamics/ConstraintSolver/btHinge2Constraint.cpp \
+	BulletDynamics/ConstraintSolver/btHinge2Constraint.h \
+	BulletDynamics/ConstraintSolver/btHingeConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btHingeConstraint.h \
+	BulletDynamics/ConstraintSolver/btJacobianEntry.h \
+	BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.cpp \
+	BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h \
+	BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h \
+	BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp \
+	BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h \
+	BulletDynamics/ConstraintSolver/btSliderConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btSliderConstraint.h \
+	BulletDynamics/ConstraintSolver/btSolve2LinearConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btSolve2LinearConstraint.h \
+	BulletDynamics/ConstraintSolver/btSolverBody.h \
+	BulletDynamics/ConstraintSolver/btSolverConstraint.h \
+	BulletDynamics/ConstraintSolver/btTypedConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btTypedConstraint.h \
+	BulletDynamics/ConstraintSolver/btUniversalConstraint.cpp \
+	BulletDynamics/ConstraintSolver/btUniversalConstraint.h \
+	BulletDynamics/Dynamics/btActionInterface.h \
+	BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp \
+	BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h \
+	BulletDynamics/Dynamics/btDynamicsWorld.h \
+	BulletDynamics/Dynamics/btRigidBody.cpp \
+	BulletDynamics/Dynamics/btRigidBody.h \
+	BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp \
+	BulletDynamics/Dynamics/btSimpleDynamicsWorld.h \
+	BulletDynamics/Featherstone/btMultiBodyConstraint.cpp \
+	BulletDynamics/Featherstone/btMultiBodyConstraint.h \
+	BulletDynamics/Featherstone/btMultiBodyConstraintSolver.cpp \
+	BulletDynamics/Featherstone/btMultiBodyConstraintSolver.h \
+	BulletDynamics/Featherstone/btMultiBody.cpp \
+	BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp \
+	BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h \
+	BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp \
+	BulletDynamics/Featherstone/btMultiBodyFixedConstraint.h \
+	BulletDynamics/Featherstone/btMultiBody.h \
+	BulletDynamics/Featherstone/btMultiBodyJointFeedback.h \
+	BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp \
+	BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h \
+	BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp \
+	BulletDynamics/Featherstone/btMultiBodyJointMotor.h \
+	BulletDynamics/Featherstone/btMultiBodyLinkCollider.h \
+	BulletDynamics/Featherstone/btMultiBodyLink.h \
+	BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp \
+	BulletDynamics/Featherstone/btMultiBodyPoint2Point.h \
+	BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp \
+	BulletDynamics/Featherstone/btMultiBodySliderConstraint.h \
+	BulletDynamics/Featherstone/btMultiBodySolverConstraint.h \
+	BulletDynamics/MLCPSolvers/btDantzigLCP.cpp \
+	BulletDynamics/MLCPSolvers/btDantzigLCP.h \
+	BulletDynamics/MLCPSolvers/btDantzigSolver.h \
+	BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp \
+	BulletDynamics/MLCPSolvers/btLemkeAlgorithm.h \
+	BulletDynamics/MLCPSolvers/btLemkeSolver.h \
+	BulletDynamics/MLCPSolvers/btMLCPSolver.cpp \
+	BulletDynamics/MLCPSolvers/btMLCPSolver.h \
+	BulletDynamics/MLCPSolvers/btMLCPSolverInterface.h \
+	BulletDynamics/MLCPSolvers/btPATHSolver.h \
+	BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h \
+	BulletDynamics/Vehicle/btRaycastVehicle.cpp \
+	BulletDynamics/Vehicle/btRaycastVehicle.h \
+	BulletDynamics/Vehicle/btVehicleRaycaster.h \
+	BulletDynamics/Vehicle/btWheelInfo.cpp \
+	BulletDynamics/Vehicle/btWheelInfo.h \
+	$(NULL)
 bullet_sources += \
-    BulletCollision/NarrowPhaseCollision/btRaycastCallback.cpp \
-    BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.cpp \
-    BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.cpp \
-    BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.cpp \
-    BulletCollision/NarrowPhaseCollision/btGjkConvexCast.cpp \
-    BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp \
-    BulletCollision/NarrowPhaseCollision/btConvexCast.cpp \
-    BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp \
-    BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h \
-    BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp \
-    BulletCollision/NarrowPhaseCollision/btGjkPairDetector.cpp \
-    BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp \
-    BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.cpp \
-    BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btCollisionObject.cpp \
-    BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btGhostObject.cpp \
-    BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp \
-    BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp \
-    BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp \
-    BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp \
-    BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp \
-    BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp \
-    BulletCollision/CollisionDispatch/btManifoldResult.cpp \
-    BulletCollision/CollisionDispatch/btCollisionWorld.cpp \
-    BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp \
-    BulletCollision/CollisionDispatch/btUnionFind.cpp \
-    BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp \
-    BulletCollision/CollisionShapes/btTetrahedronShape.cpp \
-    BulletCollision/CollisionShapes/btShapeHull.cpp \
-    BulletCollision/CollisionShapes/btMinkowskiSumShape.cpp \
-    BulletCollision/CollisionShapes/btCompoundShape.cpp \
-    BulletCollision/CollisionShapes/btConeShape.cpp \
-    BulletCollision/CollisionShapes/btConvexPolyhedron.cpp \
-    BulletCollision/CollisionShapes/btConvexPolyhedron.h \
-    BulletCollision/CollisionShapes/btMultiSphereShape.cpp \
-    BulletCollision/CollisionShapes/btUniformScalingShape.cpp \
-    BulletCollision/CollisionShapes/btSphereShape.cpp \
-    BulletCollision/CollisionShapes/btTriangleIndexVertexArray.cpp \
-    BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp \
-    BulletCollision/CollisionShapes/btTriangleMeshShape.cpp \
-    BulletCollision/CollisionShapes/btTriangleBuffer.cpp \
-    BulletCollision/CollisionShapes/btStaticPlaneShape.cpp \
-    BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp \
-    BulletCollision/CollisionShapes/btEmptyShape.cpp \
-    BulletCollision/CollisionShapes/btCollisionShape.cpp \
-    BulletCollision/CollisionShapes/btConvexShape.cpp \
-    BulletCollision/CollisionShapes/btConvex2dShape.cpp \
-    BulletCollision/CollisionShapes/btConvexInternalShape.cpp \
-    BulletCollision/CollisionShapes/btConvexHullShape.cpp \
-    BulletCollision/CollisionShapes/btTriangleCallback.cpp \
-    BulletCollision/CollisionShapes/btCapsuleShape.cpp \
-    BulletCollision/CollisionShapes/btConvexTriangleMeshShape.cpp \
-    BulletCollision/CollisionShapes/btConcaveShape.cpp \
-    BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp \
-    BulletCollision/CollisionShapes/btBoxShape.cpp \
-    BulletCollision/CollisionShapes/btBox2dShape.cpp \
-    BulletCollision/CollisionShapes/btOptimizedBvh.cpp \
-    BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp \
-    BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.cpp \
-    BulletCollision/CollisionShapes/btCylinderShape.cpp \
-    BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp \
-    BulletCollision/CollisionShapes/btStridingMeshInterface.cpp \
-    BulletCollision/CollisionShapes/btTriangleIndexVertexMaterialArray.cpp \
-    BulletCollision/CollisionShapes/btTriangleMesh.cpp \
-    BulletCollision/BroadphaseCollision/btAxisSweep3.cpp \
-    BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp \
-    BulletCollision/BroadphaseCollision/btDbvtBroadphase.cpp \
-    BulletCollision/BroadphaseCollision/btMultiSapBroadphase.cpp \
-    BulletCollision/BroadphaseCollision/btDispatcher.cpp \
-    BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp \
-    BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp \
-    BulletCollision/BroadphaseCollision/btCollisionAlgorithm.cpp \
-    BulletCollision/BroadphaseCollision/btDbvt.cpp \
-    BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp \
-    BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h \
-    BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h \
-    BulletCollision/NarrowPhaseCollision/btConvexCast.h \
-    BulletCollision/NarrowPhaseCollision/btGjkEpa2.h \
-    BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h \
-    BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h \
-    BulletCollision/NarrowPhaseCollision/btPointCollector.h \
-    BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h \
-    BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h \
-    BulletCollision/NarrowPhaseCollision/btRaycastCallback.h \
-    BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.h \
-    BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.h \
-    BulletCollision/NarrowPhaseCollision/btPersistentManifold.h \
-    BulletCollision/NarrowPhaseCollision/btGjkConvexCast.h \
-    BulletCollision/NarrowPhaseCollision/btManifoldPoint.h \
-    BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h \
-    BulletCollision/CollisionDispatch/btCollisionObject.h \
-    BulletCollision/CollisionDispatch/btGhostObject.h \
-    BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btCollisionCreateFunc.h \
-    BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h \
-    BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h \
-    BulletCollision/CollisionDispatch/btBoxBoxDetector.h \
-    BulletCollision/CollisionDispatch/btCollisionDispatcher.h \
-    BulletCollision/CollisionDispatch/SphereTriangleDetector.h \
-    BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btUnionFind.h \
-    BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btSimulationIslandManager.h \
-    BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h \
-    BulletCollision/CollisionDispatch/btCollisionWorld.h \
-    BulletCollision/CollisionDispatch/btInternalEdgeUtility.h \
-    BulletCollision/CollisionDispatch/btManifoldResult.h \
-    BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btCollisionConfiguration.h \
-    BulletCollision/CollisionShapes/btConvexShape.h \
-    BulletCollision/CollisionShapes/btConvex2dShape.h \
-    BulletCollision/CollisionShapes/btTriangleCallback.h \
-    BulletCollision/CollisionShapes/btPolyhedralConvexShape.h \
-    BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btCompoundShape.h \
-    BulletCollision/CollisionShapes/btBoxShape.h \
-    BulletCollision/CollisionShapes/btBox2dShape.h \
-    BulletCollision/CollisionShapes/btMultiSphereShape.h \
-    BulletCollision/CollisionShapes/btCollisionMargin.h \
-    BulletCollision/CollisionShapes/btConcaveShape.h \
-    BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btEmptyShape.h \
-    BulletCollision/CollisionShapes/btUniformScalingShape.h \
-    BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btMaterial.h \
-    BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h \
-    BulletCollision/CollisionShapes/btTriangleInfoMap.h \
-    BulletCollision/CollisionShapes/btSphereShape.h \
-    BulletCollision/CollisionShapes/btConvexPointCloudShape.h \
-    BulletCollision/CollisionShapes/btCapsuleShape.h \
-    BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h \
-    BulletCollision/CollisionShapes/btCollisionShape.h \
-    BulletCollision/CollisionShapes/btStaticPlaneShape.h \
-    BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btStridingMeshInterface.h \
-    BulletCollision/CollisionShapes/btTriangleMesh.h \
-    BulletCollision/CollisionShapes/btTriangleBuffer.h \
-    BulletCollision/CollisionShapes/btShapeHull.h \
-    BulletCollision/CollisionShapes/btMinkowskiSumShape.h \
-    BulletCollision/CollisionShapes/btOptimizedBvh.h \
-    BulletCollision/CollisionShapes/btTriangleShape.h \
-    BulletCollision/CollisionShapes/btTriangleIndexVertexMaterialArray.h \
-    BulletCollision/CollisionShapes/btCylinderShape.h \
-    BulletCollision/CollisionShapes/btTetrahedronShape.h \
-    BulletCollision/CollisionShapes/btConvexInternalShape.h \
-    BulletCollision/CollisionShapes/btConeShape.h \
-    BulletCollision/CollisionShapes/btConvexHullShape.h \
-    BulletCollision/BroadphaseCollision/btAxisSweep3.h \
-    BulletCollision/BroadphaseCollision/btDbvtBroadphase.h \
-    BulletCollision/BroadphaseCollision/btSimpleBroadphase.h \
-    BulletCollision/BroadphaseCollision/btMultiSapBroadphase.h \
-    BulletCollision/BroadphaseCollision/btDbvt.h \
-    BulletCollision/BroadphaseCollision/btOverlappingPairCallback.h \
-    BulletCollision/BroadphaseCollision/btDispatcher.h \
-    BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h \
-    BulletCollision/BroadphaseCollision/btBroadphaseProxy.h \
-    BulletCollision/BroadphaseCollision/btOverlappingPairCache.h \
-    BulletCollision/BroadphaseCollision/btBroadphaseInterface.h \
-    BulletCollision/BroadphaseCollision/btQuantizedBvh.h \
-    BulletCollision/Gimpact/btGImpactBvh.cpp\
-    BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp\
-    BulletCollision/Gimpact/btTriangleShapeEx.cpp\
-    BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp\
-    BulletCollision/Gimpact/btGImpactShape.cpp\
-    BulletCollision/Gimpact/gim_box_set.cpp\
-    BulletCollision/Gimpact/gim_contact.cpp\
-    BulletCollision/Gimpact/gim_memory.cpp\
-    BulletCollision/Gimpact/gim_tri_collision.cpp
+	BulletInverseDynamics/details/IDEigenInterface.hpp \
+	BulletInverseDynamics/details/IDLinearMathInterface.hpp \
+	BulletInverseDynamics/details/IDMatVec.hpp \
+	BulletInverseDynamics/details/MultiBodyTreeImpl.cpp \
+	BulletInverseDynamics/details/MultiBodyTreeImpl.hpp \
+	BulletInverseDynamics/details/MultiBodyTreeInitCache.cpp \
+	BulletInverseDynamics/details/MultiBodyTreeInitCache.hpp \
+	BulletInverseDynamics/IDConfigBuiltin.hpp \
+	BulletInverseDynamics/IDConfigEigen.hpp \
+	BulletInverseDynamics/IDConfig.hpp \
+	BulletInverseDynamics/IDErrorMessages.hpp \
+	BulletInverseDynamics/IDMath.cpp \
+	BulletInverseDynamics/IDMath.hpp \
+	BulletInverseDynamics/MultiBodyTree.cpp \
+	BulletInverseDynamics/MultiBodyTree.hpp \
+	$(NULL)
 bullet_sources += \
-    BulletDynamics/Dynamics/btRigidBody.cpp \
-    BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp \
-    BulletDynamics/Dynamics/Bullet-C-API.cpp \
-    BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp \
-    BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btSolve2LinearConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btSolve2LinearConstraint.h \
-    BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btTypedConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btContactConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btSliderConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btHingeConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btHinge2Constraint.cpp \
-    BulletDynamics/ConstraintSolver/btUniversalConstraint.cpp \
-    BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp \
-    BulletDynamics/Vehicle/btWheelInfo.cpp \
-    BulletDynamics/Vehicle/btRaycastVehicle.cpp \
-    BulletDynamics/Character/btKinematicCharacterController.cpp \
-    BulletDynamics/Character/btKinematicCharacterController.h \
-    BulletDynamics/Character/btCharacterControllerInterface.h \
-    BulletDynamics/Dynamics/btActionInterface.h \
-    BulletDynamics/Dynamics/btSimpleDynamicsWorld.h \
-    BulletDynamics/Dynamics/btRigidBody.h \
-    BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h \
-    BulletDynamics/Dynamics/btDynamicsWorld.h \
-    BulletDynamics/ConstraintSolver/btSolverBody.h \
-    BulletDynamics/ConstraintSolver/btConstraintSolver.h \
-    BulletDynamics/ConstraintSolver/btConeTwistConstraint.h \
-    BulletDynamics/ConstraintSolver/btTypedConstraint.h \
-    BulletDynamics/ConstraintSolver/btContactSolverInfo.h \
-    BulletDynamics/ConstraintSolver/btContactConstraint.h \
-    BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h \
-    BulletDynamics/ConstraintSolver/btJacobianEntry.h \
-    BulletDynamics/ConstraintSolver/btSolverConstraint.h \
-    BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h \
-    BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h \
-    BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h \
-    BulletDynamics/ConstraintSolver/btSliderConstraint.h \
-    BulletDynamics/ConstraintSolver/btHingeConstraint.h \
-    BulletDynamics/ConstraintSolver/btHinge2Constraint.h \
-    BulletDynamics/ConstraintSolver/btUniversalConstraint.h \
-    BulletDynamics/Vehicle/btVehicleRaycaster.h \
-    BulletDynamics/Vehicle/btRaycastVehicle.h \
-    BulletDynamics/Vehicle/btWheelInfo.h
+	BulletSoftBody/btDefaultSoftBodySolver.cpp \
+	BulletSoftBody/btDefaultSoftBodySolver.h \
+	BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp \
+	BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h \
+	BulletSoftBody/btSoftBody.cpp \
+	BulletSoftBody/btSoftBodyData.h \
+	BulletSoftBody/btSoftBody.h \
+	BulletSoftBody/btSoftBodyHelpers.cpp \
+	BulletSoftBody/btSoftBodyHelpers.h \
+	BulletSoftBody/btSoftBodyInternals.h \
+	BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.cpp \
+	BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.h \
+	BulletSoftBody/btSoftBodySolvers.h \
+	BulletSoftBody/btSoftBodySolverVertexBuffer.h \
+	BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp \
+	BulletSoftBody/btSoftRigidCollisionAlgorithm.h \
+	BulletSoftBody/btSoftRigidDynamicsWorld.cpp \
+	BulletSoftBody/btSoftRigidDynamicsWorld.h \
+	BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp \
+	BulletSoftBody/btSoftSoftCollisionAlgorithm.h \
+	BulletSoftBody/btSparseSDF.h \
+	$(NULL)
 bullet_sources += \
-    BulletSoftBody/btDefaultSoftBodySolver.cpp \
-    BulletSoftBody/btDefaultSoftBodySolver.h \
-    BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.cpp \
-    BulletSoftBody/btSoftBody.cpp \
-    BulletSoftBody/btSoftBodySolvers.h \
-    BulletSoftBody/btSoftBodySolverVertexBuffer.h \
-    BulletSoftBody/btSoftBodyData.h \
-    BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp \
-    BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp \
-    BulletSoftBody/btSoftRigidDynamicsWorld.cpp \
-    BulletSoftBody/btSoftBodyHelpers.cpp \
-    BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp \
-    BulletSoftBody/btSparseSDF.h \
-    BulletSoftBody/btSoftRigidCollisionAlgorithm.h \
-    BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.h \
-    BulletSoftBody/btSoftBody.h \
-    BulletSoftBody/btSoftSoftCollisionAlgorithm.h \
-    BulletSoftBody/btSoftBodyInternals.h \
-    BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h \
-    BulletSoftBody/btSoftRigidDynamicsWorld.h \
-    BulletSoftBody/btSoftBodyHelpers.h
+	LinearMath/btAabbUtil2.h \
+	LinearMath/btAlignedAllocator.cpp \
+	LinearMath/btAlignedAllocator.h \
+	LinearMath/btAlignedObjectArray.h \
+	LinearMath/btConvexHullComputer.cpp \
+	LinearMath/btConvexHullComputer.h \
+	LinearMath/btConvexHull.cpp \
+	LinearMath/btConvexHull.h \
+	LinearMath/btCpuFeatureUtility.h \
+	LinearMath/btDefaultMotionState.h \
+	LinearMath/btGeometryUtil.cpp \
+	LinearMath/btGeometryUtil.h \
+	LinearMath/btGrahamScan2dConvexHull.h \
+	LinearMath/btHashMap.h \
+	LinearMath/btIDebugDraw.h \
+	LinearMath/btList.h \
+	LinearMath/btMatrix3x3.h \
+	LinearMath/btMatrixX.h \
+	LinearMath/btMinMax.h \
+	LinearMath/btMotionState.h \
+	LinearMath/btPolarDecomposition.cpp \
+	LinearMath/btPolarDecomposition.h \
+	LinearMath/btPoolAllocator.h \
+	LinearMath/btQuadWord.h \
+	LinearMath/btQuaternion.h \
+	LinearMath/btQuickprof.cpp \
+	LinearMath/btQuickprof.h \
+	LinearMath/btRandom.h \
+	LinearMath/btScalar.h \
+	LinearMath/btSerializer.cpp \
+	LinearMath/btSerializer.h \
+	LinearMath/btSpatialAlgebra.h \
+	LinearMath/btStackAlloc.h \
+	LinearMath/btTransform.h \
+	LinearMath/btTransformUtil.h \
+	LinearMath/btVector3.cpp \
+	LinearMath/btVector3.h \
+	$(NULL)
 bullet_sources += \
-    BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.h \
-    BulletSoftBody/btSoftBodyInternals.h \
-    BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h \
-    BulletSoftBody/btSoftSoftCollisionAlgorithm.h \
-    BulletSoftBody/btSoftBody.h \
-    BulletSoftBody/btSoftBodyHelpers.h \
-    BulletSoftBody/btSparseSDF.h \
-    BulletSoftBody/btSoftRigidCollisionAlgorithm.h \
-    BulletSoftBody/btSoftRigidDynamicsWorld.h \
-    BulletDynamics/Vehicle/btRaycastVehicle.h \
-    BulletDynamics/Vehicle/btWheelInfo.h \
-    BulletDynamics/Vehicle/btVehicleRaycaster.h \
-    BulletDynamics/Dynamics/btActionInterface.h \
-    BulletDynamics/Dynamics/btRigidBody.h \
-    BulletDynamics/Dynamics/btDynamicsWorld.h \
-    BulletDynamics/Dynamics/btSimpleDynamicsWorld.h \
-    BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h \
-    BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h \
-    BulletDynamics/ConstraintSolver/btSolverConstraint.h \
-    BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h \
-    BulletDynamics/ConstraintSolver/btTypedConstraint.h \
-    BulletDynamics/ConstraintSolver/btSliderConstraint.h \
-    BulletDynamics/ConstraintSolver/btConstraintSolver.h \
-    BulletDynamics/ConstraintSolver/btContactConstraint.h \
-    BulletDynamics/ConstraintSolver/btContactSolverInfo.h \
-    BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h \
-    BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h \
-    BulletDynamics/ConstraintSolver/btJacobianEntry.h \
-    BulletDynamics/ConstraintSolver/btConeTwistConstraint.h \
-    BulletDynamics/ConstraintSolver/btHingeConstraint.h \
-    BulletDynamics/ConstraintSolver/btHinge2Constraint.h \
-    BulletDynamics/ConstraintSolver/btUniversalConstraint.h \
-    BulletDynamics/ConstraintSolver/btSolverBody.h \
-    BulletDynamics/Character/btCharacterControllerInterface.h \
-    BulletDynamics/Character/btKinematicCharacterController.h \
-    BulletCollision/CollisionShapes/btShapeHull.h \
-    BulletCollision/CollisionShapes/btConcaveShape.h \
-    BulletCollision/CollisionShapes/btCollisionMargin.h \
-    BulletCollision/CollisionShapes/btCompoundShape.h \
-    BulletCollision/CollisionShapes/btConvexHullShape.h \
-    BulletCollision/CollisionShapes/btCylinderShape.h \
-    BulletCollision/CollisionShapes/btTriangleMesh.h \
-    BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h \
-    BulletCollision/CollisionShapes/btUniformScalingShape.h \
-    BulletCollision/CollisionShapes/btConvexPointCloudShape.h \
-    BulletCollision/CollisionShapes/btTetrahedronShape.h \
-    BulletCollision/CollisionShapes/btCapsuleShape.h \
-    BulletCollision/CollisionShapes/btSphereShape.h \
-    BulletCollision/CollisionShapes/btMultiSphereShape.h \
-    BulletCollision/CollisionShapes/btConvexInternalShape.h \
-    BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btStridingMeshInterface.h \
-    BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btEmptyShape.h \
-    BulletCollision/CollisionShapes/btOptimizedBvh.h \
-    BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btTriangleCallback.h \
-    BulletCollision/CollisionShapes/btTriangleIndexVertexMaterialArray.h \
-    BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h \
-    BulletCollision/CollisionShapes/btTriangleInfoMap.h \
-    BulletCollision/CollisionShapes/btTriangleBuffer.h \
-    BulletCollision/CollisionShapes/btConvexShape.h \
-    BulletCollision/CollisionShapes/btConvex2dShape.h \
-    BulletCollision/CollisionShapes/btStaticPlaneShape.h \
-    BulletCollision/CollisionShapes/btConeShape.h \
-    BulletCollision/CollisionShapes/btCollisionShape.h \
-    BulletCollision/CollisionShapes/btTriangleShape.h \
-    BulletCollision/CollisionShapes/btBoxShape.h \
-    BulletCollision/CollisionShapes/btBox2dShape.h \
-    BulletCollision/CollisionShapes/btMinkowskiSumShape.h \
-    BulletCollision/CollisionShapes/btTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btMaterial.h \
-    BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h \
-    BulletCollision/CollisionShapes/btPolyhedralConvexShape.h \
-    BulletCollision/NarrowPhaseCollision/btConvexCast.h \
-    BulletCollision/NarrowPhaseCollision/btGjkEpa2.h \
-    BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h \
-    BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.h \
-    BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h \
-    BulletCollision/NarrowPhaseCollision/btGjkConvexCast.h \
-    BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h \
-    BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h \
-    BulletCollision/NarrowPhaseCollision/btPersistentManifold.h \
-    BulletCollision/NarrowPhaseCollision/btManifoldPoint.h \
-    BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h \
-    BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h \
-    BulletCollision/NarrowPhaseCollision/btRaycastCallback.h \
-    BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.h \
-    BulletCollision/NarrowPhaseCollision/btPointCollector.h \
-    BulletCollision/NarrowPhaseCollision/btMinkowskiPenetrationDepthSolver.h \
-    BulletCollision/BroadphaseCollision/btDbvt.h \
-    BulletCollision/BroadphaseCollision/btDispatcher.h \
-    BulletCollision/BroadphaseCollision/btDbvtBroadphase.h \
-    BulletCollision/BroadphaseCollision/btSimpleBroadphase.h \
-    BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h \
-    BulletCollision/BroadphaseCollision/btOverlappingPairCallback.h \
-    BulletCollision/BroadphaseCollision/btMultiSapBroadphase.h \
-    BulletCollision/BroadphaseCollision/btQuantizedBvh.h \
-    BulletCollision/BroadphaseCollision/btAxisSweep3.h \
-    BulletCollision/BroadphaseCollision/btBroadphaseInterface.h \
-    BulletCollision/BroadphaseCollision/btOverlappingPairCache.h \
-    BulletCollision/BroadphaseCollision/btBroadphaseProxy.h \
-    BulletCollision/CollisionDispatch/btUnionFind.h \
-    BulletCollision/CollisionDispatch/btCollisionConfiguration.h \
-    BulletCollision/CollisionDispatch/btCollisionDispatcher.h \
-    BulletCollision/CollisionDispatch/SphereTriangleDetector.h \
-    BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btCollisionWorld.h \
-    BulletCollision/CollisionDispatch/btCollisionCreateFunc.h \
-    BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h \
-    BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h \
-    BulletCollision/CollisionDispatch/btCollisionObject.h \
-    BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h \
-    BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btGhostObject.h \
-    BulletCollision/CollisionDispatch/btSimulationIslandManager.h \
-    BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btBoxBoxDetector.h \
-    BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h \
-    BulletCollision/CollisionDispatch/btInternalEdgeUtility.h \
-    BulletCollision/CollisionDispatch/btManifoldResult.h \
-    BulletCollision/Gimpact/gim_memory.h \
-    BulletCollision/Gimpact/gim_clip_polygon.h \
-    BulletCollision/Gimpact/gim_bitset.h \
-    BulletCollision/Gimpact/gim_linear_math.h \
-    BulletCollision/Gimpact/btGeometryOperations.h \
-    BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h \
-    BulletCollision/Gimpact/btGImpactBvh.h \
-    BulletCollision/Gimpact/gim_box_set.h \
-    BulletCollision/Gimpact/gim_array.h \
-    BulletCollision/Gimpact/btGImpactShape.h \
-    BulletCollision/Gimpact/btTriangleShapeEx.h \
-    BulletCollision/Gimpact/btClipPolygon.h \
-    BulletCollision/Gimpact/gim_box_collision.h \
-    BulletCollision/Gimpact/gim_tri_collision.h \
-    BulletCollision/Gimpact/gim_geometry.h \
-    BulletCollision/Gimpact/gim_math.h \
-    BulletCollision/Gimpact/btQuantization.h \
-    BulletCollision/Gimpact/btGImpactQuantizedBvh.h \
-    BulletCollision/Gimpact/gim_geom_types.h \
-    BulletCollision/Gimpact/gim_basic_geometry_operations.h \
-    BulletCollision/Gimpact/gim_contact.h \
-    BulletCollision/Gimpact/gim_hash_table.h \
-    BulletCollision/Gimpact/gim_radixsort.h \
-    BulletCollision/Gimpact/btGImpactMassUtil.h \
-    BulletCollision/Gimpact/btGenericPoolAllocator.h \
-    BulletCollision/Gimpact/btBoxCollision.h \
-    BulletCollision/Gimpact/btContactProcessing.h \
-    LinearMath/btGeometryUtil.h \
-    LinearMath/btConvexHull.h \
-    LinearMath/btList.h \
-    LinearMath/btMatrix3x3.h \
-    LinearMath/btVector3.h \
-    LinearMath/btPoolAllocator.h \
-    LinearMath/btScalar.h \
-    LinearMath/btDefaultMotionState.h \
-    LinearMath/btTransform.h \
-    LinearMath/btQuadWord.h \
-    LinearMath/btAabbUtil2.h \
-    LinearMath/btTransformUtil.h \
-    LinearMath/btRandom.h \
-    LinearMath/btQuaternion.h \
-    LinearMath/btMinMax.h \
-    LinearMath/btMotionState.h \
-    LinearMath/btIDebugDraw.h \
-    LinearMath/btAlignedAllocator.h \
-    LinearMath/btStackAlloc.h \
-    LinearMath/btAlignedObjectArray.h \
-    LinearMath/btHashMap.h \
-    LinearMath/btQuickprof.h\
-    LinearMath/btSerializer.h \
-    Bullet-C-Api.h \
-    btBulletDynamicsCommon.h \
-    btBulletCollisionCommon.h
+	clew/clew.c \
+	clew/clew.h \
+	$(NULL)
diff --git a/src/bullet/MiniCL/MiniCL.cpp b/src/bullet/MiniCL/MiniCL.cpp
deleted file mode 100644
index 24f6751f..00000000
--- a/src/bullet/MiniCL/MiniCL.cpp
+++ /dev/null
@@ -1,784 +0,0 @@
-   Copyright (C) 2010 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "MiniCL/cl.h"
-#define __PHYSICS_COMMON_H__ 1
-#ifdef _WIN32
-#include "BulletMultiThreaded/Win32ThreadSupport.h"
-#include "BulletMultiThreaded/PlatformDefinitions.h"
-#include "BulletMultiThreaded/PosixThreadSupport.h"
-#include "BulletMultiThreaded/SequentialThreadSupport.h"
-#include "MiniCLTaskScheduler.h"
-#include "MiniCLTask/MiniCLTask.h"
-#include "LinearMath/btMinMax.h"
-#include <stdio.h>
-static const char* spPlatformID = "MiniCL, SCEA";
-static const char* spDriverVersion= "1.0";
-CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(
-	cl_uint           num_entries,
-    cl_platform_id *  platforms,
-    cl_uint *         num_platforms ) CL_API_SUFFIX__VERSION_1_0
-	if(platforms != NULL)
-	{
-		if(num_entries <= 0)
-		{
-			return CL_INVALID_VALUE; 
-		}
-		*((const char**)platforms) = spPlatformID;
-	}
-	if(num_platforms != NULL)
-	{
-		*num_platforms = 1;
-	}
-	return CL_SUCCESS;
-CL_API_ENTRY cl_int CL_API_CALL clGetPlatformInfo(
-	cl_platform_id   platform, 
-	cl_platform_info param_name,
-	size_t           param_value_size, 
-	void *           param_value,
-	size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
-	char* pId = (char*)platform;
-	if(strcmp(pId, spPlatformID))
-	{
-	}
-	switch(param_name)
-	{
-		{
-			if(param_value_size < (strlen(spDriverVersion) + 1))
-			{
-				return CL_INVALID_VALUE; 
-			}
-			strcpy((char*)param_value, spDriverVersion);
-			if(param_value_size_ret != NULL)
-			{
-				*param_value_size_ret = strlen(spDriverVersion) + 1;
-			}
-			break;
-		}
-			if(param_value_size < (strlen(spPlatformID) + 1))
-			{
-				return CL_INVALID_VALUE; 
-			}
-			strcpy((char*)param_value, spPlatformID);
-			if(param_value_size_ret != NULL)
-			{
-				*param_value_size_ret = strlen(spPlatformID) + 1;
-			}
-			break;
-		default : 
-			return CL_INVALID_VALUE; 
-	}
-	return CL_SUCCESS;
-CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
-	cl_device_id            device ,
-	cl_device_info          param_name ,
-	size_t                  param_value_size ,
-	void *                  param_value ,
-	size_t *                param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
-	switch (param_name)
-	{
-		{
-			char deviceName[] = "MiniCL CPU";
-			unsigned int nameLen = (unsigned int)strlen(deviceName)+1;
-			btAssert(param_value_size>strlen(deviceName));
-			if (nameLen < param_value_size)
-			{
-				const char* cpuName = "MiniCL CPU";
-				sprintf((char*)param_value,"%s",cpuName);
-			} else
-			{
-				printf("error: param_value_size should be at least %d, but it is %d\n",nameLen,param_value_size);
-				return CL_INVALID_VALUE; 
-			}
-			break;
-		}
-		{
-			if (param_value_size>=sizeof(cl_device_type))
-			{
-				cl_device_type* deviceType = (cl_device_type*)param_value;
-				*deviceType = CL_DEVICE_TYPE_CPU;
-			} else
-			{
-				printf("error: param_value_size should be at least %d\n",sizeof(cl_device_type));
-				return CL_INVALID_VALUE; 
-			}
-			break;
-		}
-		{
-			if (param_value_size>=sizeof(cl_uint))
-			{
-				cl_uint* numUnits = (cl_uint*)param_value;
-				*numUnits= 4;
-			} else
-			{
-				printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
-				return CL_INVALID_VALUE; 
-			}
-			break;
-		}
-		{
-			size_t workitem_size[3];
-			if (param_value_size>=sizeof(workitem_size))
-			{
-				size_t* workItemSize = (size_t*)param_value;
-				workItemSize[0] = 64;
-				workItemSize[1] = 24;
-				workItemSize[2] = 16;
-			} else
-			{
-				printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
-				return CL_INVALID_VALUE; 
-			}
-			break;
-		}
-		{
-			 cl_uint* clock_frequency = (cl_uint*)param_value;
-			 *clock_frequency = 3*1024;
-			break;
-		}
-		{
-			if(param_value_size < (strlen(spPlatformID) + 1))
-			{
-				return CL_INVALID_VALUE; 
-			}
-			strcpy((char*)param_value, spPlatformID);
-			if(param_value_size_ret != NULL)
-			{
-				*param_value_size_ret = strlen(spPlatformID) + 1;
-			}
-			break;
-		}
-		{
-			if(param_value_size < (strlen(spDriverVersion) + 1))
-			{
-				return CL_INVALID_VALUE; 
-			}
-			strcpy((char*)param_value, spDriverVersion);
-			if(param_value_size_ret != NULL)
-			{
-				*param_value_size_ret = strlen(spDriverVersion) + 1;
-			}
-			break;
-		}
-		{
-			 cl_uint* maxDimensions = (cl_uint*)param_value;
-			 *maxDimensions = 1;
-			 break;
-		}
-		{
-			 cl_uint* maxWorkGroupSize = (cl_uint*)param_value;
-			 *maxWorkGroupSize = 128;//1;
-			 break;
-		}
-		{
-			 cl_uint* addressBits = (cl_uint*)param_value;
-			 *addressBits= 32; //@todo: should this be 64 for 64bit builds?
-			 break;
-		}
-			{
-				cl_ulong* maxMemAlloc = (cl_ulong*)param_value;
-				*maxMemAlloc= 512*1024*1024; //this "should be enough for everyone" ?
-			 break;
-			}
-			{
-				cl_ulong* maxMemAlloc = (cl_ulong*)param_value;
-				*maxMemAlloc= 1024*1024*1024; //this "should be enough for everyone" ?
-			 break;
-			}
-			{
-			cl_bool* error_correction_support = (cl_bool*)param_value;
-			*error_correction_support = CL_FALSE;
-			break;
-			}
-			{
-			cl_device_local_mem_type* local_mem_type = (cl_device_local_mem_type*)param_value;
-			*local_mem_type = CL_GLOBAL;
-			break;
-			}
-			{
-				cl_ulong* localmem = (cl_ulong*) param_value;
-				*localmem = 32*1024;
-				break;
-			}
-			{
-				cl_ulong* localmem = (cl_ulong*) param_value;
-				*localmem = 64*1024;
-				break;
-			}
-			{
-				cl_command_queue_properties* queueProp = (cl_command_queue_properties*) param_value;
-				memset(queueProp,0,param_value_size);
-				break;
-			}
-			{
-				cl_bool* imageSupport = (cl_bool*) param_value;
-				*imageSupport = CL_FALSE;
-				break;
-			}
-			{
-				cl_uint* imageArgs = (cl_uint*) param_value;
-				*imageArgs = 0;
-				break;
-			}
-			{
-				size_t* maxSize = (size_t*) param_value;
-				*maxSize = 0;
-				break;
-			}
-			{
-				char* extensions = (char*) param_value;
-				*extensions = 0;
-				break;
-			}
-			{
-				cl_uint* width  = (cl_uint*) param_value;
-				*width = 1;
-				break;
-			}
-	default:
-		{
-			printf("error: unsupported param_name:%d\n",param_name);
-		}
-	}
-	return 0;
-CL_API_ENTRY cl_int CL_API_CALL clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0
-	return 0;
-CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0
-	return 0;
-CL_API_ENTRY cl_int CL_API_CALL clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0
-	return 0;
-CL_API_ENTRY cl_int CL_API_CALL clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0
-	return 0;
-// Enqueued Commands APIs
-CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBuffer(cl_command_queue     command_queue ,
-                    cl_mem               buffer ,
-                    cl_bool             /* blocking_read */,
-                    size_t               offset ,
-                    size_t               cb , 
-                    void *               ptr ,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0
-	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) command_queue;
-	///wait for all work items to be completed
-	scheduler->flush();
-	memcpy(ptr,(char*)buffer + offset,cb);
-	return 0;
-CL_API_ENTRY cl_int clGetProgramBuildInfo(cl_program            /* program */,
-                      cl_device_id          /* device */,
-                      cl_program_build_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
-	return 0;
-// Program Object APIs
-CL_API_ENTRY cl_program
-clCreateProgramWithSource(cl_context         context ,
-                          cl_uint           /* count */,
-                          const char **     /* strings */,
-                          const size_t *    /* lengths */,
-                          cl_int *          errcode_ret ) CL_API_SUFFIX__VERSION_1_0
-	*errcode_ret = CL_SUCCESS;
-	return (cl_program)context;
-CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBuffer(cl_command_queue     command_queue ,
-                    cl_mem               buffer ,
-                    cl_bool             /* blocking_read */,
-                    size_t              offset,
-                    size_t               cb , 
-                    const void *         ptr ,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0
-	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) command_queue;
-	///wait for all work items to be completed
-	scheduler->flush();
-	memcpy((char*)buffer + offset, ptr,cb);
-	return 0;
-CL_API_ENTRY cl_int CL_API_CALL clFlush(cl_command_queue  command_queue)
-	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) command_queue;
-	///wait for all work items to be completed
-	scheduler->flush();
-	return 0;
-CL_API_ENTRY cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel         clKernel ,
-                       cl_uint           work_dim ,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *    global_work_size ,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0
-	MiniCLKernel* kernel = (MiniCLKernel*) clKernel;
-	for (unsigned int ii=0;ii<work_dim;ii++)
-	{
-		int maxTask = kernel->m_scheduler->getMaxNumOutstandingTasks();
-		int numWorkItems = global_work_size[ii];
-//		//at minimum 64 work items per task
-//		int numWorkItemsPerTask = btMax(64,numWorkItems / maxTask);
-		int numWorkItemsPerTask = numWorkItems / maxTask;
-		if (!numWorkItemsPerTask) numWorkItemsPerTask = 1;
-		for (int t=0;t<numWorkItems;)
-		{
-			//Performance Hint: tweak this number during benchmarking
-			int endIndex = (t+numWorkItemsPerTask) < numWorkItems ? t+numWorkItemsPerTask : numWorkItems;
-			kernel->m_scheduler->issueTask(t, endIndex, kernel);
-			t = endIndex;
-		}
-	}
-	void* bla = 0;
-	scheduler->issueTask(bla,2,3);
-	scheduler->flush();
-	*/
-	return 0;
-#define LOCAL_BUF_SIZE 32768
-static int sLocalMemBuf[LOCAL_BUF_SIZE * 4 + 16];
-static int* spLocalBufCurr = NULL;
-static int sLocalBufUsed = LOCAL_BUF_SIZE; // so it will be reset at the first call
-static void* localBufMalloc(int size)
-	int size16 = (size + 15) >> 4; // in 16-byte units
-	if((sLocalBufUsed + size16) > LOCAL_BUF_SIZE)
-	{ // reset
-		spLocalBufCurr = sLocalMemBuf;
-		while((unsigned long)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes
-		sLocalBufUsed = 0;
-	}
-	void* ret = spLocalBufCurr;
-	spLocalBufCurr += size16 * 4;
-	sLocalBufUsed += size;
-	return ret;
-CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel    clKernel ,
-               cl_uint      arg_index ,
-               size_t       arg_size ,
-               const void *  arg_value ) CL_API_SUFFIX__VERSION_1_0
-	MiniCLKernel* kernel = (MiniCLKernel* ) clKernel;
-	btAssert(arg_size <= MINICL_MAX_ARGLENGTH);
-	if (arg_index>MINI_CL_MAX_ARG)
-	{
-		printf("error: clSetKernelArg arg_index (%u) exceeds %u\n",arg_index,MINI_CL_MAX_ARG);
-	} else
-	{
-		if (arg_size>MINICL_MAX_ARGLENGTH)
-		//if (arg_size != MINICL_MAX_ARGLENGTH)
-		{
-			printf("error: clSetKernelArg argdata too large: %zu (maximum is %zu)\n",arg_size,MINICL_MAX_ARGLENGTH);
-		} 
-		else
-		{
-			if(arg_value == NULL)
-			{	// this is only for __local memory qualifier
-				void* ptr = localBufMalloc(arg_size);
-				kernel->m_argData[arg_index] = ptr;
-			}
-			else
-			{
-				memcpy(&(kernel->m_argData[arg_index]), arg_value, arg_size);
-			}
-			kernel->m_argSizes[arg_index] = arg_size;
-			if(arg_index >= kernel->m_numArgs)
-			{
-				kernel->m_numArgs = arg_index + 1;
-				kernel->updateLauncher();
-			}
-		}
-	}
-	return 0;
-// Kernel Object APIs
-CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program       program ,
-               const char *     kernel_name ,
-               cl_int *         errcode_ret ) CL_API_SUFFIX__VERSION_1_0
-	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) program;
-	MiniCLKernel* kernel = new MiniCLKernel();
-	int nameLen = strlen(kernel_name);
-	if(nameLen >= MINI_CL_MAX_KERNEL_NAME)
-	{
-		*errcode_ret = CL_INVALID_KERNEL_NAME;
-		return NULL;
-	}
-	strcpy(kernel->m_name, kernel_name);
-	kernel->m_numArgs = 0;
-	//kernel->m_kernelProgramCommandId = scheduler->findProgramCommandIdByName(kernel_name);
-	//if (kernel->m_kernelProgramCommandId>=0)
-	//{
-	//	*errcode_ret = CL_SUCCESS;
-	//} else
-	//{
-	//	*errcode_ret = CL_INVALID_KERNEL_NAME;
-	//}
-	kernel->m_scheduler = scheduler;
-	if(kernel->registerSelf() == NULL)
-	{
-		*errcode_ret = CL_INVALID_KERNEL_NAME;
-		return NULL;
-	}
-	else
-	{
-		*errcode_ret = CL_SUCCESS;
-	}
-	return (cl_kernel)kernel;
-CL_API_ENTRY cl_int CL_API_CALL clBuildProgram(cl_program           /* program */,
-               cl_uint              /* num_devices */,
-               const cl_device_id * /* device_list */,
-               const char *         /* options */, 
-               void (*pfn_notify)(cl_program /* program */, void * /* user_data */),
-               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0
-	return CL_SUCCESS;
-CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithBinary(cl_context                     context ,
-                          cl_uint                        /* num_devices */,
-                          const cl_device_id *           /* device_list */,
-                          const size_t *                 /* lengths */,
-                          const unsigned char **         /* binaries */,
-                          cl_int *                       /* binary_status */,
-                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0
-	return (cl_program)context;
-// Memory Object APIs
-CL_API_ENTRY cl_mem CL_API_CALL clCreateBuffer(cl_context   /* context */,
-               cl_mem_flags flags ,
-               size_t       size,
-               void *       host_ptr ,
-               cl_int *     errcode_ret ) CL_API_SUFFIX__VERSION_1_0
-	cl_mem buf = (cl_mem)malloc(size);
-	if ((flags&CL_MEM_COPY_HOST_PTR) && host_ptr)
-	{
-		memcpy(buf,host_ptr,size);
-	}
-	*errcode_ret = 0;
-	return buf;
-// Command Queue APIs
-CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueue(cl_context                      context , 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                        errcode_ret ) CL_API_SUFFIX__VERSION_1_0
-	*errcode_ret = 0;
-	return (cl_command_queue) context;
-extern CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context         /* context */, 
-                 cl_context_info    param_name , 
-                 size_t             param_value_size , 
-                 void *             param_value, 
-                 size_t *           param_value_size_ret ) CL_API_SUFFIX__VERSION_1_0
-	switch (param_name)
-	{
-		{
-			if (!param_value_size)
-			{
-				*param_value_size_ret = 13;
-			} else
-			{
-				const char* testName = "MiniCL_Test.";
-				sprintf((char*)param_value,"%s",testName);
-			}
-			break;
-		};
-	default:
-		{
-			printf("unsupported\n");
-		}
-	}
-	return 0;
-CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type           device_type ,
-                        void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
-                        void *                  /* user_data */,
-                        cl_int *                 errcode_ret ) CL_API_SUFFIX__VERSION_1_0
-	int maxNumOutstandingTasks = 4;
-//	int maxNumOutstandingTasks = 2;
-//	int maxNumOutstandingTasks = 1;
-	gMiniCLNumOutstandingTasks = maxNumOutstandingTasks;
-	const int maxNumOfThreadSupports = 8;
-	static int sUniqueThreadSupportIndex = 0;
-	static const char* sUniqueThreadSupportName[maxNumOfThreadSupports] = 
-	{
-		"MiniCL_0", "MiniCL_1", "MiniCL_2", "MiniCL_3", "MiniCL_4", "MiniCL_5", "MiniCL_6", "MiniCL_7" 
-	};
-	btThreadSupportInterface* threadSupport = 0;
-	if (device_type==CL_DEVICE_TYPE_DEBUG)
-	{
-		SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
-		threadSupport = new SequentialThreadSupport(stc);
-	} else
-	{
-#if _WIN32
-	btAssert(sUniqueThreadSupportIndex < maxNumOfThreadSupports);
-	const char* bla = "MiniCL";
-	threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
-//								bla,
-								sUniqueThreadSupportName[sUniqueThreadSupportIndex++],
-								processMiniCLTask, //processCollisionTask,
-								createMiniCLLocalStoreMemory,//createCollisionLocalStoreMemory,
-								maxNumOutstandingTasks));
-		PosixThreadSupport::ThreadConstructionInfo constructionInfo("PosixThreads",
-																	processMiniCLTask,
-																	createMiniCLLocalStoreMemory,
-																	maxNumOutstandingTasks);
-		threadSupport = new PosixThreadSupport(constructionInfo);
-	///todo: add posix thread support for other platforms
-	SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
-	threadSupport = new SequentialThreadSupport(stc);
-#endif //USE_PTHREADS
-	}
-	MiniCLTaskScheduler* scheduler = new MiniCLTaskScheduler(threadSupport,maxNumOutstandingTasks);
-	*errcode_ret = 0;
-	return (cl_context)scheduler;
-clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */, 
-               cl_uint          /* num_entries */, 
-               cl_device_id *   /* devices */, 
-               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0
-	return 0;
-clCreateContext(const cl_context_properties *  properties ,
-                cl_uint                        num_devices ,
-                const cl_device_id *           devices ,
-                 void (*pfn_notify)(const char *, const void *, size_t, void *),
-                void *                         user_data ,
-                cl_int *                       errcode_ret ) CL_API_SUFFIX__VERSION_1_0
-	return	clCreateContextFromType(properties,CL_DEVICE_TYPE_ALL,pfn_notify,user_data,errcode_ret);
-CL_API_ENTRY cl_int CL_API_CALL clReleaseContext(cl_context  context ) CL_API_SUFFIX__VERSION_1_0
-	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) context;
-	btThreadSupportInterface* threadSupport = scheduler->getThreadSupportInterface();
-	delete scheduler;
-	delete threadSupport;
-	return 0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFinish(cl_command_queue command_queue ) CL_API_SUFFIX__VERSION_1_0
-	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) command_queue;
-	///wait for all work items to be completed
-	scheduler->flush();
-	return CL_SUCCESS;
-extern CL_API_ENTRY cl_int CL_API_CALL 
-clGetProgramInfo(cl_program         /* program */,
-                 cl_program_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
-   return 0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelWorkGroupInfo(cl_kernel                   kernel ,
-                         cl_device_id               /* device */,
-                         cl_kernel_work_group_info  wgi/* param_name */,
-                         size_t   sz                  /* param_value_size */,
-                         void *     ptr                /* param_value */,
-                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
-	 &&(sz == sizeof(size_t))
-	 &&(ptr != NULL))
-	{
-		MiniCLKernel* miniCLKernel = (MiniCLKernel*)kernel;
-		MiniCLTaskScheduler* scheduler = miniCLKernel->m_scheduler;
-		*((size_t*)ptr) = scheduler->getMaxNumOutstandingTasks();
-		return CL_SUCCESS;
-	}
-	else
-	{
-	}
diff --git a/src/bullet/MiniCL/MiniCLTask/MiniCLTask.cpp b/src/bullet/MiniCL/MiniCLTask/MiniCLTask.cpp
deleted file mode 100644
index a56e96a0..00000000
--- a/src/bullet/MiniCL/MiniCLTask/MiniCLTask.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include "MiniCLTask.h"
-#include "BulletMultiThreaded/PlatformDefinitions.h"
-#include "BulletMultiThreaded/SpuFakeDma.h"
-#include "LinearMath/btMinMax.h"
-#include "MiniCLTask.h"
-#include "MiniCL/MiniCLTaskScheduler.h"
-#ifdef __SPU__
-#include <spu_printf.h>
-#include <stdio.h>
-#define spu_printf printf
-int gMiniCLNumOutstandingTasks = 0;
-struct MiniCLTask_LocalStoreMemory
-void processMiniCLTask(void* userPtr, void* lsMemory)
-	//	BT_PROFILE("processSampleTask");
-	MiniCLTask_LocalStoreMemory* localMemory = (MiniCLTask_LocalStoreMemory*)lsMemory;
-	MiniCLTaskDesc* taskDescPtr = (MiniCLTaskDesc*)userPtr;
-	MiniCLTaskDesc& taskDesc = *taskDescPtr;
-	for (unsigned int i=taskDesc.m_firstWorkUnit;i<taskDesc.m_lastWorkUnit;i++)
-	{
-		taskDesc.m_kernel->m_launcher(&taskDesc, i);
-	}
-//	printf("Compute Unit[%d] executed kernel %d work items [%d..%d)\n",taskDesc.m_taskId,taskDesc.m_kernelProgramId,taskDesc.m_firstWorkUnit,taskDesc.m_lastWorkUnit);
-#if defined(__CELLOS_LV2__) || defined (LIBSPE2)
-ATTRIBUTE_ALIGNED16(MiniCLTask_LocalStoreMemory	gLocalStoreMemory);
-void* createMiniCLLocalStoreMemory()
-	return &gLocalStoreMemory;
-void* createMiniCLLocalStoreMemory()
-	return new MiniCLTask_LocalStoreMemory;
diff --git a/src/bullet/MiniCL/MiniCLTask/MiniCLTask.h b/src/bullet/MiniCL/MiniCLTask/MiniCLTask.h
deleted file mode 100644
index 7e78be08..00000000
--- a/src/bullet/MiniCL/MiniCLTask/MiniCLTask.h
+++ /dev/null
@@ -1,62 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef MINICL__TASK_H
-#define MINICL__TASK_H
-#include "BulletMultiThreaded/PlatformDefinitions.h"
-#include "LinearMath/btScalar.h"
-#include "LinearMath/btAlignedAllocator.h"
-#define MINICL_MAX_ARGLENGTH (sizeof(void*))
-#define MINI_CL_MAX_ARG 16
-struct MiniCLKernel;
-ATTRIBUTE_ALIGNED16(struct) MiniCLTaskDesc
-	MiniCLTaskDesc()
-	{
-		for (int i=0;i<MINI_CL_MAX_ARG;i++)
-		{
-			m_argSizes[i]=0;
-		}
-	}
-	uint32_t		m_taskId;
-	uint32_t		m_firstWorkUnit;
-	uint32_t		m_lastWorkUnit;
-	MiniCLKernel*	m_kernel;
-	void*			m_argData[MINI_CL_MAX_ARG];
-	int				m_argSizes[MINI_CL_MAX_ARG];
-extern "C" int gMiniCLNumOutstandingTasks;
-void	processMiniCLTask(void* userPtr, void* lsMemory);
-void*	createMiniCLLocalStoreMemory();
-#endif //MINICL__TASK_H
diff --git a/src/bullet/MiniCL/MiniCLTaskScheduler.cpp b/src/bullet/MiniCL/MiniCLTaskScheduler.cpp
deleted file mode 100644
index 18cf6457..00000000
--- a/src/bullet/MiniCL/MiniCLTaskScheduler.cpp
+++ /dev/null
@@ -1,519 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-//#define __CELLOS_LV2__ 1
-#define __BT_SKIP_UINT64_H 1
-#include "MiniCLTaskScheduler.h"
-#include <stdio.h>
-#ifdef __SPU__
-void	SampleThreadFunc(void* userPtr,void* lsMemory)
-	//do nothing
-	printf("hello world\n");
-void*	SamplelsMemoryFunc()
-	//don't create local store memory, just return 0
-	return 0;
-#include "BulletMultiThreaded/btThreadSupportInterface.h"
-//#	include "SPUAssert.h"
-#include <string.h>
-#include "MiniCL/cl_platform.h"
-extern "C" {
-	extern char SPU_SAMPLE_ELF_SYMBOL[];
-MiniCLTaskScheduler::MiniCLTaskScheduler(btThreadSupportInterface*	threadInterface,  int maxNumOutstandingTasks)
-	m_taskBusy.resize(m_maxNumOutstandingTasks);
-	m_spuSampleTaskDesc.resize(m_maxNumOutstandingTasks);
-	m_kernels.resize(0);
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_initialized = false;
-	m_threadInterface->startSPU();
-	m_threadInterface->stopSPU();
-void	MiniCLTaskScheduler::initialize()
-	printf("MiniCLTaskScheduler::initialize()\n");
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_initialized = true;
-void MiniCLTaskScheduler::issueTask(int firstWorkUnit, int lastWorkUnit, MiniCLKernel* kernel)
-	printf("MiniCLTaskScheduler::issueTask (m_currentTask= %d\)n", m_currentTask);
-	m_taskBusy[m_currentTask] = true;
-	m_numBusyTasks++;
-	MiniCLTaskDesc& taskDesc = m_spuSampleTaskDesc[m_currentTask];
-	{
-		// send task description in event message
-		taskDesc.m_firstWorkUnit = firstWorkUnit;
-		taskDesc.m_lastWorkUnit = lastWorkUnit;
-		taskDesc.m_kernel = kernel;
-		//some bookkeeping to recognize finished tasks
-		taskDesc.m_taskId = m_currentTask;
-//		for (int i=0;i<MINI_CL_MAX_ARG;i++)
-		for (unsigned int i=0; i < kernel->m_numArgs; i++)
-		{
-			taskDesc.m_argSizes[i] = kernel->m_argSizes[i];
-			if (taskDesc.m_argSizes[i])
-			{
-				taskDesc.m_argData[i] = kernel->m_argData[i];
-//				memcpy(&taskDesc.m_argData[i],&argData[MINICL_MAX_ARGLENGTH*i],taskDesc.m_argSizes[i]);
-			}
-		}
-	}
-	m_threadInterface->sendRequest(1, (ppu_address_t) &taskDesc, m_currentTask);
-	// if all tasks busy, wait for spu event to clear the task.
-	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
-	{
-		unsigned int taskId;
-		unsigned int outputSize;
-		for (int i=0;i<m_maxNumOutstandingTasks;i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-		m_threadInterface->waitForResponse(&taskId, &outputSize);
-		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
-		postProcess(taskId, outputSize);
-		m_taskBusy[taskId] = false;
-		m_numBusyTasks--;
-	}
-	// find new task buffer
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		if (!m_taskBusy[i])
-		{
-			m_currentTask = i;
-			break;
-		}
-	}
-///Optional PPU-size post processing for each task
-void MiniCLTaskScheduler::postProcess(int taskId, int outputSize)
-void MiniCLTaskScheduler::flush()
-	printf("\nSpuCollisionTaskProcess::flush()\n");
-	// all tasks are issued, wait for all tasks to be complete
-	while(m_numBusyTasks > 0)
-	{
-// Consolidating SPU code
-	  unsigned int taskId;
-	  unsigned int outputSize;
-	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-	  {
-		  m_threadInterface->waitForResponse(&taskId, &outputSize);
-	  }
-		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
-		postProcess(taskId, outputSize);
-		m_taskBusy[taskId] = false;
-		m_numBusyTasks--;
-	}
-typedef void (*MiniCLKernelLauncher0)(int);
-typedef void (*MiniCLKernelLauncher1)(void*, int);
-typedef void (*MiniCLKernelLauncher2)(void*, void*, int);
-typedef void (*MiniCLKernelLauncher3)(void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher4)(void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher5)(void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher6)(void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher7)(void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher8)(void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher9)(void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher10)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher11)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher12)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher13)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher14)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher15)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-typedef void (*MiniCLKernelLauncher16)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, int);
-static void kernelLauncher0(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher0)(taskDesc->m_kernel->m_launcher))(guid);
-static void kernelLauncher1(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher1)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												guid);
-static void kernelLauncher2(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher2)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												guid);
-static void kernelLauncher3(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher3)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												guid);
-static void kernelLauncher4(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher4)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												guid);
-static void kernelLauncher5(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher5)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												guid);
-static void kernelLauncher6(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher6)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												guid);
-static void kernelLauncher7(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher7)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												guid);
-static void kernelLauncher8(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher8)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												guid);
-static void kernelLauncher9(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher9)(taskDesc->m_kernel->m_pCode))(	taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												guid);
-static void kernelLauncher10(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher10)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												guid);
-static void kernelLauncher11(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher11)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												taskDesc->m_argData[10], 
-												guid);
-static void kernelLauncher12(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher12)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												taskDesc->m_argData[10], 
-												taskDesc->m_argData[11], 
-												guid);
-static void kernelLauncher13(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher13)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												taskDesc->m_argData[10], 
-												taskDesc->m_argData[11], 
-												taskDesc->m_argData[12], 
-												guid);
-static void kernelLauncher14(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher14)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												taskDesc->m_argData[10], 
-												taskDesc->m_argData[11], 
-												taskDesc->m_argData[12], 
-												taskDesc->m_argData[13], 
-												guid);
-static void kernelLauncher15(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher15)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												taskDesc->m_argData[10], 
-												taskDesc->m_argData[11], 
-												taskDesc->m_argData[12], 
-												taskDesc->m_argData[13], 
-												taskDesc->m_argData[14], 
-												guid);
-static void kernelLauncher16(MiniCLTaskDesc* taskDesc, int guid)
-	((MiniCLKernelLauncher16)(taskDesc->m_kernel->m_pCode))(taskDesc->m_argData[0], 
-												taskDesc->m_argData[1], 
-												taskDesc->m_argData[2], 
-												taskDesc->m_argData[3], 
-												taskDesc->m_argData[4], 
-												taskDesc->m_argData[5], 
-												taskDesc->m_argData[6], 
-												taskDesc->m_argData[7], 
-												taskDesc->m_argData[8], 
-												taskDesc->m_argData[9], 
-												taskDesc->m_argData[10], 
-												taskDesc->m_argData[11], 
-												taskDesc->m_argData[12], 
-												taskDesc->m_argData[13], 
-												taskDesc->m_argData[14], 
-												taskDesc->m_argData[15], 
-												guid);
-static kernelLauncherCB spLauncherList[MINI_CL_MAX_ARG+1] = 
-	kernelLauncher0,
-	kernelLauncher1,
-	kernelLauncher2,
-	kernelLauncher3,
-	kernelLauncher4,
-	kernelLauncher5,
-	kernelLauncher6,
-	kernelLauncher7,
-	kernelLauncher8,
-	kernelLauncher9,
-	kernelLauncher10,
-	kernelLauncher11,
-	kernelLauncher12,
-	kernelLauncher13,
-	kernelLauncher14,
-	kernelLauncher15,
-	kernelLauncher16
-void MiniCLKernel::updateLauncher()
-	m_launcher = spLauncherList[m_numArgs];
-struct MiniCLKernelDescEntry
-	void* pCode;
-	const char* pName;
-static MiniCLKernelDescEntry spKernelDesc[256];
-static int sNumKernelDesc = 0;
-MiniCLKernelDesc::MiniCLKernelDesc(void* pCode, const char* pName)
-	for(int i = 0; i < sNumKernelDesc; i++)
-	{
-		if(!strcmp(pName, spKernelDesc[i].pName))
-		{	// already registered
-			btAssert(spKernelDesc[i].pCode == pCode);
-			return; 
-		}
-	}
-	spKernelDesc[sNumKernelDesc].pCode = pCode;
-	spKernelDesc[sNumKernelDesc].pName = pName;
-	sNumKernelDesc++;
-MiniCLKernel* MiniCLKernel::registerSelf()
-	m_scheduler->registerKernel(this);
-	for(int i = 0; i < sNumKernelDesc; i++)
-	{
-		if(!strcmp(m_name, spKernelDesc[i].pName))
-		{
-			m_pCode = spKernelDesc[i].pCode;
-			return this;
-		}
-	}
-	return NULL;
diff --git a/src/bullet/MiniCL/MiniCLTaskScheduler.h b/src/bullet/MiniCL/MiniCLTaskScheduler.h
deleted file mode 100644
index 3061a713..00000000
--- a/src/bullet/MiniCL/MiniCLTaskScheduler.h
+++ /dev/null
@@ -1,194 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <assert.h>
-#include "BulletMultiThreaded/PlatformDefinitions.h"
-#include <stdlib.h>
-#include "LinearMath/btAlignedObjectArray.h"
-#include "MiniCLTask/MiniCLTask.h"
-//just add your commands here, try to keep them globally unique for debugging purposes
-struct MiniCLKernel;
-/// MiniCLTaskScheduler handles SPU processing of collision pairs.
-/// When PPU issues a task, it will look for completed task buffers
-/// PPU will do postprocessing, dependent on workunit output (not likely)
-class MiniCLTaskScheduler
-	// track task buffers that are being used, and total busy tasks
-	btAlignedObjectArray<bool>	m_taskBusy;
-	btAlignedObjectArray<MiniCLTaskDesc>	m_spuSampleTaskDesc;
-	btAlignedObjectArray<const MiniCLKernel*>	m_kernels;
-	int   m_numBusyTasks;
-	// the current task and the current entry to insert a new work unit
-	int   m_currentTask;
-	bool m_initialized;
-	void postProcess(int taskId, int outputSize);
-	class	btThreadSupportInterface*	m_threadInterface;
-	int	m_maxNumOutstandingTasks;
-	MiniCLTaskScheduler(btThreadSupportInterface*	threadInterface, int maxNumOutstandingTasks);
-	~MiniCLTaskScheduler();
-	///call initialize in the beginning of the frame, before addCollisionPairToTask
-	void initialize();
-	void issueTask(int firstWorkUnit, int lastWorkUnit, MiniCLKernel* kernel);
-	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
-	void flush();
-	class	btThreadSupportInterface*	getThreadSupportInterface()
-	{
-		return m_threadInterface;
-	}
-	int	findProgramCommandIdByName(const char* programName) const;
-	int getMaxNumOutstandingTasks() const
-	{
-		return m_maxNumOutstandingTasks;
-	}
-	void registerKernel(MiniCLKernel* kernel)
-	{
-		m_kernels.push_back(kernel);
-	}
-typedef void (*kernelLauncherCB)(MiniCLTaskDesc* taskDesc, int guid);
-struct	MiniCLKernel
-	MiniCLTaskScheduler* m_scheduler;
-//	int	m_kernelProgramCommandId;
-	char	m_name[MINI_CL_MAX_KERNEL_NAME];
-	unsigned int	m_numArgs;
-	kernelLauncherCB	m_launcher;
-	void* m_pCode;
-	void updateLauncher();
-	MiniCLKernel* registerSelf();
-	void*	m_argData[MINI_CL_MAX_ARG];
-	int				m_argSizes[MINI_CL_MAX_ARG];
-#if defined(USE_LIBSPE2) && defined(__SPU__)
-#include "../SpuLibspe2Support.h"
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <SpuFakeDma.h>
-void * SamplelsMemoryFunc();
-void SampleThreadFunc(void* userPtr,void* lsMemory);
-int main(unsigned long long speid, addr64 argp, addr64 envp)
-	printf("SPU is up \n");
-	ATTRIBUTE_ALIGNED128(btSpuStatus status);
-	ATTRIBUTE_ALIGNED16( SpuSampleTaskDesc taskDesc ) ;
-	unsigned int received_message = Spu_Mailbox_Event_Nothing;
-        bool shutdown = false;
-	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	status.m_status = Spu_Status_Free;
-	status.m_lsMemory.p = SamplelsMemoryFunc();
-	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	while (!shutdown)
-	{
-		received_message = spu_read_in_mbox();
-		switch(received_message)
-		{
-		case Spu_Mailbox_Event_Shutdown:
-			shutdown = true;
-			break; 
-		case Spu_Mailbox_Event_Task:
-			// refresh the status
-			printf("SPU recieved Task \n");
-			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			btAssert(status.m_status==Spu_Status_Occupied);
-			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuSampleTaskDesc), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			SampleThreadFunc((void*)&taskDesc, reinterpret_cast<void*> (taskDesc.m_mainMemoryPtr) );
-			break;
-		case Spu_Mailbox_Event_Nothing:
-		default:
-			break;
-		}
-		// set to status free and wait for next task
-		status.m_status = Spu_Status_Free;
-		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-		cellDmaWaitTagStatusAll(DMA_MASK(3));		
-  	}
-  	return 0;
diff --git a/src/bullet/MiniCL/cl.h b/src/bullet/MiniCL/cl.h
deleted file mode 100644
index 35282988..00000000
--- a/src/bullet/MiniCL/cl.h
+++ /dev/null
@@ -1,867 +0,0 @@
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- ******************************************************************************/
-#ifndef __OPENCL_CL_H
-#define __OPENCL_CL_H
-#ifdef __APPLE__
-#include <MiniCL/cl_platform.h>
-#include <MiniCL/cl_platform.h>
-#ifdef __cplusplus
-extern "C" {
-typedef struct _cl_platform_id *    cl_platform_id;
-typedef struct _cl_device_id *      cl_device_id;
-typedef struct _cl_context *        cl_context;
-typedef struct _cl_command_queue *  cl_command_queue;
-typedef struct _cl_mem *            cl_mem;
-typedef struct _cl_program *        cl_program;
-typedef struct _cl_kernel *         cl_kernel;
-typedef struct _cl_event *          cl_event;
-typedef struct _cl_sampler *        cl_sampler;
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
-typedef cl_ulong            cl_bitfield;
-typedef cl_bitfield         cl_device_type;
-typedef cl_uint             cl_platform_info;
-typedef cl_uint             cl_device_info;
-typedef cl_bitfield         cl_device_address_info;
-typedef cl_bitfield         cl_device_fp_config;
-typedef cl_uint             cl_device_mem_cache_type;
-typedef cl_uint             cl_device_local_mem_type;
-typedef cl_bitfield         cl_device_exec_capabilities;
-typedef cl_bitfield         cl_command_queue_properties;
-typedef intptr_t			cl_context_properties;
-typedef cl_uint             cl_context_info;
-typedef cl_uint             cl_command_queue_info;
-typedef cl_uint             cl_channel_order;
-typedef cl_uint             cl_channel_type;
-typedef cl_bitfield         cl_mem_flags;
-typedef cl_uint             cl_mem_object_type;
-typedef cl_uint             cl_mem_info;
-typedef cl_uint             cl_image_info;
-typedef cl_uint             cl_addressing_mode;
-typedef cl_uint             cl_filter_mode;
-typedef cl_uint             cl_sampler_info;
-typedef cl_bitfield         cl_map_flags;
-typedef cl_uint             cl_program_info;
-typedef cl_uint             cl_program_build_info;
-typedef cl_int              cl_build_status;
-typedef cl_uint             cl_kernel_info;
-typedef cl_uint             cl_kernel_work_group_info;
-typedef cl_uint             cl_event_info;
-typedef cl_uint             cl_command_type;
-typedef cl_uint             cl_profiling_info;
-typedef struct _cl_image_format {
-    cl_channel_order        image_channel_order;
-    cl_channel_type         image_channel_data_type;
-} cl_image_format;
-// Error Codes
-#define CL_SUCCESS                                  0
-#define CL_DEVICE_NOT_FOUND                         -1
-#define CL_DEVICE_NOT_AVAILABLE                     -2
-#define CL_OUT_OF_RESOURCES                         -5
-#define CL_OUT_OF_HOST_MEMORY                       -6
-#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
-#define CL_MEM_COPY_OVERLAP                         -8
-#define CL_IMAGE_FORMAT_MISMATCH                    -9
-#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
-#define CL_BUILD_PROGRAM_FAILURE                    -11
-#define CL_MAP_FAILURE                              -12
-#define CL_INVALID_VALUE                            -30
-#define CL_INVALID_DEVICE_TYPE                      -31
-#define CL_INVALID_PLATFORM                         -32
-#define CL_INVALID_DEVICE                           -33
-#define CL_INVALID_CONTEXT                          -34
-#define CL_INVALID_QUEUE_PROPERTIES                 -35
-#define CL_INVALID_COMMAND_QUEUE                    -36
-#define CL_INVALID_HOST_PTR                         -37
-#define CL_INVALID_MEM_OBJECT                       -38
-#define CL_INVALID_IMAGE_SIZE                       -40
-#define CL_INVALID_SAMPLER                          -41
-#define CL_INVALID_BINARY                           -42
-#define CL_INVALID_BUILD_OPTIONS                    -43
-#define CL_INVALID_PROGRAM                          -44
-#define CL_INVALID_PROGRAM_EXECUTABLE               -45
-#define CL_INVALID_KERNEL_NAME                      -46
-#define CL_INVALID_KERNEL_DEFINITION                -47
-#define CL_INVALID_KERNEL                           -48
-#define CL_INVALID_ARG_INDEX                        -49
-#define CL_INVALID_ARG_VALUE                        -50
-#define CL_INVALID_ARG_SIZE                         -51
-#define CL_INVALID_KERNEL_ARGS                      -52
-#define CL_INVALID_WORK_DIMENSION                   -53
-#define CL_INVALID_WORK_GROUP_SIZE                  -54
-#define CL_INVALID_WORK_ITEM_SIZE                   -55
-#define CL_INVALID_GLOBAL_OFFSET                    -56
-#define CL_INVALID_EVENT_WAIT_LIST                  -57
-#define CL_INVALID_EVENT                            -58
-#define CL_INVALID_OPERATION                        -59
-#define CL_INVALID_GL_OBJECT                        -60
-#define CL_INVALID_BUFFER_SIZE                      -61
-#define CL_INVALID_MIP_LEVEL                        -62
-// OpenCL Version
-#define CL_VERSION_1_0                              1
-// cl_bool
-#define CL_FALSE                                    0
-#define CL_TRUE                                     1
-// cl_platform_info
-#define CL_PLATFORM_PROFILE                         0x0900
-#define CL_PLATFORM_VERSION                         0x0901
-#define CL_PLATFORM_NAME                            0x0902
-#define CL_PLATFORM_VENDOR                          0x0903
-#define CL_PLATFORM_EXTENSIONS                      0x0904
-// cl_device_type - bitfield
-#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
-#define CL_DEVICE_TYPE_CPU                          (1 << 1)
-#define CL_DEVICE_TYPE_GPU                          (1 << 2)
-#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
-#define CL_DEVICE_TYPE_DEBUG						(1 << 4)
-#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
-// cl_device_info
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-// cl_device_address_info - bitfield
-#define CL_DEVICE_ADDRESS_32_BITS                   (1 << 0)
-#define CL_DEVICE_ADDRESS_64_BITS                   (1 << 1)
-// cl_device_fp_config - bitfield
-#define CL_FP_DENORM                                (1 << 0)
-#define CL_FP_INF_NAN                               (1 << 1)
-#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
-#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
-#define CL_FP_ROUND_TO_INF                          (1 << 4)
-#define CL_FP_FMA                                   (1 << 5)
-// cl_device_mem_cache_type
-#define CL_NONE                                     0x0
-#define CL_READ_ONLY_CACHE                          0x1
-#define CL_READ_WRITE_CACHE                         0x2
-// cl_device_local_mem_type
-#define CL_LOCAL                                    0x1
-#define CL_GLOBAL                                   0x2
-// cl_device_exec_capabilities - bitfield
-#define CL_EXEC_KERNEL                              (1 << 0)
-#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
-// cl_command_queue_properties - bitfield
-#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
-// cl_context_info
-#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
-#define CL_CONTEXT_NUM_DEVICES                      0x1081
-#define CL_CONTEXT_DEVICES                          0x1082
-#define CL_CONTEXT_PROPERTIES                       0x1083
-#define CL_CONTEXT_PLATFORM                         0x1084
-// cl_command_queue_info
-#define CL_QUEUE_CONTEXT                            0x1090
-#define CL_QUEUE_DEVICE                             0x1091
-#define CL_QUEUE_REFERENCE_COUNT                    0x1092
-#define CL_QUEUE_PROPERTIES                         0x1093
-// cl_mem_flags - bitfield
-#define CL_MEM_READ_WRITE                           (1 << 0)
-#define CL_MEM_WRITE_ONLY                           (1 << 1)
-#define CL_MEM_READ_ONLY                            (1 << 2)
-#define CL_MEM_USE_HOST_PTR                         (1 << 3)
-#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
-#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-// cl_channel_order
-#define CL_R                                        0x10B0
-#define CL_A                                        0x10B1
-#define CL_RG                                       0x10B2
-#define CL_RA                                       0x10B3
-#define CL_RGB                                      0x10B4
-#define CL_RGBA                                     0x10B5
-#define CL_BGRA                                     0x10B6
-#define CL_ARGB                                     0x10B7
-#define CL_INTENSITY                                0x10B8
-#define CL_LUMINANCE                                0x10B9
-// cl_channel_type
-#define CL_SNORM_INT8                               0x10D0
-#define CL_SNORM_INT16                              0x10D1
-#define CL_UNORM_INT8                               0x10D2
-#define CL_UNORM_INT16                              0x10D3
-#define CL_UNORM_SHORT_565                          0x10D4
-#define CL_UNORM_SHORT_555                          0x10D5
-#define CL_UNORM_INT_101010                         0x10D6
-#define CL_SIGNED_INT8                              0x10D7
-#define CL_SIGNED_INT16                             0x10D8
-#define CL_SIGNED_INT32                             0x10D9
-#define CL_UNSIGNED_INT8                            0x10DA
-#define CL_UNSIGNED_INT16                           0x10DB
-#define CL_UNSIGNED_INT32                           0x10DC
-#define CL_HALF_FLOAT                               0x10DD
-#define CL_FLOAT                                    0x10DE
-// cl_mem_object_type
-#define CL_MEM_OBJECT_BUFFER                        0x10F0
-#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
-#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
-// cl_mem_info
-#define CL_MEM_TYPE                                 0x1100
-#define CL_MEM_FLAGS                                0x1101
-#define CL_MEM_SIZE                                 0x1102
-#define CL_MEM_HOST_PTR                             0x1103
-#define CL_MEM_MAP_COUNT                            0x1104
-#define CL_MEM_REFERENCE_COUNT                      0x1105
-#define CL_MEM_CONTEXT                              0x1106
-// cl_image_info
-#define CL_IMAGE_FORMAT                             0x1110
-#define CL_IMAGE_ELEMENT_SIZE                       0x1111
-#define CL_IMAGE_ROW_PITCH                          0x1112
-#define CL_IMAGE_SLICE_PITCH                        0x1113
-#define CL_IMAGE_WIDTH                              0x1114
-#define CL_IMAGE_HEIGHT                             0x1115
-#define CL_IMAGE_DEPTH                              0x1116
-// cl_addressing_mode
-#define CL_ADDRESS_NONE                             0x1130
-#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
-#define CL_ADDRESS_CLAMP                            0x1132
-#define CL_ADDRESS_REPEAT                           0x1133
-// cl_filter_mode
-#define CL_FILTER_NEAREST                           0x1140
-#define CL_FILTER_LINEAR                            0x1141
-// cl_sampler_info
-#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
-#define CL_SAMPLER_CONTEXT                          0x1151
-#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
-#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
-#define CL_SAMPLER_FILTER_MODE                      0x1154
-// cl_map_flags - bitfield
-#define CL_MAP_READ                                 (1 << 0)
-#define CL_MAP_WRITE                                (1 << 1)
-// cl_program_info
-#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
-#define CL_PROGRAM_CONTEXT                          0x1161
-#define CL_PROGRAM_NUM_DEVICES                      0x1162
-#define CL_PROGRAM_DEVICES                          0x1163
-#define CL_PROGRAM_SOURCE                           0x1164
-#define CL_PROGRAM_BINARY_SIZES                     0x1165
-#define CL_PROGRAM_BINARIES                         0x1166
-// cl_program_build_info
-#define CL_PROGRAM_BUILD_STATUS                     0x1181
-#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
-#define CL_PROGRAM_BUILD_LOG                        0x1183
-// cl_build_status
-#define CL_BUILD_SUCCESS                            0
-#define CL_BUILD_NONE                               -1
-#define CL_BUILD_ERROR                              -2
-#define CL_BUILD_IN_PROGRESS                        -3
-// cl_kernel_info
-#define CL_KERNEL_FUNCTION_NAME                     0x1190
-#define CL_KERNEL_NUM_ARGS                          0x1191
-#define CL_KERNEL_REFERENCE_COUNT                   0x1192
-#define CL_KERNEL_CONTEXT                           0x1193
-#define CL_KERNEL_PROGRAM                           0x1194
-// cl_kernel_work_group_info
-#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
-#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
-#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
-// cl_event_info
-#define CL_EVENT_COMMAND_QUEUE                      0x11D0
-#define CL_EVENT_COMMAND_TYPE                       0x11D1
-#define CL_EVENT_REFERENCE_COUNT                    0x11D2
-// cl_command_type
-#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
-#define CL_COMMAND_TASK                             0x11F1
-#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
-#define CL_COMMAND_READ_BUFFER                      0x11F3
-#define CL_COMMAND_WRITE_BUFFER                     0x11F4
-#define CL_COMMAND_COPY_BUFFER                      0x11F5
-#define CL_COMMAND_READ_IMAGE                       0x11F6
-#define CL_COMMAND_WRITE_IMAGE                      0x11F7
-#define CL_COMMAND_COPY_IMAGE                       0x11F8
-#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
-#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
-#define CL_COMMAND_MAP_BUFFER                       0x11FB
-#define CL_COMMAND_MAP_IMAGE                        0x11FC
-#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
-#define CL_COMMAND_MARKER                           0x11FE
-#define CL_COMMAND_WAIT_FOR_EVENTS                  0x11FF
-#define CL_COMMAND_BARRIER                          0x1200
-#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x1201
-#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1202
-// command execution status
-#define CL_COMPLETE                                 0x0
-#define CL_RUNNING                                  0x1
-#define CL_SUBMITTED                                0x2
-#define CL_QUEUED                                   0x3
-// cl_profiling_info
-#define CL_PROFILING_COMMAND_QUEUED                 0x1280
-#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
-#define CL_PROFILING_COMMAND_START                  0x1282
-#define CL_PROFILING_COMMAND_END                    0x1283
-// Platform API
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformIDs(cl_uint          /* num_entries */,
-                 cl_platform_id * /* platforms */,
-                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL 
-clGetPlatformInfo(cl_platform_id   /* platform */, 
-                  cl_platform_info /* param_name */,
-                  size_t           /* param_value_size */, 
-                  void *           /* param_value */,
-                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Device APIs
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */, 
-               cl_uint          /* num_entries */, 
-               cl_device_id *   /* devices */, 
-               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceInfo(cl_device_id    /* device */,
-                cl_device_info  /* param_name */, 
-                size_t          /* param_value_size */, 
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Context APIs  
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContext(const cl_context_properties * /* properties */,
-                cl_uint                 /* num_devices */,
-                const cl_device_id *    /* devices */,
-                void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
-                void *                  /* user_data */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type          /* device_type */,
-                        void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
-                        void *                  /* user_data */,
-                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetContextInfo(cl_context         /* context */, 
-                 cl_context_info    /* param_name */, 
-                 size_t             /* param_value_size */, 
-                 void *             /* param_value */, 
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Command Queue APIs
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */, 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
-                      cl_command_queue_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
-                          cl_command_queue_properties   /* properties */, 
-                          cl_bool                        /* enable */,
-                          cl_command_queue_properties * /* old_properties */) CL_API_SUFFIX__VERSION_1_0;
-// Memory Object APIs
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBuffer(cl_context   /* context */,
-               cl_mem_flags /* flags */,
-               size_t       /* size */,
-               void *       /* host_ptr */,
-               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage2D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage3D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */, 
-                size_t                  /* image_height */,
-                size_t                  /* image_depth */, 
-                size_t                  /* image_row_pitch */, 
-                size_t                  /* image_slice_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSupportedImageFormats(cl_context           /* context */,
-                           cl_mem_flags         /* flags */,
-                           cl_mem_object_type   /* image_type */,
-                           cl_uint              /* num_entries */,
-                           cl_image_format *    /* image_formats */,
-                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemObjectInfo(cl_mem           /* memobj */,
-                   cl_mem_info      /* param_name */, 
-                   size_t           /* param_value_size */,
-                   void *           /* param_value */,
-                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetImageInfo(cl_mem           /* image */,
-               cl_image_info    /* param_name */, 
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Sampler APIs
-extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */, 
-                cl_addressing_mode  /* addressing_mode */, 
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSamplerInfo(cl_sampler         /* sampler */,
-                 cl_sampler_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Program Object APIs
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithSource(cl_context        /* context */,
-                          cl_uint           /* count */,
-                          const char **     /* strings */,
-                          const size_t *    /* lengths */,
-                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBinary(cl_context                     /* context */,
-                          cl_uint                        /* num_devices */,
-                          const cl_device_id *           /* device_list */,
-                          const size_t *                 /* lengths */,
-                          const unsigned char **         /* binaries */,
-                          cl_int *                       /* binary_status */,
-                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clBuildProgram(cl_program           /* program */,
-               cl_uint              /* num_devices */,
-               const cl_device_id * /* device_list */,
-               const char *         /* options */, 
-               void (*pfn_notify)(cl_program /* program */, void * /* user_data */),
-               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramInfo(cl_program         /* program */,
-                 cl_program_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramBuildInfo(cl_program            /* program */,
-                      cl_device_id          /* device */,
-                      cl_program_build_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Kernel Object APIs
-extern CL_API_ENTRY cl_kernel CL_API_CALL
-clCreateKernel(cl_program      /* program */,
-               const char *    /* kernel_name */,
-               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateKernelsInProgram(cl_program     /* program */,
-                         cl_uint        /* num_kernels */,
-                         cl_kernel *    /* kernels */,
-                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArg(cl_kernel    /* kernel */,
-               cl_uint      /* arg_index */,
-               size_t       /* arg_size */,
-               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelInfo(cl_kernel       /* kernel */,
-                cl_kernel_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
-                         cl_device_id               /* device */,
-                         cl_kernel_work_group_info  /* param_name */,
-                         size_t                     /* param_value_size */,
-                         void *                     /* param_value */,
-                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Event Object APIs
-extern CL_API_ENTRY cl_int CL_API_CALL
-clWaitForEvents(cl_uint             /* num_events */,
-                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventInfo(cl_event         /* event */,
-               cl_event_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-// Profiling APIs
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventProfilingInfo(cl_event            /* event */,
-                        cl_profiling_info   /* param_name */,
-                        size_t              /* param_value_size */,
-                        void *              /* param_value */,
-                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-// Flush and Finish APIs
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-// Enqueued Commands APIs
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* buffer */,
-                    cl_bool             /* blocking_read */,
-                    size_t              /* offset */,
-                    size_t              /* cb */, 
-                    void *              /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
-                     cl_mem             /* buffer */, 
-                     cl_bool            /* blocking_write */, 
-                     size_t             /* offset */, 
-                     size_t             /* cb */, 
-                     const void *       /* ptr */, 
-                     cl_uint            /* num_events_in_wait_list */, 
-                     const cl_event *   /* event_wait_list */, 
-                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
-                    cl_mem              /* src_buffer */,
-                    cl_mem              /* dst_buffer */, 
-                    size_t              /* src_offset */,
-                    size_t              /* dst_offset */,
-                    size_t              /* cb */, 
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* image */,
-                   cl_bool              /* blocking_read */, 
-                   const size_t *       /* origin[3] */,
-                   const size_t *       /* region[3] */,
-                   size_t               /* row_pitch */,
-                   size_t               /* slice_pitch */, 
-                   void *               /* ptr */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteImage(cl_command_queue    /* command_queue */,
-                    cl_mem              /* image */,
-                    cl_bool             /* blocking_write */, 
-                    const size_t *      /* origin[3] */,
-                    const size_t *      /* region[3] */,
-                    size_t              /* input_row_pitch */,
-                    size_t              /* input_slice_pitch */, 
-                    const void *        /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* src_image */,
-                   cl_mem               /* dst_image */, 
-                   const size_t *       /* src_origin[3] */,
-                   const size_t *       /* dst_origin[3] */,
-                   const size_t *       /* region[3] */, 
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_image */,
-                           cl_mem           /* dst_buffer */, 
-                           const size_t *   /* src_origin[3] */,
-                           const size_t *   /* region[3] */, 
-                           size_t           /* dst_offset */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_buffer */,
-                           cl_mem           /* dst_image */, 
-                           size_t           /* src_offset */,
-                           const size_t *   /* dst_origin[3] */,
-                           const size_t *   /* region[3] */, 
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapBuffer(cl_command_queue /* command_queue */,
-                   cl_mem           /* buffer */,
-                   cl_bool          /* blocking_map */, 
-                   cl_map_flags     /* map_flags */,
-                   size_t           /* offset */,
-                   size_t           /* cb */,
-                   cl_uint          /* num_events_in_wait_list */,
-                   const cl_event * /* event_wait_list */,
-                   cl_event *       /* event */,
-                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapImage(cl_command_queue  /* command_queue */,
-                  cl_mem            /* image */, 
-                  cl_bool           /* blocking_map */, 
-                  cl_map_flags      /* map_flags */, 
-                  const size_t *    /* origin[3] */,
-                  const size_t *    /* region[3] */,
-                  size_t *          /* image_row_pitch */,
-                  size_t *          /* image_slice_pitch */,
-                  cl_uint           /* num_events_in_wait_list */,
-                  const cl_event *  /* event_wait_list */,
-                  cl_event *        /* event */,
-                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
-                        cl_mem           /* memobj */,
-                        void *           /* mapped_ptr */,
-                        cl_uint          /* num_events_in_wait_list */,
-                        const cl_event *  /* event_wait_list */,
-                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-					  void (*user_func)(void *), 
-                      void *            /* args */,
-                      size_t            /* cb_args */, 
-                      cl_uint           /* num_mem_objects */,
-                      const cl_mem *    /* mem_list */,
-                      const void **     /* args_mem_loc */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarker(cl_command_queue    /* command_queue */,
-                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
-                       cl_uint          /* num_events */,
-                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-#ifdef __cplusplus
-#endif  // __OPENCL_CL_H
diff --git a/src/bullet/MiniCL/cl_MiniCL_Defs.h b/src/bullet/MiniCL/cl_MiniCL_Defs.h
deleted file mode 100644
index 73fd3c7d..00000000
--- a/src/bullet/MiniCL/cl_MiniCL_Defs.h
+++ /dev/null
@@ -1,439 +0,0 @@
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <float.h>
-#include <math.h>
-#include "LinearMath/btScalar.h"
-#include "MiniCL/cl.h"
-#define __kernel
-#define __global
-#define __local
-#define get_global_id(a)	__guid_arg
-#define get_local_id(a)		((__guid_arg) % gMiniCLNumOutstandingTasks)
-#define get_local_size(a)	(gMiniCLNumOutstandingTasks)
-#define get_group_id(a)		((__guid_arg) / gMiniCLNumOutstandingTasks)
-static unsigned int as_uint(float val) { return *((unsigned int*)&val); }
-#define CLK_LOCAL_MEM_FENCE		0x01
-#define CLK_GLOBAL_MEM_FENCE	0x02
-static void barrier(unsigned int a)
-	// TODO : implement
-//ATTRIBUTE_ALIGNED16(struct) float8
-struct float8
-	float s0;
-	float s1;
-	float s2;
-	float s3;
-	float s4;
-	float s5;
-	float s6;
-	float s7;
-	float8(float scalar)
-	{
-		s0=s1=s2=s3=s4=s5=s6=s7=scalar;
-	}
-float select( float arg0, float arg1, bool select)
-	if (select)
-		return arg0;
-	return arg1;
-#define __constant
-struct float3
-	float x,y,z;
-	float3& operator+=(const float3& other)
-	{
-		x += other.x;
-		y += other.y;
-		z += other.z;
-		return *this;
-	}
-	float3& operator-=(const float3& other)
-	{
-		x -= other.x;
-		y -= other.y;
-		z -= other.z;
-		return *this;
-	}
-static float dot(const float3&a ,const float3& b)
-	float3 tmp;
-	tmp.x = a.x*b.x;
-	tmp.y = a.y*b.y;
-	tmp.z = a.z*b.z;
-	return tmp.x+tmp.y+tmp.z;
-static float3 operator-(const float3& a,const float3& b)
-	float3 tmp;
-	tmp.x = a.x - b.x;
-	tmp.y = a.y - b.y;
-	tmp.z = a.z - b.z;
-	return tmp;
-static float3 operator*(const float& scalar,const float3& b)
-	float3 tmp;
-	tmp.x = scalar * b.x;
-	tmp.y = scalar * b.y;
-	tmp.z = scalar * b.z;
-	return tmp;
-static float3 operator*(const float3& a,const float& scalar)
-	float3 tmp;
-	tmp.x = a.x * scalar;
-	tmp.y = a.y * scalar;
-	tmp.z = a.z * scalar;
-	return tmp;
-static float3 operator*(const float3& a,const float3& b)
-	float3 tmp;
-	tmp.x = a.x * b.x;
-	tmp.y = a.y * b.y;
-	tmp.z = a.z * b.z;
-	return tmp;
-//ATTRIBUTE_ALIGNED16(struct) float4
-struct float4
-	union
-	{
-		struct {
-			float x;
-			float y;
-			float z;
-		};
-		float3 xyz;
-	};
-	float w;
-	float4() {}
-	float4(float v0, float v1, float v2, float v3)
-	{
-		x=v0;
-		y=v1;
-		z=v2;
-		w=v3;
-	}
-	float4(float3 xyz, float scalarW) 
-	{
-		x = xyz.x;
-		y = xyz.y;
-		z = xyz.z;
-		w = scalarW;
-	}
-	float4(float v) 
-	{
-		x = y = z = w = v; 
-	}
-	float4 operator*(const float4& other)
-	{
-		float4 tmp;
-		tmp.x = x*other.x;
-		tmp.y = y*other.y;
-		tmp.z = z*other.z;
-		tmp.w = w*other.w;
-		return tmp;
-	}
-	float4 operator*(const float& other)
-	{
-		float4 tmp;
-		tmp.x = x*other;
-		tmp.y = y*other;
-		tmp.z = z*other;
-		tmp.w = w*other;
-		return tmp;
-	}
-	float4& operator+=(const float4& other)
-	{
-		x += other.x;
-		y += other.y;
-		z += other.z;
-		w += other.w;
-		return *this;
-	}
-	float4& operator-=(const float4& other)
-	{
-		x -= other.x;
-		y -= other.y;
-		z -= other.z;
-		w -= other.w;
-		return *this;
-	}
-	float4& operator *=(float scalar)
-	{
-		x *= scalar;
-		y *= scalar;
-		z *= scalar;
-		w *= scalar;
-		return (*this);
-	}
-static float4 fabs(const float4& a)
-	float4 tmp;
-	tmp.x = a.x < 0.f ? 0.f  : a.x;
-	tmp.y = a.y < 0.f ? 0.f  : a.y;
-	tmp.z = a.z < 0.f ? 0.f  : a.z;
-	tmp.w = a.w < 0.f ? 0.f  : a.w;
-	return tmp;
-static float4 operator+(const float4& a,const float4& b)
-	float4 tmp;
-	tmp.x = a.x + b.x;
-	tmp.y = a.y + b.y;
-	tmp.z = a.z + b.z;
-	tmp.w = a.w + b.w;
-	return tmp;
-static float8 operator+(const float8& a,const float8& b)
-	float8 tmp(0);
-	tmp.s0  = a.s0 + b.s0;
-	tmp.s1  = a.s1 + b.s1;
-	tmp.s2  = a.s2 + b.s2;
-	tmp.s3  = a.s3 + b.s3;
-	tmp.s4  = a.s4 + b.s4;
-	tmp.s5  = a.s5 + b.s5;
-	tmp.s6  = a.s6 + b.s6;
-	tmp.s7  = a.s7 + b.s7;
-	return tmp;
-static float4 operator-(const float4& a,const float4& b)
-	float4 tmp;
-	tmp.x = a.x - b.x;
-	tmp.y = a.y - b.y;
-	tmp.z = a.z - b.z;
-	tmp.w = a.w - b.w;
-	return tmp;
-static float8 operator-(const float8& a,const float8& b)
-	float8 tmp(0);
-	tmp.s0  = a.s0 - b.s0;
-	tmp.s1  = a.s1 - b.s1;
-	tmp.s2  = a.s2 - b.s2;
-	tmp.s3  = a.s3 - b.s3;
-	tmp.s4  = a.s4 - b.s4;
-	tmp.s5  = a.s5 - b.s5;
-	tmp.s6  = a.s6 - b.s6;
-	tmp.s7  = a.s7 - b.s7;
-	return tmp;
-static float4 operator*(float a,const float4& b)
-	float4 tmp;
-	tmp.x = a * b.x;
-	tmp.y = a * b.y;
-	tmp.z = a * b.z;
-	tmp.w = a * b.w;
-	return tmp;
-static float4 operator/(const float4& b,float a)
-	float4 tmp;
-	tmp.x = b.x/a;
-	tmp.y = b.y/a;
-	tmp.z = b.z/a;
-	tmp.w = b.w/a;
-	return tmp;
-static float dot(const float4&a ,const float4& b)
-	float4 tmp;
-	tmp.x = a.x*b.x;
-	tmp.y = a.y*b.y;
-	tmp.z = a.z*b.z;
-	tmp.w = a.w*b.w;
-	return tmp.x+tmp.y+tmp.z+tmp.w;
-static float length(const float4&a)
-	float l = sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
-	return l;
-static float4 normalize(const float4&a)
-	float4 tmp;
-	float l = length(a);
-	tmp = 1.f/l*a;
-	return tmp;
-static float4 cross(const float4&a ,const float4& b)
-	float4 tmp;
-	tmp.x =  a.y*b.z - a.z*b.y;
-	tmp.y = -a.x*b.z + a.z*b.x;
-	tmp.z =  a.x*b.y - a.y*b.x;
-	tmp.w = 0.f;
-	return tmp;
-static float max(float a, float b) 
-	return (a >= b) ? a : b;
-static float min(float a, float b) 
-	return (a <= b) ? a : b;
-static float fmax(float a, float b) 
-	return (a >= b) ? a : b;
-static float fmin(float a, float b) 
-	return (a <= b) ? a : b;
-struct int2
-	int x,y;
-struct uint2
-	unsigned int x,y;
-//typedef int2 uint2;
-typedef unsigned int uint;
-struct int4
-	int x,y,z,w;
-struct uint4
-	unsigned int x,y,z,w;
-	uint4() {}
-	uint4(uint val) { x = y = z = w = val; }
-	uint4& operator+=(const uint4& other)
-	{
-		x += other.x;
-		y += other.y;
-		z += other.z;
-		w += other.w;
-		return *this;
-	}
-static uint4 operator+(const uint4& a,const uint4& b)
-	uint4 tmp;
-	tmp.x = a.x + b.x;
-	tmp.y = a.y + b.y;
-	tmp.z = a.z + b.z;
-	tmp.w = a.w + b.w;
-	return tmp;
-static uint4 operator-(const uint4& a,const uint4& b)
-	uint4 tmp;
-	tmp.x = a.x - b.x;
-	tmp.y = a.y - b.y;
-	tmp.z = a.z - b.z;
-	tmp.w = a.w - b.w;
-	return tmp;
-#define native_sqrt sqrtf
-#define native_sin sinf
-#define native_cos cosf
-#define native_powr powf
-#define GUID_ARG ,int __guid_arg
-#define GUID_ARG_VAL ,__guid_arg
-#define as_int(a) (*((int*)&(a)))
-extern "C" int gMiniCLNumOutstandingTasks;
-//	extern "C" void __kernel_func();
diff --git a/src/bullet/MiniCL/cl_gl.h b/src/bullet/MiniCL/cl_gl.h
deleted file mode 100644
index 0a69d6ec..00000000
--- a/src/bullet/MiniCL/cl_gl.h
+++ /dev/null
@@ -1,113 +0,0 @@
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- **********************************************************************************/
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
-#ifdef __APPLE__
-#include <OpenCL/cl_platform.h>
-#include <MiniCL/cl_platform.h>
-#ifdef __cplusplus
-extern "C" {
-// NOTE:  Make sure that appropriate GL header file is included separately
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-// cl_gl_object_type
-#define CL_GL_OBJECT_BUFFER             0x2000
-#define CL_GL_OBJECT_TEXTURE2D          0x2001
-#define CL_GL_OBJECT_TEXTURE3D          0x2002
-#define CL_GL_OBJECT_RENDERBUFFER       0x2003
-// cl_gl_texture_info
-#define CL_GL_TEXTURE_TARGET            0x2004
-#define CL_GL_MIPMAP_LEVEL              0x2005
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     /* context */,
-                     cl_mem_flags   /* flags */,
-                     GLuint         /* bufobj */,
-                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        GLenum          /* target */,
-                        GLint           /* miplevel */,
-                        GLuint          /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        GLenum          /* target */,
-                        GLint           /* miplevel */,
-                        GLuint          /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   /* context */,
-                           cl_mem_flags /* flags */,
-                           GLuint       /* renderbuffer */,
-                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                /* memobj */,
-                  cl_gl_object_type *   /* gl_object_type */,
-                  GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               /* memobj */,
-                   cl_gl_texture_info   /* param_name */,
-                   size_t               /* param_value_size */,
-                   void *               /* param_value */,
-                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-#ifdef __cplusplus
-#endif  // __OPENCL_CL_GL_H
diff --git a/src/bullet/MiniCL/cl_platform.h b/src/bullet/MiniCL/cl_platform.h
deleted file mode 100644
index 43219e14..00000000
--- a/src/bullet/MiniCL/cl_platform.h
+++ /dev/null
@@ -1,254 +0,0 @@
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- **********************************************************************************/
-#ifndef __CL_PLATFORM_H
-#define __CL_PLATFORM_H
-#define CL_PLATFORM_MINI_CL  0x12345
-struct MiniCLKernelDesc
-	MiniCLKernelDesc(void* pCode, const char* pName);
-#define MINICL_REGISTER(__kernel_func) static MiniCLKernelDesc __kernel_func##Desc((void*)__kernel_func, #__kernel_func);
-#ifdef __APPLE__
-    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
-    #include <AvailabilityMacros.h>
-#ifdef __cplusplus
-extern "C" {
-#define CL_API_ENTRY
-#define CL_API_CALL
-#ifdef __APPLE__
-#define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))       
-#define CL_EXTENSION_WEAK_LINK                         
-#if defined (_WIN32) && ! defined (__MINGW32__)
-typedef signed   __int8  int8_t;
-typedef unsigned __int8  uint8_t;
-typedef signed   __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef signed   __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef signed   __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef int8_t          cl_char;
-typedef uint8_t         cl_uchar;
-typedef int16_t         cl_short    ;
-typedef uint16_t        cl_ushort   ;
-typedef int32_t         cl_int      ;
-typedef uint32_t        cl_uint     ;
-typedef int64_t         cl_long     ;
-typedef uint64_t        cl_ulong    ;
-typedef uint16_t        cl_half     ;
-typedef float           cl_float    ;
-typedef double          cl_double   ;
-typedef int8_t          cl_char2[2]     ;
-typedef int8_t          cl_char4[4]     ;
-typedef int8_t          cl_char8[8]     ;
-typedef int8_t          cl_char16[16]   ;
-typedef uint8_t         cl_uchar2[2]    ;
-typedef uint8_t         cl_uchar4[4]    ;
-typedef uint8_t         cl_uchar8[8]    ;
-typedef uint8_t         cl_uchar16[16]  ;
-typedef int16_t         cl_short2[2]     ;
-typedef int16_t         cl_short4[4]     ;
-typedef int16_t         cl_short8[8]     ;
-typedef int16_t         cl_short16[16]   ;
-typedef uint16_t        cl_ushort2[2]    ;
-typedef uint16_t        cl_ushort4[4]    ;
-typedef uint16_t        cl_ushort8[8]    ;
-typedef uint16_t        cl_ushort16[16]  ;
-typedef int32_t         cl_int2[2]     ;
-typedef int32_t         cl_int4[4]     ;
-typedef int32_t         cl_int8[8]     ;
-typedef int32_t         cl_int16[16]    ;
-typedef uint32_t        cl_uint2[2]     ;
-typedef uint32_t        cl_uint4[4]     ;
-typedef uint32_t        cl_uint8[8]     ;
-typedef uint32_t        cl_uint16[16]   ;
-typedef int64_t         cl_long2[2]     ;
-typedef int64_t         cl_long4[4]     ;
-typedef int64_t         cl_long8[8]     ;
-typedef int64_t         cl_long16[16]   ;
-typedef uint64_t        cl_ulong2[2]    ;
-typedef uint64_t        cl_ulong4[4]    ;
-typedef uint64_t        cl_ulong8[8]    ;
-typedef uint64_t        cl_ulong16[16]  ;
-typedef float           cl_float2[2]    ;
-typedef float           cl_float4[4]    ;
-typedef float           cl_float8[8]    ;
-typedef float           cl_float16[16]  ;
-typedef double          cl_double2[2]   ;
-typedef double          cl_double4[4]   ;
-typedef double          cl_double8[8]   ;
-typedef double          cl_double16[16] ;
-#include <stdint.h>
-/* scalar types  */
-typedef int8_t          cl_char;
-typedef uint8_t         cl_uchar;
-typedef int16_t         cl_short    __attribute__((aligned(2)));
-typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
-typedef int32_t         cl_int      __attribute__((aligned(4)));
-typedef uint32_t        cl_uint     __attribute__((aligned(4)));
-typedef int64_t         cl_long     __attribute__((aligned(8)));
-typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
-typedef uint16_t        cl_half     __attribute__((aligned(2)));
-typedef float           cl_float    __attribute__((aligned(4)));
-typedef double          cl_double   __attribute__((aligned(8)));
- * Vector types 
- *
- *  Note:   OpenCL requires that all types be naturally aligned. 
- *          This means that vector types must be naturally aligned.
- *          For example, a vector of four floats must be aligned to
- *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
- *          alignment of the float).  The alignment qualifiers here
- *          will only function properly if your compiler supports them
- *          and if you don't actively work to defeat them.  For example,
- *          in order for a cl_float4 to be 16 byte aligned in a struct,
- *          the start of the struct must itself be 16-byte aligned. 
- *
- *          Maintaining proper alignment is the user's responsibility.
- */
-typedef int8_t          cl_char2[2]     __attribute__((aligned(2)));
-typedef int8_t          cl_char4[4]     __attribute__((aligned(4)));
-typedef int8_t          cl_char8[8]     __attribute__((aligned(8)));
-typedef int8_t          cl_char16[16]   __attribute__((aligned(16)));
-typedef uint8_t         cl_uchar2[2]    __attribute__((aligned(2)));
-typedef uint8_t         cl_uchar4[4]    __attribute__((aligned(4)));
-typedef uint8_t         cl_uchar8[8]    __attribute__((aligned(8)));
-typedef uint8_t         cl_uchar16[16]  __attribute__((aligned(16)));
-typedef int16_t         cl_short2[2]     __attribute__((aligned(4)));
-typedef int16_t         cl_short4[4]     __attribute__((aligned(8)));
-typedef int16_t         cl_short8[8]     __attribute__((aligned(16)));
-typedef int16_t         cl_short16[16]   __attribute__((aligned(32)));
-typedef uint16_t        cl_ushort2[2]    __attribute__((aligned(4)));
-typedef uint16_t        cl_ushort4[4]    __attribute__((aligned(8)));
-typedef uint16_t        cl_ushort8[8]    __attribute__((aligned(16)));
-typedef uint16_t        cl_ushort16[16]  __attribute__((aligned(32)));
-typedef int32_t         cl_int2[2]      __attribute__((aligned(8)));
-typedef int32_t         cl_int4[4]      __attribute__((aligned(16)));
-typedef int32_t         cl_int8[8]      __attribute__((aligned(32)));
-typedef int32_t         cl_int16[16]    __attribute__((aligned(64)));
-typedef uint32_t        cl_uint2[2]     __attribute__((aligned(8)));
-typedef uint32_t        cl_uint4[4]     __attribute__((aligned(16)));
-typedef uint32_t        cl_uint8[8]     __attribute__((aligned(32)));
-typedef uint32_t        cl_uint16[16]   __attribute__((aligned(64)));
-typedef int64_t         cl_long2[2]     __attribute__((aligned(16)));
-typedef int64_t         cl_long4[4]     __attribute__((aligned(32)));
-typedef int64_t         cl_long8[8]     __attribute__((aligned(64)));
-typedef int64_t         cl_long16[16]   __attribute__((aligned(128)));
-typedef uint64_t        cl_ulong2[2]    __attribute__((aligned(16)));
-typedef uint64_t        cl_ulong4[4]    __attribute__((aligned(32)));
-typedef uint64_t        cl_ulong8[8]    __attribute__((aligned(64)));
-typedef uint64_t        cl_ulong16[16]  __attribute__((aligned(128)));
-typedef float           cl_float2[2]    __attribute__((aligned(8)));
-typedef float           cl_float4[4]    __attribute__((aligned(16)));
-typedef float           cl_float8[8]    __attribute__((aligned(32)));
-typedef float           cl_float16[16]  __attribute__((aligned(64)));
-typedef double          cl_double2[2]   __attribute__((aligned(16)));
-typedef double          cl_double4[4]   __attribute__((aligned(32)));
-typedef double          cl_double8[8]   __attribute__((aligned(64)));
-typedef double          cl_double16[16] __attribute__((aligned(128)));
-#include <stddef.h>
-/* and a few goodies to go with them */
-#define CL_CHAR_BIT         8
-#define CL_SCHAR_MAX        127
-#define CL_SCHAR_MIN        (-127-1)
-#define CL_CHAR_MAX         CL_SCHAR_MAX
-#define CL_CHAR_MIN         CL_SCHAR_MIN
-#define CL_UCHAR_MAX        255
-#define CL_SHRT_MAX         32767
-#define CL_SHRT_MIN         (-32767-1)
-#define CL_USHRT_MAX        65535
-#define CL_INT_MAX          2147483647
-#define CL_INT_MIN          (-2147483647-1)
-#define CL_UINT_MAX         0xffffffffU
-#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
-#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
-#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
-#define CL_FLT_DIG          6
-#define CL_FLT_MANT_DIG     24
-#define CL_FLT_MAX_10_EXP   +38
-#define CL_FLT_MAX_EXP      +128
-#define CL_FLT_MIN_10_EXP   -37
-#define CL_FLT_MIN_EXP      -125
-#define CL_FLT_RADIX        2
-#define CL_FLT_MAX          0x1.fffffep127f
-#define CL_FLT_MIN          0x1.0p-126f
-#define CL_FLT_EPSILON      0x1.0p-23f
-#define CL_DBL_DIG          15
-#define CL_DBL_MANT_DIG     53
-#define CL_DBL_MAX_10_EXP   +308
-#define CL_DBL_MAX_EXP      +1024
-#define CL_DBL_MIN_10_EXP   -307
-#define CL_DBL_MIN_EXP      -1021
-#define CL_DBL_RADIX        2
-#define CL_DBL_MAX          0x1.fffffffffffffp1023
-#define CL_DBL_MIN          0x1.0p-1022
-#define CL_DBL_EPSILON      0x1.0p-52
-/* There are no vector types for half */
-#ifdef __cplusplus
-#endif  // __CL_PLATFORM_H
diff --git a/src/bullet/btBulletCollisionCommon.h b/src/bullet/btBulletCollisionCommon.h
index 472690c1..af981b5d 100644
--- a/src/bullet/btBulletCollisionCommon.h
+++ b/src/bullet/btBulletCollisionCommon.h
@@ -45,7 +45,6 @@ subject to the following restrictions:
 ///Narrowphase Collision Detector
 #include "BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h"
-//btSphereBoxCollisionAlgorithm is broken, use gjk for now
 //#include "BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.h"
diff --git a/src/bullet/btBulletDynamicsCommon.h b/src/bullet/btBulletDynamicsCommon.h
index ccfad19b..50282bf2 100644
--- a/src/bullet/btBulletDynamicsCommon.h
+++ b/src/bullet/btBulletDynamicsCommon.h
@@ -32,6 +32,9 @@ subject to the following restrictions:
 #include "BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btUniversalConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btHinge2Constraint.h"
+#include "BulletDynamics/ConstraintSolver/btGearConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btFixedConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
diff --git a/src/bullet/clew/clew.c b/src/bullet/clew/clew.c
new file mode 100644
index 00000000..a07b0aad
--- /dev/null
+++ b/src/bullet/clew/clew.c
@@ -0,0 +1,312 @@
+//  Copyright (c) 2009 Organic Vectory B.V.
+//  Written by George van Venrooij
+//  Distributed under the Boost Software License, Version 1.0.
+//  (See accompanying file license.txt)
+#include "clew.h"
+#ifdef _WIN32
+    #define WIN32_LEAN_AND_MEAN
+    #define VC_EXTRALEAN
+    #include <windows.h>
+    typedef HMODULE             CLEW_DYNLIB_HANDLE;
+    #define CLEW_DYNLIB_OPEN    LoadLibrary
+    #define CLEW_DYNLIB_CLOSE   FreeLibrary
+    #define CLEW_DYNLIB_IMPORT  GetProcAddress
+    #include <dlfcn.h>
+    typedef void*                   CLEW_DYNLIB_HANDLE;
+    #define CLEW_DYNLIB_OPEN(path)  dlopen(path, RTLD_NOW | RTLD_GLOBAL)
+    #define CLEW_DYNLIB_CLOSE       dlclose
+    #define CLEW_DYNLIB_IMPORT      dlsym
+#include <stdlib.h>
+//! \brief module handle
+static CLEW_DYNLIB_HANDLE module = NULL;
+//  Variables holding function entry points
+PFNCLGETPLATFORMIDS                 __clewGetPlatformIDs                = NULL;
+PFNCLGETPLATFORMINFO                __clewGetPlatformInfo               = NULL;
+PFNCLGETDEVICEIDS                   __clewGetDeviceIDs                  = NULL;
+PFNCLGETDEVICEINFO                  __clewGetDeviceInfo                 = NULL;
+PFNCLCREATECONTEXT                  __clewCreateContext                 = NULL;
+PFNCLCREATECONTEXTFROMTYPE          __clewCreateContextFromType         = NULL;
+PFNCLRETAINCONTEXT                  __clewRetainContext                 = NULL;
+PFNCLRELEASECONTEXT                 __clewReleaseContext                = NULL;
+PFNCLGETCONTEXTINFO                 __clewGetContextInfo                = NULL;
+PFNCLCREATECOMMANDQUEUE             __clewCreateCommandQueue            = NULL;
+PFNCLRETAINCOMMANDQUEUE             __clewRetainCommandQueue            = NULL;
+PFNCLRELEASECOMMANDQUEUE            __clewReleaseCommandQueue           = NULL;
+PFNCLGETCOMMANDQUEUEINFO            __clewGetCommandQueueInfo           = NULL;
+PFNCLSETCOMMANDQUEUEPROPERTY        __clewSetCommandQueueProperty       = NULL;
+PFNCLCREATEBUFFER                   __clewCreateBuffer                  = NULL;
+PFNCLCREATESUBBUFFER                __clewCreateSubBuffer               = NULL;
+PFNCLCREATEIMAGE2D                  __clewCreateImage2D                 = NULL;
+PFNCLCREATEIMAGE3D                  __clewCreateImage3D                 = NULL;
+PFNCLRETAINMEMOBJECT                __clewRetainMemObject               = NULL;
+PFNCLRELEASEMEMOBJECT               __clewReleaseMemObject              = NULL;
+PFNCLGETSUPPORTEDIMAGEFORMATS       __clewGetSupportedImageFormats      = NULL;
+PFNCLGETMEMOBJECTINFO               __clewGetMemObjectInfo              = NULL;
+PFNCLGETIMAGEINFO                   __clewGetImageInfo                  = NULL;
+PFNCLCREATESAMPLER                  __clewCreateSampler                 = NULL;
+PFNCLRETAINSAMPLER                  __clewRetainSampler                 = NULL;
+PFNCLRELEASESAMPLER                 __clewReleaseSampler                = NULL;
+PFNCLGETSAMPLERINFO                 __clewGetSamplerInfo                = NULL;
+PFNCLCREATEPROGRAMWITHSOURCE        __clewCreateProgramWithSource       = NULL;
+PFNCLCREATEPROGRAMWITHBINARY        __clewCreateProgramWithBinary       = NULL;
+PFNCLRETAINPROGRAM                  __clewRetainProgram                 = NULL;
+PFNCLRELEASEPROGRAM                 __clewReleaseProgram                = NULL;
+PFNCLBUILDPROGRAM                   __clewBuildProgram                  = NULL;
+PFNCLUNLOADCOMPILER                 __clewUnloadCompiler                = NULL;
+PFNCLGETPROGRAMINFO                 __clewGetProgramInfo                = NULL;
+PFNCLGETPROGRAMBUILDINFO            __clewGetProgramBuildInfo           = NULL;
+PFNCLCREATEKERNEL                   __clewCreateKernel                  = NULL;
+PFNCLCREATEKERNELSINPROGRAM         __clewCreateKernelsInProgram        = NULL;
+PFNCLRETAINKERNEL                   __clewRetainKernel                  = NULL;
+PFNCLRELEASEKERNEL                  __clewReleaseKernel                 = NULL;
+PFNCLSETKERNELARG                   __clewSetKernelArg                  = NULL;
+PFNCLGETKERNELINFO                  __clewGetKernelInfo                 = NULL;
+PFNCLGETKERNELWORKGROUPINFO         __clewGetKernelWorkGroupInfo        = NULL;
+PFNCLWAITFOREVENTS                  __clewWaitForEvents                 = NULL;
+PFNCLGETEVENTINFO                   __clewGetEventInfo                  = NULL;
+PFNCLCREATEUSEREVENT                __clewCreateUserEvent               = NULL;
+PFNCLRETAINEVENT                    __clewRetainEvent                   = NULL;
+PFNCLRELEASEEVENT                   __clewReleaseEvent                  = NULL;
+PFNCLSETUSEREVENTSTATUS             __clewSetUserEventStatus            = NULL;
+PFNCLSETEVENTCALLBACK               __clewSetEventCallback              = NULL;
+PFNCLGETEVENTPROFILINGINFO          __clewGetEventProfilingInfo         = NULL;
+PFNCLFLUSH                          __clewFlush                         = NULL;
+PFNCLFINISH                         __clewFinish                        = NULL;
+PFNCLENQUEUEREADBUFFER              __clewEnqueueReadBuffer             = NULL;
+PFNCLENQUEUEREADBUFFERRECT          __clewEnqueueReadBufferRect         = NULL;
+PFNCLENQUEUEWRITEBUFFER             __clewEnqueueWriteBuffer            = NULL;
+PFNCLENQUEUEWRITEBUFFERRECT         __clewEnqueueWriteBufferRect        = NULL;
+PFNCLENQUEUECOPYBUFFER              __clewEnqueueCopyBuffer             = NULL;
+PFNCLENQUEUEREADIMAGE               __clewEnqueueReadImage              = NULL;
+PFNCLENQUEUEWRITEIMAGE              __clewEnqueueWriteImage             = NULL;
+PFNCLENQUEUECOPYIMAGE               __clewEnqueueCopyImage              = NULL;
+PFNCLENQUEUECOPYBUFFERRECT          __clewEnqueueCopyBufferRect         = NULL;
+PFNCLENQUEUECOPYIMAGETOBUFFER       __clewEnqueueCopyImageToBuffer      = NULL;
+PFNCLENQUEUECOPYBUFFERTOIMAGE       __clewEnqueueCopyBufferToImage      = NULL;
+PFNCLENQUEUEMAPBUFFER               __clewEnqueueMapBuffer              = NULL;
+PFNCLENQUEUEMAPIMAGE                __clewEnqueueMapImage               = NULL;
+PFNCLENQUEUEUNMAPMEMOBJECT          __clewEnqueueUnmapMemObject         = NULL;
+PFNCLENQUEUENDRANGEKERNEL           __clewEnqueueNDRangeKernel          = NULL;
+PFNCLENQUEUETASK                    __clewEnqueueTask                   = NULL;
+PFNCLENQUEUENATIVEKERNEL            __clewEnqueueNativeKernel           = NULL;
+PFNCLENQUEUEMARKER                  __clewEnqueueMarker                 = NULL;
+PFNCLENQUEUEWAITFOREVENTS           __clewEnqueueWaitForEvents          = NULL;
+PFNCLENQUEUEBARRIER                 __clewEnqueueBarrier                = NULL;
+PFNCLGETEXTENSIONFUNCTIONADDRESS    __clewGetExtensionFunctionAddress   = NULL;
+void clewExit(void)
+    if (module != NULL)
+    {
+        //  Ignore errors
+        CLEW_DYNLIB_CLOSE(module);
+        module = NULL;
+    }
+int clewInit(const char* path)
+    int error = 0;
+    //  Check if already initialized
+    if (module != NULL)
+    {
+        return CLEW_SUCCESS;
+    }
+    //  Load library
+    module = CLEW_DYNLIB_OPEN(path);
+    //  Check for errors
+    if (module == NULL)
+    {
+        return CLEW_ERROR_OPEN_FAILED;
+    }
+    //  Set unloading
+    error = atexit(clewExit);
+    if (error)
+    {
+        //  Failure queuing atexit, shutdown with error
+        CLEW_DYNLIB_CLOSE(module);
+        module = NULL;
+    }
+    //  Determine function entry-points
+    __clewGetPlatformIDs                = (PFNCLGETPLATFORMIDS              )CLEW_DYNLIB_IMPORT(module, "clGetPlatformIDs");
+    __clewGetPlatformInfo               = (PFNCLGETPLATFORMINFO             )CLEW_DYNLIB_IMPORT(module, "clGetPlatformInfo");
+    __clewGetDeviceIDs                  = (PFNCLGETDEVICEIDS                )CLEW_DYNLIB_IMPORT(module, "clGetDeviceIDs");
+    __clewGetDeviceInfo                 = (PFNCLGETDEVICEINFO               )CLEW_DYNLIB_IMPORT(module, "clGetDeviceInfo");
+    __clewCreateContext                 = (PFNCLCREATECONTEXT               )CLEW_DYNLIB_IMPORT(module, "clCreateContext");
+    __clewCreateContextFromType         = (PFNCLCREATECONTEXTFROMTYPE       )CLEW_DYNLIB_IMPORT(module, "clCreateContextFromType");
+    __clewRetainContext                 = (PFNCLRETAINCONTEXT               )CLEW_DYNLIB_IMPORT(module, "clRetainContext");
+    __clewReleaseContext                = (PFNCLRELEASECONTEXT              )CLEW_DYNLIB_IMPORT(module, "clReleaseContext");
+    __clewGetContextInfo                = (PFNCLGETCONTEXTINFO              )CLEW_DYNLIB_IMPORT(module, "clGetContextInfo");
+    __clewCreateCommandQueue            = (PFNCLCREATECOMMANDQUEUE          )CLEW_DYNLIB_IMPORT(module, "clCreateCommandQueue");
+    __clewRetainCommandQueue            = (PFNCLRETAINCOMMANDQUEUE          )CLEW_DYNLIB_IMPORT(module, "clRetainCommandQueue");
+    __clewReleaseCommandQueue           = (PFNCLRELEASECOMMANDQUEUE         )CLEW_DYNLIB_IMPORT(module, "clReleaseCommandQueue");
+    __clewGetCommandQueueInfo           = (PFNCLGETCOMMANDQUEUEINFO         )CLEW_DYNLIB_IMPORT(module, "clGetCommandQueueInfo");
+    __clewSetCommandQueueProperty       = (PFNCLSETCOMMANDQUEUEPROPERTY     )CLEW_DYNLIB_IMPORT(module, "clSetCommandQueueProperty");
+    __clewCreateBuffer                  = (PFNCLCREATEBUFFER                )CLEW_DYNLIB_IMPORT(module, "clCreateBuffer");
+    __clewCreateSubBuffer               = (PFNCLCREATESUBBUFFER                )CLEW_DYNLIB_IMPORT(module, "clCreateBuffer");
+    __clewCreateImage2D                 = (PFNCLCREATEIMAGE2D               )CLEW_DYNLIB_IMPORT(module, "clCreateImage2D");
+    __clewCreateImage3D                 = (PFNCLCREATEIMAGE3D               )CLEW_DYNLIB_IMPORT(module, "clCreateImage3D");
+    __clewRetainMemObject               = (PFNCLRETAINMEMOBJECT             )CLEW_DYNLIB_IMPORT(module, "clRetainMemObject");
+    __clewReleaseMemObject              = (PFNCLRELEASEMEMOBJECT            )CLEW_DYNLIB_IMPORT(module, "clReleaseMemObject");
+    __clewGetSupportedImageFormats      = (PFNCLGETSUPPORTEDIMAGEFORMATS    )CLEW_DYNLIB_IMPORT(module, "clGetSupportedImageFormats");
+    __clewGetMemObjectInfo              = (PFNCLGETMEMOBJECTINFO            )CLEW_DYNLIB_IMPORT(module, "clGetMemObjectInfo");
+    __clewGetImageInfo                  = (PFNCLGETIMAGEINFO                )CLEW_DYNLIB_IMPORT(module, "clGetImageInfo");
+    __clewSetMemObjectDestructorCallback = (PFNCLSETMEMOBJECTDESTRUCTORCALLBACK)CLEW_DYNLIB_IMPORT(module, "clSetMemObjectDestructorCallback");
+    __clewCreateSampler                 = (PFNCLCREATESAMPLER               )CLEW_DYNLIB_IMPORT(module, "clCreateSampler");
+    __clewRetainSampler                 = (PFNCLRETAINSAMPLER               )CLEW_DYNLIB_IMPORT(module, "clRetainSampler");
+    __clewReleaseSampler                = (PFNCLRELEASESAMPLER              )CLEW_DYNLIB_IMPORT(module, "clReleaseSampler");
+    __clewGetSamplerInfo                = (PFNCLGETSAMPLERINFO              )CLEW_DYNLIB_IMPORT(module, "clGetSamplerInfo");
+    __clewCreateProgramWithSource       = (PFNCLCREATEPROGRAMWITHSOURCE     )CLEW_DYNLIB_IMPORT(module, "clCreateProgramWithSource");
+    __clewCreateProgramWithBinary       = (PFNCLCREATEPROGRAMWITHBINARY     )CLEW_DYNLIB_IMPORT(module, "clCreateProgramWithBinary");
+    __clewRetainProgram                 = (PFNCLRETAINPROGRAM               )CLEW_DYNLIB_IMPORT(module, "clRetainProgram");
+    __clewReleaseProgram                = (PFNCLRELEASEPROGRAM              )CLEW_DYNLIB_IMPORT(module, "clReleaseProgram");
+    __clewBuildProgram                  = (PFNCLBUILDPROGRAM                )CLEW_DYNLIB_IMPORT(module, "clBuildProgram");
+    __clewUnloadCompiler                = (PFNCLUNLOADCOMPILER              )CLEW_DYNLIB_IMPORT(module, "clUnloadCompiler");
+    __clewGetProgramInfo                = (PFNCLGETPROGRAMINFO              )CLEW_DYNLIB_IMPORT(module, "clGetProgramInfo");
+    __clewGetProgramBuildInfo           = (PFNCLGETPROGRAMBUILDINFO         )CLEW_DYNLIB_IMPORT(module, "clGetProgramBuildInfo");
+    __clewCreateKernel                  = (PFNCLCREATEKERNEL                )CLEW_DYNLIB_IMPORT(module, "clCreateKernel");
+    __clewCreateKernelsInProgram        = (PFNCLCREATEKERNELSINPROGRAM      )CLEW_DYNLIB_IMPORT(module, "clCreateKernelsInProgram");
+    __clewRetainKernel                  = (PFNCLRETAINKERNEL                )CLEW_DYNLIB_IMPORT(module, "clRetainKernel");
+    __clewReleaseKernel                 = (PFNCLRELEASEKERNEL               )CLEW_DYNLIB_IMPORT(module, "clReleaseKernel");
+    __clewSetKernelArg                  = (PFNCLSETKERNELARG                )CLEW_DYNLIB_IMPORT(module, "clSetKernelArg");
+    __clewGetKernelInfo                 = (PFNCLGETKERNELINFO               )CLEW_DYNLIB_IMPORT(module, "clGetKernelInfo");
+    __clewGetKernelWorkGroupInfo        = (PFNCLGETKERNELWORKGROUPINFO      )CLEW_DYNLIB_IMPORT(module, "clGetKernelWorkGroupInfo");
+    __clewWaitForEvents                 = (PFNCLWAITFOREVENTS               )CLEW_DYNLIB_IMPORT(module, "clWaitForEvents");
+    __clewGetEventInfo                  = (PFNCLGETEVENTINFO                )CLEW_DYNLIB_IMPORT(module, "clGetEventInfo");
+    __clewCreateUserEvent               = (PFNCLCREATEUSEREVENT             )CLEW_DYNLIB_IMPORT(module, "clCreateUserEvent");
+    __clewRetainEvent                   = (PFNCLRETAINEVENT                 )CLEW_DYNLIB_IMPORT(module, "clRetainEvent");
+    __clewReleaseEvent                  = (PFNCLRELEASEEVENT                )CLEW_DYNLIB_IMPORT(module, "clReleaseEvent");
+    __clewSetUserEventStatus            = (PFNCLSETUSEREVENTSTATUS          )CLEW_DYNLIB_IMPORT(module, "clSetUserEventStatus");
+    __clewSetEventCallback              = (PFNCLSETEVENTCALLBACK            )CLEW_DYNLIB_IMPORT(module, "clSetEventCallback");
+    __clewGetEventProfilingInfo         = (PFNCLGETEVENTPROFILINGINFO       )CLEW_DYNLIB_IMPORT(module, "clGetEventProfilingInfo");
+    __clewFlush                         = (PFNCLFLUSH                       )CLEW_DYNLIB_IMPORT(module, "clFlush");
+    __clewFinish                        = (PFNCLFINISH                      )CLEW_DYNLIB_IMPORT(module, "clFinish");
+    __clewEnqueueReadBuffer             = (PFNCLENQUEUEREADBUFFER           )CLEW_DYNLIB_IMPORT(module, "clEnqueueReadBuffer");
+    __clewEnqueueReadBufferRect         = (PFNCLENQUEUEREADBUFFERRECT       )CLEW_DYNLIB_IMPORT(module, "clEnqueueReadBufferRect");
+    __clewEnqueueWriteBuffer            = (PFNCLENQUEUEWRITEBUFFER          )CLEW_DYNLIB_IMPORT(module, "clEnqueueWriteBuffer");
+    __clewEnqueueWriteBufferRect        = (PFNCLENQUEUEWRITEBUFFERRECT      )CLEW_DYNLIB_IMPORT(module, "clEnqueueWriteBufferRect");
+    __clewEnqueueCopyBuffer             = (PFNCLENQUEUECOPYBUFFER           )CLEW_DYNLIB_IMPORT(module, "clEnqueueCopyBuffer");
+    __clewEnqueueCopyBufferRect         = (PFNCLENQUEUECOPYBUFFERRECT       )CLEW_DYNLIB_IMPORT(module, "clEnqueueCopyBufferRect");
+    __clewEnqueueReadImage              = (PFNCLENQUEUEREADIMAGE            )CLEW_DYNLIB_IMPORT(module, "clEnqueueReadImage");
+    __clewEnqueueWriteImage             = (PFNCLENQUEUEWRITEIMAGE           )CLEW_DYNLIB_IMPORT(module, "clEnqueueWriteImage");
+    __clewEnqueueCopyImage              = (PFNCLENQUEUECOPYIMAGE            )CLEW_DYNLIB_IMPORT(module, "clEnqueueCopyImage");
+    __clewEnqueueCopyImageToBuffer      = (PFNCLENQUEUECOPYIMAGETOBUFFER    )CLEW_DYNLIB_IMPORT(module, "clEnqueueCopyImageToBuffer");
+    __clewEnqueueCopyBufferToImage      = (PFNCLENQUEUECOPYBUFFERTOIMAGE    )CLEW_DYNLIB_IMPORT(module, "clEnqueueCopyBufferToImage");
+    __clewEnqueueMapBuffer              = (PFNCLENQUEUEMAPBUFFER            )CLEW_DYNLIB_IMPORT(module, "clEnqueueMapBuffer");
+    __clewEnqueueMapImage               = (PFNCLENQUEUEMAPIMAGE             )CLEW_DYNLIB_IMPORT(module, "clEnqueueMapImage");
+    __clewEnqueueUnmapMemObject         = (PFNCLENQUEUEUNMAPMEMOBJECT       )CLEW_DYNLIB_IMPORT(module, "clEnqueueUnmapMemObject");
+    __clewEnqueueNDRangeKernel          = (PFNCLENQUEUENDRANGEKERNEL        )CLEW_DYNLIB_IMPORT(module, "clEnqueueNDRangeKernel");
+    __clewEnqueueTask                   = (PFNCLENQUEUETASK                 )CLEW_DYNLIB_IMPORT(module, "clEnqueueTask");
+    __clewEnqueueNativeKernel           = (PFNCLENQUEUENATIVEKERNEL         )CLEW_DYNLIB_IMPORT(module, "clEnqueueNativeKernel");
+    __clewEnqueueMarker                 = (PFNCLENQUEUEMARKER               )CLEW_DYNLIB_IMPORT(module, "clEnqueueMarker");
+    __clewEnqueueWaitForEvents          = (PFNCLENQUEUEWAITFOREVENTS        )CLEW_DYNLIB_IMPORT(module, "clEnqueueWaitForEvents");
+    __clewEnqueueBarrier                = (PFNCLENQUEUEBARRIER              )CLEW_DYNLIB_IMPORT(module, "clEnqueueBarrier");
+    __clewGetExtensionFunctionAddress   = (PFNCLGETEXTENSIONFUNCTIONADDRESS )CLEW_DYNLIB_IMPORT(module, "clGetExtensionFunctionAddress");
+    return CLEW_SUCCESS;
+const char* clewErrorString(cl_int error)
+    static const char* strings[] =
+    {
+        // Error Codes
+          "CL_SUCCESS"                                  //   0
+        , "CL_DEVICE_NOT_FOUND"                         //  -1
+        , "CL_DEVICE_NOT_AVAILABLE"                     //  -2
+        , "CL_COMPILER_NOT_AVAILABLE"                   //  -3
+        , "CL_MEM_OBJECT_ALLOCATION_FAILURE"            //  -4
+        , "CL_OUT_OF_RESOURCES"                         //  -5
+        , "CL_OUT_OF_HOST_MEMORY"                       //  -6
+        , "CL_PROFILING_INFO_NOT_AVAILABLE"             //  -7
+        , "CL_MEM_COPY_OVERLAP"                         //  -8
+        , "CL_IMAGE_FORMAT_MISMATCH"                    //  -9
+        , "CL_IMAGE_FORMAT_NOT_SUPPORTED"               //  -10
+        , "CL_BUILD_PROGRAM_FAILURE"                    //  -11
+        , "CL_MAP_FAILURE"                              //  -12
+        , ""    //  -13
+        , ""    //  -14
+        , ""    //  -15
+        , ""    //  -16
+        , ""    //  -17
+        , ""    //  -18
+        , ""    //  -19
+        , ""    //  -20
+        , ""    //  -21
+        , ""    //  -22
+        , ""    //  -23
+        , ""    //  -24
+        , ""    //  -25
+        , ""    //  -26
+        , ""    //  -27
+        , ""    //  -28
+        , ""    //  -29
+        , "CL_INVALID_VALUE"                            //  -30
+        , "CL_INVALID_DEVICE_TYPE"                      //  -31
+        , "CL_INVALID_PLATFORM"                         //  -32
+        , "CL_INVALID_DEVICE"                           //  -33
+        , "CL_INVALID_CONTEXT"                          //  -34
+        , "CL_INVALID_QUEUE_PROPERTIES"                 //  -35
+        , "CL_INVALID_COMMAND_QUEUE"                    //  -36
+        , "CL_INVALID_HOST_PTR"                         //  -37
+        , "CL_INVALID_MEM_OBJECT"                       //  -38
+        , "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"          //  -39
+        , "CL_INVALID_IMAGE_SIZE"                       //  -40
+        , "CL_INVALID_SAMPLER"                          //  -41
+        , "CL_INVALID_BINARY"                           //  -42
+        , "CL_INVALID_BUILD_OPTIONS"                    //  -43
+        , "CL_INVALID_PROGRAM"                          //  -44
+        , "CL_INVALID_PROGRAM_EXECUTABLE"               //  -45
+        , "CL_INVALID_KERNEL_NAME"                      //  -46
+        , "CL_INVALID_KERNEL_DEFINITION"                //  -47
+        , "CL_INVALID_KERNEL"                           //  -48
+        , "CL_INVALID_ARG_INDEX"                        //  -49
+        , "CL_INVALID_ARG_VALUE"                        //  -50
+        , "CL_INVALID_ARG_SIZE"                         //  -51
+        , "CL_INVALID_KERNEL_ARGS"                      //  -52
+        , "CL_INVALID_WORK_DIMENSION"                   //  -53
+        , "CL_INVALID_WORK_GROUP_SIZE"                  //  -54
+        , "CL_INVALID_WORK_ITEM_SIZE"                   //  -55
+        , "CL_INVALID_GLOBAL_OFFSET"                    //  -56
+        , "CL_INVALID_EVENT_WAIT_LIST"                  //  -57
+        , "CL_INVALID_EVENT"                            //  -58
+        , "CL_INVALID_OPERATION"                        //  -59
+        , "CL_INVALID_GL_OBJECT"                        //  -60
+        , "CL_INVALID_BUFFER_SIZE"                      //  -61
+        , "CL_INVALID_MIP_LEVEL"                        //  -62
+        , "CL_INVALID_GLOBAL_WORK_SIZE"                 //  -63
+    };
+    return strings[-error];
diff --git a/src/bullet/clew/clew.h b/src/bullet/clew/clew.h
new file mode 100644
index 00000000..ee0fef18
--- /dev/null
+++ b/src/bullet/clew/clew.h
@@ -0,0 +1,2397 @@
+//  Copyright (c) 2009-2011 Organic Vectory B.V., KindDragon
+//  Written by George van Venrooij
+//  Distributed under the MIT License.
+//! \file clew.h
+//! \brief OpenCL run-time loader header
+//! This file contains a copy of the contents of CL.H and CL_PLATFORM.H from the 
+//! official OpenCL spec. The purpose of this code is to load the OpenCL dynamic
+//! library at run-time and thus allow the executable to function on many
+//! platforms regardless of the vendor of the OpenCL driver actually installed.
+//! Some of the techniques used here were inspired by work done in the GLEW
+//! library (http://glew.sourceforge.net/)
+//  Run-time dynamic linking functionality based on concepts used in GLEW
+#ifdef  __OPENCL_CL_H
+#error cl.h included before clew.h
+#error cl_platform.h included before clew.h
+//  Prevent cl.h inclusion
+#define __OPENCL_CL_H
+//  Prevent cl_platform.h inclusion
+#define __CL_PLATFORM_H
+* Copyright (c) 2008-2010 The Khronos Group Inc.
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and/or associated documentation files (the
+* "Materials"), to deal in the Materials without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Materials, and to
+* permit persons to whom the Materials are furnished to do so, subject to
+* the following conditions:
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Materials.
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#ifdef __cplusplus
+extern "C" {
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+//disabled the APPLE thing, don't know why it is there, is just causes tons of warnings
+#ifdef __APPLE1__
+    #define CL_EXTENSION_WEAK_LINK                  __attribute__((weak_import))       
+    #define CL_API_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
+    #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
+    #define CL_EXTENSION_WEAK_LINK                         
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+#if (defined (_WIN32) && defined(_MSC_VER))
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#include <stdint.h>
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#include <stddef.h>
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+#ifdef _MSC_VER
+#if defined(_M_IX86)
+#if _M_IX86_FP >= 0
+#define __SSE__
+#if _M_IX86_FP >= 1
+#define __SSE2__
+#elif defined(_M_X64)
+#define __SSE__
+#define __SSE2__
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ ) && !defined( __ICC )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ ) && !defined( __ICC )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ ) && !defined( __ICC )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ ) && !defined( __ICC )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if (defined( __GNUC__) && ! defined( __STRICT_ANSI__ )) || (defined( _MSC_VER ) && ! defined( __STDC__ ))
+    /* .xyzw and .s0123...{f|F} are supported */
+    /* .hi and .lo are supported */
+#if defined( CL_NAMED_STRUCT_SUPPORTED) && defined( _MSC_VER )
+#define __extension__ __pragma(warning(suppress:4201))
+/* Define cl_vector types */
+/* ---- cl_charn ---- */
+typedef union
+    cl_char  CL_ALIGNED(2) s[2];
+   __extension__ struct{ cl_char  x, y; };
+   __extension__ struct{ cl_char  s0, s1; };
+   __extension__ struct{ cl_char  lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+typedef union
+    cl_char  CL_ALIGNED(4) s[4];
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3; };
+   __extension__ struct{ cl_char2 lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+typedef union
+    cl_char   CL_ALIGNED(8) s[8];
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_char4 lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+typedef union
+    cl_char  CL_ALIGNED(16) s[16];
+   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_char8 lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+/* ---- cl_ucharn ---- */
+typedef union
+    cl_uchar  CL_ALIGNED(2) s[2];
+   __extension__ struct{ cl_uchar  x, y; };
+   __extension__ struct{ cl_uchar  s0, s1; };
+   __extension__ struct{ cl_uchar  lo, hi; };
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+typedef union
+    cl_uchar  CL_ALIGNED(4) s[4];
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uchar2 lo, hi; };
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+typedef union
+    cl_uchar   CL_ALIGNED(8) s[8];
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uchar4 lo, hi; };
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+typedef union
+    cl_uchar  CL_ALIGNED(16) s[16];
+   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uchar8 lo, hi; };
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+/* ---- cl_shortn ---- */
+typedef union
+    cl_short  CL_ALIGNED(4) s[2];
+   __extension__ struct{ cl_short  x, y; };
+   __extension__ struct{ cl_short  s0, s1; };
+   __extension__ struct{ cl_short  lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+typedef union
+    cl_short  CL_ALIGNED(8) s[4];
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3; };
+   __extension__ struct{ cl_short2 lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+typedef union
+    cl_short   CL_ALIGNED(16) s[8];
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_short4 lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+typedef union
+    cl_short  CL_ALIGNED(32) s[16];
+   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_short8 lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+/* ---- cl_ushortn ---- */
+typedef union
+    cl_ushort  CL_ALIGNED(4) s[2];
+   __extension__ struct{ cl_ushort  x, y; };
+   __extension__ struct{ cl_ushort  s0, s1; };
+   __extension__ struct{ cl_ushort  lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+typedef union
+    cl_ushort  CL_ALIGNED(8) s[4];
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ushort2 lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+typedef union
+    cl_ushort   CL_ALIGNED(16) s[8];
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ushort4 lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+typedef union
+    cl_ushort  CL_ALIGNED(32) s[16];
+   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ushort8 lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+/* ---- cl_intn ---- */
+typedef union
+    cl_int  CL_ALIGNED(8) s[2];
+   __extension__ struct{ cl_int  x, y; };
+   __extension__ struct{ cl_int  s0, s1; };
+   __extension__ struct{ cl_int  lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+typedef union
+    cl_int  CL_ALIGNED(16) s[4];
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3; };
+   __extension__ struct{ cl_int2 lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+typedef union
+    cl_int   CL_ALIGNED(32) s[8];
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_int4 lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+typedef union
+    cl_int  CL_ALIGNED(64) s[16];
+   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_int8 lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+/* ---- cl_uintn ---- */
+typedef union
+    cl_uint  CL_ALIGNED(8) s[2];
+   __extension__ struct{ cl_uint  x, y; };
+   __extension__ struct{ cl_uint  s0, s1; };
+   __extension__ struct{ cl_uint  lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+typedef union
+    cl_uint  CL_ALIGNED(16) s[4];
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uint2 lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+typedef union
+    cl_uint   CL_ALIGNED(32) s[8];
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uint4 lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+typedef union
+    cl_uint  CL_ALIGNED(64) s[16];
+   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uint8 lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+/* ---- cl_longn ---- */
+typedef union
+    cl_long  CL_ALIGNED(16) s[2];
+   __extension__ struct{ cl_long  x, y; };
+   __extension__ struct{ cl_long  s0, s1; };
+   __extension__ struct{ cl_long  lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+typedef union
+    cl_long  CL_ALIGNED(32) s[4];
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3; };
+   __extension__ struct{ cl_long2 lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+typedef union
+    cl_long   CL_ALIGNED(64) s[8];
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_long4 lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+typedef union
+    cl_long  CL_ALIGNED(128) s[16];
+   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_long8 lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+/* ---- cl_ulongn ---- */
+typedef union
+    cl_ulong  CL_ALIGNED(16) s[2];
+   __extension__ struct{ cl_ulong  x, y; };
+   __extension__ struct{ cl_ulong  s0, s1; };
+   __extension__ struct{ cl_ulong  lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+typedef union
+    cl_ulong  CL_ALIGNED(32) s[4];
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ulong2 lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+typedef union
+    cl_ulong   CL_ALIGNED(64) s[8];
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ulong4 lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+typedef union
+    cl_ulong  CL_ALIGNED(128) s[16];
+   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ulong8 lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+/* --- cl_floatn ---- */
+typedef union
+    cl_float  CL_ALIGNED(8) s[2];
+   __extension__ struct{ cl_float  x, y; };
+   __extension__ struct{ cl_float  s0, s1; };
+   __extension__ struct{ cl_float  lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+typedef union
+    cl_float  CL_ALIGNED(16) s[4];
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3; };
+   __extension__ struct{ cl_float2  lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+typedef union
+    cl_float   CL_ALIGNED(32) s[8];
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_float4  lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+typedef union
+    cl_float  CL_ALIGNED(64) s[16];
+   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_float8 lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+/* --- cl_doublen ---- */
+typedef union
+    cl_double  CL_ALIGNED(16) s[2];
+   __extension__ struct{ cl_double  x, y; };
+   __extension__ struct{ cl_double s0, s1; };
+   __extension__ struct{ cl_double lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+typedef union
+    cl_double  CL_ALIGNED(32) s[4];
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3; };
+   __extension__ struct{ cl_double2 lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+typedef union
+    cl_double   CL_ALIGNED(64) s[8];
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_double4 lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+typedef union
+    cl_double  CL_ALIGNED(128) s[16];
+   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_double8 lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_BEGIN \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_BEGIN "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+//  CL.h contents
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t			cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+/* cl_context_info + cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_CONTEXT                            0x11D4
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+/*  Function signature typedef's */
+/* Platform API */
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETPLATFORMIDS)(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL * 
+PFNCLGETPLATFORMINFO)(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Device APIs */
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETDEVICEIDS)(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETDEVICEINFO)(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+// Context APIs  
+typedef CL_API_ENTRY cl_context (CL_API_CALL *
+PFNCLCREATECONTEXT)(const cl_context_properties * /* properties */,
+                cl_uint                       /* num_devices */,
+                const cl_device_id *          /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                        /* user_data */,
+                cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_context (CL_API_CALL *
+PFNCLCREATECONTEXTFROMTYPE)(const cl_context_properties * /* properties */,
+                        cl_device_type                /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                        /* user_data */,
+                        cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRETAINCONTEXT)(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRELEASECONTEXT)(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETCONTEXTINFO)(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Command Queue APIs */
+typedef CL_API_ENTRY cl_command_queue (CL_API_CALL *
+PFNCLCREATECOMMANDQUEUE)(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRETAINCOMMANDQUEUE)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRELEASECOMMANDQUEUE)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETCOMMANDQUEUEINFO)(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLSETCOMMANDQUEUEPROPERTY)(cl_command_queue              /* command_queue */,
+                          cl_command_queue_properties   /* properties */, 
+                          cl_bool                        /* enable */,
+                          cl_command_queue_properties * /* old_properties */) CL_API_SUFFIX__VERSION_1_0;
+/* Memory Object APIs */
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *
+PFNCLCREATEBUFFER)(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *
+PFNCLCREATESUBBUFFER)(cl_mem   			/* buffer */,
+               cl_mem_flags 			/* flags */,
+               cl_buffer_create_type    /* buffer_create_type */,
+               const void *       		/* buffer_create_info */,
+               cl_int *     			/* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *
+PFNCLCREATEIMAGE2D)(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *
+PFNCLCREATEIMAGE3D)(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETSUPPORTEDIMAGEFORMATS)(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETMEMOBJECTINFO)(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETIMAGEINFO)(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+/* Sampler APIs  */
+typedef CL_API_ENTRY cl_sampler (CL_API_CALL *
+PFNCLCREATESAMPLER)(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRETAINSAMPLER)(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRELEASESAMPLER)(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETSAMPLERINFO)(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Program Object APIs  */
+typedef CL_API_ENTRY cl_program (CL_API_CALL *
+PFNCLCREATEPROGRAMWITHSOURCE)(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_program (CL_API_CALL *
+PFNCLCREATEPROGRAMWITHBINARY)(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRETAINPROGRAM)(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRELEASEPROGRAM)(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLBUILDPROGRAM)(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETPROGRAMINFO)(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETPROGRAMBUILDINFO)(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Kernel Object APIs */
+typedef CL_API_ENTRY cl_kernel (CL_API_CALL *
+PFNCLCREATEKERNEL)(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLCREATEKERNELSINPROGRAM)(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRETAINKERNEL)(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRELEASEKERNEL)(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLSETKERNELARG)(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETKERNELINFO)(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETKERNELWORKGROUPINFO)(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+// Event Object APIs
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLWAITFOREVENTS)(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETEVENTINFO)(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_event (CL_API_CALL *
+PFNCLCREATEUSEREVENT)(cl_context    /* context */,
+                  	  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLRETAINEVENT)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLSETUSEREVENTSTATUS)(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLSETEVENTCALLBACK)( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+/* Profiling APIs  */
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLGETEVENTPROFILINGINFO)(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+// Flush and Finish APIs
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLFLUSH)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLFINISH)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+/* Enqueued Commands APIs */
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEREADBUFFER)(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* cb */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEREADBUFFERRECT)(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_origin */,
+                        const size_t *      /* host_origin */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEWRITEBUFFER)(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* cb */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEWRITEBUFFERRECT)(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_origin */,
+                         const size_t *      /* host_origin */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUECOPYBUFFER)(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* cb */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUECOPYBUFFERRECT)(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEREADIMAGE)(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEWRITEIMAGE)(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUECOPYIMAGE)(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUECOPYIMAGETOBUFFER)(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUECOPYBUFFERTOIMAGE)(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY void * (CL_API_CALL *
+PFNCLENQUEUEMAPBUFFER)(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* cb */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY void * (CL_API_CALL *
+PFNCLENQUEUEMAPIMAGE)(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEUNMAPMEMOBJECT)(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUENDRANGEKERNEL)(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUETASK)(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUENATIVEKERNEL)(cl_command_queue  /* command_queue */,
+                      void (*user_func)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEMARKER)(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEWAITFOREVENTS)(cl_command_queue /* command_queue */,
+                       cl_uint          /* num_events */,
+                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *
+PFNCLENQUEUEBARRIER)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+// Extension function access
+// Returns the extension function address for the given function name,
+// or NULL if a valid function can not be found.  The client must
+// check to make sure the address is not NULL, before using or 
+// calling the returned function address.
+#define CLEW_STATIC
+#  define CLEWAPI extern
+#  ifdef CLEW_BUILD
+#    define CLEWAPI extern __declspec(dllexport)
+#  else
+#    define CLEWAPI extern __declspec(dllimport)
+#  endif
+#if defined(_WIN32)
+#define CLEW_FUN_EXPORT extern
+#define CLEW_GET_FUN(x) x
+//  Variables holding function entry points
+CLEW_FUN_EXPORT     PFNCLGETPLATFORMIDS                 __clewGetPlatformIDs                ;
+CLEW_FUN_EXPORT     PFNCLGETPLATFORMINFO                __clewGetPlatformInfo               ;
+CLEW_FUN_EXPORT     PFNCLGETDEVICEIDS                   __clewGetDeviceIDs                  ;
+CLEW_FUN_EXPORT     PFNCLGETDEVICEINFO                  __clewGetDeviceInfo                 ;
+CLEW_FUN_EXPORT     PFNCLCREATECONTEXT                  __clewCreateContext                 ;
+CLEW_FUN_EXPORT     PFNCLCREATECONTEXTFROMTYPE          __clewCreateContextFromType         ;
+CLEW_FUN_EXPORT     PFNCLRETAINCONTEXT                  __clewRetainContext                 ;
+CLEW_FUN_EXPORT     PFNCLRELEASECONTEXT                 __clewReleaseContext                ;
+CLEW_FUN_EXPORT     PFNCLGETCONTEXTINFO                 __clewGetContextInfo                ;
+CLEW_FUN_EXPORT     PFNCLCREATECOMMANDQUEUE             __clewCreateCommandQueue            ;
+CLEW_FUN_EXPORT     PFNCLRETAINCOMMANDQUEUE             __clewRetainCommandQueue            ;
+CLEW_FUN_EXPORT     PFNCLRELEASECOMMANDQUEUE            __clewReleaseCommandQueue           ;
+CLEW_FUN_EXPORT     PFNCLGETCOMMANDQUEUEINFO            __clewGetCommandQueueInfo           ;
+CLEW_FUN_EXPORT     PFNCLSETCOMMANDQUEUEPROPERTY        __clewSetCommandQueueProperty       ;
+CLEW_FUN_EXPORT     PFNCLCREATEBUFFER                   __clewCreateBuffer                  ;
+CLEW_FUN_EXPORT     PFNCLCREATESUBBUFFER                __clewCreateSubBuffer               ;
+CLEW_FUN_EXPORT     PFNCLCREATEIMAGE2D                  __clewCreateImage2D                 ;
+CLEW_FUN_EXPORT     PFNCLCREATEIMAGE3D                  __clewCreateImage3D                 ;
+CLEW_FUN_EXPORT     PFNCLRETAINMEMOBJECT                __clewRetainMemObject               ;
+CLEW_FUN_EXPORT     PFNCLRELEASEMEMOBJECT               __clewReleaseMemObject              ;
+CLEW_FUN_EXPORT     PFNCLGETSUPPORTEDIMAGEFORMATS       __clewGetSupportedImageFormats      ;
+CLEW_FUN_EXPORT     PFNCLGETMEMOBJECTINFO               __clewGetMemObjectInfo              ;
+CLEW_FUN_EXPORT     PFNCLGETIMAGEINFO                   __clewGetImageInfo                  ;
+CLEW_FUN_EXPORT     PFNCLCREATESAMPLER                  __clewCreateSampler                 ;
+CLEW_FUN_EXPORT     PFNCLRETAINSAMPLER                  __clewRetainSampler                 ;
+CLEW_FUN_EXPORT     PFNCLRELEASESAMPLER                 __clewReleaseSampler                ;
+CLEW_FUN_EXPORT     PFNCLGETSAMPLERINFO                 __clewGetSamplerInfo                ;
+CLEW_FUN_EXPORT     PFNCLCREATEPROGRAMWITHSOURCE        __clewCreateProgramWithSource       ;
+CLEW_FUN_EXPORT     PFNCLCREATEPROGRAMWITHBINARY        __clewCreateProgramWithBinary       ;
+CLEW_FUN_EXPORT     PFNCLRETAINPROGRAM                  __clewRetainProgram                 ;
+CLEW_FUN_EXPORT     PFNCLRELEASEPROGRAM                 __clewReleaseProgram                ;
+CLEW_FUN_EXPORT     PFNCLBUILDPROGRAM                   __clewBuildProgram                  ;
+CLEW_FUN_EXPORT     PFNCLUNLOADCOMPILER                 __clewUnloadCompiler                ;
+CLEW_FUN_EXPORT     PFNCLGETPROGRAMINFO                 __clewGetProgramInfo                ;
+CLEW_FUN_EXPORT     PFNCLGETPROGRAMBUILDINFO            __clewGetProgramBuildInfo           ;
+CLEW_FUN_EXPORT     PFNCLCREATEKERNEL                   __clewCreateKernel                  ;
+CLEW_FUN_EXPORT     PFNCLCREATEKERNELSINPROGRAM         __clewCreateKernelsInProgram        ;
+CLEW_FUN_EXPORT     PFNCLRETAINKERNEL                   __clewRetainKernel                  ;
+CLEW_FUN_EXPORT     PFNCLRELEASEKERNEL                  __clewReleaseKernel                 ;
+CLEW_FUN_EXPORT     PFNCLSETKERNELARG                   __clewSetKernelArg                  ;
+CLEW_FUN_EXPORT     PFNCLGETKERNELINFO                  __clewGetKernelInfo                 ;
+CLEW_FUN_EXPORT     PFNCLGETKERNELWORKGROUPINFO         __clewGetKernelWorkGroupInfo        ;
+CLEW_FUN_EXPORT     PFNCLWAITFOREVENTS                  __clewWaitForEvents                 ;
+CLEW_FUN_EXPORT     PFNCLGETEVENTINFO                   __clewGetEventInfo                  ;
+CLEW_FUN_EXPORT     PFNCLCREATEUSEREVENT                __clewCreateUserEvent               ;
+CLEW_FUN_EXPORT     PFNCLRETAINEVENT                    __clewRetainEvent                   ;
+CLEW_FUN_EXPORT     PFNCLRELEASEEVENT                   __clewReleaseEvent                  ;
+CLEW_FUN_EXPORT     PFNCLSETUSEREVENTSTATUS             __clewSetUserEventStatus            ;
+CLEW_FUN_EXPORT     PFNCLSETEVENTCALLBACK               __clewSetEventCallback              ;
+CLEW_FUN_EXPORT     PFNCLGETEVENTPROFILINGINFO          __clewGetEventProfilingInfo         ;
+CLEW_FUN_EXPORT     PFNCLFLUSH                          __clewFlush                         ;
+CLEW_FUN_EXPORT     PFNCLFINISH                         __clewFinish                        ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEREADBUFFER              __clewEnqueueReadBuffer             ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEREADBUFFERRECT          __clewEnqueueReadBufferRect         ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEWRITEBUFFER             __clewEnqueueWriteBuffer            ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEWRITEBUFFERRECT         __clewEnqueueWriteBufferRect        ;
+CLEW_FUN_EXPORT     PFNCLENQUEUECOPYBUFFER              __clewEnqueueCopyBuffer             ;
+CLEW_FUN_EXPORT     PFNCLENQUEUECOPYBUFFERRECT          __clewEnqueueCopyBufferRect         ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEREADIMAGE               __clewEnqueueReadImage              ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEWRITEIMAGE              __clewEnqueueWriteImage             ;
+CLEW_FUN_EXPORT     PFNCLENQUEUECOPYIMAGE               __clewEnqueueCopyImage              ;
+CLEW_FUN_EXPORT     PFNCLENQUEUECOPYIMAGETOBUFFER       __clewEnqueueCopyImageToBuffer      ;
+CLEW_FUN_EXPORT     PFNCLENQUEUECOPYBUFFERTOIMAGE       __clewEnqueueCopyBufferToImage      ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEMAPBUFFER               __clewEnqueueMapBuffer              ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEMAPIMAGE                __clewEnqueueMapImage               ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEUNMAPMEMOBJECT          __clewEnqueueUnmapMemObject         ;
+CLEW_FUN_EXPORT     PFNCLENQUEUENDRANGEKERNEL           __clewEnqueueNDRangeKernel          ;
+CLEW_FUN_EXPORT     PFNCLENQUEUETASK                    __clewEnqueueTask                   ;
+CLEW_FUN_EXPORT     PFNCLENQUEUENATIVEKERNEL            __clewEnqueueNativeKernel           ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEMARKER                  __clewEnqueueMarker                 ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEWAITFOREVENTS           __clewEnqueueWaitForEvents          ;
+CLEW_FUN_EXPORT     PFNCLENQUEUEBARRIER                 __clewEnqueueBarrier                ;
+#define	clGetPlatformIDs                CLEW_GET_FUN(__clewGetPlatformIDs                )
+#define	clGetPlatformInfo               CLEW_GET_FUN(__clewGetPlatformInfo               )
+#define	clGetDeviceIDs                  CLEW_GET_FUN(__clewGetDeviceIDs                  )
+#define	clGetDeviceInfo                 CLEW_GET_FUN(__clewGetDeviceInfo                 )
+#define	clCreateContext                 CLEW_GET_FUN(__clewCreateContext                 )
+#define	clCreateContextFromType         CLEW_GET_FUN(__clewCreateContextFromType         )
+#define	clRetainContext                 CLEW_GET_FUN(__clewRetainContext                 )
+#define	clReleaseContext                CLEW_GET_FUN(__clewReleaseContext                )
+#define	clGetContextInfo                CLEW_GET_FUN(__clewGetContextInfo                )
+#define	clCreateCommandQueue            CLEW_GET_FUN(__clewCreateCommandQueue            )
+#define	clRetainCommandQueue            CLEW_GET_FUN(__clewRetainCommandQueue            )
+#define	clReleaseCommandQueue           CLEW_GET_FUN(__clewReleaseCommandQueue           )
+#define	clGetCommandQueueInfo           CLEW_GET_FUN(__clewGetCommandQueueInfo           )
+#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
+ *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+ *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+ *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+ *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+ *
+ *  Software developers previously relying on this API are instructed to set the command queue 
+ *  properties when creating the queue, instead. 
+ */
+#define	clSetCommandQueueProperty       CLEW_GET_FUN(__clewSetCommandQueueProperty       )
+#define	clCreateBuffer                  CLEW_GET_FUN(__clewCreateBuffer                  )
+#define	clCreateSubBuffer               CLEW_GET_FUN(__clewCreateSubBuffer               )
+#define	clCreateImage2D                 CLEW_GET_FUN(__clewCreateImage2D                 )
+#define	clCreateImage3D                 CLEW_GET_FUN(__clewCreateImage3D                 )
+#define	clRetainMemObject               CLEW_GET_FUN(__clewRetainMemObject               )
+#define	clReleaseMemObject              CLEW_GET_FUN(__clewReleaseMemObject              )
+#define	clGetSupportedImageFormats      CLEW_GET_FUN(__clewGetSupportedImageFormats      )
+#define	clGetMemObjectInfo              CLEW_GET_FUN(__clewGetMemObjectInfo              )
+#define	clGetImageInfo                  CLEW_GET_FUN(__clewGetImageInfo                  )
+#define	clSetMemObjectDestructorCallback CLEW_GET_FUN(__clewSetMemObjectDestructorCallback)
+#define	clCreateSampler                 CLEW_GET_FUN(__clewCreateSampler                 )
+#define	clRetainSampler                 CLEW_GET_FUN(__clewRetainSampler                 )
+#define	clReleaseSampler                CLEW_GET_FUN(__clewReleaseSampler                )
+#define	clGetSamplerInfo                CLEW_GET_FUN(__clewGetSamplerInfo                )
+#define	clCreateProgramWithSource       CLEW_GET_FUN(__clewCreateProgramWithSource       )
+#define	clCreateProgramWithBinary       CLEW_GET_FUN(__clewCreateProgramWithBinary       )
+#define	clRetainProgram                 CLEW_GET_FUN(__clewRetainProgram                 )
+#define	clReleaseProgram                CLEW_GET_FUN(__clewReleaseProgram                )
+#define	clBuildProgram                  CLEW_GET_FUN(__clewBuildProgram                  )
+#define	clUnloadCompiler                CLEW_GET_FUN(__clewUnloadCompiler                )
+#define	clGetProgramInfo                CLEW_GET_FUN(__clewGetProgramInfo                )
+#define	clGetProgramBuildInfo           CLEW_GET_FUN(__clewGetProgramBuildInfo           )
+#define	clCreateKernel                  CLEW_GET_FUN(__clewCreateKernel                  )
+#define	clCreateKernelsInProgram        CLEW_GET_FUN(__clewCreateKernelsInProgram        )
+#define	clRetainKernel                  CLEW_GET_FUN(__clewRetainKernel                  )
+#define	clReleaseKernel                 CLEW_GET_FUN(__clewReleaseKernel                 )
+#define	clSetKernelArg                  CLEW_GET_FUN(__clewSetKernelArg                  )
+#define	clGetKernelInfo                 CLEW_GET_FUN(__clewGetKernelInfo                 )
+#define	clGetKernelWorkGroupInfo        CLEW_GET_FUN(__clewGetKernelWorkGroupInfo        )
+#define	clWaitForEvents                 CLEW_GET_FUN(__clewWaitForEvents                 )
+#define	clGetEventInfo                  CLEW_GET_FUN(__clewGetEventInfo                  )
+#define	clCreateUserEvent               CLEW_GET_FUN(__clewCreateUserEvent               )
+#define	clRetainEvent                   CLEW_GET_FUN(__clewRetainEvent                   )
+#define	clReleaseEvent                  CLEW_GET_FUN(__clewReleaseEvent                  )
+#define	clSetUserEventStatus            CLEW_GET_FUN(__clewSetUserEventStatus            )
+#define	clSetEventCallback              CLEW_GET_FUN(__clewSetEventCallback              )
+#define	clGetEventProfilingInfo         CLEW_GET_FUN(__clewGetEventProfilingInfo         )
+#define	clFlush                         CLEW_GET_FUN(__clewFlush                         )
+#define	clFinish                        CLEW_GET_FUN(__clewFinish                        )
+#define	clEnqueueReadBuffer             CLEW_GET_FUN(__clewEnqueueReadBuffer             )
+#define	clEnqueueReadBufferRect         CLEW_GET_FUN(__clewEnqueueReadBufferRect         )
+#define	clEnqueueWriteBuffer            CLEW_GET_FUN(__clewEnqueueWriteBuffer            )
+#define	clEnqueueWriteBufferRect        CLEW_GET_FUN(__clewEnqueueWriteBufferRect        )
+#define	clEnqueueCopyBuffer             CLEW_GET_FUN(__clewEnqueueCopyBuffer             )
+#define	clEnqueueCopyBufferRect         CLEW_GET_FUN(__clewEnqueueCopyBufferRect         )
+#define	clEnqueueReadImage              CLEW_GET_FUN(__clewEnqueueReadImage              )
+#define	clEnqueueWriteImage             CLEW_GET_FUN(__clewEnqueueWriteImage             )
+#define	clEnqueueCopyImage              CLEW_GET_FUN(__clewEnqueueCopyImage              )
+#define	clEnqueueCopyImageToBuffer      CLEW_GET_FUN(__clewEnqueueCopyImageToBuffer      )
+#define	clEnqueueCopyBufferToImage      CLEW_GET_FUN(__clewEnqueueCopyBufferToImage      )
+#define	clEnqueueMapBuffer              CLEW_GET_FUN(__clewEnqueueMapBuffer              )
+#define	clEnqueueMapImage               CLEW_GET_FUN(__clewEnqueueMapImage               )
+#define	clEnqueueUnmapMemObject         CLEW_GET_FUN(__clewEnqueueUnmapMemObject         )
+#define	clEnqueueNDRangeKernel          CLEW_GET_FUN(__clewEnqueueNDRangeKernel          )
+#define	clEnqueueTask                   CLEW_GET_FUN(__clewEnqueueTask                   )
+#define	clEnqueueNativeKernel           CLEW_GET_FUN(__clewEnqueueNativeKernel           )
+#define	clEnqueueMarker                 CLEW_GET_FUN(__clewEnqueueMarker                 )
+#define	clEnqueueWaitForEvents          CLEW_GET_FUN(__clewEnqueueWaitForEvents          )
+#define	clEnqueueBarrier                CLEW_GET_FUN(__clewEnqueueBarrier                )
+#define	clGetExtensionFunctionAddress   CLEW_GET_FUN(__clewGetExtensionFunctionAddress   )
+#define CLEW_SUCCESS                0       //!<    Success error code
+#define CLEW_ERROR_OPEN_FAILED      -1      //!<    Error code for failing to open the dynamic library
+#define CLEW_ERROR_ATEXIT_FAILED    -2      //!<    Error code for failing to queue the closing of the dynamic library to atexit()
+//! \brief Load OpenCL dynamic library and set function entry points
+int         clewInit        (const char*);
+//! \brief Exit clew and unload OpenCL dynamic library
+void		clewExit();
+//! \brief Convert an OpenCL error code to its string equivalent
+const char* clewErrorString (cl_int error);
+#ifdef __cplusplus
+#endif  //  CLEW_HPP_INCLUDED
diff --git a/src/bullet/vectormath/scalar/boolInVec.h b/src/bullet/vectormath/scalar/boolInVec.h
deleted file mode 100644
index c5eeeebd..00000000
--- a/src/bullet/vectormath/scalar/boolInVec.h
+++ /dev/null
@@ -1,225 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef _BOOLINVEC_H
-#define _BOOLINVEC_H
-#include <math.h>
-namespace Vectormath {
-class floatInVec;
-// boolInVec class
-class boolInVec
-    unsigned int mData;
-    // Default constructor; does no initialization
-    //
-    inline boolInVec( ) { };
-    // Construct from a value converted from float
-    //
-    inline boolInVec(floatInVec vec);
-    // Explicit cast from bool
-    //
-    explicit inline boolInVec(bool scalar);
-    // Explicit cast to bool
-    //
-    inline bool getAsBool() const;
-    // Implicit cast to bool
-    //
-    inline operator bool() const;
-    // Boolean negation operator
-    //
-    inline const boolInVec operator ! () const;
-    // Assignment operator
-    //
-    inline boolInVec& operator = (boolInVec vec);
-    // Boolean and assignment operator
-    //
-    inline boolInVec& operator &= (boolInVec vec);
-    // Boolean exclusive or assignment operator
-    //
-    inline boolInVec& operator ^= (boolInVec vec);
-    // Boolean or assignment operator
-    //
-    inline boolInVec& operator |= (boolInVec vec);
-// Equal operator
-inline const boolInVec operator == (boolInVec vec0, boolInVec vec1);
-// Not equal operator
-inline const boolInVec operator != (boolInVec vec0, boolInVec vec1);
-// And operator
-inline const boolInVec operator & (boolInVec vec0, boolInVec vec1);
-// Exclusive or operator
-inline const boolInVec operator ^ (boolInVec vec0, boolInVec vec1);
-// Or operator
-inline const boolInVec operator | (boolInVec vec0, boolInVec vec1);
-// Conditionally select between two values
-inline const boolInVec select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1);
-} // namespace Vectormath
-// boolInVec implementation
-#include "floatInVec.h"
-namespace Vectormath {
-boolInVec::boolInVec(floatInVec vec)
-    *this = (vec != floatInVec(0.0f));
-boolInVec::boolInVec(bool scalar)
-    mData = -(int)scalar;
-boolInVec::getAsBool() const
-    return (mData > 0);
-boolInVec::operator bool() const
-    return getAsBool();
-const boolInVec
-boolInVec::operator ! () const
-    return boolInVec(!mData);
-boolInVec::operator = (boolInVec vec)
-    mData = vec.mData;
-    return *this;
-boolInVec::operator &= (boolInVec vec)
-    *this = *this & vec;
-    return *this;
-boolInVec::operator ^= (boolInVec vec)
-    *this = *this ^ vec;
-    return *this;
-boolInVec::operator |= (boolInVec vec)
-    *this = *this | vec;
-    return *this;
-const boolInVec
-operator == (boolInVec vec0, boolInVec vec1)
-    return boolInVec(vec0.getAsBool() == vec1.getAsBool());
-const boolInVec
-operator != (boolInVec vec0, boolInVec vec1)
-    return !(vec0 == vec1);
-const boolInVec
-operator & (boolInVec vec0, boolInVec vec1)
-    return boolInVec(vec0.getAsBool() & vec1.getAsBool());
-const boolInVec
-operator | (boolInVec vec0, boolInVec vec1)
-    return boolInVec(vec0.getAsBool() | vec1.getAsBool());
-const boolInVec
-operator ^ (boolInVec vec0, boolInVec vec1)
-    return boolInVec(vec0.getAsBool() ^ vec1.getAsBool());
-const boolInVec
-select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1)
-    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
-} // namespace Vectormath
-#endif // boolInVec_h
diff --git a/src/bullet/vectormath/scalar/floatInVec.h b/src/bullet/vectormath/scalar/floatInVec.h
deleted file mode 100644
index 12d89e43..00000000
--- a/src/bullet/vectormath/scalar/floatInVec.h
+++ /dev/null
@@ -1,343 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#ifndef _FLOATINVEC_H
-#define _FLOATINVEC_H
-#include <math.h>
-namespace Vectormath {
-class boolInVec;
-// floatInVec class
-// A class representing a scalar float value contained in a vector register
-// This class does not support fastmath
-class floatInVec
-    float mData;
-    // Default constructor; does no initialization
-    //
-    inline floatInVec( ) { };
-    // Construct from a value converted from bool
-    //
-    inline floatInVec(boolInVec vec);
-    // Explicit cast from float
-    //
-    explicit inline floatInVec(float scalar);
-    // Explicit cast to float
-    //
-    inline float getAsFloat() const;
-    // Implicit cast to float
-    //
-    inline operator float() const;
-    // Post increment (add 1.0f)
-    //
-    inline const floatInVec operator ++ (int);
-    // Post decrement (subtract 1.0f)
-    //
-    inline const floatInVec operator -- (int);
-    // Pre increment (add 1.0f)
-    //
-    inline floatInVec& operator ++ ();
-    // Pre decrement (subtract 1.0f)
-    //
-    inline floatInVec& operator -- ();
-    // Negation operator
-    //
-    inline const floatInVec operator - () const;
-    // Assignment operator
-    //
-    inline floatInVec& operator = (floatInVec vec);
-    // Multiplication assignment operator
-    //
-    inline floatInVec& operator *= (floatInVec vec);
-    // Division assignment operator
-    //
-    inline floatInVec& operator /= (floatInVec vec);
-    // Addition assignment operator
-    //
-    inline floatInVec& operator += (floatInVec vec);
-    // Subtraction assignment operator
-    //
-    inline floatInVec& operator -= (floatInVec vec);
-// Multiplication operator
-inline const floatInVec operator * (floatInVec vec0, floatInVec vec1);
-// Division operator
-inline const floatInVec operator / (floatInVec vec0, floatInVec vec1);
-// Addition operator
-inline const floatInVec operator + (floatInVec vec0, floatInVec vec1);
-// Subtraction operator
-inline const floatInVec operator - (floatInVec vec0, floatInVec vec1);
-// Less than operator
-inline const boolInVec operator < (floatInVec vec0, floatInVec vec1);
-// Less than or equal operator
-inline const boolInVec operator <= (floatInVec vec0, floatInVec vec1);
-// Greater than operator
-inline const boolInVec operator > (floatInVec vec0, floatInVec vec1);
-// Greater than or equal operator
-inline const boolInVec operator >= (floatInVec vec0, floatInVec vec1);
-// Equal operator
-inline const boolInVec operator == (floatInVec vec0, floatInVec vec1);
-// Not equal operator
-inline const boolInVec operator != (floatInVec vec0, floatInVec vec1);
-// Conditionally select between two values
-inline const floatInVec select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1);
-} // namespace Vectormath
-// floatInVec implementation
-#include "boolInVec.h"
-namespace Vectormath {
-floatInVec::floatInVec(boolInVec vec)
-    mData = float(vec.getAsBool());
-floatInVec::floatInVec(float scalar)
-    mData = scalar;
-floatInVec::getAsFloat() const
-    return mData;
-floatInVec::operator float() const
-    return getAsFloat();
-const floatInVec
-floatInVec::operator ++ (int)
-    float olddata = mData;
-    operator ++();
-    return floatInVec(olddata);
-const floatInVec
-floatInVec::operator -- (int)
-    float olddata = mData;
-    operator --();
-    return floatInVec(olddata);
-floatInVec::operator ++ ()
-    *this += floatInVec(1.0f);
-    return *this;
-floatInVec::operator -- ()
-    *this -= floatInVec(1.0f);
-    return *this;
-const floatInVec
-floatInVec::operator - () const
-    return floatInVec(-mData);
-floatInVec::operator = (floatInVec vec)
-    mData = vec.mData;
-    return *this;
-floatInVec::operator *= (floatInVec vec)
-    *this = *this * vec;
-    return *this;
-floatInVec::operator /= (floatInVec vec)
-    *this = *this / vec;
-    return *this;
-floatInVec::operator += (floatInVec vec)
-    *this = *this + vec;
-    return *this;
-floatInVec::operator -= (floatInVec vec)
-    *this = *this - vec;
-    return *this;
-const floatInVec
-operator * (floatInVec vec0, floatInVec vec1)
-    return floatInVec(vec0.getAsFloat() * vec1.getAsFloat());
-const floatInVec
-operator / (floatInVec num, floatInVec den)
-    return floatInVec(num.getAsFloat() / den.getAsFloat());
-const floatInVec
-operator + (floatInVec vec0, floatInVec vec1)
-    return floatInVec(vec0.getAsFloat() + vec1.getAsFloat());
-const floatInVec
-operator - (floatInVec vec0, floatInVec vec1)
-    return floatInVec(vec0.getAsFloat() - vec1.getAsFloat());
-const boolInVec
-operator < (floatInVec vec0, floatInVec vec1)
-    return boolInVec(vec0.getAsFloat() < vec1.getAsFloat());
-const boolInVec
-operator <= (floatInVec vec0, floatInVec vec1)
-    return !(vec0 > vec1);
-const boolInVec
-operator > (floatInVec vec0, floatInVec vec1)
-    return boolInVec(vec0.getAsFloat() > vec1.getAsFloat());
-const boolInVec
-operator >= (floatInVec vec0, floatInVec vec1)
-    return !(vec0 < vec1);
-const boolInVec
-operator == (floatInVec vec0, floatInVec vec1)
-    return boolInVec(vec0.getAsFloat() == vec1.getAsFloat());
-const boolInVec
-operator != (floatInVec vec0, floatInVec vec1)
-    return !(vec0 == vec1);
-const floatInVec
-select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1)
-    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
-} // namespace Vectormath
-#endif // floatInVec_h
diff --git a/src/bullet/vectormath/scalar/mat_aos.h b/src/bullet/vectormath/scalar/mat_aos.h
deleted file mode 100644
index e103243d..00000000
--- a/src/bullet/vectormath/scalar/mat_aos.h
+++ /dev/null
@@ -1,1630 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-namespace Vectormath {
-namespace Aos {
-// Constants
-#define _VECTORMATH_PI_OVER_2 1.570796327f
-// Definitions
-inline Matrix3::Matrix3( const Matrix3 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-inline Matrix3::Matrix3( float scalar )
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-inline Matrix3::Matrix3( const Quat & unitQuat )
-    float qx, qy, qz, qw, qx2, qy2, qz2, qxqx2, qyqy2, qzqz2, qxqy2, qyqz2, qzqw2, qxqz2, qyqw2, qxqw2;
-    qx = unitQuat.getX();
-    qy = unitQuat.getY();
-    qz = unitQuat.getZ();
-    qw = unitQuat.getW();
-    qx2 = ( qx + qx );
-    qy2 = ( qy + qy );
-    qz2 = ( qz + qz );
-    qxqx2 = ( qx * qx2 );
-    qxqy2 = ( qx * qy2 );
-    qxqz2 = ( qx * qz2 );
-    qxqw2 = ( qw * qx2 );
-    qyqy2 = ( qy * qy2 );
-    qyqz2 = ( qy * qz2 );
-    qyqw2 = ( qw * qy2 );
-    qzqz2 = ( qz * qz2 );
-    qzqw2 = ( qw * qz2 );
-    mCol0 = Vector3( ( ( 1.0f - qyqy2 ) - qzqz2 ), ( qxqy2 + qzqw2 ), ( qxqz2 - qyqw2 ) );
-    mCol1 = Vector3( ( qxqy2 - qzqw2 ), ( ( 1.0f - qxqx2 ) - qzqz2 ), ( qyqz2 + qxqw2 ) );
-    mCol2 = Vector3( ( qxqz2 + qyqw2 ), ( qyqz2 - qxqw2 ), ( ( 1.0f - qxqx2 ) - qyqy2 ) );
-inline Matrix3::Matrix3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2 )
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-inline Matrix3 & Matrix3::setCol0( const Vector3 & _col0 )
-    mCol0 = _col0;
-    return *this;
-inline Matrix3 & Matrix3::setCol1( const Vector3 & _col1 )
-    mCol1 = _col1;
-    return *this;
-inline Matrix3 & Matrix3::setCol2( const Vector3 & _col2 )
-    mCol2 = _col2;
-    return *this;
-inline Matrix3 & Matrix3::setCol( int col, const Vector3 & vec )
-    *(&mCol0 + col) = vec;
-    return *this;
-inline Matrix3 & Matrix3::setRow( int row, const Vector3 & vec )
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    return *this;
-inline Matrix3 & Matrix3::setElem( int col, int row, float val )
-    Vector3 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-inline float Matrix3::getElem( int col, int row ) const
-    return this->getCol( col ).getElem( row );
-inline const Vector3 Matrix3::getCol0( ) const
-    return mCol0;
-inline const Vector3 Matrix3::getCol1( ) const
-    return mCol1;
-inline const Vector3 Matrix3::getCol2( ) const
-    return mCol2;
-inline const Vector3 Matrix3::getCol( int col ) const
-    return *(&mCol0 + col);
-inline const Vector3 Matrix3::getRow( int row ) const
-    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
-inline Vector3 & Matrix3::operator []( int col )
-    return *(&mCol0 + col);
-inline const Vector3 Matrix3::operator []( int col ) const
-    return *(&mCol0 + col);
-inline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    return *this;
-inline const Matrix3 transpose( const Matrix3 & mat )
-    return Matrix3(
-        Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
-        Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
-        Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
-    );
-inline const Matrix3 inverse( const Matrix3 & mat )
-    Vector3 tmp0, tmp1, tmp2;
-    float detinv;
-    tmp0 = cross( mat.getCol1(), mat.getCol2() );
-    tmp1 = cross( mat.getCol2(), mat.getCol0() );
-    tmp2 = cross( mat.getCol0(), mat.getCol1() );
-    detinv = ( 1.0f / dot( mat.getCol2(), tmp2 ) );
-    return Matrix3(
-        Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) ),
-        Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) ),
-        Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) )
-    );
-inline float determinant( const Matrix3 & mat )
-    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
-inline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
-    return Matrix3(
-        ( mCol0 + mat.mCol0 ),
-        ( mCol1 + mat.mCol1 ),
-        ( mCol2 + mat.mCol2 )
-    );
-inline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
-    return Matrix3(
-        ( mCol0 - mat.mCol0 ),
-        ( mCol1 - mat.mCol1 ),
-        ( mCol2 - mat.mCol2 )
-    );
-inline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
-    *this = *this + mat;
-    return *this;
-inline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
-    *this = *this - mat;
-    return *this;
-inline const Matrix3 Matrix3::operator -( ) const
-    return Matrix3(
-        ( -mCol0 ),
-        ( -mCol1 ),
-        ( -mCol2 )
-    );
-inline const Matrix3 absPerElem( const Matrix3 & mat )
-    return Matrix3(
-        absPerElem( mat.getCol0() ),
-        absPerElem( mat.getCol1() ),
-        absPerElem( mat.getCol2() )
-    );
-inline const Matrix3 Matrix3::operator *( float scalar ) const
-    return Matrix3(
-        ( mCol0 * scalar ),
-        ( mCol1 * scalar ),
-        ( mCol2 * scalar )
-    );
-inline Matrix3 & Matrix3::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-inline const Matrix3 operator *( float scalar, const Matrix3 & mat )
-    return mat * scalar;
-inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
-    return Vector3(
-        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
-        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
-        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
-    );
-inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
-    return Matrix3(
-        ( *this * mat.mCol0 ),
-        ( *this * mat.mCol1 ),
-        ( *this * mat.mCol2 )
-    );
-inline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
-    *this = *this * mat;
-    return *this;
-inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
-    return Matrix3(
-        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
-        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
-        mulPerElem( mat0.getCol2(), mat1.getCol2() )
-    );
-inline const Matrix3 Matrix3::identity( )
-    return Matrix3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( )
-    );
-inline const Matrix3 Matrix3::rotationX( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Matrix3(
-        Vector3::xAxis( ),
-        Vector3( 0.0f, c, s ),
-        Vector3( 0.0f, -s, c )
-    );
-inline const Matrix3 Matrix3::rotationY( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Matrix3(
-        Vector3( c, 0.0f, -s ),
-        Vector3::yAxis( ),
-        Vector3( s, 0.0f, c )
-    );
-inline const Matrix3 Matrix3::rotationZ( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Matrix3(
-        Vector3( c, s, 0.0f ),
-        Vector3( -s, c, 0.0f ),
-        Vector3::zAxis( )
-    );
-inline const Matrix3 Matrix3::rotationZYX( const Vector3 & radiansXYZ )
-    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
-    sX = sinf( radiansXYZ.getX() );
-    cX = cosf( radiansXYZ.getX() );
-    sY = sinf( radiansXYZ.getY() );
-    cY = cosf( radiansXYZ.getY() );
-    sZ = sinf( radiansXYZ.getZ() );
-    cZ = cosf( radiansXYZ.getZ() );
-    tmp0 = ( cZ * sY );
-    tmp1 = ( sZ * sY );
-    return Matrix3(
-        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
-        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
-        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) )
-    );
-inline const Matrix3 Matrix3::rotation( float radians, const Vector3 & unitVec )
-    float x, y, z, s, c, oneMinusC, xy, yz, zx;
-    s = sinf( radians );
-    c = cosf( radians );
-    x = unitVec.getX();
-    y = unitVec.getY();
-    z = unitVec.getZ();
-    xy = ( x * y );
-    yz = ( y * z );
-    zx = ( z * x );
-    oneMinusC = ( 1.0f - c );
-    return Matrix3(
-        Vector3( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ) ),
-        Vector3( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ) ),
-        Vector3( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ) )
-    );
-inline const Matrix3 Matrix3::rotation( const Quat & unitQuat )
-    return Matrix3( unitQuat );
-inline const Matrix3 Matrix3::scale( const Vector3 & scaleVec )
-    return Matrix3(
-        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
-        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
-        Vector3( 0.0f, 0.0f, scaleVec.getZ() )
-    );
-inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec )
-    return Matrix3(
-        ( mat.getCol0() * scaleVec.getX( ) ),
-        ( mat.getCol1() * scaleVec.getY( ) ),
-        ( mat.getCol2() * scaleVec.getZ( ) )
-    );
-inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat )
-    return Matrix3(
-        mulPerElem( mat.getCol0(), scaleVec ),
-        mulPerElem( mat.getCol1(), scaleVec ),
-        mulPerElem( mat.getCol2(), scaleVec )
-    );
-inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
-    return Matrix3(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 )
-    );
-inline void print( const Matrix3 & mat )
-    print( mat.getRow( 0 ) );
-    print( mat.getRow( 1 ) );
-    print( mat.getRow( 2 ) );
-inline void print( const Matrix3 & mat, const char * name )
-    printf("%s:\n", name);
-    print( mat );
-inline Matrix4::Matrix4( const Matrix4 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    mCol3 = mat.mCol3;
-inline Matrix4::Matrix4( float scalar )
-    mCol0 = Vector4( scalar );
-    mCol1 = Vector4( scalar );
-    mCol2 = Vector4( scalar );
-    mCol3 = Vector4( scalar );
-inline Matrix4::Matrix4( const Transform3 & mat )
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( mat.getCol3(), 1.0f );
-inline Matrix4::Matrix4( const Vector4 & _col0, const Vector4 & _col1, const Vector4 & _col2, const Vector4 & _col3 )
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-    mCol3 = _col3;
-inline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 & translateVec )
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( translateVec, 1.0f );
-inline Matrix4::Matrix4( const Quat & unitQuat, const Vector3 & translateVec )
-    Matrix3 mat;
-    mat = Matrix3( unitQuat );
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( translateVec, 1.0f );
-inline Matrix4 & Matrix4::setCol0( const Vector4 & _col0 )
-    mCol0 = _col0;
-    return *this;
-inline Matrix4 & Matrix4::setCol1( const Vector4 & _col1 )
-    mCol1 = _col1;
-    return *this;
-inline Matrix4 & Matrix4::setCol2( const Vector4 & _col2 )
-    mCol2 = _col2;
-    return *this;
-inline Matrix4 & Matrix4::setCol3( const Vector4 & _col3 )
-    mCol3 = _col3;
-    return *this;
-inline Matrix4 & Matrix4::setCol( int col, const Vector4 & vec )
-    *(&mCol0 + col) = vec;
-    return *this;
-inline Matrix4 & Matrix4::setRow( int row, const Vector4 & vec )
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    mCol3.setElem( row, vec.getElem( 3 ) );
-    return *this;
-inline Matrix4 & Matrix4::setElem( int col, int row, float val )
-    Vector4 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-inline float Matrix4::getElem( int col, int row ) const
-    return this->getCol( col ).getElem( row );
-inline const Vector4 Matrix4::getCol0( ) const
-    return mCol0;
-inline const Vector4 Matrix4::getCol1( ) const
-    return mCol1;
-inline const Vector4 Matrix4::getCol2( ) const
-    return mCol2;
-inline const Vector4 Matrix4::getCol3( ) const
-    return mCol3;
-inline const Vector4 Matrix4::getCol( int col ) const
-    return *(&mCol0 + col);
-inline const Vector4 Matrix4::getRow( int row ) const
-    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
-inline Vector4 & Matrix4::operator []( int col )
-    return *(&mCol0 + col);
-inline const Vector4 Matrix4::operator []( int col ) const
-    return *(&mCol0 + col);
-inline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    mCol3 = mat.mCol3;
-    return *this;
-inline const Matrix4 transpose( const Matrix4 & mat )
-    return Matrix4(
-        Vector4( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX(), mat.getCol3().getX() ),
-        Vector4( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY(), mat.getCol3().getY() ),
-        Vector4( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ(), mat.getCol3().getZ() ),
-        Vector4( mat.getCol0().getW(), mat.getCol1().getW(), mat.getCol2().getW(), mat.getCol3().getW() )
-    );
-inline const Matrix4 inverse( const Matrix4 & mat )
-    Vector4 res0, res1, res2, res3;
-    float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
-    mA = mat.getCol0().getX();
-    mB = mat.getCol0().getY();
-    mC = mat.getCol0().getZ();
-    mD = mat.getCol0().getW();
-    mE = mat.getCol1().getX();
-    mF = mat.getCol1().getY();
-    mG = mat.getCol1().getZ();
-    mH = mat.getCol1().getW();
-    mI = mat.getCol2().getX();
-    mJ = mat.getCol2().getY();
-    mK = mat.getCol2().getZ();
-    mL = mat.getCol2().getW();
-    mM = mat.getCol3().getX();
-    mN = mat.getCol3().getY();
-    mO = mat.getCol3().getZ();
-    mP = mat.getCol3().getW();
-    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
-    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
-    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
-    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
-    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
-    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
-    res0.setX( ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
-    res0.setY( ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
-    res0.setZ( ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
-    res0.setW( ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
-    detInv = ( 1.0f / ( ( ( ( mA * res0.getX() ) + ( mE * res0.getY() ) ) + ( mI * res0.getZ() ) ) + ( mM * res0.getW() ) ) );
-    res1.setX( ( mI * tmp1 ) );
-    res1.setY( ( mM * tmp0 ) );
-    res1.setZ( ( mA * tmp1 ) );
-    res1.setW( ( mE * tmp0 ) );
-    res3.setX( ( mI * tmp3 ) );
-    res3.setY( ( mM * tmp2 ) );
-    res3.setZ( ( mA * tmp3 ) );
-    res3.setW( ( mE * tmp2 ) );
-    res2.setX( ( mI * tmp5 ) );
-    res2.setY( ( mM * tmp4 ) );
-    res2.setZ( ( mA * tmp5 ) );
-    res2.setW( ( mE * tmp4 ) );
-    tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
-    tmp1 = ( ( mM * mF ) - ( mE * mN ) );
-    tmp2 = ( ( mI * mD ) - ( mA * mL ) );
-    tmp3 = ( ( mM * mH ) - ( mE * mP ) );
-    tmp4 = ( ( mI * mC ) - ( mA * mK ) );
-    tmp5 = ( ( mM * mG ) - ( mE * mO ) );
-    res2.setX( ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.getX() ) );
-    res2.setY( ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.getY() ) );
-    res2.setZ( ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.getZ() ) );
-    res2.setW( ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.getW() ) );
-    res3.setX( ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.getX() ) );
-    res3.setY( ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.getY() ) );
-    res3.setZ( ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.getZ() ) );
-    res3.setW( ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.getW() ) );
-    res1.setX( ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.getX() ) );
-    res1.setY( ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.getY() ) );
-    res1.setZ( ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.getZ() ) );
-    res1.setW( ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.getW() ) );
-    return Matrix4(
-        ( res0 * detInv ),
-        ( res1 * detInv ),
-        ( res2 * detInv ),
-        ( res3 * detInv )
-    );
-inline const Matrix4 affineInverse( const Matrix4 & mat )
-    Transform3 affineMat;
-    affineMat.setCol0( mat.getCol0().getXYZ( ) );
-    affineMat.setCol1( mat.getCol1().getXYZ( ) );
-    affineMat.setCol2( mat.getCol2().getXYZ( ) );
-    affineMat.setCol3( mat.getCol3().getXYZ( ) );
-    return Matrix4( inverse( affineMat ) );
-inline const Matrix4 orthoInverse( const Matrix4 & mat )
-    Transform3 affineMat;
-    affineMat.setCol0( mat.getCol0().getXYZ( ) );
-    affineMat.setCol1( mat.getCol1().getXYZ( ) );
-    affineMat.setCol2( mat.getCol2().getXYZ( ) );
-    affineMat.setCol3( mat.getCol3().getXYZ( ) );
-    return Matrix4( orthoInverse( affineMat ) );
-inline float determinant( const Matrix4 & mat )
-    float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-    mA = mat.getCol0().getX();
-    mB = mat.getCol0().getY();
-    mC = mat.getCol0().getZ();
-    mD = mat.getCol0().getW();
-    mE = mat.getCol1().getX();
-    mF = mat.getCol1().getY();
-    mG = mat.getCol1().getZ();
-    mH = mat.getCol1().getW();
-    mI = mat.getCol2().getX();
-    mJ = mat.getCol2().getY();
-    mK = mat.getCol2().getZ();
-    mL = mat.getCol2().getW();
-    mM = mat.getCol3().getX();
-    mN = mat.getCol3().getY();
-    mO = mat.getCol3().getZ();
-    mP = mat.getCol3().getW();
-    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
-    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
-    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
-    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
-    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
-    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
-    dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
-    dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
-    dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
-    dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
-    return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
-inline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
-    return Matrix4(
-        ( mCol0 + mat.mCol0 ),
-        ( mCol1 + mat.mCol1 ),
-        ( mCol2 + mat.mCol2 ),
-        ( mCol3 + mat.mCol3 )
-    );
-inline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
-    return Matrix4(
-        ( mCol0 - mat.mCol0 ),
-        ( mCol1 - mat.mCol1 ),
-        ( mCol2 - mat.mCol2 ),
-        ( mCol3 - mat.mCol3 )
-    );
-inline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
-    *this = *this + mat;
-    return *this;
-inline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
-    *this = *this - mat;
-    return *this;
-inline const Matrix4 Matrix4::operator -( ) const
-    return Matrix4(
-        ( -mCol0 ),
-        ( -mCol1 ),
-        ( -mCol2 ),
-        ( -mCol3 )
-    );
-inline const Matrix4 absPerElem( const Matrix4 & mat )
-    return Matrix4(
-        absPerElem( mat.getCol0() ),
-        absPerElem( mat.getCol1() ),
-        absPerElem( mat.getCol2() ),
-        absPerElem( mat.getCol3() )
-    );
-inline const Matrix4 Matrix4::operator *( float scalar ) const
-    return Matrix4(
-        ( mCol0 * scalar ),
-        ( mCol1 * scalar ),
-        ( mCol2 * scalar ),
-        ( mCol3 * scalar )
-    );
-inline Matrix4 & Matrix4::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-inline const Matrix4 operator *( float scalar, const Matrix4 & mat )
-    return mat * scalar;
-inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
-    return Vector4(
-        ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
-        ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
-        ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
-        ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
-    );
-inline const Vector4 Matrix4::operator *( const Vector3 & vec ) const
-    return Vector4(
-        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
-        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
-        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ),
-        ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) )
-    );
-inline const Vector4 Matrix4::operator *( const Point3 & pnt ) const
-    return Vector4(
-        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
-        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
-        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() ),
-        ( ( ( ( mCol0.getW() * pnt.getX() ) + ( mCol1.getW() * pnt.getY() ) ) + ( mCol2.getW() * pnt.getZ() ) ) + mCol3.getW() )
-    );
-inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
-    return Matrix4(
-        ( *this * mat.mCol0 ),
-        ( *this * mat.mCol1 ),
-        ( *this * mat.mCol2 ),
-        ( *this * mat.mCol3 )
-    );
-inline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
-    *this = *this * mat;
-    return *this;
-inline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
-    return Matrix4(
-        ( *this * tfrm.getCol0() ),
-        ( *this * tfrm.getCol1() ),
-        ( *this * tfrm.getCol2() ),
-        ( *this * Point3( tfrm.getCol3() ) )
-    );
-inline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
-    *this = *this * tfrm;
-    return *this;
-inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
-    return Matrix4(
-        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
-        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
-        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
-        mulPerElem( mat0.getCol3(), mat1.getCol3() )
-    );
-inline const Matrix4 Matrix4::identity( )
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4::yAxis( ),
-        Vector4::zAxis( ),
-        Vector4::wAxis( )
-    );
-inline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
-    mCol0.setXYZ( mat3.getCol0() );
-    mCol1.setXYZ( mat3.getCol1() );
-    mCol2.setXYZ( mat3.getCol2() );
-    return *this;
-inline const Matrix3 Matrix4::getUpper3x3( ) const
-    return Matrix3(
-        mCol0.getXYZ( ),
-        mCol1.getXYZ( ),
-        mCol2.getXYZ( )
-    );
-inline Matrix4 & Matrix4::setTranslation( const Vector3 & translateVec )
-    mCol3.setXYZ( translateVec );
-    return *this;
-inline const Vector3 Matrix4::getTranslation( ) const
-    return mCol3.getXYZ( );
-inline const Matrix4 Matrix4::rotationX( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4( 0.0f, c, s, 0.0f ),
-        Vector4( 0.0f, -s, c, 0.0f ),
-        Vector4::wAxis( )
-    );
-inline const Matrix4 Matrix4::rotationY( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Matrix4(
-        Vector4( c, 0.0f, -s, 0.0f ),
-        Vector4::yAxis( ),
-        Vector4( s, 0.0f, c, 0.0f ),
-        Vector4::wAxis( )
-    );
-inline const Matrix4 Matrix4::rotationZ( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Matrix4(
-        Vector4( c, s, 0.0f, 0.0f ),
-        Vector4( -s, c, 0.0f, 0.0f ),
-        Vector4::zAxis( ),
-        Vector4::wAxis( )
-    );
-inline const Matrix4 Matrix4::rotationZYX( const Vector3 & radiansXYZ )
-    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
-    sX = sinf( radiansXYZ.getX() );
-    cX = cosf( radiansXYZ.getX() );
-    sY = sinf( radiansXYZ.getY() );
-    cY = cosf( radiansXYZ.getY() );
-    sZ = sinf( radiansXYZ.getZ() );
-    cZ = cosf( radiansXYZ.getZ() );
-    tmp0 = ( cZ * sY );
-    tmp1 = ( sZ * sY );
-    return Matrix4(
-        Vector4( ( cZ * cY ), ( sZ * cY ), -sY, 0.0f ),
-        Vector4( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ), 0.0f ),
-        Vector4( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ), 0.0f ),
-        Vector4::wAxis( )
-    );
-inline const Matrix4 Matrix4::rotation( float radians, const Vector3 & unitVec )
-    float x, y, z, s, c, oneMinusC, xy, yz, zx;
-    s = sinf( radians );
-    c = cosf( radians );
-    x = unitVec.getX();
-    y = unitVec.getY();
-    z = unitVec.getZ();
-    xy = ( x * y );
-    yz = ( y * z );
-    zx = ( z * x );
-    oneMinusC = ( 1.0f - c );
-    return Matrix4(
-        Vector4( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ), 0.0f ),
-        Vector4( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ), 0.0f ),
-        Vector4( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ), 0.0f ),
-        Vector4::wAxis( )
-    );
-inline const Matrix4 Matrix4::rotation( const Quat & unitQuat )
-    return Matrix4( Transform3::rotation( unitQuat ) );
-inline const Matrix4 Matrix4::scale( const Vector3 & scaleVec )
-    return Matrix4(
-        Vector4( scaleVec.getX(), 0.0f, 0.0f, 0.0f ),
-        Vector4( 0.0f, scaleVec.getY(), 0.0f, 0.0f ),
-        Vector4( 0.0f, 0.0f, scaleVec.getZ(), 0.0f ),
-        Vector4::wAxis( )
-    );
-inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec )
-    return Matrix4(
-        ( mat.getCol0() * scaleVec.getX( ) ),
-        ( mat.getCol1() * scaleVec.getY( ) ),
-        ( mat.getCol2() * scaleVec.getZ( ) ),
-        mat.getCol3()
-    );
-inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat )
-    Vector4 scale4;
-    scale4 = Vector4( scaleVec, 1.0f );
-    return Matrix4(
-        mulPerElem( mat.getCol0(), scale4 ),
-        mulPerElem( mat.getCol1(), scale4 ),
-        mulPerElem( mat.getCol2(), scale4 ),
-        mulPerElem( mat.getCol3(), scale4 )
-    );
-inline const Matrix4 Matrix4::translation( const Vector3 & translateVec )
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4::yAxis( ),
-        Vector4::zAxis( ),
-        Vector4( translateVec, 1.0f )
-    );
-inline const Matrix4 Matrix4::lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec )
-    Matrix4 m4EyeFrame;
-    Vector3 v3X, v3Y, v3Z;
-    v3Y = normalize( upVec );
-    v3Z = normalize( ( eyePos - lookAtPos ) );
-    v3X = normalize( cross( v3Y, v3Z ) );
-    v3Y = cross( v3Z, v3X );
-    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
-    return orthoInverse( m4EyeFrame );
-inline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
-    float f, rangeInv;
-    f = tanf( ( (float)( _VECTORMATH_PI_OVER_2 ) - ( 0.5f * fovyRadians ) ) );
-    rangeInv = ( 1.0f / ( zNear - zFar ) );
-    return Matrix4(
-        Vector4( ( f / aspect ), 0.0f, 0.0f, 0.0f ),
-        Vector4( 0.0f, f, 0.0f, 0.0f ),
-        Vector4( 0.0f, 0.0f, ( ( zNear + zFar ) * rangeInv ), -1.0f ),
-        Vector4( 0.0f, 0.0f, ( ( ( zNear * zFar ) * rangeInv ) * 2.0f ), 0.0f )
-    );
-inline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
-    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf, n2;
-    sum_rl = ( right + left );
-    sum_tb = ( top + bottom );
-    sum_nf = ( zNear + zFar );
-    inv_rl = ( 1.0f / ( right - left ) );
-    inv_tb = ( 1.0f / ( top - bottom ) );
-    inv_nf = ( 1.0f / ( zNear - zFar ) );
-    n2 = ( zNear + zNear );
-    return Matrix4(
-        Vector4( ( n2 * inv_rl ), 0.0f, 0.0f, 0.0f ),
-        Vector4( 0.0f, ( n2 * inv_tb ), 0.0f, 0.0f ),
-        Vector4( ( sum_rl * inv_rl ), ( sum_tb * inv_tb ), ( sum_nf * inv_nf ), -1.0f ),
-        Vector4( 0.0f, 0.0f, ( ( n2 * inv_nf ) * zFar ), 0.0f )
-    );
-inline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
-    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf;
-    sum_rl = ( right + left );
-    sum_tb = ( top + bottom );
-    sum_nf = ( zNear + zFar );
-    inv_rl = ( 1.0f / ( right - left ) );
-    inv_tb = ( 1.0f / ( top - bottom ) );
-    inv_nf = ( 1.0f / ( zNear - zFar ) );
-    return Matrix4(
-        Vector4( ( inv_rl + inv_rl ), 0.0f, 0.0f, 0.0f ),
-        Vector4( 0.0f, ( inv_tb + inv_tb ), 0.0f, 0.0f ),
-        Vector4( 0.0f, 0.0f, ( inv_nf + inv_nf ), 0.0f ),
-        Vector4( ( -sum_rl * inv_rl ), ( -sum_tb * inv_tb ), ( sum_nf * inv_nf ), 1.0f )
-    );
-inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
-    return Matrix4(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 ),
-        select( mat0.getCol3(), mat1.getCol3(), select1 )
-    );
-inline void print( const Matrix4 & mat )
-    print( mat.getRow( 0 ) );
-    print( mat.getRow( 1 ) );
-    print( mat.getRow( 2 ) );
-    print( mat.getRow( 3 ) );
-inline void print( const Matrix4 & mat, const char * name )
-    printf("%s:\n", name);
-    print( mat );
-inline Transform3::Transform3( const Transform3 & tfrm )
-    mCol0 = tfrm.mCol0;
-    mCol1 = tfrm.mCol1;
-    mCol2 = tfrm.mCol2;
-    mCol3 = tfrm.mCol3;
-inline Transform3::Transform3( float scalar )
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-    mCol3 = Vector3( scalar );
-inline Transform3::Transform3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2, const Vector3 & _col3 )
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-    mCol3 = _col3;
-inline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 & translateVec )
-    this->setUpper3x3( tfrm );
-    this->setTranslation( translateVec );
-inline Transform3::Transform3( const Quat & unitQuat, const Vector3 & translateVec )
-    this->setUpper3x3( Matrix3( unitQuat ) );
-    this->setTranslation( translateVec );
-inline Transform3 & Transform3::setCol0( const Vector3 & _col0 )
-    mCol0 = _col0;
-    return *this;
-inline Transform3 & Transform3::setCol1( const Vector3 & _col1 )
-    mCol1 = _col1;
-    return *this;
-inline Transform3 & Transform3::setCol2( const Vector3 & _col2 )
-    mCol2 = _col2;
-    return *this;
-inline Transform3 & Transform3::setCol3( const Vector3 & _col3 )
-    mCol3 = _col3;
-    return *this;
-inline Transform3 & Transform3::setCol( int col, const Vector3 & vec )
-    *(&mCol0 + col) = vec;
-    return *this;
-inline Transform3 & Transform3::setRow( int row, const Vector4 & vec )
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    mCol3.setElem( row, vec.getElem( 3 ) );
-    return *this;
-inline Transform3 & Transform3::setElem( int col, int row, float val )
-    Vector3 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-inline float Transform3::getElem( int col, int row ) const
-    return this->getCol( col ).getElem( row );
-inline const Vector3 Transform3::getCol0( ) const
-    return mCol0;
-inline const Vector3 Transform3::getCol1( ) const
-    return mCol1;
-inline const Vector3 Transform3::getCol2( ) const
-    return mCol2;
-inline const Vector3 Transform3::getCol3( ) const
-    return mCol3;
-inline const Vector3 Transform3::getCol( int col ) const
-    return *(&mCol0 + col);
-inline const Vector4 Transform3::getRow( int row ) const
-    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
-inline Vector3 & Transform3::operator []( int col )
-    return *(&mCol0 + col);
-inline const Vector3 Transform3::operator []( int col ) const
-    return *(&mCol0 + col);
-inline Transform3 & Transform3::operator =( const Transform3 & tfrm )
-    mCol0 = tfrm.mCol0;
-    mCol1 = tfrm.mCol1;
-    mCol2 = tfrm.mCol2;
-    mCol3 = tfrm.mCol3;
-    return *this;
-inline const Transform3 inverse( const Transform3 & tfrm )
-    Vector3 tmp0, tmp1, tmp2, inv0, inv1, inv2;
-    float detinv;
-    tmp0 = cross( tfrm.getCol1(), tfrm.getCol2() );
-    tmp1 = cross( tfrm.getCol2(), tfrm.getCol0() );
-    tmp2 = cross( tfrm.getCol0(), tfrm.getCol1() );
-    detinv = ( 1.0f / dot( tfrm.getCol2(), tmp2 ) );
-    inv0 = Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) );
-    inv1 = Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) );
-    inv2 = Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) );
-    return Transform3(
-        inv0,
-        inv1,
-        inv2,
-        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
-    );
-inline const Transform3 orthoInverse( const Transform3 & tfrm )
-    Vector3 inv0, inv1, inv2;
-    inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
-    inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
-    inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
-    return Transform3(
-        inv0,
-        inv1,
-        inv2,
-        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
-    );
-inline const Transform3 absPerElem( const Transform3 & tfrm )
-    return Transform3(
-        absPerElem( tfrm.getCol0() ),
-        absPerElem( tfrm.getCol1() ),
-        absPerElem( tfrm.getCol2() ),
-        absPerElem( tfrm.getCol3() )
-    );
-inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
-    return Vector3(
-        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
-        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
-        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
-    );
-inline const Point3 Transform3::operator *( const Point3 & pnt ) const
-    return Point3(
-        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
-        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
-        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() )
-    );
-inline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
-    return Transform3(
-        ( *this * tfrm.mCol0 ),
-        ( *this * tfrm.mCol1 ),
-        ( *this * tfrm.mCol2 ),
-        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
-    );
-inline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
-    *this = *this * tfrm;
-    return *this;
-inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
-    return Transform3(
-        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
-        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
-        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
-        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
-    );
-inline const Transform3 Transform3::identity( )
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( ),
-        Vector3( 0.0f )
-    );
-inline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
-    mCol0 = tfrm.getCol0();
-    mCol1 = tfrm.getCol1();
-    mCol2 = tfrm.getCol2();
-    return *this;
-inline const Matrix3 Transform3::getUpper3x3( ) const
-    return Matrix3( mCol0, mCol1, mCol2 );
-inline Transform3 & Transform3::setTranslation( const Vector3 & translateVec )
-    mCol3 = translateVec;
-    return *this;
-inline const Vector3 Transform3::getTranslation( ) const
-    return mCol3;
-inline const Transform3 Transform3::rotationX( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3( 0.0f, c, s ),
-        Vector3( 0.0f, -s, c ),
-        Vector3( 0.0f )
-    );
-inline const Transform3 Transform3::rotationY( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Transform3(
-        Vector3( c, 0.0f, -s ),
-        Vector3::yAxis( ),
-        Vector3( s, 0.0f, c ),
-        Vector3( 0.0f )
-    );
-inline const Transform3 Transform3::rotationZ( float radians )
-    float s, c;
-    s = sinf( radians );
-    c = cosf( radians );
-    return Transform3(
-        Vector3( c, s, 0.0f ),
-        Vector3( -s, c, 0.0f ),
-        Vector3::zAxis( ),
-        Vector3( 0.0f )
-    );
-inline const Transform3 Transform3::rotationZYX( const Vector3 & radiansXYZ )
-    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
-    sX = sinf( radiansXYZ.getX() );
-    cX = cosf( radiansXYZ.getX() );
-    sY = sinf( radiansXYZ.getY() );
-    cY = cosf( radiansXYZ.getY() );
-    sZ = sinf( radiansXYZ.getZ() );
-    cZ = cosf( radiansXYZ.getZ() );
-    tmp0 = ( cZ * sY );
-    tmp1 = ( sZ * sY );
-    return Transform3(
-        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
-        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
-        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) ),
-        Vector3( 0.0f )
-    );
-inline const Transform3 Transform3::rotation( float radians, const Vector3 & unitVec )
-    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
-inline const Transform3 Transform3::rotation( const Quat & unitQuat )
-    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
-inline const Transform3 Transform3::scale( const Vector3 & scaleVec )
-    return Transform3(
-        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
-        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
-        Vector3( 0.0f, 0.0f, scaleVec.getZ() ),
-        Vector3( 0.0f )
-    );
-inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec )
-    return Transform3(
-        ( tfrm.getCol0() * scaleVec.getX( ) ),
-        ( tfrm.getCol1() * scaleVec.getY( ) ),
-        ( tfrm.getCol2() * scaleVec.getZ( ) ),
-        tfrm.getCol3()
-    );
-inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm )
-    return Transform3(
-        mulPerElem( tfrm.getCol0(), scaleVec ),
-        mulPerElem( tfrm.getCol1(), scaleVec ),
-        mulPerElem( tfrm.getCol2(), scaleVec ),
-        mulPerElem( tfrm.getCol3(), scaleVec )
-    );
-inline const Transform3 Transform3::translation( const Vector3 & translateVec )
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( ),
-        translateVec
-    );
-inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
-    return Transform3(
-        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
-        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
-        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
-        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
-    );
-inline void print( const Transform3 & tfrm )
-    print( tfrm.getRow( 0 ) );
-    print( tfrm.getRow( 1 ) );
-    print( tfrm.getRow( 2 ) );
-inline void print( const Transform3 & tfrm, const char * name )
-    printf("%s:\n", name);
-    print( tfrm );
-inline Quat::Quat( const Matrix3 & tfrm )
-    float trace, radicand, scale, xx, yx, zx, xy, yy, zy, xz, yz, zz, tmpx, tmpy, tmpz, tmpw, qx, qy, qz, qw;
-    int negTrace, ZgtX, ZgtY, YgtX;
-    int largestXorY, largestYorZ, largestZorX;
-    xx = tfrm.getCol0().getX();
-    yx = tfrm.getCol0().getY();
-    zx = tfrm.getCol0().getZ();
-    xy = tfrm.getCol1().getX();
-    yy = tfrm.getCol1().getY();
-    zy = tfrm.getCol1().getZ();
-    xz = tfrm.getCol2().getX();
-    yz = tfrm.getCol2().getY();
-    zz = tfrm.getCol2().getZ();
-    trace = ( ( xx + yy ) + zz );
-    negTrace = ( trace < 0.0f );
-    ZgtX = zz > xx;
-    ZgtY = zz > yy;
-    YgtX = yy > xx;
-    largestXorY = ( !ZgtX || !ZgtY ) && negTrace;
-    largestYorZ = ( YgtX || ZgtX ) && negTrace;
-    largestZorX = ( ZgtY || !YgtX ) && negTrace;
-    if ( largestXorY )
-    {
-        zz = -zz;
-        xy = -xy;
-    }
-    if ( largestYorZ )
-    {
-        xx = -xx;
-        yz = -yz;
-    }
-    if ( largestZorX )
-    {
-        yy = -yy;
-        zx = -zx;
-    }
-    radicand = ( ( ( xx + yy ) + zz ) + 1.0f );
-    scale = ( 0.5f * ( 1.0f / sqrtf( radicand ) ) );
-    tmpx = ( ( zy - yz ) * scale );
-    tmpy = ( ( xz - zx ) * scale );
-    tmpz = ( ( yx - xy ) * scale );
-    tmpw = ( radicand * scale );
-    qx = tmpx;
-    qy = tmpy;
-    qz = tmpz;
-    qw = tmpw;
-    if ( largestXorY )
-    {
-        qx = tmpw;
-        qy = tmpz;
-        qz = tmpy;
-        qw = tmpx;
-    }
-    if ( largestYorZ )
-    {
-        tmpx = qx;
-        tmpz = qz;
-        qx = qy;
-        qy = tmpx;
-        qz = qw;
-        qw = tmpz;
-    }
-    mX = qx;
-    mY = qy;
-    mZ = qz;
-    mW = qw;
-inline const Matrix3 outer( const Vector3 & tfrm0, const Vector3 & tfrm1 )
-    return Matrix3(
-        ( tfrm0 * tfrm1.getX( ) ),
-        ( tfrm0 * tfrm1.getY( ) ),
-        ( tfrm0 * tfrm1.getZ( ) )
-    );
-inline const Matrix4 outer( const Vector4 & tfrm0, const Vector4 & tfrm1 )
-    return Matrix4(
-        ( tfrm0 * tfrm1.getX( ) ),
-        ( tfrm0 * tfrm1.getY( ) ),
-        ( tfrm0 * tfrm1.getZ( ) ),
-        ( tfrm0 * tfrm1.getW( ) )
-    );
-inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat )
-    return Vector3(
-        ( ( ( vec.getX() * mat.getCol0().getX() ) + ( vec.getY() * mat.getCol0().getY() ) ) + ( vec.getZ() * mat.getCol0().getZ() ) ),
-        ( ( ( vec.getX() * mat.getCol1().getX() ) + ( vec.getY() * mat.getCol1().getY() ) ) + ( vec.getZ() * mat.getCol1().getZ() ) ),
-        ( ( ( vec.getX() * mat.getCol2().getX() ) + ( vec.getY() * mat.getCol2().getY() ) ) + ( vec.getZ() * mat.getCol2().getZ() ) )
-    );
-inline const Matrix3 crossMatrix( const Vector3 & vec )
-    return Matrix3(
-        Vector3( 0.0f, vec.getZ(), -vec.getY() ),
-        Vector3( -vec.getZ(), 0.0f, vec.getX() ),
-        Vector3( vec.getY(), -vec.getX(), 0.0f )
-    );
-inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat )
-    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/scalar/quat_aos.h b/src/bullet/vectormath/scalar/quat_aos.h
deleted file mode 100644
index 764e0170..00000000
--- a/src/bullet/vectormath/scalar/quat_aos.h
+++ /dev/null
@@ -1,433 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// Definitions
-namespace Vectormath {
-namespace Aos {
-inline Quat::Quat( const Quat & quat )
-    mX = quat.mX;
-    mY = quat.mY;
-    mZ = quat.mZ;
-    mW = quat.mW;
-inline Quat::Quat( float _x, float _y, float _z, float _w )
-    mX = _x;
-    mY = _y;
-    mZ = _z;
-    mW = _w;
-inline Quat::Quat( const Vector3 & xyz, float _w )
-    this->setXYZ( xyz );
-    this->setW( _w );
-inline Quat::Quat( const Vector4 & vec )
-    mX = vec.getX();
-    mY = vec.getY();
-    mZ = vec.getZ();
-    mW = vec.getW();
-inline Quat::Quat( float scalar )
-    mX = scalar;
-    mY = scalar;
-    mZ = scalar;
-    mW = scalar;
-inline const Quat Quat::identity( )
-    return Quat( 0.0f, 0.0f, 0.0f, 1.0f );
-inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 )
-    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
-inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 )
-    Quat start;
-    float recipSinAngle, scale0, scale1, cosAngle, angle;
-    cosAngle = dot( unitQuat0, unitQuat1 );
-    if ( cosAngle < 0.0f ) {
-        cosAngle = -cosAngle;
-        start = ( -unitQuat0 );
-    } else {
-        start = unitQuat0;
-    }
-    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
-        angle = acosf( cosAngle );
-        recipSinAngle = ( 1.0f / sinf( angle ) );
-        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
-        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
-    } else {
-        scale0 = ( 1.0f - t );
-        scale1 = t;
-    }
-    return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
-inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 )
-    Quat tmp0, tmp1;
-    tmp0 = slerp( t, unitQuat0, unitQuat3 );
-    tmp1 = slerp( t, unitQuat1, unitQuat2 );
-    return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
-inline void loadXYZW( Quat & quat, const float * fptr )
-    quat = Quat( fptr[0], fptr[1], fptr[2], fptr[3] );
-inline void storeXYZW( const Quat & quat, float * fptr )
-    fptr[0] = quat.getX();
-    fptr[1] = quat.getY();
-    fptr[2] = quat.getZ();
-    fptr[3] = quat.getW();
-inline Quat & Quat::operator =( const Quat & quat )
-    mX = quat.mX;
-    mY = quat.mY;
-    mZ = quat.mZ;
-    mW = quat.mW;
-    return *this;
-inline Quat & Quat::setXYZ( const Vector3 & vec )
-    mX = vec.getX();
-    mY = vec.getY();
-    mZ = vec.getZ();
-    return *this;
-inline const Vector3 Quat::getXYZ( ) const
-    return Vector3( mX, mY, mZ );
-inline Quat & Quat::setX( float _x )
-    mX = _x;
-    return *this;
-inline float Quat::getX( ) const
-    return mX;
-inline Quat & Quat::setY( float _y )
-    mY = _y;
-    return *this;
-inline float Quat::getY( ) const
-    return mY;
-inline Quat & Quat::setZ( float _z )
-    mZ = _z;
-    return *this;
-inline float Quat::getZ( ) const
-    return mZ;
-inline Quat & Quat::setW( float _w )
-    mW = _w;
-    return *this;
-inline float Quat::getW( ) const
-    return mW;
-inline Quat & Quat::setElem( int idx, float value )
-    *(&mX + idx) = value;
-    return *this;
-inline float Quat::getElem( int idx ) const
-    return *(&mX + idx);
-inline float & Quat::operator []( int idx )
-    return *(&mX + idx);
-inline float Quat::operator []( int idx ) const
-    return *(&mX + idx);
-inline const Quat Quat::operator +( const Quat & quat ) const
-    return Quat(
-        ( mX + quat.mX ),
-        ( mY + quat.mY ),
-        ( mZ + quat.mZ ),
-        ( mW + quat.mW )
-    );
-inline const Quat Quat::operator -( const Quat & quat ) const
-    return Quat(
-        ( mX - quat.mX ),
-        ( mY - quat.mY ),
-        ( mZ - quat.mZ ),
-        ( mW - quat.mW )
-    );
-inline const Quat Quat::operator *( float scalar ) const
-    return Quat(
-        ( mX * scalar ),
-        ( mY * scalar ),
-        ( mZ * scalar ),
-        ( mW * scalar )
-    );
-inline Quat & Quat::operator +=( const Quat & quat )
-    *this = *this + quat;
-    return *this;
-inline Quat & Quat::operator -=( const Quat & quat )
-    *this = *this - quat;
-    return *this;
-inline Quat & Quat::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-inline const Quat Quat::operator /( float scalar ) const
-    return Quat(
-        ( mX / scalar ),
-        ( mY / scalar ),
-        ( mZ / scalar ),
-        ( mW / scalar )
-    );
-inline Quat & Quat::operator /=( float scalar )
-    *this = *this / scalar;
-    return *this;
-inline const Quat Quat::operator -( ) const
-    return Quat(
-        -mX,
-        -mY,
-        -mZ,
-        -mW
-    );
-inline const Quat operator *( float scalar, const Quat & quat )
-    return quat * scalar;
-inline float dot( const Quat & quat0, const Quat & quat1 )
-    float result;
-    result = ( quat0.getX() * quat1.getX() );
-    result = ( result + ( quat0.getY() * quat1.getY() ) );
-    result = ( result + ( quat0.getZ() * quat1.getZ() ) );
-    result = ( result + ( quat0.getW() * quat1.getW() ) );
-    return result;
-inline float norm( const Quat & quat )
-    float result;
-    result = ( quat.getX() * quat.getX() );
-    result = ( result + ( quat.getY() * quat.getY() ) );
-    result = ( result + ( quat.getZ() * quat.getZ() ) );
-    result = ( result + ( quat.getW() * quat.getW() ) );
-    return result;
-inline float length( const Quat & quat )
-    return ::sqrtf( norm( quat ) );
-inline const Quat normalize( const Quat & quat )
-    float lenSqr, lenInv;
-    lenSqr = norm( quat );
-    lenInv = ( 1.0f / sqrtf( lenSqr ) );
-    return Quat(
-        ( quat.getX() * lenInv ),
-        ( quat.getY() * lenInv ),
-        ( quat.getZ() * lenInv ),
-        ( quat.getW() * lenInv )
-    );
-inline const Quat Quat::rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 )
-    float cosHalfAngleX2, recipCosHalfAngleX2;
-    cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
-    recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
-    return Quat( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
-inline const Quat Quat::rotation( float radians, const Vector3 & unitVec )
-    float s, c, angle;
-    angle = ( radians * 0.5f );
-    s = sinf( angle );
-    c = cosf( angle );
-    return Quat( ( unitVec * s ), c );
-inline const Quat Quat::rotationX( float radians )
-    float s, c, angle;
-    angle = ( radians * 0.5f );
-    s = sinf( angle );
-    c = cosf( angle );
-    return Quat( s, 0.0f, 0.0f, c );
-inline const Quat Quat::rotationY( float radians )
-    float s, c, angle;
-    angle = ( radians * 0.5f );
-    s = sinf( angle );
-    c = cosf( angle );
-    return Quat( 0.0f, s, 0.0f, c );
-inline const Quat Quat::rotationZ( float radians )
-    float s, c, angle;
-    angle = ( radians * 0.5f );
-    s = sinf( angle );
-    c = cosf( angle );
-    return Quat( 0.0f, 0.0f, s, c );
-inline const Quat Quat::operator *( const Quat & quat ) const
-    return Quat(
-        ( ( ( ( mW * quat.mX ) + ( mX * quat.mW ) ) + ( mY * quat.mZ ) ) - ( mZ * quat.mY ) ),
-        ( ( ( ( mW * quat.mY ) + ( mY * quat.mW ) ) + ( mZ * quat.mX ) ) - ( mX * quat.mZ ) ),
-        ( ( ( ( mW * quat.mZ ) + ( mZ * quat.mW ) ) + ( mX * quat.mY ) ) - ( mY * quat.mX ) ),
-        ( ( ( ( mW * quat.mW ) - ( mX * quat.mX ) ) - ( mY * quat.mY ) ) - ( mZ * quat.mZ ) )
-    );
-inline Quat & Quat::operator *=( const Quat & quat )
-    *this = *this * quat;
-    return *this;
-inline const Vector3 rotate( const Quat & quat, const Vector3 & vec )
-    float tmpX, tmpY, tmpZ, tmpW;
-    tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
-    tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
-    tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
-    tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
-    return Vector3(
-        ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
-        ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
-        ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
-    );
-inline const Quat conj( const Quat & quat )
-    return Quat( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
-inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 )
-    return Quat(
-        ( select1 )? quat1.getX() : quat0.getX(),
-        ( select1 )? quat1.getY() : quat0.getY(),
-        ( select1 )? quat1.getZ() : quat0.getZ(),
-        ( select1 )? quat1.getW() : quat0.getW()
-    );
-inline void print( const Quat & quat )
-    printf( "( %f %f %f %f )\n", quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
-inline void print( const Quat & quat, const char * name )
-    printf( "%s: ( %f %f %f %f )\n", name, quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/scalar/vec_aos.h b/src/bullet/vectormath/scalar/vec_aos.h
deleted file mode 100644
index 46d4d6b3..00000000
--- a/src/bullet/vectormath/scalar/vec_aos.h
+++ /dev/null
@@ -1,1426 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-// Constants
-#define _VECTORMATH_SLERP_TOL 0.999f
-// Definitions
-namespace Vectormath {
-namespace Aos {
-inline Vector3::Vector3( const Vector3 & vec )
-    mX = vec.mX;
-    mY = vec.mY;
-    mZ = vec.mZ;
-inline Vector3::Vector3( float _x, float _y, float _z )
-    mX = _x;
-    mY = _y;
-    mZ = _z;
-inline Vector3::Vector3( const Point3 & pnt )
-    mX = pnt.getX();
-    mY = pnt.getY();
-    mZ = pnt.getZ();
-inline Vector3::Vector3( float scalar )
-    mX = scalar;
-    mY = scalar;
-    mZ = scalar;
-inline const Vector3 Vector3::xAxis( )
-    return Vector3( 1.0f, 0.0f, 0.0f );
-inline const Vector3 Vector3::yAxis( )
-    return Vector3( 0.0f, 1.0f, 0.0f );
-inline const Vector3 Vector3::zAxis( )
-    return Vector3( 0.0f, 0.0f, 1.0f );
-inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 )
-    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
-inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 )
-    float recipSinAngle, scale0, scale1, cosAngle, angle;
-    cosAngle = dot( unitVec0, unitVec1 );
-    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
-        angle = acosf( cosAngle );
-        recipSinAngle = ( 1.0f / sinf( angle ) );
-        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
-        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
-    } else {
-        scale0 = ( 1.0f - t );
-        scale1 = t;
-    }
-    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
-inline void loadXYZ( Vector3 & vec, const float * fptr )
-    vec = Vector3( fptr[0], fptr[1], fptr[2] );
-inline void storeXYZ( const Vector3 & vec, float * fptr )
-    fptr[0] = vec.getX();
-    fptr[1] = vec.getY();
-    fptr[2] = vec.getZ();
-inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr )
-    union Data32 {
-        unsigned int u32;
-        float f32;
-    };
-    for (int i = 0; i < 3; i++) {
-        unsigned short fp16 = hfptr[i];
-        unsigned int sign = fp16 >> 15;
-        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
-        unsigned int mantissa = fp16 & ((1 << 10) - 1);
-        if (exponent == 0) {
-            // zero
-            mantissa = 0;
-        } else if (exponent == 31) {
-            // infinity or nan -> infinity
-            exponent = 255;
-	    mantissa = 0;
-        } else {
-            exponent += 127 - 15;
-            mantissa <<= 13;
-        }
-        Data32 d;
-        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
-        vec[i] = d.f32;
-    }
-inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr )
-    union Data32 {
-        unsigned int u32;
-        float f32;
-    };
-    for (int i = 0; i < 3; i++) {
-        Data32 d;
-        d.f32 = vec[i];
-        unsigned int sign = d.u32 >> 31;
-        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
-        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
-        if (exponent == 0) {
-            // zero or denorm -> zero
-            mantissa = 0;
-        } else if (exponent == 255 && mantissa != 0) {
-            // nan -> infinity
-            exponent = 31;
-            mantissa = 0;
-        } else if (exponent >= 127 - 15 + 31) {
-            // overflow or infinity -> infinity
-            exponent = 31;
-            mantissa = 0;
-        } else if (exponent <= 127 - 15) {
-            // underflow -> zero
-            exponent = 0;
-            mantissa = 0;
-        } else {
-            exponent -= 127 - 15;
-            mantissa >>= 13;
-        }
-        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
-    }
-inline Vector3 & Vector3::operator =( const Vector3 & vec )
-    mX = vec.mX;
-    mY = vec.mY;
-    mZ = vec.mZ;
-    return *this;
-inline Vector3 & Vector3::setX( float _x )
-    mX = _x;
-    return *this;
-inline float Vector3::getX( ) const
-    return mX;
-inline Vector3 & Vector3::setY( float _y )
-    mY = _y;
-    return *this;
-inline float Vector3::getY( ) const
-    return mY;
-inline Vector3 & Vector3::setZ( float _z )
-    mZ = _z;
-    return *this;
-inline float Vector3::getZ( ) const
-    return mZ;
-inline Vector3 & Vector3::setElem( int idx, float value )
-    *(&mX + idx) = value;
-    return *this;
-inline float Vector3::getElem( int idx ) const
-    return *(&mX + idx);
-inline float & Vector3::operator []( int idx )
-    return *(&mX + idx);
-inline float Vector3::operator []( int idx ) const
-    return *(&mX + idx);
-inline const Vector3 Vector3::operator +( const Vector3 & vec ) const
-    return Vector3(
-        ( mX + vec.mX ),
-        ( mY + vec.mY ),
-        ( mZ + vec.mZ )
-    );
-inline const Vector3 Vector3::operator -( const Vector3 & vec ) const
-    return Vector3(
-        ( mX - vec.mX ),
-        ( mY - vec.mY ),
-        ( mZ - vec.mZ )
-    );
-inline const Point3 Vector3::operator +( const Point3 & pnt ) const
-    return Point3(
-        ( mX + pnt.getX() ),
-        ( mY + pnt.getY() ),
-        ( mZ + pnt.getZ() )
-    );
-inline const Vector3 Vector3::operator *( float scalar ) const
-    return Vector3(
-        ( mX * scalar ),
-        ( mY * scalar ),
-        ( mZ * scalar )
-    );
-inline Vector3 & Vector3::operator +=( const Vector3 & vec )
-    *this = *this + vec;
-    return *this;
-inline Vector3 & Vector3::operator -=( const Vector3 & vec )
-    *this = *this - vec;
-    return *this;
-inline Vector3 & Vector3::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-inline const Vector3 Vector3::operator /( float scalar ) const
-    return Vector3(
-        ( mX / scalar ),
-        ( mY / scalar ),
-        ( mZ / scalar )
-    );
-inline Vector3 & Vector3::operator /=( float scalar )
-    *this = *this / scalar;
-    return *this;
-inline const Vector3 Vector3::operator -( ) const
-    return Vector3(
-        -mX,
-        -mY,
-        -mZ
-    );
-inline const Vector3 operator *( float scalar, const Vector3 & vec )
-    return vec * scalar;
-inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 )
-    return Vector3(
-        ( vec0.getX() * vec1.getX() ),
-        ( vec0.getY() * vec1.getY() ),
-        ( vec0.getZ() * vec1.getZ() )
-    );
-inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 )
-    return Vector3(
-        ( vec0.getX() / vec1.getX() ),
-        ( vec0.getY() / vec1.getY() ),
-        ( vec0.getZ() / vec1.getZ() )
-    );
-inline const Vector3 recipPerElem( const Vector3 & vec )
-    return Vector3(
-        ( 1.0f / vec.getX() ),
-        ( 1.0f / vec.getY() ),
-        ( 1.0f / vec.getZ() )
-    );
-inline const Vector3 sqrtPerElem( const Vector3 & vec )
-    return Vector3(
-        sqrtf( vec.getX() ),
-        sqrtf( vec.getY() ),
-        sqrtf( vec.getZ() )
-    );
-inline const Vector3 rsqrtPerElem( const Vector3 & vec )
-    return Vector3(
-        ( 1.0f / sqrtf( vec.getX() ) ),
-        ( 1.0f / sqrtf( vec.getY() ) ),
-        ( 1.0f / sqrtf( vec.getZ() ) )
-    );
-inline const Vector3 absPerElem( const Vector3 & vec )
-    return Vector3(
-        fabsf( vec.getX() ),
-        fabsf( vec.getY() ),
-        fabsf( vec.getZ() )
-    );
-inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 )
-    return Vector3(
-        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
-        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
-        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() )
-    );
-inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 )
-    return Vector3(
-        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
-        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
-        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ()
-    );
-inline float maxElem( const Vector3 & vec )
-    float result;
-    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
-    result = (vec.getZ() > result)? vec.getZ() : result;
-    return result;
-inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 )
-    return Vector3(
-        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
-        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
-        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ()
-    );
-inline float minElem( const Vector3 & vec )
-    float result;
-    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
-    result = (vec.getZ() < result)? vec.getZ() : result;
-    return result;
-inline float sum( const Vector3 & vec )
-    float result;
-    result = ( vec.getX() + vec.getY() );
-    result = ( result + vec.getZ() );
-    return result;
-inline float dot( const Vector3 & vec0, const Vector3 & vec1 )
-    float result;
-    result = ( vec0.getX() * vec1.getX() );
-    result = ( result + ( vec0.getY() * vec1.getY() ) );
-    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
-    return result;
-inline float lengthSqr( const Vector3 & vec )
-    float result;
-    result = ( vec.getX() * vec.getX() );
-    result = ( result + ( vec.getY() * vec.getY() ) );
-    result = ( result + ( vec.getZ() * vec.getZ() ) );
-    return result;
-inline float length( const Vector3 & vec )
-    return ::sqrtf( lengthSqr( vec ) );
-inline const Vector3 normalize( const Vector3 & vec )
-    float lenSqr, lenInv;
-    lenSqr = lengthSqr( vec );
-    lenInv = ( 1.0f / sqrtf( lenSqr ) );
-    return Vector3(
-        ( vec.getX() * lenInv ),
-        ( vec.getY() * lenInv ),
-        ( vec.getZ() * lenInv )
-    );
-inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 )
-    return Vector3(
-        ( ( vec0.getY() * vec1.getZ() ) - ( vec0.getZ() * vec1.getY() ) ),
-        ( ( vec0.getZ() * vec1.getX() ) - ( vec0.getX() * vec1.getZ() ) ),
-        ( ( vec0.getX() * vec1.getY() ) - ( vec0.getY() * vec1.getX() ) )
-    );
-inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 )
-    return Vector3(
-        ( select1 )? vec1.getX() : vec0.getX(),
-        ( select1 )? vec1.getY() : vec0.getY(),
-        ( select1 )? vec1.getZ() : vec0.getZ()
-    );
-inline void print( const Vector3 & vec )
-    printf( "( %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ() );
-inline void print( const Vector3 & vec, const char * name )
-    printf( "%s: ( %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ() );
-inline Vector4::Vector4( const Vector4 & vec )
-    mX = vec.mX;
-    mY = vec.mY;
-    mZ = vec.mZ;
-    mW = vec.mW;
-inline Vector4::Vector4( float _x, float _y, float _z, float _w )
-    mX = _x;
-    mY = _y;
-    mZ = _z;
-    mW = _w;
-inline Vector4::Vector4( const Vector3 & xyz, float _w )
-    this->setXYZ( xyz );
-    this->setW( _w );
-inline Vector4::Vector4( const Vector3 & vec )
-    mX = vec.getX();
-    mY = vec.getY();
-    mZ = vec.getZ();
-    mW = 0.0f;
-inline Vector4::Vector4( const Point3 & pnt )
-    mX = pnt.getX();
-    mY = pnt.getY();
-    mZ = pnt.getZ();
-    mW = 1.0f;
-inline Vector4::Vector4( const Quat & quat )
-    mX = quat.getX();
-    mY = quat.getY();
-    mZ = quat.getZ();
-    mW = quat.getW();
-inline Vector4::Vector4( float scalar )
-    mX = scalar;
-    mY = scalar;
-    mZ = scalar;
-    mW = scalar;
-inline const Vector4 Vector4::xAxis( )
-    return Vector4( 1.0f, 0.0f, 0.0f, 0.0f );
-inline const Vector4 Vector4::yAxis( )
-    return Vector4( 0.0f, 1.0f, 0.0f, 0.0f );
-inline const Vector4 Vector4::zAxis( )
-    return Vector4( 0.0f, 0.0f, 1.0f, 0.0f );
-inline const Vector4 Vector4::wAxis( )
-    return Vector4( 0.0f, 0.0f, 0.0f, 1.0f );
-inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 )
-    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
-inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 )
-    float recipSinAngle, scale0, scale1, cosAngle, angle;
-    cosAngle = dot( unitVec0, unitVec1 );
-    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
-        angle = acosf( cosAngle );
-        recipSinAngle = ( 1.0f / sinf( angle ) );
-        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
-        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
-    } else {
-        scale0 = ( 1.0f - t );
-        scale1 = t;
-    }
-    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
-inline void loadXYZW( Vector4 & vec, const float * fptr )
-    vec = Vector4( fptr[0], fptr[1], fptr[2], fptr[3] );
-inline void storeXYZW( const Vector4 & vec, float * fptr )
-    fptr[0] = vec.getX();
-    fptr[1] = vec.getY();
-    fptr[2] = vec.getZ();
-    fptr[3] = vec.getW();
-inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr )
-    union Data32 {
-        unsigned int u32;
-        float f32;
-    };
-    for (int i = 0; i < 4; i++) {
-        unsigned short fp16 = hfptr[i];
-        unsigned int sign = fp16 >> 15;
-        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
-        unsigned int mantissa = fp16 & ((1 << 10) - 1);
-        if (exponent == 0) {
-            // zero
-            mantissa = 0;
-        } else if (exponent == 31) {
-            // infinity or nan -> infinity
-            exponent = 255;
-	    mantissa = 0;
-        } else {
-            exponent += 127 - 15;
-            mantissa <<= 13;
-        }
-        Data32 d;
-        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
-        vec[i] = d.f32;
-    }
-inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr )
-    union Data32 {
-        unsigned int u32;
-        float f32;
-    };
-    for (int i = 0; i < 4; i++) {
-        Data32 d;
-        d.f32 = vec[i];
-        unsigned int sign = d.u32 >> 31;
-        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
-        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
-        if (exponent == 0) {
-            // zero or denorm -> zero
-            mantissa = 0;
-        } else if (exponent == 255 && mantissa != 0) {
-            // nan -> infinity
-            exponent = 31;
-            mantissa = 0;
-        } else if (exponent >= 127 - 15 + 31) {
-            // overflow or infinity -> infinity
-            exponent = 31;
-            mantissa = 0;
-        } else if (exponent <= 127 - 15) {
-            // underflow -> zero
-            exponent = 0;
-            mantissa = 0;
-        } else {
-            exponent -= 127 - 15;
-            mantissa >>= 13;
-        }
-        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
-    }
-inline Vector4 & Vector4::operator =( const Vector4 & vec )
-    mX = vec.mX;
-    mY = vec.mY;
-    mZ = vec.mZ;
-    mW = vec.mW;
-    return *this;
-inline Vector4 & Vector4::setXYZ( const Vector3 & vec )
-    mX = vec.getX();
-    mY = vec.getY();
-    mZ = vec.getZ();
-    return *this;
-inline const Vector3 Vector4::getXYZ( ) const
-    return Vector3( mX, mY, mZ );
-inline Vector4 & Vector4::setX( float _x )
-    mX = _x;
-    return *this;
-inline float Vector4::getX( ) const
-    return mX;
-inline Vector4 & Vector4::setY( float _y )
-    mY = _y;
-    return *this;
-inline float Vector4::getY( ) const
-    return mY;
-inline Vector4 & Vector4::setZ( float _z )
-    mZ = _z;
-    return *this;
-inline float Vector4::getZ( ) const
-    return mZ;
-inline Vector4 & Vector4::setW( float _w )
-    mW = _w;
-    return *this;
-inline float Vector4::getW( ) const
-    return mW;
-inline Vector4 & Vector4::setElem( int idx, float value )
-    *(&mX + idx) = value;
-    return *this;
-inline float Vector4::getElem( int idx ) const
-    return *(&mX + idx);
-inline float & Vector4::operator []( int idx )
-    return *(&mX + idx);
-inline float Vector4::operator []( int idx ) const
-    return *(&mX + idx);
-inline const Vector4 Vector4::operator +( const Vector4 & vec ) const
-    return Vector4(
-        ( mX + vec.mX ),
-        ( mY + vec.mY ),
-        ( mZ + vec.mZ ),
-        ( mW + vec.mW )
-    );
-inline const Vector4 Vector4::operator -( const Vector4 & vec ) const
-    return Vector4(
-        ( mX - vec.mX ),
-        ( mY - vec.mY ),
-        ( mZ - vec.mZ ),
-        ( mW - vec.mW )
-    );
-inline const Vector4 Vector4::operator *( float scalar ) const
-    return Vector4(
-        ( mX * scalar ),
-        ( mY * scalar ),
-        ( mZ * scalar ),
-        ( mW * scalar )
-    );
-inline Vector4 & Vector4::operator +=( const Vector4 & vec )
-    *this = *this + vec;
-    return *this;
-inline Vector4 & Vector4::operator -=( const Vector4 & vec )
-    *this = *this - vec;
-    return *this;
-inline Vector4 & Vector4::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-inline const Vector4 Vector4::operator /( float scalar ) const
-    return Vector4(
-        ( mX / scalar ),
-        ( mY / scalar ),
-        ( mZ / scalar ),
-        ( mW / scalar )
-    );
-inline Vector4 & Vector4::operator /=( float scalar )
-    *this = *this / scalar;
-    return *this;
-inline const Vector4 Vector4::operator -( ) const
-    return Vector4(
-        -mX,
-        -mY,
-        -mZ,
-        -mW
-    );
-inline const Vector4 operator *( float scalar, const Vector4 & vec )
-    return vec * scalar;
-inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 )
-    return Vector4(
-        ( vec0.getX() * vec1.getX() ),
-        ( vec0.getY() * vec1.getY() ),
-        ( vec0.getZ() * vec1.getZ() ),
-        ( vec0.getW() * vec1.getW() )
-    );
-inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 )
-    return Vector4(
-        ( vec0.getX() / vec1.getX() ),
-        ( vec0.getY() / vec1.getY() ),
-        ( vec0.getZ() / vec1.getZ() ),
-        ( vec0.getW() / vec1.getW() )
-    );
-inline const Vector4 recipPerElem( const Vector4 & vec )
-    return Vector4(
-        ( 1.0f / vec.getX() ),
-        ( 1.0f / vec.getY() ),
-        ( 1.0f / vec.getZ() ),
-        ( 1.0f / vec.getW() )
-    );
-inline const Vector4 sqrtPerElem( const Vector4 & vec )
-    return Vector4(
-        sqrtf( vec.getX() ),
-        sqrtf( vec.getY() ),
-        sqrtf( vec.getZ() ),
-        sqrtf( vec.getW() )
-    );
-inline const Vector4 rsqrtPerElem( const Vector4 & vec )
-    return Vector4(
-        ( 1.0f / sqrtf( vec.getX() ) ),
-        ( 1.0f / sqrtf( vec.getY() ) ),
-        ( 1.0f / sqrtf( vec.getZ() ) ),
-        ( 1.0f / sqrtf( vec.getW() ) )
-    );
-inline const Vector4 absPerElem( const Vector4 & vec )
-    return Vector4(
-        fabsf( vec.getX() ),
-        fabsf( vec.getY() ),
-        fabsf( vec.getZ() ),
-        fabsf( vec.getW() )
-    );
-inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 )
-    return Vector4(
-        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
-        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
-        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() ),
-        ( vec1.getW() < 0.0f )? -fabsf( vec0.getW() ) : fabsf( vec0.getW() )
-    );
-inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 )
-    return Vector4(
-        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
-        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
-        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ(),
-        (vec0.getW() > vec1.getW())? vec0.getW() : vec1.getW()
-    );
-inline float maxElem( const Vector4 & vec )
-    float result;
-    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
-    result = (vec.getZ() > result)? vec.getZ() : result;
-    result = (vec.getW() > result)? vec.getW() : result;
-    return result;
-inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 )
-    return Vector4(
-        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
-        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
-        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ(),
-        (vec0.getW() < vec1.getW())? vec0.getW() : vec1.getW()
-    );
-inline float minElem( const Vector4 & vec )
-    float result;
-    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
-    result = (vec.getZ() < result)? vec.getZ() : result;
-    result = (vec.getW() < result)? vec.getW() : result;
-    return result;
-inline float sum( const Vector4 & vec )
-    float result;
-    result = ( vec.getX() + vec.getY() );
-    result = ( result + vec.getZ() );
-    result = ( result + vec.getW() );
-    return result;
-inline float dot( const Vector4 & vec0, const Vector4 & vec1 )
-    float result;
-    result = ( vec0.getX() * vec1.getX() );
-    result = ( result + ( vec0.getY() * vec1.getY() ) );
-    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
-    result = ( result + ( vec0.getW() * vec1.getW() ) );
-    return result;
-inline float lengthSqr( const Vector4 & vec )
-    float result;
-    result = ( vec.getX() * vec.getX() );
-    result = ( result + ( vec.getY() * vec.getY() ) );
-    result = ( result + ( vec.getZ() * vec.getZ() ) );
-    result = ( result + ( vec.getW() * vec.getW() ) );
-    return result;
-inline float length( const Vector4 & vec )
-    return ::sqrtf( lengthSqr( vec ) );
-inline const Vector4 normalize( const Vector4 & vec )
-    float lenSqr, lenInv;
-    lenSqr = lengthSqr( vec );
-    lenInv = ( 1.0f / sqrtf( lenSqr ) );
-    return Vector4(
-        ( vec.getX() * lenInv ),
-        ( vec.getY() * lenInv ),
-        ( vec.getZ() * lenInv ),
-        ( vec.getW() * lenInv )
-    );
-inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 )
-    return Vector4(
-        ( select1 )? vec1.getX() : vec0.getX(),
-        ( select1 )? vec1.getY() : vec0.getY(),
-        ( select1 )? vec1.getZ() : vec0.getZ(),
-        ( select1 )? vec1.getW() : vec0.getW()
-    );
-inline void print( const Vector4 & vec )
-    printf( "( %f %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
-inline void print( const Vector4 & vec, const char * name )
-    printf( "%s: ( %f %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
-inline Point3::Point3( const Point3 & pnt )
-    mX = pnt.mX;
-    mY = pnt.mY;
-    mZ = pnt.mZ;
-inline Point3::Point3( float _x, float _y, float _z )
-    mX = _x;
-    mY = _y;
-    mZ = _z;
-inline Point3::Point3( const Vector3 & vec )
-    mX = vec.getX();
-    mY = vec.getY();
-    mZ = vec.getZ();
-inline Point3::Point3( float scalar )
-    mX = scalar;
-    mY = scalar;
-    mZ = scalar;
-inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 )
-    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
-inline void loadXYZ( Point3 & pnt, const float * fptr )
-    pnt = Point3( fptr[0], fptr[1], fptr[2] );
-inline void storeXYZ( const Point3 & pnt, float * fptr )
-    fptr[0] = pnt.getX();
-    fptr[1] = pnt.getY();
-    fptr[2] = pnt.getZ();
-inline void loadHalfFloats( Point3 & vec, const unsigned short * hfptr )
-    union Data32 {
-        unsigned int u32;
-        float f32;
-    };
-    for (int i = 0; i < 3; i++) {
-        unsigned short fp16 = hfptr[i];
-        unsigned int sign = fp16 >> 15;
-        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
-        unsigned int mantissa = fp16 & ((1 << 10) - 1);
-        if (exponent == 0) {
-            // zero
-            mantissa = 0;
-        } else if (exponent == 31) {
-            // infinity or nan -> infinity
-            exponent = 255;
-	    mantissa = 0;
-        } else {
-            exponent += 127 - 15;
-            mantissa <<= 13;
-        }
-        Data32 d;
-        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
-        vec[i] = d.f32;
-    }
-inline void storeHalfFloats( const Point3 & vec, unsigned short * hfptr )
-    union Data32 {
-        unsigned int u32;
-        float f32;
-    };
-    for (int i = 0; i < 3; i++) {
-        Data32 d;
-        d.f32 = vec[i];
-        unsigned int sign = d.u32 >> 31;
-        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
-        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
-        if (exponent == 0) {
-            // zero or denorm -> zero
-            mantissa = 0;
-        } else if (exponent == 255 && mantissa != 0) {
-            // nan -> infinity
-            exponent = 31;
-            mantissa = 0;
-        } else if (exponent >= 127 - 15 + 31) {
-            // overflow or infinity -> infinity
-            exponent = 31;
-            mantissa = 0;
-        } else if (exponent <= 127 - 15) {
-            // underflow -> zero
-            exponent = 0;
-            mantissa = 0;
-        } else {
-            exponent -= 127 - 15;
-            mantissa >>= 13;
-        }
-        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
-    }
-inline Point3 & Point3::operator =( const Point3 & pnt )
-    mX = pnt.mX;
-    mY = pnt.mY;
-    mZ = pnt.mZ;
-    return *this;
-inline Point3 & Point3::setX( float _x )
-    mX = _x;
-    return *this;
-inline float Point3::getX( ) const
-    return mX;
-inline Point3 & Point3::setY( float _y )
-    mY = _y;
-    return *this;
-inline float Point3::getY( ) const
-    return mY;
-inline Point3 & Point3::setZ( float _z )
-    mZ = _z;
-    return *this;
-inline float Point3::getZ( ) const
-    return mZ;
-inline Point3 & Point3::setElem( int idx, float value )
-    *(&mX + idx) = value;
-    return *this;
-inline float Point3::getElem( int idx ) const
-    return *(&mX + idx);
-inline float & Point3::operator []( int idx )
-    return *(&mX + idx);
-inline float Point3::operator []( int idx ) const
-    return *(&mX + idx);
-inline const Vector3 Point3::operator -( const Point3 & pnt ) const
-    return Vector3(
-        ( mX - pnt.mX ),
-        ( mY - pnt.mY ),
-        ( mZ - pnt.mZ )
-    );
-inline const Point3 Point3::operator +( const Vector3 & vec ) const
-    return Point3(
-        ( mX + vec.getX() ),
-        ( mY + vec.getY() ),
-        ( mZ + vec.getZ() )
-    );
-inline const Point3 Point3::operator -( const Vector3 & vec ) const
-    return Point3(
-        ( mX - vec.getX() ),
-        ( mY - vec.getY() ),
-        ( mZ - vec.getZ() )
-    );
-inline Point3 & Point3::operator +=( const Vector3 & vec )
-    *this = *this + vec;
-    return *this;
-inline Point3 & Point3::operator -=( const Vector3 & vec )
-    *this = *this - vec;
-    return *this;
-inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 )
-    return Point3(
-        ( pnt0.getX() * pnt1.getX() ),
-        ( pnt0.getY() * pnt1.getY() ),
-        ( pnt0.getZ() * pnt1.getZ() )
-    );
-inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 )
-    return Point3(
-        ( pnt0.getX() / pnt1.getX() ),
-        ( pnt0.getY() / pnt1.getY() ),
-        ( pnt0.getZ() / pnt1.getZ() )
-    );
-inline const Point3 recipPerElem( const Point3 & pnt )
-    return Point3(
-        ( 1.0f / pnt.getX() ),
-        ( 1.0f / pnt.getY() ),
-        ( 1.0f / pnt.getZ() )
-    );
-inline const Point3 sqrtPerElem( const Point3 & pnt )
-    return Point3(
-        sqrtf( pnt.getX() ),
-        sqrtf( pnt.getY() ),
-        sqrtf( pnt.getZ() )
-    );
-inline const Point3 rsqrtPerElem( const Point3 & pnt )
-    return Point3(
-        ( 1.0f / sqrtf( pnt.getX() ) ),
-        ( 1.0f / sqrtf( pnt.getY() ) ),
-        ( 1.0f / sqrtf( pnt.getZ() ) )
-    );
-inline const Point3 absPerElem( const Point3 & pnt )
-    return Point3(
-        fabsf( pnt.getX() ),
-        fabsf( pnt.getY() ),
-        fabsf( pnt.getZ() )
-    );
-inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 )
-    return Point3(
-        ( pnt1.getX() < 0.0f )? -fabsf( pnt0.getX() ) : fabsf( pnt0.getX() ),
-        ( pnt1.getY() < 0.0f )? -fabsf( pnt0.getY() ) : fabsf( pnt0.getY() ),
-        ( pnt1.getZ() < 0.0f )? -fabsf( pnt0.getZ() ) : fabsf( pnt0.getZ() )
-    );
-inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 )
-    return Point3(
-        (pnt0.getX() > pnt1.getX())? pnt0.getX() : pnt1.getX(),
-        (pnt0.getY() > pnt1.getY())? pnt0.getY() : pnt1.getY(),
-        (pnt0.getZ() > pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
-    );
-inline float maxElem( const Point3 & pnt )
-    float result;
-    result = (pnt.getX() > pnt.getY())? pnt.getX() : pnt.getY();
-    result = (pnt.getZ() > result)? pnt.getZ() : result;
-    return result;
-inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 )
-    return Point3(
-        (pnt0.getX() < pnt1.getX())? pnt0.getX() : pnt1.getX(),
-        (pnt0.getY() < pnt1.getY())? pnt0.getY() : pnt1.getY(),
-        (pnt0.getZ() < pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
-    );
-inline float minElem( const Point3 & pnt )
-    float result;
-    result = (pnt.getX() < pnt.getY())? pnt.getX() : pnt.getY();
-    result = (pnt.getZ() < result)? pnt.getZ() : result;
-    return result;
-inline float sum( const Point3 & pnt )
-    float result;
-    result = ( pnt.getX() + pnt.getY() );
-    result = ( result + pnt.getZ() );
-    return result;
-inline const Point3 scale( const Point3 & pnt, float scaleVal )
-    return mulPerElem( pnt, Point3( scaleVal ) );
-inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec )
-    return mulPerElem( pnt, Point3( scaleVec ) );
-inline float projection( const Point3 & pnt, const Vector3 & unitVec )
-    float result;
-    result = ( pnt.getX() * unitVec.getX() );
-    result = ( result + ( pnt.getY() * unitVec.getY() ) );
-    result = ( result + ( pnt.getZ() * unitVec.getZ() ) );
-    return result;
-inline float distSqrFromOrigin( const Point3 & pnt )
-    return lengthSqr( Vector3( pnt ) );
-inline float distFromOrigin( const Point3 & pnt )
-    return length( Vector3( pnt ) );
-inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 )
-    return lengthSqr( ( pnt1 - pnt0 ) );
-inline float dist( const Point3 & pnt0, const Point3 & pnt1 )
-    return length( ( pnt1 - pnt0 ) );
-inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 )
-    return Point3(
-        ( select1 )? pnt1.getX() : pnt0.getX(),
-        ( select1 )? pnt1.getY() : pnt0.getY(),
-        ( select1 )? pnt1.getZ() : pnt0.getZ()
-    );
-inline void print( const Point3 & pnt )
-    printf( "( %f %f %f )\n", pnt.getX(), pnt.getY(), pnt.getZ() );
-inline void print( const Point3 & pnt, const char * name )
-    printf( "%s: ( %f %f %f )\n", name, pnt.getX(), pnt.getY(), pnt.getZ() );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/scalar/vectormath_aos.h b/src/bullet/vectormath/scalar/vectormath_aos.h
deleted file mode 100644
index d00456df..00000000
--- a/src/bullet/vectormath/scalar/vectormath_aos.h
+++ /dev/null
@@ -1,1872 +0,0 @@
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-#include <math.h>
-#include <stdio.h>
-namespace Vectormath {
-namespace Aos {
-// Forward Declarations
-class Vector3;
-class Vector4;
-class Point3;
-class Quat;
-class Matrix3;
-class Matrix4;
-class Transform3;
-// A 3-D vector in array-of-structures format
-class Vector3
-    float mX;
-    float mY;
-    float mZ;
-#ifndef __GNUC__
-    float d;
-    // Default constructor; does no initialization
-    // 
-    inline Vector3( ) { };
-    // Copy a 3-D vector
-    // 
-    inline Vector3( const Vector3 & vec );
-    // Construct a 3-D vector from x, y, and z elements
-    // 
-    inline Vector3( float x, float y, float z );
-    // Copy elements from a 3-D point into a 3-D vector
-    // 
-    explicit inline Vector3( const Point3 & pnt );
-    // Set all elements of a 3-D vector to the same scalar value
-    // 
-    explicit inline Vector3( float scalar );
-    // Assign one 3-D vector to another
-    // 
-    inline Vector3 & operator =( const Vector3 & vec );
-    // Set the x element of a 3-D vector
-    // 
-    inline Vector3 & setX( float x );
-    // Set the y element of a 3-D vector
-    // 
-    inline Vector3 & setY( float y );
-    // Set the z element of a 3-D vector
-    // 
-    inline Vector3 & setZ( float z );
-    // Get the x element of a 3-D vector
-    // 
-    inline float getX( ) const;
-    // Get the y element of a 3-D vector
-    // 
-    inline float getY( ) const;
-    // Get the z element of a 3-D vector
-    // 
-    inline float getZ( ) const;
-    // Set an x, y, or z element of a 3-D vector by index
-    // 
-    inline Vector3 & setElem( int idx, float value );
-    // Get an x, y, or z element of a 3-D vector by index
-    // 
-    inline float getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    inline float & operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    inline float operator []( int idx ) const;
-    // Add two 3-D vectors
-    // 
-    inline const Vector3 operator +( const Vector3 & vec ) const;
-    // Subtract a 3-D vector from another 3-D vector
-    // 
-    inline const Vector3 operator -( const Vector3 & vec ) const;
-    // Add a 3-D vector to a 3-D point
-    // 
-    inline const Point3 operator +( const Point3 & pnt ) const;
-    // Multiply a 3-D vector by a scalar
-    // 
-    inline const Vector3 operator *( float scalar ) const;
-    // Divide a 3-D vector by a scalar
-    // 
-    inline const Vector3 operator /( float scalar ) const;
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    inline Vector3 & operator +=( const Vector3 & vec );
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    inline Vector3 & operator -=( const Vector3 & vec );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Vector3 & operator *=( float scalar );
-    // Perform compound assignment and division by a scalar
-    // 
-    inline Vector3 & operator /=( float scalar );
-    // Negate all elements of a 3-D vector
-    // 
-    inline const Vector3 operator -( ) const;
-    // Construct x axis
-    // 
-    static inline const Vector3 xAxis( );
-    // Construct y axis
-    // 
-    static inline const Vector3 yAxis( );
-    // Construct z axis
-    // 
-    static inline const Vector3 zAxis( );
-#ifdef __GNUC__
-__attribute__ ((aligned(16)))
-// Multiply a 3-D vector by a scalar
-inline const Vector3 operator *( float scalar, const Vector3 & vec );
-// Multiply two 3-D vectors per element
-inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 );
-// Divide two 3-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 );
-// Compute the reciprocal of a 3-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-inline const Vector3 recipPerElem( const Vector3 & vec );
-// Compute the square root of a 3-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function sqrtf4.
-inline const Vector3 sqrtPerElem( const Vector3 & vec );
-// Compute the reciprocal square root of a 3-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function rsqrtf4.
-inline const Vector3 rsqrtPerElem( const Vector3 & vec );
-// Compute the absolute value of a 3-D vector per element
-inline const Vector3 absPerElem( const Vector3 & vec );
-// Copy sign from one 3-D vector to another, per element
-inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 );
-// Maximum of two 3-D vectors per element
-inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 );
-// Minimum of two 3-D vectors per element
-inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 );
-// Maximum element of a 3-D vector
-inline float maxElem( const Vector3 & vec );
-// Minimum element of a 3-D vector
-inline float minElem( const Vector3 & vec );
-// Compute the sum of all elements of a 3-D vector
-inline float sum( const Vector3 & vec );
-// Compute the dot product of two 3-D vectors
-inline float dot( const Vector3 & vec0, const Vector3 & vec1 );
-// Compute the square of the length of a 3-D vector
-inline float lengthSqr( const Vector3 & vec );
-// Compute the length of a 3-D vector
-inline float length( const Vector3 & vec );
-// Normalize a 3-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-inline const Vector3 normalize( const Vector3 & vec );
-// Compute cross product of two 3-D vectors
-inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 );
-// Outer product of two 3-D vectors
-inline const Matrix3 outer( const Vector3 & vec0, const Vector3 & vec1 );
-// Pre-multiply a row vector by a 3x3 matrix
-inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat );
-// Cross-product matrix of a 3-D vector
-inline const Matrix3 crossMatrix( const Vector3 & vec );
-// Create cross-product matrix and multiply
-// NOTE: 
-// Faster than separately creating a cross-product matrix and multiplying.
-inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat );
-// Linear interpolation between two 3-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 );
-// Spherical linear interpolation between two 3-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 );
-// Conditionally select between two 3-D vectors
-inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 );
-// Load x, y, and z elements from the first three words of a float array.
-inline void loadXYZ( Vector3 & vec, const float * fptr );
-// Store x, y, and z elements of a 3-D vector in the first three words of a float array.
-// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
-inline void storeXYZ( const Vector3 & vec, float * fptr );
-// Load three-half-floats as a 3-D vector
-// NOTE: 
-// This transformation does not support either denormalized numbers or NaNs.
-inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr );
-// Store a 3-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
-// NOTE: 
-// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
-inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr );
-// Print a 3-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Vector3 & vec );
-// Print a 3-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Vector3 & vec, const char * name );
-// A 4-D vector in array-of-structures format
-class Vector4
-    float mX;
-    float mY;
-    float mZ;
-    float mW;
-    // Default constructor; does no initialization
-    // 
-    inline Vector4( ) { };
-    // Copy a 4-D vector
-    // 
-    inline Vector4( const Vector4 & vec );
-    // Construct a 4-D vector from x, y, z, and w elements
-    // 
-    inline Vector4( float x, float y, float z, float w );
-    // Construct a 4-D vector from a 3-D vector and a scalar
-    // 
-    inline Vector4( const Vector3 & xyz, float w );
-    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
-    // 
-    explicit inline Vector4( const Vector3 & vec );
-    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
-    // 
-    explicit inline Vector4( const Point3 & pnt );
-    // Copy elements from a quaternion into a 4-D vector
-    // 
-    explicit inline Vector4( const Quat & quat );
-    // Set all elements of a 4-D vector to the same scalar value
-    // 
-    explicit inline Vector4( float scalar );
-    // Assign one 4-D vector to another
-    // 
-    inline Vector4 & operator =( const Vector4 & vec );
-    // Set the x, y, and z elements of a 4-D vector
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    inline Vector4 & setXYZ( const Vector3 & vec );
-    // Get the x, y, and z elements of a 4-D vector
-    // 
-    inline const Vector3 getXYZ( ) const;
-    // Set the x element of a 4-D vector
-    // 
-    inline Vector4 & setX( float x );
-    // Set the y element of a 4-D vector
-    // 
-    inline Vector4 & setY( float y );
-    // Set the z element of a 4-D vector
-    // 
-    inline Vector4 & setZ( float z );
-    // Set the w element of a 4-D vector
-    // 
-    inline Vector4 & setW( float w );
-    // Get the x element of a 4-D vector
-    // 
-    inline float getX( ) const;
-    // Get the y element of a 4-D vector
-    // 
-    inline float getY( ) const;
-    // Get the z element of a 4-D vector
-    // 
-    inline float getZ( ) const;
-    // Get the w element of a 4-D vector
-    // 
-    inline float getW( ) const;
-    // Set an x, y, z, or w element of a 4-D vector by index
-    // 
-    inline Vector4 & setElem( int idx, float value );
-    // Get an x, y, z, or w element of a 4-D vector by index
-    // 
-    inline float getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    inline float & operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    inline float operator []( int idx ) const;
-    // Add two 4-D vectors
-    // 
-    inline const Vector4 operator +( const Vector4 & vec ) const;
-    // Subtract a 4-D vector from another 4-D vector
-    // 
-    inline const Vector4 operator -( const Vector4 & vec ) const;
-    // Multiply a 4-D vector by a scalar
-    // 
-    inline const Vector4 operator *( float scalar ) const;
-    // Divide a 4-D vector by a scalar
-    // 
-    inline const Vector4 operator /( float scalar ) const;
-    // Perform compound assignment and addition with a 4-D vector
-    // 
-    inline Vector4 & operator +=( const Vector4 & vec );
-    // Perform compound assignment and subtraction by a 4-D vector
-    // 
-    inline Vector4 & operator -=( const Vector4 & vec );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Vector4 & operator *=( float scalar );
-    // Perform compound assignment and division by a scalar
-    // 
-    inline Vector4 & operator /=( float scalar );
-    // Negate all elements of a 4-D vector
-    // 
-    inline const Vector4 operator -( ) const;
-    // Construct x axis
-    // 
-    static inline const Vector4 xAxis( );
-    // Construct y axis
-    // 
-    static inline const Vector4 yAxis( );
-    // Construct z axis
-    // 
-    static inline const Vector4 zAxis( );
-    // Construct w axis
-    // 
-    static inline const Vector4 wAxis( );
-#ifdef __GNUC__
-__attribute__ ((aligned(16)))
-// Multiply a 4-D vector by a scalar
-inline const Vector4 operator *( float scalar, const Vector4 & vec );
-// Multiply two 4-D vectors per element
-inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 );
-// Divide two 4-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 );
-// Compute the reciprocal of a 4-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-inline const Vector4 recipPerElem( const Vector4 & vec );
-// Compute the square root of a 4-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function sqrtf4.
-inline const Vector4 sqrtPerElem( const Vector4 & vec );
-// Compute the reciprocal square root of a 4-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function rsqrtf4.
-inline const Vector4 rsqrtPerElem( const Vector4 & vec );
-// Compute the absolute value of a 4-D vector per element
-inline const Vector4 absPerElem( const Vector4 & vec );
-// Copy sign from one 4-D vector to another, per element
-inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 );
-// Maximum of two 4-D vectors per element
-inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 );
-// Minimum of two 4-D vectors per element
-inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 );
-// Maximum element of a 4-D vector
-inline float maxElem( const Vector4 & vec );
-// Minimum element of a 4-D vector
-inline float minElem( const Vector4 & vec );
-// Compute the sum of all elements of a 4-D vector
-inline float sum( const Vector4 & vec );
-// Compute the dot product of two 4-D vectors
-inline float dot( const Vector4 & vec0, const Vector4 & vec1 );
-// Compute the square of the length of a 4-D vector
-inline float lengthSqr( const Vector4 & vec );
-// Compute the length of a 4-D vector
-inline float length( const Vector4 & vec );
-// Normalize a 4-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-inline const Vector4 normalize( const Vector4 & vec );
-// Outer product of two 4-D vectors
-inline const Matrix4 outer( const Vector4 & vec0, const Vector4 & vec1 );
-// Linear interpolation between two 4-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 );
-// Spherical linear interpolation between two 4-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 );
-// Conditionally select between two 4-D vectors
-inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 );
-// Load x, y, z, and w elements from the first four words of a float array.
-inline void loadXYZW( Vector4 & vec, const float * fptr );
-// Store x, y, z, and w elements of a 4-D vector in the first four words of a float array.
-// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
-inline void storeXYZW( const Vector4 & vec, float * fptr );
-// Load four-half-floats as a 4-D vector
-// NOTE: 
-// This transformation does not support either denormalized numbers or NaNs.
-inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr );
-// Store a 4-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
-// NOTE: 
-// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
-inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr );
-// Print a 4-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Vector4 & vec );
-// Print a 4-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Vector4 & vec, const char * name );
-// A 3-D point in array-of-structures format
-class Point3
-    float mX;
-    float mY;
-    float mZ;
-#ifndef __GNUC__
-    float d;
-    // Default constructor; does no initialization
-    // 
-    inline Point3( ) { };
-    // Copy a 3-D point
-    // 
-    inline Point3( const Point3 & pnt );
-    // Construct a 3-D point from x, y, and z elements
-    // 
-    inline Point3( float x, float y, float z );
-    // Copy elements from a 3-D vector into a 3-D point
-    // 
-    explicit inline Point3( const Vector3 & vec );
-    // Set all elements of a 3-D point to the same scalar value
-    // 
-    explicit inline Point3( float scalar );
-    // Assign one 3-D point to another
-    // 
-    inline Point3 & operator =( const Point3 & pnt );
-    // Set the x element of a 3-D point
-    // 
-    inline Point3 & setX( float x );
-    // Set the y element of a 3-D point
-    // 
-    inline Point3 & setY( float y );
-    // Set the z element of a 3-D point
-    // 
-    inline Point3 & setZ( float z );
-    // Get the x element of a 3-D point
-    // 
-    inline float getX( ) const;
-    // Get the y element of a 3-D point
-    // 
-    inline float getY( ) const;
-    // Get the z element of a 3-D point
-    // 
-    inline float getZ( ) const;
-    // Set an x, y, or z element of a 3-D point by index
-    // 
-    inline Point3 & setElem( int idx, float value );
-    // Get an x, y, or z element of a 3-D point by index
-    // 
-    inline float getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    inline float & operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    inline float operator []( int idx ) const;
-    // Subtract a 3-D point from another 3-D point
-    // 
-    inline const Vector3 operator -( const Point3 & pnt ) const;
-    // Add a 3-D point to a 3-D vector
-    // 
-    inline const Point3 operator +( const Vector3 & vec ) const;
-    // Subtract a 3-D vector from a 3-D point
-    // 
-    inline const Point3 operator -( const Vector3 & vec ) const;
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    inline Point3 & operator +=( const Vector3 & vec );
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    inline Point3 & operator -=( const Vector3 & vec );
-#ifdef __GNUC__
-__attribute__ ((aligned(16)))
-// Multiply two 3-D points per element
-inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 );
-// Divide two 3-D points per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 );
-// Compute the reciprocal of a 3-D point per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-inline const Point3 recipPerElem( const Point3 & pnt );
-// Compute the square root of a 3-D point per element
-// NOTE: 
-// Floating-point behavior matches standard library function sqrtf4.
-inline const Point3 sqrtPerElem( const Point3 & pnt );
-// Compute the reciprocal square root of a 3-D point per element
-// NOTE: 
-// Floating-point behavior matches standard library function rsqrtf4.
-inline const Point3 rsqrtPerElem( const Point3 & pnt );
-// Compute the absolute value of a 3-D point per element
-inline const Point3 absPerElem( const Point3 & pnt );
-// Copy sign from one 3-D point to another, per element
-inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 );
-// Maximum of two 3-D points per element
-inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 );
-// Minimum of two 3-D points per element
-inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 );
-// Maximum element of a 3-D point
-inline float maxElem( const Point3 & pnt );
-// Minimum element of a 3-D point
-inline float minElem( const Point3 & pnt );
-// Compute the sum of all elements of a 3-D point
-inline float sum( const Point3 & pnt );
-// Apply uniform scale to a 3-D point
-inline const Point3 scale( const Point3 & pnt, float scaleVal );
-// Apply non-uniform scale to a 3-D point
-inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec );
-// Scalar projection of a 3-D point on a unit-length 3-D vector
-inline float projection( const Point3 & pnt, const Vector3 & unitVec );
-// Compute the square of the distance of a 3-D point from the coordinate-system origin
-inline float distSqrFromOrigin( const Point3 & pnt );
-// Compute the distance of a 3-D point from the coordinate-system origin
-inline float distFromOrigin( const Point3 & pnt );
-// Compute the square of the distance between two 3-D points
-inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 );
-// Compute the distance between two 3-D points
-inline float dist( const Point3 & pnt0, const Point3 & pnt1 );
-// Linear interpolation between two 3-D points
-// NOTE: 
-// Does not clamp t between 0 and 1.
-inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 );
-// Conditionally select between two 3-D points
-inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 );
-// Load x, y, and z elements from the first three words of a float array.
-inline void loadXYZ( Point3 & pnt, const float * fptr );
-// Store x, y, and z elements of a 3-D point in the first three words of a float array.
-// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
-inline void storeXYZ( const Point3 & pnt, float * fptr );
-// Load three-half-floats as a 3-D point
-// NOTE: 
-// This transformation does not support either denormalized numbers or NaNs.
-inline void loadHalfFloats( Point3 & pnt, const unsigned short * hfptr );
-// Store a 3-D point as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
-// NOTE: 
-// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
-inline void storeHalfFloats( const Point3 & pnt, unsigned short * hfptr );
-// Print a 3-D point
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Point3 & pnt );
-// Print a 3-D point and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Point3 & pnt, const char * name );
-// A quaternion in array-of-structures format
-class Quat
-    float mX;
-    float mY;
-    float mZ;
-    float mW;
-    // Default constructor; does no initialization
-    // 
-    inline Quat( ) { };
-    // Copy a quaternion
-    // 
-    inline Quat( const Quat & quat );
-    // Construct a quaternion from x, y, z, and w elements
-    // 
-    inline Quat( float x, float y, float z, float w );
-    // Construct a quaternion from a 3-D vector and a scalar
-    // 
-    inline Quat( const Vector3 & xyz, float w );
-    // Copy elements from a 4-D vector into a quaternion
-    // 
-    explicit inline Quat( const Vector4 & vec );
-    // Convert a rotation matrix to a unit-length quaternion
-    // 
-    explicit inline Quat( const Matrix3 & rotMat );
-    // Set all elements of a quaternion to the same scalar value
-    // 
-    explicit inline Quat( float scalar );
-    // Assign one quaternion to another
-    // 
-    inline Quat & operator =( const Quat & quat );
-    // Set the x, y, and z elements of a quaternion
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    inline Quat & setXYZ( const Vector3 & vec );
-    // Get the x, y, and z elements of a quaternion
-    // 
-    inline const Vector3 getXYZ( ) const;
-    // Set the x element of a quaternion
-    // 
-    inline Quat & setX( float x );
-    // Set the y element of a quaternion
-    // 
-    inline Quat & setY( float y );
-    // Set the z element of a quaternion
-    // 
-    inline Quat & setZ( float z );
-    // Set the w element of a quaternion
-    // 
-    inline Quat & setW( float w );
-    // Get the x element of a quaternion
-    // 
-    inline float getX( ) const;
-    // Get the y element of a quaternion
-    // 
-    inline float getY( ) const;
-    // Get the z element of a quaternion
-    // 
-    inline float getZ( ) const;
-    // Get the w element of a quaternion
-    // 
-    inline float getW( ) const;
-    // Set an x, y, z, or w element of a quaternion by index
-    // 
-    inline Quat & setElem( int idx, float value );
-    // Get an x, y, z, or w element of a quaternion by index
-    // 
-    inline float getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    inline float & operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    inline float operator []( int idx ) const;
-    // Add two quaternions
-    // 
-    inline const Quat operator +( const Quat & quat ) const;
-    // Subtract a quaternion from another quaternion
-    // 
-    inline const Quat operator -( const Quat & quat ) const;
-    // Multiply two quaternions
-    // 
-    inline const Quat operator *( const Quat & quat ) const;
-    // Multiply a quaternion by a scalar
-    // 
-    inline const Quat operator *( float scalar ) const;
-    // Divide a quaternion by a scalar
-    // 
-    inline const Quat operator /( float scalar ) const;
-    // Perform compound assignment and addition with a quaternion
-    // 
-    inline Quat & operator +=( const Quat & quat );
-    // Perform compound assignment and subtraction by a quaternion
-    // 
-    inline Quat & operator -=( const Quat & quat );
-    // Perform compound assignment and multiplication by a quaternion
-    // 
-    inline Quat & operator *=( const Quat & quat );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Quat & operator *=( float scalar );
-    // Perform compound assignment and division by a scalar
-    // 
-    inline Quat & operator /=( float scalar );
-    // Negate all elements of a quaternion
-    // 
-    inline const Quat operator -( ) const;
-    // Construct an identity quaternion
-    // 
-    static inline const Quat identity( );
-    // Construct a quaternion to rotate between two unit-length 3-D vectors
-    // NOTE: 
-    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
-    // 
-    static inline const Quat rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 );
-    // Construct a quaternion to rotate around a unit-length 3-D vector
-    // 
-    static inline const Quat rotation( float radians, const Vector3 & unitVec );
-    // Construct a quaternion to rotate around the x axis
-    // 
-    static inline const Quat rotationX( float radians );
-    // Construct a quaternion to rotate around the y axis
-    // 
-    static inline const Quat rotationY( float radians );
-    // Construct a quaternion to rotate around the z axis
-    // 
-    static inline const Quat rotationZ( float radians );
-#ifdef __GNUC__
-__attribute__ ((aligned(16)))
-// Multiply a quaternion by a scalar
-inline const Quat operator *( float scalar, const Quat & quat );
-// Compute the conjugate of a quaternion
-inline const Quat conj( const Quat & quat );
-// Use a unit-length quaternion to rotate a 3-D vector
-inline const Vector3 rotate( const Quat & unitQuat, const Vector3 & vec );
-// Compute the dot product of two quaternions
-inline float dot( const Quat & quat0, const Quat & quat1 );
-// Compute the norm of a quaternion
-inline float norm( const Quat & quat );
-// Compute the length of a quaternion
-inline float length( const Quat & quat );
-// Normalize a quaternion
-// NOTE: 
-// The result is unpredictable when all elements of quat are at or near zero.
-inline const Quat normalize( const Quat & quat );
-// Linear interpolation between two quaternions
-// NOTE: 
-// Does not clamp t between 0 and 1.
-inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 );
-// Spherical linear interpolation between two quaternions
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 );
-// Spherical quadrangle interpolation
-inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 );
-// Conditionally select between two quaternions
-inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 );
-// Load x, y, z, and w elements from the first four words of a float array.
-inline void loadXYZW( Quat & quat, const float * fptr );
-// Store x, y, z, and w elements of a quaternion in the first four words of a float array.
-// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
-inline void storeXYZW( const Quat & quat, float * fptr );
-// Print a quaternion
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Quat & quat );
-// Print a quaternion and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Quat & quat, const char * name );
-// A 3x3 matrix in array-of-structures format
-class Matrix3
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-    // Default constructor; does no initialization
-    // 
-    inline Matrix3( ) { };
-    // Copy a 3x3 matrix
-    // 
-    inline Matrix3( const Matrix3 & mat );
-    // Construct a 3x3 matrix containing the specified columns
-    // 
-    inline Matrix3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2 );
-    // Construct a 3x3 rotation matrix from a unit-length quaternion
-    // 
-    explicit inline Matrix3( const Quat & unitQuat );
-    // Set all elements of a 3x3 matrix to the same scalar value
-    // 
-    explicit inline Matrix3( float scalar );
-    // Assign one 3x3 matrix to another
-    // 
-    inline Matrix3 & operator =( const Matrix3 & mat );
-    // Set column 0 of a 3x3 matrix
-    // 
-    inline Matrix3 & setCol0( const Vector3 & col0 );
-    // Set column 1 of a 3x3 matrix
-    // 
-    inline Matrix3 & setCol1( const Vector3 & col1 );
-    // Set column 2 of a 3x3 matrix
-    // 
-    inline Matrix3 & setCol2( const Vector3 & col2 );
-    // Get column 0 of a 3x3 matrix
-    // 
-    inline const Vector3 getCol0( ) const;
-    // Get column 1 of a 3x3 matrix
-    // 
-    inline const Vector3 getCol1( ) const;
-    // Get column 2 of a 3x3 matrix
-    // 
-    inline const Vector3 getCol2( ) const;
-    // Set the column of a 3x3 matrix referred to by the specified index
-    // 
-    inline Matrix3 & setCol( int col, const Vector3 & vec );
-    // Set the row of a 3x3 matrix referred to by the specified index
-    // 
-    inline Matrix3 & setRow( int row, const Vector3 & vec );
-    // Get the column of a 3x3 matrix referred to by the specified index
-    // 
-    inline const Vector3 getCol( int col ) const;
-    // Get the row of a 3x3 matrix referred to by the specified index
-    // 
-    inline const Vector3 getRow( int row ) const;
-    // Subscripting operator to set or get a column
-    // 
-    inline Vector3 & operator []( int col );
-    // Subscripting operator to get a column
-    // 
-    inline const Vector3 operator []( int col ) const;
-    // Set the element of a 3x3 matrix referred to by column and row indices
-    // 
-    inline Matrix3 & setElem( int col, int row, float val );
-    // Get the element of a 3x3 matrix referred to by column and row indices
-    // 
-    inline float getElem( int col, int row ) const;
-    // Add two 3x3 matrices
-    // 
-    inline const Matrix3 operator +( const Matrix3 & mat ) const;
-    // Subtract a 3x3 matrix from another 3x3 matrix
-    // 
-    inline const Matrix3 operator -( const Matrix3 & mat ) const;
-    // Negate all elements of a 3x3 matrix
-    // 
-    inline const Matrix3 operator -( ) const;
-    // Multiply a 3x3 matrix by a scalar
-    // 
-    inline const Matrix3 operator *( float scalar ) const;
-    // Multiply a 3x3 matrix by a 3-D vector
-    // 
-    inline const Vector3 operator *( const Vector3 & vec ) const;
-    // Multiply two 3x3 matrices
-    // 
-    inline const Matrix3 operator *( const Matrix3 & mat ) const;
-    // Perform compound assignment and addition with a 3x3 matrix
-    // 
-    inline Matrix3 & operator +=( const Matrix3 & mat );
-    // Perform compound assignment and subtraction by a 3x3 matrix
-    // 
-    inline Matrix3 & operator -=( const Matrix3 & mat );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Matrix3 & operator *=( float scalar );
-    // Perform compound assignment and multiplication by a 3x3 matrix
-    // 
-    inline Matrix3 & operator *=( const Matrix3 & mat );
-    // Construct an identity 3x3 matrix
-    // 
-    static inline const Matrix3 identity( );
-    // Construct a 3x3 matrix to rotate around the x axis
-    // 
-    static inline const Matrix3 rotationX( float radians );
-    // Construct a 3x3 matrix to rotate around the y axis
-    // 
-    static inline const Matrix3 rotationY( float radians );
-    // Construct a 3x3 matrix to rotate around the z axis
-    // 
-    static inline const Matrix3 rotationZ( float radians );
-    // Construct a 3x3 matrix to rotate around the x, y, and z axes
-    // 
-    static inline const Matrix3 rotationZYX( const Vector3 & radiansXYZ );
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
-    // 
-    static inline const Matrix3 rotation( float radians, const Vector3 & unitVec );
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static inline const Matrix3 rotation( const Quat & unitQuat );
-    // Construct a 3x3 matrix to perform scaling
-    // 
-    static inline const Matrix3 scale( const Vector3 & scaleVec );
-// Multiply a 3x3 matrix by a scalar
-inline const Matrix3 operator *( float scalar, const Matrix3 & mat );
-// Append (post-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec );
-// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat );
-// Multiply two 3x3 matrices per element
-inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
-// Compute the absolute value of a 3x3 matrix per element
-inline const Matrix3 absPerElem( const Matrix3 & mat );
-// Transpose of a 3x3 matrix
-inline const Matrix3 transpose( const Matrix3 & mat );
-// Compute the inverse of a 3x3 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-inline const Matrix3 inverse( const Matrix3 & mat );
-// Determinant of a 3x3 matrix
-inline float determinant( const Matrix3 & mat );
-// Conditionally select between two 3x3 matrices
-inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
-// Print a 3x3 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Matrix3 & mat );
-// Print a 3x3 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Matrix3 & mat, const char * name );
-// A 4x4 matrix in array-of-structures format
-class Matrix4
-    Vector4 mCol0;
-    Vector4 mCol1;
-    Vector4 mCol2;
-    Vector4 mCol3;
-    // Default constructor; does no initialization
-    // 
-    inline Matrix4( ) { };
-    // Copy a 4x4 matrix
-    // 
-    inline Matrix4( const Matrix4 & mat );
-    // Construct a 4x4 matrix containing the specified columns
-    // 
-    inline Matrix4( const Vector4 & col0, const Vector4 & col1, const Vector4 & col2, const Vector4 & col3 );
-    // Construct a 4x4 matrix from a 3x4 transformation matrix
-    // 
-    explicit inline Matrix4( const Transform3 & mat );
-    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
-    // 
-    inline Matrix4( const Matrix3 & mat, const Vector3 & translateVec );
-    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
-    // 
-    inline Matrix4( const Quat & unitQuat, const Vector3 & translateVec );
-    // Set all elements of a 4x4 matrix to the same scalar value
-    // 
-    explicit inline Matrix4( float scalar );
-    // Assign one 4x4 matrix to another
-    // 
-    inline Matrix4 & operator =( const Matrix4 & mat );
-    // Set the upper-left 3x3 submatrix
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    inline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
-    // Get the upper-left 3x3 submatrix of a 4x4 matrix
-    // 
-    inline const Matrix3 getUpper3x3( ) const;
-    // Set translation component
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    inline Matrix4 & setTranslation( const Vector3 & translateVec );
-    // Get the translation component of a 4x4 matrix
-    // 
-    inline const Vector3 getTranslation( ) const;
-    // Set column 0 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol0( const Vector4 & col0 );
-    // Set column 1 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol1( const Vector4 & col1 );
-    // Set column 2 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol2( const Vector4 & col2 );
-    // Set column 3 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol3( const Vector4 & col3 );
-    // Get column 0 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol0( ) const;
-    // Get column 1 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol1( ) const;
-    // Get column 2 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol2( ) const;
-    // Get column 3 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol3( ) const;
-    // Set the column of a 4x4 matrix referred to by the specified index
-    // 
-    inline Matrix4 & setCol( int col, const Vector4 & vec );
-    // Set the row of a 4x4 matrix referred to by the specified index
-    // 
-    inline Matrix4 & setRow( int row, const Vector4 & vec );
-    // Get the column of a 4x4 matrix referred to by the specified index
-    // 
-    inline const Vector4 getCol( int col ) const;
-    // Get the row of a 4x4 matrix referred to by the specified index
-    // 
-    inline const Vector4 getRow( int row ) const;
-    // Subscripting operator to set or get a column
-    // 
-    inline Vector4 & operator []( int col );
-    // Subscripting operator to get a column
-    // 
-    inline const Vector4 operator []( int col ) const;
-    // Set the element of a 4x4 matrix referred to by column and row indices
-    // 
-    inline Matrix4 & setElem( int col, int row, float val );
-    // Get the element of a 4x4 matrix referred to by column and row indices
-    // 
-    inline float getElem( int col, int row ) const;
-    // Add two 4x4 matrices
-    // 
-    inline const Matrix4 operator +( const Matrix4 & mat ) const;
-    // Subtract a 4x4 matrix from another 4x4 matrix
-    // 
-    inline const Matrix4 operator -( const Matrix4 & mat ) const;
-    // Negate all elements of a 4x4 matrix
-    // 
-    inline const Matrix4 operator -( ) const;
-    // Multiply a 4x4 matrix by a scalar
-    // 
-    inline const Matrix4 operator *( float scalar ) const;
-    // Multiply a 4x4 matrix by a 4-D vector
-    // 
-    inline const Vector4 operator *( const Vector4 & vec ) const;
-    // Multiply a 4x4 matrix by a 3-D vector
-    // 
-    inline const Vector4 operator *( const Vector3 & vec ) const;
-    // Multiply a 4x4 matrix by a 3-D point
-    // 
-    inline const Vector4 operator *( const Point3 & pnt ) const;
-    // Multiply two 4x4 matrices
-    // 
-    inline const Matrix4 operator *( const Matrix4 & mat ) const;
-    // Multiply a 4x4 matrix by a 3x4 transformation matrix
-    // 
-    inline const Matrix4 operator *( const Transform3 & tfrm ) const;
-    // Perform compound assignment and addition with a 4x4 matrix
-    // 
-    inline Matrix4 & operator +=( const Matrix4 & mat );
-    // Perform compound assignment and subtraction by a 4x4 matrix
-    // 
-    inline Matrix4 & operator -=( const Matrix4 & mat );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Matrix4 & operator *=( float scalar );
-    // Perform compound assignment and multiplication by a 4x4 matrix
-    // 
-    inline Matrix4 & operator *=( const Matrix4 & mat );
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    inline Matrix4 & operator *=( const Transform3 & tfrm );
-    // Construct an identity 4x4 matrix
-    // 
-    static inline const Matrix4 identity( );
-    // Construct a 4x4 matrix to rotate around the x axis
-    // 
-    static inline const Matrix4 rotationX( float radians );
-    // Construct a 4x4 matrix to rotate around the y axis
-    // 
-    static inline const Matrix4 rotationY( float radians );
-    // Construct a 4x4 matrix to rotate around the z axis
-    // 
-    static inline const Matrix4 rotationZ( float radians );
-    // Construct a 4x4 matrix to rotate around the x, y, and z axes
-    // 
-    static inline const Matrix4 rotationZYX( const Vector3 & radiansXYZ );
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
-    // 
-    static inline const Matrix4 rotation( float radians, const Vector3 & unitVec );
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static inline const Matrix4 rotation( const Quat & unitQuat );
-    // Construct a 4x4 matrix to perform scaling
-    // 
-    static inline const Matrix4 scale( const Vector3 & scaleVec );
-    // Construct a 4x4 matrix to perform translation
-    // 
-    static inline const Matrix4 translation( const Vector3 & translateVec );
-    // Construct viewing matrix based on eye position, position looked at, and up direction
-    // 
-    static inline const Matrix4 lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec );
-    // Construct a perspective projection matrix
-    // 
-    static inline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
-    // Construct a perspective projection matrix based on frustum
-    // 
-    static inline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
-    // Construct an orthographic projection matrix
-    // 
-    static inline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
-// Multiply a 4x4 matrix by a scalar
-inline const Matrix4 operator *( float scalar, const Matrix4 & mat );
-// Append (post-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec );
-// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat );
-// Multiply two 4x4 matrices per element
-inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
-// Compute the absolute value of a 4x4 matrix per element
-inline const Matrix4 absPerElem( const Matrix4 & mat );
-// Transpose of a 4x4 matrix
-inline const Matrix4 transpose( const Matrix4 & mat );
-// Compute the inverse of a 4x4 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-inline const Matrix4 inverse( const Matrix4 & mat );
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
-inline const Matrix4 affineInverse( const Matrix4 & mat );
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
-inline const Matrix4 orthoInverse( const Matrix4 & mat );
-// Determinant of a 4x4 matrix
-inline float determinant( const Matrix4 & mat );
-// Conditionally select between two 4x4 matrices
-inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
-// Print a 4x4 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Matrix4 & mat );
-// Print a 4x4 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Matrix4 & mat, const char * name );
-// A 3x4 transformation matrix in array-of-structures format
-class Transform3
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-    Vector3 mCol3;
-    // Default constructor; does no initialization
-    // 
-    inline Transform3( ) { };
-    // Copy a 3x4 transformation matrix
-    // 
-    inline Transform3( const Transform3 & tfrm );
-    // Construct a 3x4 transformation matrix containing the specified columns
-    // 
-    inline Transform3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2, const Vector3 & col3 );
-    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
-    // 
-    inline Transform3( const Matrix3 & tfrm, const Vector3 & translateVec );
-    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
-    // 
-    inline Transform3( const Quat & unitQuat, const Vector3 & translateVec );
-    // Set all elements of a 3x4 transformation matrix to the same scalar value
-    // 
-    explicit inline Transform3( float scalar );
-    // Assign one 3x4 transformation matrix to another
-    // 
-    inline Transform3 & operator =( const Transform3 & tfrm );
-    // Set the upper-left 3x3 submatrix
-    // 
-    inline Transform3 & setUpper3x3( const Matrix3 & mat3 );
-    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
-    // 
-    inline const Matrix3 getUpper3x3( ) const;
-    // Set translation component
-    // 
-    inline Transform3 & setTranslation( const Vector3 & translateVec );
-    // Get the translation component of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getTranslation( ) const;
-    // Set column 0 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol0( const Vector3 & col0 );
-    // Set column 1 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol1( const Vector3 & col1 );
-    // Set column 2 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol2( const Vector3 & col2 );
-    // Set column 3 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol3( const Vector3 & col3 );
-    // Get column 0 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol0( ) const;
-    // Get column 1 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol1( ) const;
-    // Get column 2 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol2( ) const;
-    // Get column 3 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol3( ) const;
-    // Set the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline Transform3 & setCol( int col, const Vector3 & vec );
-    // Set the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline Transform3 & setRow( int row, const Vector4 & vec );
-    // Get the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline const Vector3 getCol( int col ) const;
-    // Get the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline const Vector4 getRow( int row ) const;
-    // Subscripting operator to set or get a column
-    // 
-    inline Vector3 & operator []( int col );
-    // Subscripting operator to get a column
-    // 
-    inline const Vector3 operator []( int col ) const;
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    inline Transform3 & setElem( int col, int row, float val );
-    // Get the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    inline float getElem( int col, int row ) const;
-    // Multiply a 3x4 transformation matrix by a 3-D vector
-    // 
-    inline const Vector3 operator *( const Vector3 & vec ) const;
-    // Multiply a 3x4 transformation matrix by a 3-D point
-    // 
-    inline const Point3 operator *( const Point3 & pnt ) const;
-    // Multiply two 3x4 transformation matrices
-    // 
-    inline const Transform3 operator *( const Transform3 & tfrm ) const;
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    inline Transform3 & operator *=( const Transform3 & tfrm );
-    // Construct an identity 3x4 transformation matrix
-    // 
-    static inline const Transform3 identity( );
-    // Construct a 3x4 transformation matrix to rotate around the x axis
-    // 
-    static inline const Transform3 rotationX( float radians );
-    // Construct a 3x4 transformation matrix to rotate around the y axis
-    // 
-    static inline const Transform3 rotationY( float radians );
-    // Construct a 3x4 transformation matrix to rotate around the z axis
-    // 
-    static inline const Transform3 rotationZ( float radians );
-    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
-    // 
-    static inline const Transform3 rotationZYX( const Vector3 & radiansXYZ );
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
-    // 
-    static inline const Transform3 rotation( float radians, const Vector3 & unitVec );
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static inline const Transform3 rotation( const Quat & unitQuat );
-    // Construct a 3x4 transformation matrix to perform scaling
-    // 
-    static inline const Transform3 scale( const Vector3 & scaleVec );
-    // Construct a 3x4 transformation matrix to perform translation
-    // 
-    static inline const Transform3 translation( const Vector3 & translateVec );
-// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec );
-// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm );
-// Multiply two 3x4 transformation matrices per element
-inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
-// Compute the absolute value of a 3x4 transformation matrix per element
-inline const Transform3 absPerElem( const Transform3 & tfrm );
-// Inverse of a 3x4 transformation matrix
-// NOTE: 
-// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
-inline const Transform3 inverse( const Transform3 & tfrm );
-// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
-inline const Transform3 orthoInverse( const Transform3 & tfrm );
-// Conditionally select between two 3x4 transformation matrices
-inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
-// Print a 3x4 transformation matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Transform3 & tfrm );
-// Print a 3x4 transformation matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-inline void print( const Transform3 & tfrm, const char * name );
-} // namespace Aos
-} // namespace Vectormath
-#include "vec_aos.h"
-#include "quat_aos.h"
-#include "mat_aos.h"
diff --git a/src/bullet/vectormath/sse/boolInVec.h b/src/bullet/vectormath/sse/boolInVec.h
deleted file mode 100644
index d21d25cb..00000000
--- a/src/bullet/vectormath/sse/boolInVec.h
+++ /dev/null
@@ -1,247 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-#ifndef _BOOLINVEC_H
-#define _BOOLINVEC_H
-#include <math.h>
-namespace Vectormath {
-class floatInVec;
-// boolInVec class
-class boolInVec
-    private:
-        __m128 mData;
-        inline boolInVec(__m128 vec);
-    public:
-        inline boolInVec() {}
-        // matches standard type conversions
-        //
-        inline boolInVec(const floatInVec &vec);
-        // explicit cast from bool
-        //
-        explicit inline boolInVec(bool scalar);
-        // explicit cast to bool
-        // 
-        inline bool getAsBool() const;
-        // implicit cast to bool
-        // 
-        inline operator bool() const;
-        // get vector data
-        // bool value is splatted across all word slots of vector as 0 (false) or -1 (true)
-        //
-        inline __m128 get128() const;
-        // operators
-        //
-        inline const boolInVec operator ! () const;
-        inline boolInVec& operator = (const boolInVec &vec);
-        inline boolInVec& operator &= (const boolInVec &vec);
-        inline boolInVec& operator ^= (const boolInVec &vec);
-        inline boolInVec& operator |= (const boolInVec &vec);
-        // friend functions
-        //
-        friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
-// boolInVec functions
-// operators
-inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
-// select between vec0 and vec1 using boolInVec.
-// false selects vec0, true selects vec1
-inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
-} // namespace Vectormath
-// boolInVec implementation
-#include "floatInVec.h"
-namespace Vectormath {
-boolInVec::boolInVec(__m128 vec)
-    mData = vec;
-boolInVec::boolInVec(const floatInVec &vec)
-    *this = (vec != floatInVec(0.0f));
-boolInVec::boolInVec(bool scalar)
-    unsigned int mask = -(int)scalar;
-	mData = _mm_set1_ps(*(float *)&mask); // TODO: Union
-boolInVec::getAsBool() const
-boolInVec::operator bool() const
-	return *(bool *)&mData;
-boolInVec::get128() const
-    return mData;
-const boolInVec
-boolInVec::operator ! () const
-    return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps())));
-boolInVec::operator = (const boolInVec &vec)
-    mData = vec.mData;
-    return *this;
-boolInVec::operator &= (const boolInVec &vec)
-    *this = *this & vec;
-    return *this;
-boolInVec::operator ^= (const boolInVec &vec)
-    *this = *this ^ vec;
-    return *this;
-boolInVec::operator |= (const boolInVec &vec)
-    *this = *this | vec;
-    return *this;
-const boolInVec
-operator == (const boolInVec &vec0, const boolInVec &vec1)
-	return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator != (const boolInVec &vec0, const boolInVec &vec1)
-	return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator & (const boolInVec &vec0, const boolInVec &vec1)
-	return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator | (const boolInVec &vec0, const boolInVec &vec1)
-	return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator ^ (const boolInVec &vec0, const boolInVec &vec1)
-	return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1)
-	return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
-} // namespace Vectormath
-#endif // boolInVec_h
diff --git a/src/bullet/vectormath/sse/floatInVec.h b/src/bullet/vectormath/sse/floatInVec.h
deleted file mode 100644
index e8ac5959..00000000
--- a/src/bullet/vectormath/sse/floatInVec.h
+++ /dev/null
@@ -1,340 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-#ifndef _FLOATINVEC_H
-#define _FLOATINVEC_H
-#include <math.h>
-#include <xmmintrin.h>
-namespace Vectormath {
-class boolInVec;
-// floatInVec class
-class floatInVec
-    private:
-        __m128 mData;
-    public:
-        inline floatInVec(__m128 vec);
-        inline floatInVec() {}
-        // matches standard type conversions
-        //
-        inline floatInVec(const boolInVec &vec);
-        // construct from a slot of __m128
-        //
-        inline floatInVec(__m128 vec, int slot);
-        // explicit cast from float
-        //
-        explicit inline floatInVec(float scalar);
-        // explicit cast to float
-        // 
-        inline float getAsFloat() const;
-        // implicit cast to float
-        //
-        inline operator float() const;
-        // get vector data
-        // float value is splatted across all word slots of vector
-        //
-        inline __m128 get128() const;
-        // operators
-        // 
-        inline const floatInVec operator ++ (int);
-        inline const floatInVec operator -- (int);
-        inline floatInVec& operator ++ ();
-        inline floatInVec& operator -- ();
-        inline const floatInVec operator - () const;
-        inline floatInVec& operator = (const floatInVec &vec);
-        inline floatInVec& operator *= (const floatInVec &vec);
-        inline floatInVec& operator /= (const floatInVec &vec);
-        inline floatInVec& operator += (const floatInVec &vec);
-        inline floatInVec& operator -= (const floatInVec &vec);
-        // friend functions
-        //
-        friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1);
-// floatInVec functions
-// operators
-inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
-inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
-inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
-inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
-// select between vec0 and vec1 using boolInVec.
-// false selects vec0, true selects vec1
-inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1);
-} // namespace Vectormath
-// floatInVec implementation
-#include "boolInVec.h"
-namespace Vectormath {
-floatInVec::floatInVec(__m128 vec)
-    mData = vec;
-floatInVec::floatInVec(const boolInVec &vec)
-	mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128());
-floatInVec::floatInVec(__m128 vec, int slot)
-	SSEFloat v;
-	v.m128 = vec;
-	mData = _mm_set1_ps(v.f[slot]);
-floatInVec::floatInVec(float scalar)
-	mData = _mm_set1_ps(scalar);
-floatInVec::getAsFloat() const
-floatInVec::operator float() const
-    return *((float *)&mData);
-floatInVec::get128() const
-    return mData;
-const floatInVec
-floatInVec::operator ++ (int)
-    __m128 olddata = mData;
-    operator ++();
-    return floatInVec(olddata);
-const floatInVec
-floatInVec::operator -- (int)
-    __m128 olddata = mData;
-    operator --();
-    return floatInVec(olddata);
-floatInVec::operator ++ ()
-    *this += floatInVec(_mm_set1_ps(1.0f));
-    return *this;
-floatInVec::operator -- ()
-    *this -= floatInVec(_mm_set1_ps(1.0f));
-    return *this;
-const floatInVec
-floatInVec::operator - () const
-    return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData));
-floatInVec::operator = (const floatInVec &vec)
-    mData = vec.mData;
-    return *this;
-floatInVec::operator *= (const floatInVec &vec)
-    *this = *this * vec;
-    return *this;
-floatInVec::operator /= (const floatInVec &vec)
-    *this = *this / vec;
-    return *this;
-floatInVec::operator += (const floatInVec &vec)
-    *this = *this + vec;
-    return *this;
-floatInVec::operator -= (const floatInVec &vec)
-    *this = *this - vec;
-    return *this;
-const floatInVec
-operator * (const floatInVec &vec0, const floatInVec &vec1)
-    return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128()));
-const floatInVec
-operator / (const floatInVec &num, const floatInVec &den)
-    return floatInVec(_mm_div_ps(num.get128(), den.get128()));
-const floatInVec
-operator + (const floatInVec &vec0, const floatInVec &vec1)
-    return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128()));
-const floatInVec
-operator - (const floatInVec &vec0, const floatInVec &vec1)
-    return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator < (const floatInVec &vec0, const floatInVec &vec1)
-    return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128()));
-const boolInVec
-operator <= (const floatInVec &vec0, const floatInVec &vec1)
-    return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128()));
-const boolInVec
-operator > (const floatInVec &vec0, const floatInVec &vec1)
-    return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator >= (const floatInVec &vec0, const floatInVec &vec1)
-    return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator == (const floatInVec &vec0, const floatInVec &vec1)
-    return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
-const boolInVec
-operator != (const floatInVec &vec0, const floatInVec &vec1)
-    return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
-const floatInVec
-select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1)
-    return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
-} // namespace Vectormath
-#endif // floatInVec_h
diff --git a/src/bullet/vectormath/sse/mat_aos.h b/src/bullet/vectormath/sse/mat_aos.h
deleted file mode 100644
index a2c66cc5..00000000
--- a/src/bullet/vectormath/sse/mat_aos.h
+++ /dev/null
@@ -1,2190 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-namespace Vectormath {
-namespace Aos {
-// Constants
-// for shuffles, words are labeled [x,y,z,w] [a,b,c,d]
-#define _VECTORMATH_PI_OVER_2 1.570796327f
-// Definitions
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Matrix3 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( float scalar )
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const floatInVec &scalar )
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Quat &unitQuat )
-    __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2;
-    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-	VM_ATTRIBUTE_ALIGN16 unsigned int sx[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int sz[4] = {0, 0, 0xffffffff, 0};
-	__m128 select_x = _mm_load_ps((float *)sx);
-	__m128 select_z = _mm_load_ps((float *)sz);
-    xyzw_2 = _mm_add_ps( unitQuat.get128(), unitQuat.get128() );
-    wwww = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,3,3,3) );
-	yzxw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,0,2,1) );
-	zxyw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,1,0,2) );
-    yzxw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,0,2,1) );
-    zxyw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,1,0,2) );
-    tmp0 = _mm_mul_ps( yzxw_2, wwww );									// tmp0 = 2yw, 2zw, 2xw, 2w2
-	tmp1 = _mm_sub_ps( _mm_set1_ps(1.0f), _mm_mul_ps(yzxw, yzxw_2) );	// tmp1 = 1 - 2y2, 1 - 2z2, 1 - 2x2, 1 - 2w2
-    tmp2 = _mm_mul_ps( yzxw, xyzw_2 );									// tmp2 = 2xy, 2yz, 2xz, 2w2
-    tmp0 = _mm_add_ps( _mm_mul_ps(zxyw, xyzw_2), tmp0 );				// tmp0 = 2yw + 2zx, 2zw + 2xy, 2xw + 2yz, 2w2 + 2w2
-    tmp1 = _mm_sub_ps( tmp1, _mm_mul_ps(zxyw, zxyw_2) );				// tmp1 = 1 - 2y2 - 2z2, 1 - 2z2 - 2x2, 1 - 2x2 - 2y2, 1 - 2w2 - 2w2
-    tmp2 = _mm_sub_ps( tmp2, _mm_mul_ps(zxyw_2, wwww) );				// tmp2 = 2xy - 2zw, 2yz - 2xw, 2xz - 2yw, 2w2 -2w2
-    tmp3 = vec_sel( tmp0, tmp1, select_x );
-    tmp4 = vec_sel( tmp1, tmp2, select_x );
-    tmp5 = vec_sel( tmp2, tmp0, select_x );
-    mCol0 = Vector3( vec_sel( tmp3, tmp2, select_z ) );
-    mCol1 = Vector3( vec_sel( tmp4, tmp0, select_z ) );
-    mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) );
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
-    mCol0 = _col0;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
-    mCol1 = _col1;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
-    mCol2 = _col2;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
-    *(&mCol0 + col) = vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, float val )
-    (*this)[col].setElem(row, val);
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
-    Vector3 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Matrix3::getElem( int col, int row ) const
-    return this->getCol( col ).getElem( row );
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol0( ) const
-    return mCol0;
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol1( ) const
-    return mCol1;
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol2( ) const
-    return mCol2;
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol( int col ) const
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getRow( int row ) const
-    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
-VECTORMATH_FORCE_INLINE Vector3 & Matrix3::operator []( int col )
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator []( int col ) const
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator =( const Matrix3 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat )
-    __m128 tmp0, tmp1, res0, res1, res2;
-    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
-    res0 = vec_mergeh( tmp0, mat.getCol1().get128() );
-    //res1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	res1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
-	res1 = vec_sel(res1, mat.getCol1().get128(), select_y);
-    //res2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
-	res2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
-	res2 = vec_sel(res2, vec_splat(mat.getCol1().get128(), 2), select_y);
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3( res2 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat )
-    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2;
-    tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() );
-    tmp0 = _vmathVfCross( mat.getCol1().get128(), mat.getCol2().get128() );
-    tmp1 = _vmathVfCross( mat.getCol2().get128(), mat.getCol0().get128() );
-    dot = _vmathVfDot3( tmp2, mat.getCol2().get128() );
-    dot = vec_splat( dot, 0 );
-    invdet = recipf4( dot );
-    tmp3 = vec_mergeh( tmp0, tmp2 );
-    tmp4 = vec_mergel( tmp0, tmp2 );
-    inv0 = vec_mergeh( tmp3, tmp1 );
-    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
-	inv1 = vec_sel(inv1, tmp1, select_y);
-    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
-	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
-	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
-    inv0 = vec_mul( inv0, invdet );
-    inv1 = vec_mul( inv1, invdet );
-	inv2 = vec_mul( inv2, invdet );
-    return Matrix3(
-        Vector3( inv0 ),
-        Vector3( inv1 ),
-        Vector3( inv2 )
-    );
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat )
-    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
-    return Matrix3(
-        ( mCol0 + mat.mCol0 ),
-        ( mCol1 + mat.mCol1 ),
-        ( mCol2 + mat.mCol2 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
-    return Matrix3(
-        ( mCol0 - mat.mCol0 ),
-        ( mCol1 - mat.mCol1 ),
-        ( mCol2 - mat.mCol2 )
-    );
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
-    *this = *this + mat;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
-    *this = *this - mat;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( ) const
-    return Matrix3(
-        ( -mCol0 ),
-        ( -mCol1 ),
-        ( -mCol2 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat )
-    return Matrix3(
-        absPerElem( mat.getCol0() ),
-        absPerElem( mat.getCol1() ),
-        absPerElem( mat.getCol2() )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( float scalar ) const
-    return *this * floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
-    return Matrix3(
-        ( mCol0 * scalar ),
-        ( mCol1 * scalar ),
-        ( mCol2 * scalar )
-    );
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( float scalar )
-    return *this *= floatInVec(scalar);
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat )
-    return floatInVec(scalar) * mat;
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
-    return mat * scalar;
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator *( const Vector3 &vec ) const
-    __m128 res;
-    __m128 xxxx, yyyy, zzzz;
-    xxxx = vec_splat( vec.get128(), 0 );
-    yyyy = vec_splat( vec.get128(), 1 );
-    zzzz = vec_splat( vec.get128(), 2 );
-    res = vec_mul( mCol0.get128(), xxxx );
-    res = vec_madd( mCol1.get128(), yyyy, res );
-    res = vec_madd( mCol2.get128(), zzzz, res );
-    return Vector3( res );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
-    return Matrix3(
-        ( *this * mat.mCol0 ),
-        ( *this * mat.mCol1 ),
-        ( *this * mat.mCol2 )
-    );
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
-    *this = *this * mat;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
-    return Matrix3(
-        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
-        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
-        mulPerElem( mat0.getCol2(), mat1.getCol2() )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::identity( )
-    return Matrix3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( float radians )
-    return rotationX( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( const floatInVec &radians )
-    __m128 s, c, res1, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res1 = vec_sel( zero, c, select_y );
-    res1 = vec_sel( res1, s, select_z );
-    res2 = vec_sel( zero, negatef4(s), select_y );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix3(
-        Vector3::xAxis( ),
-        Vector3( res1 ),
-        Vector3( res2 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( float radians )
-    return rotationY( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( const floatInVec &radians )
-    __m128 s, c, res0, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, negatef4(s), select_z );
-    res2 = vec_sel( zero, s, select_x );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3::yAxis( ),
-        Vector3( res2 )
-	);
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( float radians )
-    return rotationZ( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
-    __m128 s, c, res0, res1;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, s, select_y );
-    res1 = vec_sel( zero, negatef4(s), select_x );
-    res1 = vec_sel( res1, c, select_y );
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3::zAxis( )
-	);
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
-    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
-    angles = Vector4( radiansXYZ, 0.0f ).get128();
-    sincosf4( angles, &s, &c );
-    negS = negatef4( s );
-    Z0 = vec_mergel( c, s );
-    Z1 = vec_mergel( negS, c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
-	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
-	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
-    X0 = vec_splat( s, 0 );
-    X1 = vec_splat( c, 0 );
-    tmp = vec_mul( Z0, Y1 );
-    return Matrix3(
-        Vector3( vec_mul( Z0, Y0 ) ),
-        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
-        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
-    return rotation( floatInVec(radians), unitVec );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
-    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
-    axis = unitVec.get128();
-    sincosf4( radians.get128(), &s, &c );
-    xxxx = vec_splat( axis, 0 );
-    yyyy = vec_splat( axis, 1 );
-    zzzz = vec_splat( axis, 2 );
-    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
-    axisS = vec_mul( axis, s );
-    negAxisS = negatef4( axisS );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
-	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
-	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
-    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
-	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
-    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
-	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
-	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
-    tmp0 = vec_sel( tmp0, c, select_x );
-    tmp1 = vec_sel( tmp1, c, select_y );
-    tmp2 = vec_sel( tmp2, c, select_z );
-    return Matrix3(
-        Vector3( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
-        Vector3( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
-        Vector3( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const Quat &unitQuat )
-    return Matrix3( unitQuat );
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
-    __m128 zero = _mm_setzero_ps();
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    return Matrix3(
-        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
-    return Matrix3(
-        ( mat.getCol0() * scaleVec.getX( ) ),
-        ( mat.getCol1() * scaleVec.getY( ) ),
-        ( mat.getCol2() * scaleVec.getZ( ) )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
-    return Matrix3(
-        mulPerElem( mat.getCol0(), scaleVec ),
-        mulPerElem( mat.getCol1(), scaleVec ),
-        mulPerElem( mat.getCol2(), scaleVec )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
-    return Matrix3(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
-    return Matrix3(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 )
-    );
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat )
-    print( mat.getRow( 0 ) );
-    print( mat.getRow( 1 ) );
-    print( mat.getRow( 2 ) );
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name )
-    printf("%s:\n", name);
-    print( mat );
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix4 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    mCol3 = mat.mCol3;
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( float scalar )
-    mCol0 = Vector4( scalar );
-    mCol1 = Vector4( scalar );
-    mCol2 = Vector4( scalar );
-    mCol3 = Vector4( scalar );
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const floatInVec &scalar )
-    mCol0 = Vector4( scalar );
-    mCol1 = Vector4( scalar );
-    mCol2 = Vector4( scalar );
-    mCol3 = Vector4( scalar );
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Transform3 & mat )
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( mat.getCol3(), 1.0f );
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-    mCol3 = _col3;
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( translateVec, 1.0f );
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
-    Matrix3 mat;
-    mat = Matrix3( unitQuat );
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( translateVec, 1.0f );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
-    mCol0 = _col0;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
-    mCol1 = _col1;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
-    mCol2 = _col2;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
-    mCol3 = _col3;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
-    *(&mCol0 + col) = vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    mCol3.setElem( row, vec.getElem( 3 ) );
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, float val )
-    (*this)[col].setElem(row, val);
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
-    Vector4 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Matrix4::getElem( int col, int row ) const
-    return this->getCol( col ).getElem( row );
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol0( ) const
-    return mCol0;
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol1( ) const
-    return mCol1;
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol2( ) const
-    return mCol2;
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol3( ) const
-    return mCol3;
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol( int col ) const
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getRow( int row ) const
-    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
-VECTORMATH_FORCE_INLINE Vector4 & Matrix4::operator []( int col )
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator []( int col ) const
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator =( const Matrix4 & mat )
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    mCol3 = mat.mCol3;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat )
-    __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3;
-    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp1 = vec_mergeh( mat.getCol1().get128(), mat.getCol3().get128() );
-    tmp2 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp3 = vec_mergel( mat.getCol1().get128(), mat.getCol3().get128() );
-    res0 = vec_mergeh( tmp0, tmp1 );
-    res1 = vec_mergel( tmp0, tmp1 );
-    res2 = vec_mergeh( tmp2, tmp3 );
-    res3 = vec_mergel( tmp2, tmp3 );
-    return Matrix4(
-        Vector4( res0 ),
-        Vector4( res1 ),
-        Vector4( res2 ),
-        Vector4( res3 )
-    );
-// TODO: Tidy
-static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathPNPN[4] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000};
-static VM_ATTRIBUTE_ALIGN16 const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f};
-VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat )
-	__m128 Va,Vb,Vc;
-	__m128 r1,r2,r3,tt,tt2;
-	__m128 sum,Det,RDet;
-	__m128 trns0,trns1,trns2,trns3;
-	__m128 _L1 = mat.getCol0().get128();
-	__m128 _L2 = mat.getCol1().get128();
-	__m128 _L3 = mat.getCol2().get128();
-	__m128 _L4 = mat.getCol3().get128();
-	// Calculating the minterms for the first line.
-	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
-	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
-	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'dot V4
-	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'dot V4"
-	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
-	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
-	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
-	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
-	tt = _L2;
-	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
-	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
-	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
-	// Calculating the determinant.
-	Det = _mm_mul_ps(sum,_L1);
-	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
-	const __m128 Sign_PNPN = _mm_load_ps((float *)_vmathPNPN);
-	const __m128 Sign_NPNP = _mm_load_ps((float *)_vmathNPNP);
-	__m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN);
-	// Calculating the minterms of the second line (using previous results).
-	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	__m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP);
-	// Testing the determinant.
-	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
-	// Calculating the minterms of the third line.
-	tt = _mm_ror_ps(_L1,1);
-	Va = _mm_mul_ps(tt,Vb);									// V1' dot V2"
-	Vb = _mm_mul_ps(tt,Vc);									// V1' dot V2^
-	Vc = _mm_mul_ps(tt,_L2);								// V1' dot V2
-	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V1" dot V2^ - V1^ dot V2"
-	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V1^ dot V2' - V1' dot V2^
-	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V1' dot V2" - V1" dot V2'
-	tt = _mm_ror_ps(_L4,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	__m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN);
-	// Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
-	RDet = _mm_div_ss(_mm_load_ss((float *)&_vmathZERONE), Det); // TODO: just 1.0f?
-	RDet = _mm_shuffle_ps(RDet,RDet,0x00);
-	// Devide the first 12 minterms with the determinant.
-	mtL1 = _mm_mul_ps(mtL1, RDet);
-	mtL2 = _mm_mul_ps(mtL2, RDet);
-	mtL3 = _mm_mul_ps(mtL3, RDet);
-	// Calculate the minterms of the forth line and devide by the determinant.
-	tt = _mm_ror_ps(_L3,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	__m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP);
-	mtL4 = _mm_mul_ps(mtL4, RDet);
-	// Now we just have to transpose the minterms matrix.
-	trns0 = _mm_unpacklo_ps(mtL1,mtL2);
-	trns1 = _mm_unpacklo_ps(mtL3,mtL4);
-	trns2 = _mm_unpackhi_ps(mtL1,mtL2);
-	trns3 = _mm_unpackhi_ps(mtL3,mtL4);
-	_L1 = _mm_movelh_ps(trns0,trns1);
-	_L2 = _mm_movehl_ps(trns1,trns0);
-	_L3 = _mm_movelh_ps(trns2,trns3);
-	_L4 = _mm_movehl_ps(trns3,trns2);
-    return Matrix4(
-        Vector4( _L1 ),
-        Vector4( _L2 ),
-        Vector4( _L3 ),
-        Vector4( _L4 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat )
-    Transform3 affineMat;
-    affineMat.setCol0( mat.getCol0().getXYZ( ) );
-    affineMat.setCol1( mat.getCol1().getXYZ( ) );
-    affineMat.setCol2( mat.getCol2().getXYZ( ) );
-    affineMat.setCol3( mat.getCol3().getXYZ( ) );
-    return Matrix4( inverse( affineMat ) );
-VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat )
-    Transform3 affineMat;
-    affineMat.setCol0( mat.getCol0().getXYZ( ) );
-    affineMat.setCol1( mat.getCol1().getXYZ( ) );
-    affineMat.setCol2( mat.getCol2().getXYZ( ) );
-    affineMat.setCol3( mat.getCol3().getXYZ( ) );
-    return Matrix4( orthoInverse( affineMat ) );
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat )
-	__m128 Va,Vb,Vc;
-	__m128 r1,r2,r3,tt,tt2;
-	__m128 sum,Det;
-	__m128 _L1 = mat.getCol0().get128();
-	__m128 _L2 = mat.getCol1().get128();
-	__m128 _L3 = mat.getCol2().get128();
-	__m128 _L4 = mat.getCol3().get128();
-	// Calculating the minterms for the first line.
-	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
-	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
-	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3' dot V4
-	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3' dot V4"
-	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
-	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
-	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
-	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
-	tt = _L2;
-	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
-	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
-	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
-	// Calculating the determinant.
-	Det = _mm_mul_ps(sum,_L1);
-	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
-	// Calculating the minterms of the second line (using previous results).
-	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	// Testing the determinant.
-	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
-	return floatInVec(Det, 0);
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
-    return Matrix4(
-        ( mCol0 + mat.mCol0 ),
-        ( mCol1 + mat.mCol1 ),
-        ( mCol2 + mat.mCol2 ),
-        ( mCol3 + mat.mCol3 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
-    return Matrix4(
-        ( mCol0 - mat.mCol0 ),
-        ( mCol1 - mat.mCol1 ),
-        ( mCol2 - mat.mCol2 ),
-        ( mCol3 - mat.mCol3 )
-    );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
-    *this = *this + mat;
-    return *this;
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
-    *this = *this - mat;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( ) const
-    return Matrix4(
-        ( -mCol0 ),
-        ( -mCol1 ),
-        ( -mCol2 ),
-        ( -mCol3 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat )
-    return Matrix4(
-        absPerElem( mat.getCol0() ),
-        absPerElem( mat.getCol1() ),
-        absPerElem( mat.getCol2() ),
-        absPerElem( mat.getCol3() )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( float scalar ) const
-    return *this * floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
-    return Matrix4(
-        ( mCol0 * scalar ),
-        ( mCol1 * scalar ),
-        ( mCol2 * scalar ),
-        ( mCol3 * scalar )
-    );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( float scalar )
-    return *this *= floatInVec(scalar);
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat )
-    return floatInVec(scalar) * mat;
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
-    return mat * scalar;
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector4 &vec ) const
-    return Vector4(
-		_mm_add_ps(
-			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
-			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))), _mm_mul_ps(mCol3.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(3,3,3,3)))))
-		);
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector3 &vec ) const
-    return Vector4(
-		_mm_add_ps(
-			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
-			_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))))
-		);
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Point3 &pnt ) const
-    return Vector4(
-		_mm_add_ps(
-			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(1,1,1,1)))),
-			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(2,2,2,2))), mCol3.get128()))
-		);
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
-    return Matrix4(
-        ( *this * mat.mCol0 ),
-        ( *this * mat.mCol1 ),
-        ( *this * mat.mCol2 ),
-        ( *this * mat.mCol3 )
-    );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
-    *this = *this * mat;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
-    return Matrix4(
-        ( *this * tfrm.getCol0() ),
-        ( *this * tfrm.getCol1() ),
-        ( *this * tfrm.getCol2() ),
-        ( *this * Point3( tfrm.getCol3() ) )
-    );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
-    *this = *this * tfrm;
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
-    return Matrix4(
-        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
-        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
-        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
-        mulPerElem( mat0.getCol3(), mat1.getCol3() )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::identity( )
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4::yAxis( ),
-        Vector4::zAxis( ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
-    mCol0.setXYZ( mat3.getCol0() );
-    mCol1.setXYZ( mat3.getCol1() );
-    mCol2.setXYZ( mat3.getCol2() );
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix4::getUpper3x3( ) const
-    return Matrix3(
-        mCol0.getXYZ( ),
-        mCol1.getXYZ( ),
-        mCol2.getXYZ( )
-    );
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
-    mCol3.setXYZ( translateVec );
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 Matrix4::getTranslation( ) const
-    return mCol3.getXYZ( );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( float radians )
-    return rotationX( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( const floatInVec &radians )
-    __m128 s, c, res1, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res1 = vec_sel( zero, c, select_y );
-    res1 = vec_sel( res1, s, select_z );
-    res2 = vec_sel( zero, negatef4(s), select_y );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4( res1 ),
-        Vector4( res2 ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( float radians )
-    return rotationY( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( const floatInVec &radians )
-    __m128 s, c, res0, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, negatef4(s), select_z );
-    res2 = vec_sel( zero, s, select_x );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix4(
-        Vector4( res0 ),
-        Vector4::yAxis( ),
-        Vector4( res2 ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( float radians )
-    return rotationZ( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
-    __m128 s, c, res0, res1;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, s, select_y );
-    res1 = vec_sel( zero, negatef4(s), select_x );
-    res1 = vec_sel( res1, c, select_y );
-    return Matrix4(
-        Vector4( res0 ),
-        Vector4( res1 ),
-        Vector4::zAxis( ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
-    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
-    angles = Vector4( radiansXYZ, 0.0f ).get128();
-    sincosf4( angles, &s, &c );
-    negS = negatef4( s );
-    Z0 = vec_mergel( c, s );
-    Z1 = vec_mergel( negS, c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
-	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
-	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
-    X0 = vec_splat( s, 0 );
-    X1 = vec_splat( c, 0 );
-    tmp = vec_mul( Z0, Y1 );
-    return Matrix4(
-        Vector4( vec_mul( Z0, Y0 ) ),
-        Vector4( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
-        Vector4( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
-    return rotation( floatInVec(radians), unitVec );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
-    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
-    axis = unitVec.get128();
-    sincosf4( radians.get128(), &s, &c );
-    xxxx = vec_splat( axis, 0 );
-    yyyy = vec_splat( axis, 1 );
-    zzzz = vec_splat( axis, 2 );
-    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
-    axisS = vec_mul( axis, s );
-    negAxisS = negatef4( axisS );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
-	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
-	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
-    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
-	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
-    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
-	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
-	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
-    tmp0 = vec_sel( tmp0, c, select_x );
-    tmp1 = vec_sel( tmp1, c, select_y );
-    tmp2 = vec_sel( tmp2, c, select_z );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    axis = vec_and( axis, _mm_load_ps( (float *)select_xyz ) );
-    tmp0 = vec_and( tmp0, _mm_load_ps( (float *)select_xyz ) );
-    tmp1 = vec_and( tmp1, _mm_load_ps( (float *)select_xyz ) );
-    tmp2 = vec_and( tmp2, _mm_load_ps( (float *)select_xyz ) );
-    return Matrix4(
-        Vector4( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
-        Vector4( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
-        Vector4( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const Quat &unitQuat )
-    return Matrix4( Transform3::rotation( unitQuat ) );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
-    __m128 zero = _mm_setzero_ps();
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    return Matrix4(
-        Vector4( vec_sel( zero, scaleVec.get128(), select_x ) ),
-        Vector4( vec_sel( zero, scaleVec.get128(), select_y ) ),
-        Vector4( vec_sel( zero, scaleVec.get128(), select_z ) ),
-        Vector4::wAxis( )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
-    return Matrix4(
-        ( mat.getCol0() * scaleVec.getX( ) ),
-        ( mat.getCol1() * scaleVec.getY( ) ),
-        ( mat.getCol2() * scaleVec.getZ( ) ),
-        mat.getCol3()
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
-    Vector4 scale4;
-    scale4 = Vector4( scaleVec, 1.0f );
-    return Matrix4(
-        mulPerElem( mat.getCol0(), scale4 ),
-        mulPerElem( mat.getCol1(), scale4 ),
-        mulPerElem( mat.getCol2(), scale4 ),
-        mulPerElem( mat.getCol3(), scale4 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::translation( const Vector3 &translateVec )
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4::yAxis( ),
-        Vector4::zAxis( ),
-        Vector4( translateVec, 1.0f )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
-    Matrix4 m4EyeFrame;
-    Vector3 v3X, v3Y, v3Z;
-    v3Y = normalize( upVec );
-    v3Z = normalize( ( eyePos - lookAtPos ) );
-    v3X = normalize( cross( v3Y, v3Z ) );
-    v3Y = cross( v3Z, v3X );
-    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
-    return orthoInverse( m4EyeFrame );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
-    float f, rangeInv;
-    __m128 zero, col0, col1, col2, col3;
-    union { __m128 v; float s[4]; } tmp;
-    f = tanf( _VECTORMATH_PI_OVER_2 - fovyRadians * 0.5f );
-    rangeInv = 1.0f / ( zNear - zFar );
-    zero = _mm_setzero_ps();
-    tmp.v = zero;
-    tmp.s[0] = f / aspect;
-    col0 = tmp.v;
-    tmp.v = zero;
-    tmp.s[1] = f;
-    col1 = tmp.v;
-    tmp.v = zero;
-    tmp.s[2] = ( zNear + zFar ) * rangeInv;
-    tmp.s[3] = -1.0f;
-    col2 = tmp.v;
-    tmp.v = zero;
-    tmp.s[2] = zNear * zFar * rangeInv * 2.0f;
-    col3 = tmp.v;
-    return Matrix4(
-        Vector4( col0 ),
-        Vector4( col1 ),
-        Vector4( col2 ),
-        Vector4( col3 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
-    /* function implementation based on code from STIDC SDK:           */
-    /* --------------------------------------------------------------  */
-    /* PLEASE DO NOT MODIFY THIS SECTION                               */
-    /* This prolog section is automatically generated.                 */
-    /*                                                                 */
-    /* (C)Copyright                                                    */
-    /* Sony Computer Entertainment, Inc.,                              */
-    /* Toshiba Corporation,                                            */
-    /* International Business Machines Corporation,                    */
-    /* 2001,2002.                                                      */
-    /* S/T/I Confidential Information                                  */
-    /* --------------------------------------------------------------  */
-    __m128 lbf, rtn;
-    __m128 diff, sum, inv_diff;
-    __m128 diagonal, column, near2;
-    __m128 zero = _mm_setzero_ps();
-    union { __m128 v; float s[4]; } l, f, r, n, b, t; // TODO: Union?
-    l.s[0] = left;
-    f.s[0] = zFar;
-    r.s[0] = right;
-    n.s[0] = zNear;
-    b.s[0] = bottom;
-    t.s[0] = top;
-    lbf = vec_mergeh( l.v, f.v );
-    rtn = vec_mergeh( r.v, n.v );
-    lbf = vec_mergeh( lbf, b.v );
-    rtn = vec_mergeh( rtn, t.v );
-    diff = vec_sub( rtn, lbf );
-    sum  = vec_add( rtn, lbf );
-    inv_diff = recipf4( diff );
-    near2 = vec_splat( n.v, 0 );
-    near2 = vec_add( near2, near2 );
-    diagonal = vec_mul( near2, inv_diff );
-    column = vec_mul( sum, inv_diff );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
-    return Matrix4(
-        Vector4( vec_sel( zero, diagonal, select_x ) ),
-        Vector4( vec_sel( zero, diagonal, select_y ) ),
-        Vector4( vec_sel( column, _mm_set1_ps(-1.0f), select_w ) ),
-        Vector4( vec_sel( zero, vec_mul( diagonal, vec_splat( f.v, 0 ) ), select_z ) )
-	);
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
-    /* function implementation based on code from STIDC SDK:           */
-    /* --------------------------------------------------------------  */
-    /* PLEASE DO NOT MODIFY THIS SECTION                               */
-    /* This prolog section is automatically generated.                 */
-    /*                                                                 */
-    /* (C)Copyright                                                    */
-    /* Sony Computer Entertainment, Inc.,                              */
-    /* Toshiba Corporation,                                            */
-    /* International Business Machines Corporation,                    */
-    /* 2001,2002.                                                      */
-    /* S/T/I Confidential Information                                  */
-    /* --------------------------------------------------------------  */
-    __m128 lbf, rtn;
-    __m128 diff, sum, inv_diff, neg_inv_diff;
-    __m128 diagonal, column;
-    __m128 zero = _mm_setzero_ps();
-    union { __m128 v; float s[4]; } l, f, r, n, b, t;
-    l.s[0] = left;
-    f.s[0] = zFar;
-    r.s[0] = right;
-    n.s[0] = zNear;
-    b.s[0] = bottom;
-    t.s[0] = top;
-    lbf = vec_mergeh( l.v, f.v );
-    rtn = vec_mergeh( r.v, n.v );
-    lbf = vec_mergeh( lbf, b.v );
-    rtn = vec_mergeh( rtn, t.v );
-    diff = vec_sub( rtn, lbf );
-    sum  = vec_add( rtn, lbf );
-    inv_diff = recipf4( diff );
-    neg_inv_diff = negatef4( inv_diff );
-    diagonal = vec_add( inv_diff, inv_diff );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
-    column = vec_mul( sum, vec_sel( neg_inv_diff, inv_diff, select_z ) ); // TODO: no madds with zero
-    return Matrix4(
-        Vector4( vec_sel( zero, diagonal, select_x ) ),
-        Vector4( vec_sel( zero, diagonal, select_y ) ),
-        Vector4( vec_sel( zero, diagonal, select_z ) ),
-        Vector4( vec_sel( column, _mm_set1_ps(1.0f), select_w ) )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
-    return Matrix4(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 ),
-        select( mat0.getCol3(), mat1.getCol3(), select1 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
-    return Matrix4(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 ),
-        select( mat0.getCol3(), mat1.getCol3(), select1 )
-    );
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat )
-    print( mat.getRow( 0 ) );
-    print( mat.getRow( 1 ) );
-    print( mat.getRow( 2 ) );
-    print( mat.getRow( 3 ) );
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name )
-    printf("%s:\n", name);
-    print( mat );
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Transform3 & tfrm )
-    mCol0 = tfrm.mCol0;
-    mCol1 = tfrm.mCol1;
-    mCol2 = tfrm.mCol2;
-    mCol3 = tfrm.mCol3;
-VECTORMATH_FORCE_INLINE Transform3::Transform3( float scalar )
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-    mCol3 = Vector3( scalar );
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const floatInVec &scalar )
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-    mCol3 = Vector3( scalar );
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-    mCol3 = _col3;
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
-    this->setUpper3x3( tfrm );
-    this->setTranslation( translateVec );
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
-    this->setUpper3x3( Matrix3( unitQuat ) );
-    this->setTranslation( translateVec );
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol0( const Vector3 &_col0 )
-    mCol0 = _col0;
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol1( const Vector3 &_col1 )
-    mCol1 = _col1;
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol2( const Vector3 &_col2 )
-    mCol2 = _col2;
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol3( const Vector3 &_col3 )
-    mCol3 = _col3;
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol( int col, const Vector3 &vec )
-    *(&mCol0 + col) = vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setRow( int row, const Vector4 &vec )
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    mCol3.setElem( row, vec.getElem( 3 ) );
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, float val )
-    (*this)[col].setElem(row, val);
-    return *this;
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
-    Vector3 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Transform3::getElem( int col, int row ) const
-    return this->getCol( col ).getElem( row );
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol0( ) const
-    return mCol0;
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol1( ) const
-    return mCol1;
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol2( ) const
-    return mCol2;
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol3( ) const
-    return mCol3;
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol( int col ) const
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE const Vector4 Transform3::getRow( int row ) const
-    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
-VECTORMATH_FORCE_INLINE Vector3 & Transform3::operator []( int col )
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator []( int col ) const
-    return *(&mCol0 + col);
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator =( const Transform3 & tfrm )
-    mCol0 = tfrm.mCol0;
-    mCol1 = tfrm.mCol1;
-    mCol2 = tfrm.mCol2;
-    mCol3 = tfrm.mCol3;
-    return *this;
-VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm )
-    __m128 inv0, inv1, inv2, inv3;
-    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet;
-    __m128 xxxx, yyyy, zzzz;
-    tmp2 = _vmathVfCross( tfrm.getCol0().get128(), tfrm.getCol1().get128() );
-    tmp0 = _vmathVfCross( tfrm.getCol1().get128(), tfrm.getCol2().get128() );
-    tmp1 = _vmathVfCross( tfrm.getCol2().get128(), tfrm.getCol0().get128() );
-    inv3 = negatef4( tfrm.getCol3().get128() );
-    dot = _vmathVfDot3( tmp2, tfrm.getCol2().get128() );
-    dot = vec_splat( dot, 0 );
-    invdet = recipf4( dot );
-    tmp3 = vec_mergeh( tmp0, tmp2 );
-    tmp4 = vec_mergel( tmp0, tmp2 );
-    inv0 = vec_mergeh( tmp3, tmp1 );
-    xxxx = vec_splat( inv3, 0 );
-    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
-	inv1 = vec_sel(inv1, tmp1, select_y);
-    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
-	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
-	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
-    yyyy = vec_splat( inv3, 1 );
-    zzzz = vec_splat( inv3, 2 );
-    inv3 = vec_mul( inv0, xxxx );
-    inv3 = vec_madd( inv1, yyyy, inv3 );
-    inv3 = vec_madd( inv2, zzzz, inv3 );
-    inv0 = vec_mul( inv0, invdet );
-    inv1 = vec_mul( inv1, invdet );
-    inv2 = vec_mul( inv2, invdet );
-    inv3 = vec_mul( inv3, invdet );
-    return Transform3(
-        Vector3( inv0 ),
-        Vector3( inv1 ),
-        Vector3( inv2 ),
-        Vector3( inv3 )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm )
-    __m128 inv0, inv1, inv2, inv3;
-    __m128 tmp0, tmp1;
-    __m128 xxxx, yyyy, zzzz;
-    tmp0 = vec_mergeh( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
-    tmp1 = vec_mergel( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
-    inv3 = negatef4( tfrm.getCol3().get128() );
-    inv0 = vec_mergeh( tmp0, tfrm.getCol1().get128() );
-    xxxx = vec_splat( inv3, 0 );
-    //inv1 = vec_perm( tmp0, tfrm.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	inv1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
-	inv1 = vec_sel(inv1, tfrm.getCol1().get128(), select_y);
-    //inv2 = vec_perm( tmp1, tfrm.getCol1().get128(), _VECTORMATH_PERM_XCYX );
-	inv2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
-	inv2 = vec_sel(inv2, vec_splat(tfrm.getCol1().get128(), 2), select_y);
-    yyyy = vec_splat( inv3, 1 );
-    zzzz = vec_splat( inv3, 2 );
-    inv3 = vec_mul( inv0, xxxx );
-    inv3 = vec_madd( inv1, yyyy, inv3 );
-    inv3 = vec_madd( inv2, zzzz, inv3 );
-    return Transform3(
-        Vector3( inv0 ),
-        Vector3( inv1 ),
-        Vector3( inv2 ),
-        Vector3( inv3 )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm )
-    return Transform3(
-        absPerElem( tfrm.getCol0() ),
-        absPerElem( tfrm.getCol1() ),
-        absPerElem( tfrm.getCol2() ),
-        absPerElem( tfrm.getCol3() )
-    );
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator *( const Vector3 &vec ) const
-    __m128 res;
-    __m128 xxxx, yyyy, zzzz;
-    xxxx = vec_splat( vec.get128(), 0 );
-    yyyy = vec_splat( vec.get128(), 1 );
-    zzzz = vec_splat( vec.get128(), 2 );
-    res = vec_mul( mCol0.get128(), xxxx );
-    res = vec_madd( mCol1.get128(), yyyy, res );
-    res = vec_madd( mCol2.get128(), zzzz, res );
-    return Vector3( res );
-VECTORMATH_FORCE_INLINE const Point3 Transform3::operator *( const Point3 &pnt ) const
-    __m128 tmp0, tmp1, res;
-    __m128 xxxx, yyyy, zzzz;
-    xxxx = vec_splat( pnt.get128(), 0 );
-    yyyy = vec_splat( pnt.get128(), 1 );
-    zzzz = vec_splat( pnt.get128(), 2 );
-    tmp0 = vec_mul( mCol0.get128(), xxxx );
-    tmp1 = vec_mul( mCol1.get128(), yyyy );
-    tmp0 = vec_madd( mCol2.get128(), zzzz, tmp0 );
-    tmp1 = vec_add( mCol3.get128(), tmp1 );
-    res = vec_add( tmp0, tmp1 );
-    return Point3( res );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
-    return Transform3(
-        ( *this * tfrm.mCol0 ),
-        ( *this * tfrm.mCol1 ),
-        ( *this * tfrm.mCol2 ),
-        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
-    );
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator *=( const Transform3 & tfrm )
-    *this = *this * tfrm;
-    return *this;
-VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
-    return Transform3(
-        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
-        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
-        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
-        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::identity( )
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( ),
-        Vector3( 0.0f )
-    );
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
-    mCol0 = tfrm.getCol0();
-    mCol1 = tfrm.getCol1();
-    mCol2 = tfrm.getCol2();
-    return *this;
-VECTORMATH_FORCE_INLINE const Matrix3 Transform3::getUpper3x3( ) const
-    return Matrix3( mCol0, mCol1, mCol2 );
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
-    mCol3 = translateVec;
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getTranslation( ) const
-    return mCol3;
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( float radians )
-    return rotationX( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( const floatInVec &radians )
-    __m128 s, c, res1, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res1 = vec_sel( zero, c, select_y );
-    res1 = vec_sel( res1, s, select_z );
-    res2 = vec_sel( zero, negatef4(s), select_y );
-    res2 = vec_sel( res2, c, select_z );
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3( res1 ),
-        Vector3( res2 ),
-        Vector3( _mm_setzero_ps() )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( float radians )
-    return rotationY( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( const floatInVec &radians )
-    __m128 s, c, res0, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, negatef4(s), select_z );
-    res2 = vec_sel( zero, s, select_x );
-    res2 = vec_sel( res2, c, select_z );
-    return Transform3(
-        Vector3( res0 ),
-        Vector3::yAxis( ),
-        Vector3( res2 ),
-        Vector3( 0.0f )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( float radians )
-    return rotationZ( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( const floatInVec &radians )
-    __m128 s, c, res0, res1;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-    __m128 zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, s, select_y );
-    res1 = vec_sel( zero, negatef4(s), select_x );
-    res1 = vec_sel( res1, c, select_y );
-    return Transform3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3::zAxis( ),
-        Vector3( 0.0f )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
-    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
-    angles = Vector4( radiansXYZ, 0.0f ).get128();
-    sincosf4( angles, &s, &c );
-    negS = negatef4( s );
-    Z0 = vec_mergel( c, s );
-    Z1 = vec_mergel( negS, c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
-	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
-	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
-    X0 = vec_splat( s, 0 );
-    X1 = vec_splat( c, 0 );
-    tmp = vec_mul( Z0, Y1 );
-    return Transform3(
-        Vector3( vec_mul( Z0, Y0 ) ),
-        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
-        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
-        Vector3( 0.0f )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
-    return rotation( floatInVec(radians), unitVec );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
-    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const Quat &unitQuat )
-    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::scale( const Vector3 &scaleVec )
-    __m128 zero = _mm_setzero_ps();
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    return Transform3(
-        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ),
-        Vector3( 0.0f )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
-    return Transform3(
-        ( tfrm.getCol0() * scaleVec.getX( ) ),
-        ( tfrm.getCol1() * scaleVec.getY( ) ),
-        ( tfrm.getCol2() * scaleVec.getZ( ) ),
-        tfrm.getCol3()
-    );
-VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
-    return Transform3(
-        mulPerElem( tfrm.getCol0(), scaleVec ),
-        mulPerElem( tfrm.getCol1(), scaleVec ),
-        mulPerElem( tfrm.getCol2(), scaleVec ),
-        mulPerElem( tfrm.getCol3(), scaleVec )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::translation( const Vector3 &translateVec )
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( ),
-        translateVec
-    );
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
-    return Transform3(
-        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
-        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
-        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
-        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
-    );
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
-    return Transform3(
-        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
-        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
-        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
-        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
-    );
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm )
-    print( tfrm.getRow( 0 ) );
-    print( tfrm.getRow( 1 ) );
-    print( tfrm.getRow( 2 ) );
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name )
-    printf("%s:\n", name);
-    print( tfrm );
-VECTORMATH_FORCE_INLINE Quat::Quat( const Matrix3 & tfrm )
-    __m128 res;
-    __m128 col0, col1, col2;
-    __m128 xx_yy, xx_yy_zz_xx, yy_zz_xx_yy, zz_xx_yy_zz, diagSum, diagDiff;
-    __m128 zy_xz_yx, yz_zx_xy, sum, diff;
-    __m128 radicand, invSqrt, scale;
-    __m128 res0, res1, res2, res3;
-    __m128 xx, yy, zz;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
-    col0 = tfrm.getCol0().get128();
-    col1 = tfrm.getCol1().get128();
-    col2 = tfrm.getCol2().get128();
-    /* four cases: */
-    /* trace > 0 */
-    /* else */
-    /*    xx largest diagonal element */
-    /*    yy largest diagonal element */
-    /*    zz largest diagonal element */
-    /* compute quaternion for each case */
-    xx_yy = vec_sel( col0, col1, select_y );
-    //xx_yy_zz_xx = vec_perm( xx_yy, col2, _VECTORMATH_PERM_XYCX );
-    //yy_zz_xx_yy = vec_perm( xx_yy, col2, _VECTORMATH_PERM_YCXY );
-    //zz_xx_yy_zz = vec_perm( xx_yy, col2, _VECTORMATH_PERM_CXYC );
-    xx_yy_zz_xx = _mm_shuffle_ps( xx_yy, xx_yy, _MM_SHUFFLE(0,0,1,0) );
-    xx_yy_zz_xx = vec_sel( xx_yy_zz_xx, col2, select_z ); // TODO: Ck
-    yy_zz_xx_yy = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(1,0,2,1) );
-    zz_xx_yy_zz = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(2,1,0,2) );
-    diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
-    diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
-    radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) );
- //   invSqrt = rsqrtf4( radicand );
-	invSqrt = newtonrapson_rsqrt4( radicand );
-    zy_xz_yx = vec_sel( col0, col1, select_z );									// zy_xz_yx = 00 01 12 03
-    //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX );
-	zy_xz_yx = _mm_shuffle_ps( zy_xz_yx, zy_xz_yx, _MM_SHUFFLE(0,1,2,2) );		// zy_xz_yx = 12 12 01 00
-    zy_xz_yx = vec_sel( zy_xz_yx, vec_splat(col2, 0), select_y );				// zy_xz_yx = 12 20 01 00
-    yz_zx_xy = vec_sel( col0, col1, select_x );									// yz_zx_xy = 10 01 02 03
-    //yz_zx_xy = vec_perm( yz_zx_xy, col2, _VECTORMATH_PERM_BZXX );
-	yz_zx_xy = _mm_shuffle_ps( yz_zx_xy, yz_zx_xy, _MM_SHUFFLE(0,0,2,0) );		// yz_zx_xy = 10 02 10 10
-	yz_zx_xy = vec_sel( yz_zx_xy, vec_splat(col2, 1), select_x );				// yz_zx_xy = 21 02 10 10
-    sum = vec_add( zy_xz_yx, yz_zx_xy );
-    diff = vec_sub( zy_xz_yx, yz_zx_xy );
-    scale = vec_mul( invSqrt, _mm_set1_ps(0.5f) );
-    //res0 = vec_perm( sum, diff, _VECTORMATH_PERM_XZYA );
-	res0 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,1,2,0) );
-	res0 = vec_sel( res0, vec_splat(diff, 0), select_w );  // TODO: Ck
-    //res1 = vec_perm( sum, diff, _VECTORMATH_PERM_ZXXB );
-	res1 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,2) );
-	res1 = vec_sel( res1, vec_splat(diff, 1), select_w );  // TODO: Ck
-    //res2 = vec_perm( sum, diff, _VECTORMATH_PERM_YXXC );
-	res2 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,1) );
-	res2 = vec_sel( res2, vec_splat(diff, 2), select_w );  // TODO: Ck
-    res3 = diff;
-    res0 = vec_sel( res0, radicand, select_x );
-    res1 = vec_sel( res1, radicand, select_y );
-    res2 = vec_sel( res2, radicand, select_z );
-    res3 = vec_sel( res3, radicand, select_w );
-    res0 = vec_mul( res0, vec_splat( scale, 0 ) );
-    res1 = vec_mul( res1, vec_splat( scale, 1 ) );
-    res2 = vec_mul( res2, vec_splat( scale, 2 ) );
-    res3 = vec_mul( res3, vec_splat( scale, 3 ) );
-    /* determine case and select answer */
-    xx = vec_splat( col0, 0 );
-    yy = vec_splat( col1, 1 );
-    zz = vec_splat( col2, 2 );
-    res = vec_sel( res0, res1, vec_cmpgt( yy, xx ) );
-    res = vec_sel( res, res2, vec_and( vec_cmpgt( zz, xx ), vec_cmpgt( zz, yy ) ) );
-    res = vec_sel( res, res3, vec_cmpgt( vec_splat( diagSum, 0 ), _mm_setzero_ps() ) );
-    mVec128 = res;
-VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
-    return Matrix3(
-        ( tfrm0 * tfrm1.getX( ) ),
-        ( tfrm0 * tfrm1.getY( ) ),
-        ( tfrm0 * tfrm1.getZ( ) )
-    );
-VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
-    return Matrix4(
-        ( tfrm0 * tfrm1.getX( ) ),
-        ( tfrm0 * tfrm1.getY( ) ),
-        ( tfrm0 * tfrm1.getZ( ) ),
-        ( tfrm0 * tfrm1.getW( ) )
-    );
-VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
-    __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res;
-    __m128 xxxx, yyyy, zzzz;
-    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
-    xxxx = vec_splat( vec.get128(), 0 );
-    mcol0 = vec_mergeh( tmp0, mat.getCol1().get128() );
-    //mcol1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	mcol1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
-	mcol1 = vec_sel(mcol1, mat.getCol1().get128(), select_y);
-    //mcol2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
-	mcol2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
-	mcol2 = vec_sel(mcol2, vec_splat(mat.getCol1().get128(), 2), select_y);
-    yyyy = vec_splat( vec.get128(), 1 );
-    res = vec_mul( mcol0, xxxx );
-    zzzz = vec_splat( vec.get128(), 2 );
-    res = vec_madd( mcol1, yyyy, res );
-    res = vec_madd( mcol2, zzzz, res );
-    return Vector3( res );
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec )
-    __m128 neg, res0, res1, res2;
-    neg = negatef4( vec.get128() );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    //res0 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_XZBX );
-	res0 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,2,2,0) );
-	res0 = vec_sel(res0, vec_splat(neg, 1), select_z);
-    //res1 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_CXXX );
-	res1 = vec_sel(vec_splat(vec.get128(), 0), vec_splat(neg, 2), select_x);
-    //res2 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_YAXX );
-	res2 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,1,1) );
-	res2 = vec_sel(res2, vec_splat(neg, 0), select_y);
-	VM_ATTRIBUTE_ALIGN16 unsigned int filter_x[4] = {0, 0xffffffff, 0xffffffff, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int filter_y[4] = {0xffffffff, 0, 0xffffffff, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int filter_z[4] = {0xffffffff, 0xffffffff, 0, 0xffffffff};
-    res0 = vec_and( res0, _mm_load_ps((float *)filter_x ) );
-    res1 = vec_and( res1, _mm_load_ps((float *)filter_y ) );
-    res2 = vec_and( res2, _mm_load_ps((float *)filter_z ) ); // TODO: Use selects?
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3( res2 )
-    );
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
-    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/sse/quat_aos.h b/src/bullet/vectormath/sse/quat_aos.h
deleted file mode 100644
index 7eac59fe..00000000
--- a/src/bullet/vectormath/sse/quat_aos.h
+++ /dev/null
@@ -1,579 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-// Definitions
-namespace Vectormath {
-namespace Aos {
-VECTORMATH_FORCE_INLINE void Quat::set128(vec_float4 vec)
-    mVec128 = vec;
-VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
-	mVec128 = _mm_unpacklo_ps(
-		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
-		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
-VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, float _w )
-    mVec128 = xyz.get128();
-    _vmathVfSetElement(mVec128, _w, 3);
-VECTORMATH_FORCE_INLINE  Quat::Quat(const Quat& quat)
-	mVec128 = quat.get128();
-VECTORMATH_FORCE_INLINE Quat::Quat( float _x, float _y, float _z, float _w )
-	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
-VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
-    mVec128 = xyz.get128();
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-VECTORMATH_FORCE_INLINE Quat::Quat( const Vector4 &vec )
-    mVec128 = vec.get128();
-VECTORMATH_FORCE_INLINE Quat::Quat( float scalar )
-    mVec128 = floatInVec(scalar).get128();
-VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &scalar )
-    mVec128 = scalar.get128();
-VECTORMATH_FORCE_INLINE Quat::Quat( __m128 vf4 )
-    mVec128 = vf4;
-VECTORMATH_FORCE_INLINE const Quat Quat::identity( )
-    return Quat( _VECTORMATH_UNIT_0001 );
-VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
-    return lerp( floatInVec(t), quat0, quat1 );
-VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
-    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
-VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
-    return slerp( floatInVec(t), unitQuat0, unitQuat1 );
-VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
-    Quat start;
-    vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    __m128 selectMask;
-    cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
-    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
-    cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
-    start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
-    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
-    angle = acosf4( cosAngle );
-    tttt = t.get128();
-    oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
-    angles = vec_mergeh( _mm_set1_ps(1.0f), tttt );
-    angles = vec_mergeh( angles, oneMinusT );
-    angles = vec_madd( angles, angle, _mm_setzero_ps() );
-    sines = sinf4( angles );
-    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
-    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
-    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
-    return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
-VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
-    return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
-VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
-    return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
-VECTORMATH_FORCE_INLINE __m128 Quat::get128( ) const
-    return mVec128;
-VECTORMATH_FORCE_INLINE Quat & Quat::operator =( const Quat &quat )
-    mVec128 = quat.mVec128;
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::setXYZ( const Vector3 &vec )
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-	mVec128 = vec_sel( vec.get128(), mVec128, sw );
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 Quat::getXYZ( ) const
-    return Vector3( mVec128 );
-VECTORMATH_FORCE_INLINE Quat & Quat::setX( float _x )
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::setX( const floatInVec &_x )
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getX( ) const
-    return floatInVec( mVec128, 0 );
-VECTORMATH_FORCE_INLINE Quat & Quat::setY( float _y )
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::setY( const floatInVec &_y )
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getY( ) const
-    return floatInVec( mVec128, 1 );
-VECTORMATH_FORCE_INLINE Quat & Quat::setZ( float _z )
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::setZ( const floatInVec &_z )
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getZ( ) const
-    return floatInVec( mVec128, 2 );
-VECTORMATH_FORCE_INLINE Quat & Quat::setW( float _w )
-    _vmathVfSetElement(mVec128, _w, 3);
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::setW( const floatInVec &_w )
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getW( ) const
-    return floatInVec( mVec128, 3 );
-VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, float value )
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, const floatInVec &value )
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getElem( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE VecIdx Quat::operator []( int idx )
-    return VecIdx( mVec128, idx );
-VECTORMATH_FORCE_INLINE const floatInVec Quat::operator []( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE const Quat Quat::operator +( const Quat &quat ) const
-    return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Quat Quat::operator -( const Quat &quat ) const
-    return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Quat Quat::operator *( float scalar ) const
-    return *this * floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const floatInVec &scalar ) const
-    return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
-VECTORMATH_FORCE_INLINE Quat & Quat::operator +=( const Quat &quat )
-    *this = *this + quat;
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::operator -=( const Quat &quat )
-    *this = *this - quat;
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const floatInVec &scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Quat Quat::operator /( float scalar ) const
-    return *this / floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Quat Quat::operator /( const floatInVec &scalar ) const
-    return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
-VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( float scalar )
-    *this = *this / scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( const floatInVec &scalar )
-    *this = *this / scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Quat Quat::operator -( ) const
-	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
-VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat )
-    return floatInVec(scalar) * quat;
-VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat )
-    return quat * scalar;
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 )
-    return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat )
-    return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat )
-    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
-VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat )
-	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
-    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
-    Vector3 crossVec;
-    __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
-    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
-    cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) );
-    recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 );
-    cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 );
-    crossVec = cross( unitVec0, unitVec1 );
-    res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 );
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw );
-    return Quat( res );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotation( float radians, const Vector3 &unitVec )
-    return rotation( floatInVec(radians), unitVec );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw );
-    return Quat( res );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( float radians )
-    return rotationX( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( const floatInVec &radians )
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( _mm_setzero_ps(), s, xsw );
-    res = vec_sel( res, c, wsw );
-    return Quat( res );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( float radians )
-    return rotationY( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( const floatInVec &radians )
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int ysw[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( _mm_setzero_ps(), s, ysw );
-    res = vec_sel( res, c, wsw );
-    return Quat( res );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( float radians )
-    return rotationZ( floatInVec(radians) );
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( const floatInVec &radians )
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( _mm_setzero_ps(), s, zsw );
-    res = vec_sel( res, c, wsw );
-    return Quat( res );
-VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const Quat &quat ) const
-    __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
-    __m128 product, l_wxyz, r_wxyz, xy, qw;
-    ldata = mVec128;
-    rdata = quat.mVec128;
-    tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) );
-    tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) );
-    tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) );
-    qv = vec_mul( vec_splat( ldata, 3 ), rdata );
-    qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv );
-    qv = vec_madd( tmp0, tmp1, qv );
-    qv = vec_nmsub( tmp2, tmp3, qv );
-    product = vec_mul( ldata, rdata );
-    l_wxyz = vec_sld( ldata, ldata, 12 );
-    r_wxyz = vec_sld( rdata, rdata, 12 );
-    qw = vec_nmsub( l_wxyz, r_wxyz, product );
-    xy = vec_madd( l_wxyz, r_wxyz, product );
-    qw = vec_sub( qw, vec_sld( xy, xy, 8 ) );
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-    return Quat( vec_sel( qv, qw, sw ) );
-VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const Quat &quat )
-    *this = *this * quat;
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &quat, const Vector3 &vec )
-{    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
-    qdata = quat.get128();
-    vdata = vec.get128();
-    tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) );
-    tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) );
-    tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) );
-    wwww = vec_splat( qdata, 3 );
-    qv = vec_mul( wwww, vdata );
-    qv = vec_madd( tmp0, tmp1, qv );
-    qv = vec_nmsub( tmp2, tmp3, qv );
-    product = vec_mul( qdata, vdata );
-    qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product );
-    qw = vec_add( vec_sld( product, product, 8 ), qw );
-    tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) );
-    res = vec_mul( vec_splat( qw, 0 ), qdata );
-    res = vec_madd( wwww, qv, res );
-    res = vec_madd( tmp0, tmp1, res );
-    res = vec_nmsub( tmp2, tmp3, res );
-    return Vector3( res );
-VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat )
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
-    return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
-VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
-    return select( quat0, quat1, boolInVec(select1) );
-//VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
-//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
-VECTORMATH_FORCE_INLINE void loadXYZW(Quat& quat, const float* fptr)
-#ifdef USE_SSE3_LDDQU
-	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
-	SSEFloat fl;
-	fl.f[0] = fptr[0];
-	fl.f[1] = fptr[1];
-	fl.f[2] = fptr[2];
-	fl.f[3] = fptr[3];
-    quat = Quat(	fl.m128);
-VECTORMATH_FORCE_INLINE void storeXYZW(const Quat& quat, float* fptr)
-	fptr[0] = quat.getX();
-	fptr[1] = quat.getY();
-	fptr[2] = quat.getZ();
-	fptr[3] = quat.getW();
-//    _mm_storeu_ps((float*)quat.get128(),fptr);
-VECTORMATH_FORCE_INLINE void print( const Quat &quat )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = quat.get128();
-    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = quat.get128();
-    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/sse/vec_aos.h b/src/bullet/vectormath/sse/vec_aos.h
deleted file mode 100644
index 35aeeaf1..00000000
--- a/src/bullet/vectormath/sse/vec_aos.h
+++ /dev/null
@@ -1,1455 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-// Constants
-// for permutes words are labeled [x,y,z,w] [a,b,c,d]
-#define _VECTORMATH_PERM_X 0x00010203
-#define _VECTORMATH_PERM_Y 0x04050607
-#define _VECTORMATH_PERM_Z 0x08090a0b
-#define _VECTORMATH_PERM_W 0x0c0d0e0f
-#define _VECTORMATH_PERM_A 0x10111213
-#define _VECTORMATH_PERM_B 0x14151617
-#define _VECTORMATH_PERM_C 0x18191a1b
-#define _VECTORMATH_PERM_D 0x1c1d1e1f
-#define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 }
-#define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 }
-#define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 }
-#define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff }
-#define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f }
-#define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f }
-#define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
-#define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
-#define _VECTORMATH_SLERP_TOL 0.999f
-// Definitions
-#define     _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
-	__m128 result = _mm_mul_ps( vec0, vec1);
-    return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) );
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
-    __m128 result = _mm_mul_ps(vec0, vec1);
-	return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
-			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)),
-			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
-    __m128 tmp0, tmp1, tmp2, tmp3, result;
-    tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
-    tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) );
-    tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) );
-    result = vec_mul( tmp0, tmp1 );
-    result = vec_nmsub( tmp2, tmp3, result );
-    return result;
-static VECTORMATH_FORCE_INLINE vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
-#if 0
-    vec_int4 bexp;
-    vec_uint4 mant, sign, hfloat;
-    vec_uint4 notZero, isInf;
-    const vec_uint4 hfloatInf = (vec_uint4)(0x00007c00u);
-    const vec_uint4 mergeMant = (vec_uint4)(0x000003ffu);
-    const vec_uint4 mergeSign = (vec_uint4)(0x00008000u);
-    sign = vec_sr((vec_uint4)v, (vec_uint4)16);
-    mant = vec_sr((vec_uint4)v, (vec_uint4)13);
-    bexp = vec_and(vec_sr((vec_int4)v, (vec_uint4)23), (vec_int4)0xff);
-    notZero = (vec_uint4)vec_cmpgt(bexp, (vec_int4)112);
-    isInf = (vec_uint4)vec_cmpgt(bexp, (vec_int4)142);
-    bexp = _mm_add_ps(bexp, (vec_int4)-112);
-    bexp = vec_sl(bexp, (vec_uint4)10);
-    hfloat = vec_sel((vec_uint4)bexp, mant, mergeMant);
-    hfloat = vec_sel((vec_uint4)(0), hfloat, notZero);
-    hfloat = vec_sel(hfloat, hfloatInf, isInf);
-    hfloat = vec_sel(hfloat, sign, mergeSign);
-    return hfloat;
-	assert(0);
-	return _mm_setzero_ps();
-static VECTORMATH_FORCE_INLINE vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
-#if 0
-    vec_uint4 hfloat_u, hfloat_v;
-    const vec_uchar16 pack = (vec_uchar16){2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
-    hfloat_u = _vmathVfToHalfFloatsUnpacked(u);
-    hfloat_v = _vmathVfToHalfFloatsUnpacked(v);
-    return (vec_ushort8)vec_perm(hfloat_u, hfloat_v, pack);
-	assert(0);
-	return _mm_setzero_si128();
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
-	SSEFloat s;
-	s.m128 = src;
-	SSEFloat d;
-	d.m128 = dst;
-	d.f[slot] = s.f[slot];
-	return d.m128;
-#define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfSplatScalar(float scalar)
-	return _mm_set1_ps(scalar);
-namespace Vectormath {
-namespace Aos {
-VECTORMATH_FORCE_INLINE VecIdx::operator floatInVec() const
-    return floatInVec(ref, i);
-VECTORMATH_FORCE_INLINE float VecIdx::getAsFloat() const
-VECTORMATH_FORCE_INLINE VecIdx::operator float() const
-    return ((float *)&ref)[i];
-VECTORMATH_FORCE_INLINE float VecIdx::operator =( float scalar )
-    _vmathVfSetElement(ref, scalar, i);
-    return scalar;
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const floatInVec &scalar )
-    ref = _vmathVfInsert(ref, scalar.get128(), i);
-    return scalar;
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const VecIdx& scalar )
-    return *this = floatInVec(scalar.ref, scalar.i);
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( float scalar )
-    return *this *= floatInVec(scalar);
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( const floatInVec &scalar )
-    return *this = floatInVec(ref, i) * scalar;
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator /=( float scalar )
-    return *this /= floatInVec(scalar);
-inline floatInVec VecIdx::operator /=( const floatInVec &scalar )
-    return *this = floatInVec(ref, i) / scalar;
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( float scalar )
-    return *this += floatInVec(scalar);
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( const floatInVec &scalar )
-    return *this = floatInVec(ref, i) + scalar;
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( float scalar )
-    return *this -= floatInVec(scalar);
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( const floatInVec &scalar )
-    return *this = floatInVec(ref, i) - scalar;
-VECTORMATH_FORCE_INLINE Vector3::Vector3(const Vector3& vec)
-    set128(vec.get128());
-VECTORMATH_FORCE_INLINE void Vector3::set128(vec_float4 vec)
-    mVec128 = vec;
-VECTORMATH_FORCE_INLINE Vector3::Vector3( float _x, float _y, float _z )
-    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
-VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
-	__m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() );
-	mVec128 = _mm_unpacklo_ps( xz, _y.get128() );
-VECTORMATH_FORCE_INLINE Vector3::Vector3( const Point3 &pnt )
-    mVec128 = pnt.get128();
-VECTORMATH_FORCE_INLINE Vector3::Vector3( float scalar )
-    mVec128 = floatInVec(scalar).get128();
-VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &scalar )
-    mVec128 = scalar.get128();
-VECTORMATH_FORCE_INLINE Vector3::Vector3( __m128 vf4 )
-    mVec128 = vf4;
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::xAxis( )
-    return Vector3( _VECTORMATH_UNIT_1000 );
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::yAxis( )
-    return Vector3( _VECTORMATH_UNIT_0100 );
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::zAxis( )
-    return Vector3( _VECTORMATH_UNIT_0010 );
-VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
-    return lerp( floatInVec(t), vec0, vec1 );
-VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
-    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
-VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
-    return slerp( floatInVec(t), unitVec0, unitVec1 );
-VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
-    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
-    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
-    angle = acosf4( cosAngle );
-    tttt = t.get128();
-    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
-    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
-    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
-    angles = _mm_mul_ps( angles, angle );
-    sines = sinf4( angles );
-    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
-    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
-    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
-    return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
-VECTORMATH_FORCE_INLINE __m128 Vector3::get128( ) const
-    return mVec128;
-VECTORMATH_FORCE_INLINE void loadXYZ(Point3& vec, const float* fptr)
-#ifdef USE_SSE3_LDDQU
-	vec = Point3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
-	SSEFloat fl;
-	fl.f[0] = fptr[0];
-	fl.f[1] = fptr[1];
-	fl.f[2] = fptr[2];
-	fl.f[3] = fptr[3];
-    vec = Point3(	fl.m128);
-#endif //USE_SSE3_LDDQU
-VECTORMATH_FORCE_INLINE void loadXYZ(Vector3& vec, const float* fptr)
-#ifdef USE_SSE3_LDDQU
-	vec = Vector3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
-	SSEFloat fl;
-	fl.f[0] = fptr[0];
-	fl.f[1] = fptr[1];
-	fl.f[2] = fptr[2];
-	fl.f[3] = fptr[3];
-    vec = Vector3(	fl.m128);
-#endif //USE_SSE3_LDDQU
-VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad )
-	__m128 dstVec = *quad;
-	VM_ATTRIBUTE_ALIGN16  unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
-	dstVec = vec_sel(vec.get128(), dstVec, sw);
-	*quad = dstVec;
-VECTORMATH_FORCE_INLINE void storeXYZ(const Point3& vec, float* fptr)
-	fptr[0] = vec.getX();
-	fptr[1] = vec.getY();
-	fptr[2] = vec.getZ();
-VECTORMATH_FORCE_INLINE void storeXYZ(const Vector3& vec, float* fptr)
-	fptr[0] = vec.getX();
-	fptr[1] = vec.getY();
-	fptr[2] = vec.getZ();
-VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
-	const float *quads = (float *)threeQuads;
-    vec0 = Vector3(  _mm_load_ps(quads) );
-    vec1 = Vector3( _mm_loadu_ps(quads + 3) );
-    vec2 = Vector3( _mm_loadu_ps(quads + 6) );
-    vec3 = Vector3( _mm_loadu_ps(quads + 9) );
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
-	__m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
-	__m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
-	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
-	threeQuads[0] = vec_sel( vec0.get128(), xxxx, xsw );
-    threeQuads[1] = _mm_shuffle_ps( vec1.get128(), vec2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
-    threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
-	assert(0);
-#if 0
-    __m128 xyz0[3];
-    __m128 xyz1[3];
-    storeXYZArray( vec0, vec1, vec2, vec3, xyz0 );
-    storeXYZArray( vec4, vec5, vec6, vec7, xyz1 );
-    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
-    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
-    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator =( const Vector3 &vec )
-    mVec128 = vec.mVec128;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( float _x )
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( const floatInVec &_x )
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getX( ) const
-    return floatInVec( mVec128, 0 );
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( float _y )
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( const floatInVec &_y )
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getY( ) const
-    return floatInVec( mVec128, 1 );
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( float _z )
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( const floatInVec &_z )
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getZ( ) const
-    return floatInVec( mVec128, 2 );
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, float value )
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, const floatInVec &value )
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getElem( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE VecIdx Vector3::operator []( int idx )
-    return VecIdx( mVec128, idx );
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::operator []( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator +( const Vector3 &vec ) const
-    return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( const Vector3 &vec ) const
-    return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Point3 Vector3::operator +( const Point3 &pnt ) const
-    return Point3( _mm_add_ps( mVec128, pnt.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( float scalar ) const
-    return *this * floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( const floatInVec &scalar ) const
-    return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) );
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator +=( const Vector3 &vec )
-    *this = *this + vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator -=( const Vector3 &vec )
-    *this = *this - vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( const floatInVec &scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( float scalar ) const
-    return *this / floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( const floatInVec &scalar ) const
-    return Vector3( _mm_div_ps( mVec128, scalar.get128() ) );
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( float scalar )
-    *this = *this / scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( const floatInVec &scalar )
-    *this = *this / scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( ) const
-	//return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
-	VM_ATTRIBUTE_ALIGN16 static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-	__m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf;
-	return Vector3(_mm_xor_ps(get128(),NEG_MASK));
-VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec )
-    return floatInVec(scalar) * vec;
-VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
-    return vec * scalar;
-VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-    return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-    return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec )
-    return Vector3( _mm_rcp_ps( vec.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec )
-    return Vector3( fabsf4( vec.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-	__m128 vmask = toM128(0x7fffffff);
-	return Vector3( _mm_or_ps(
-		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
-		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
-VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-    return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec )
-    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
-VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-    return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec )
-    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec )
-    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
-    return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec )
-    return floatInVec(  _vmathVfDot3( vec.get128(), vec.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec )
-    return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
-VECTORMATH_FORCE_INLINE const Vector3 normalizeApprox( const Vector3 &vec )
-    return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
-VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec )
-	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
-VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
-    return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
-    return select( vec0, vec1, boolInVec(select1) );
-VECTORMATH_FORCE_INLINE  const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1)
-    return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128()));
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
-VECTORMATH_FORCE_INLINE Vector4::Vector4( float _x, float _y, float _z, float _w )
-    mVec128 = _mm_setr_ps(_x, _y, _z, _w); 
- }
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
-	mVec128 = _mm_unpacklo_ps(
-		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
-		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, float _w )
-    mVec128 = xyz.get128();
-    _vmathVfSetElement(mVec128, _w, 3);
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
-    mVec128 = xyz.get128();
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &vec )
-    mVec128 = vec.get128();
-    mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3);
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Point3 &pnt )
-    mVec128 = pnt.get128();
-    mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3);
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Quat &quat )
-    mVec128 = quat.get128();
-VECTORMATH_FORCE_INLINE Vector4::Vector4( float scalar )
-    mVec128 = floatInVec(scalar).get128();
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &scalar )
-    mVec128 = scalar.get128();
-VECTORMATH_FORCE_INLINE Vector4::Vector4( __m128 vf4 )
-    mVec128 = vf4;
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::xAxis( )
-    return Vector4( _VECTORMATH_UNIT_1000 );
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::yAxis( )
-    return Vector4( _VECTORMATH_UNIT_0100 );
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::zAxis( )
-    return Vector4( _VECTORMATH_UNIT_0010 );
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::wAxis( )
-    return Vector4( _VECTORMATH_UNIT_0001 );
-VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
-    return lerp( floatInVec(t), vec0, vec1 );
-VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
-    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
-VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
-    return slerp( floatInVec(t), unitVec0, unitVec1 );
-VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
-    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() );
-    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
-    angle = acosf4( cosAngle );
-    tttt = t.get128();
-    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
-    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
-    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
-    angles = _mm_mul_ps( angles, angle );
-    sines = sinf4( angles );
-    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
-    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
-    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
-    return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
-VECTORMATH_FORCE_INLINE __m128 Vector4::get128( ) const
-    return mVec128;
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
-    twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128());
-    twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128());
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator =( const Vector4 &vec )
-    mVec128 = vec.mVec128;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setXYZ( const Vector3 &vec )
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-	mVec128 = vec_sel( vec.get128(), mVec128, sw );
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector3 Vector4::getXYZ( ) const
-    return Vector3( mVec128 );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( float _x )
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( const floatInVec &_x )
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getX( ) const
-    return floatInVec( mVec128, 0 );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( float _y )
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( const floatInVec &_y )
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getY( ) const
-    return floatInVec( mVec128, 1 );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( float _z )
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( const floatInVec &_z )
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getZ( ) const
-    return floatInVec( mVec128, 2 );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( float _w )
-    _vmathVfSetElement(mVec128, _w, 3);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( const floatInVec &_w )
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getW( ) const
-    return floatInVec( mVec128, 3 );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, float value )
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, const floatInVec &value )
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getElem( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE VecIdx Vector4::operator []( int idx )
-    return VecIdx( mVec128, idx );
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::operator []( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator +( const Vector4 &vec ) const
-    return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( const Vector4 &vec ) const
-    return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( float scalar ) const
-    return *this * floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( const floatInVec &scalar ) const
-    return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator +=( const Vector4 &vec )
-    *this = *this + vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator -=( const Vector4 &vec )
-    *this = *this - vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( float scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( const floatInVec &scalar )
-    *this = *this * scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( float scalar ) const
-    return *this / floatInVec(scalar);
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( const floatInVec &scalar ) const
-    return Vector4( _mm_div_ps( mVec128, scalar.get128() ) );
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( float scalar )
-    *this = *this / scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( const floatInVec &scalar )
-    *this = *this / scalar;
-    return *this;
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( ) const
-	return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
-VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec )
-    return floatInVec(scalar) * vec;
-VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
-    return vec * scalar;
-VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-    return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-    return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec )
-    return Vector4( _mm_rcp_ps( vec.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec )
-    return Vector4( fabsf4( vec.get128() ) );
-VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-	__m128 vmask = toM128(0x7fffffff);
-	return Vector4( _mm_or_ps(
-		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
-		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
-VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-    return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec )
-    return floatInVec( _mm_max_ps(
-		_mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
-		_mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
-VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-    return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec )
-    return floatInVec( _mm_min_ps(
-		_mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
-		_mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec )
-    return floatInVec( _mm_add_ps(
-		_mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
-		_mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
-    return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec )
-    return floatInVec(  _vmathVfDot4( vec.get128(), vec.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec )
-    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
-VECTORMATH_FORCE_INLINE const Vector4 normalizeApprox( const Vector4 &vec )
-    return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
-VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec )
-    return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
-VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
-    return select( vec0, vec1, boolInVec(select1) );
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-VECTORMATH_FORCE_INLINE Point3::Point3( float _x, float _y, float _z )
-    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
-VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
-	mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() );
-VECTORMATH_FORCE_INLINE Point3::Point3( const Vector3 &vec )
-    mVec128 = vec.get128();
-VECTORMATH_FORCE_INLINE Point3::Point3( float scalar )
-    mVec128 = floatInVec(scalar).get128();
-VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &scalar )
-    mVec128 = scalar.get128();
-VECTORMATH_FORCE_INLINE Point3::Point3( __m128 vf4 )
-    mVec128 = vf4;
-VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
-    return lerp( floatInVec(t), pnt0, pnt1 );
-VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
-    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
-VECTORMATH_FORCE_INLINE __m128 Point3::get128( ) const
-    return mVec128;
-VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad )
-    __m128 dstVec = *quad;
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
-    dstVec = vec_sel(pnt.get128(), dstVec, sw);
-    *quad = dstVec;
-VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
-	const float *quads = (float *)threeQuads;
-    pnt0 = Point3(  _mm_load_ps(quads) );
-    pnt1 = Point3( _mm_loadu_ps(quads + 3) );
-    pnt2 = Point3( _mm_loadu_ps(quads + 6) );
-    pnt3 = Point3( _mm_loadu_ps(quads + 9) );
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
-	__m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
-	__m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
-	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
-	threeQuads[0] = vec_sel( pnt0.get128(), xxxx, xsw );
-    threeQuads[1] = _mm_shuffle_ps( pnt1.get128(), pnt2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
-    threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
-#if 0
-    __m128 xyz0[3];
-    __m128 xyz1[3];
-    storeXYZArray( pnt0, pnt1, pnt2, pnt3, xyz0 );
-    storeXYZArray( pnt4, pnt5, pnt6, pnt7, xyz1 );
-    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
-    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
-    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
-	assert(0);
-VECTORMATH_FORCE_INLINE Point3 & Point3::operator =( const Point3 &pnt )
-    mVec128 = pnt.mVec128;
-    return *this;
-VECTORMATH_FORCE_INLINE Point3 & Point3::setX( float _x )
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-VECTORMATH_FORCE_INLINE Point3 & Point3::setX( const floatInVec &_x )
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getX( ) const
-    return floatInVec( mVec128, 0 );
-VECTORMATH_FORCE_INLINE Point3 & Point3::setY( float _y )
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-VECTORMATH_FORCE_INLINE Point3 & Point3::setY( const floatInVec &_y )
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getY( ) const
-    return floatInVec( mVec128, 1 );
-VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( float _z )
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( const floatInVec &_z )
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getZ( ) const
-    return floatInVec( mVec128, 2 );
-VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, float value )
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, const floatInVec &value )
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getElem( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE VecIdx Point3::operator []( int idx )
-    return VecIdx( mVec128, idx );
-VECTORMATH_FORCE_INLINE const floatInVec Point3::operator []( int idx ) const
-    return floatInVec( mVec128, idx );
-VECTORMATH_FORCE_INLINE const Vector3 Point3::operator -( const Point3 &pnt ) const
-    return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) );
-VECTORMATH_FORCE_INLINE const Point3 Point3::operator +( const Vector3 &vec ) const
-    return Point3( _mm_add_ps( mVec128, vec.get128() ) );
-VECTORMATH_FORCE_INLINE const Point3 Point3::operator -( const Vector3 &vec ) const
-    return Point3( _mm_sub_ps( mVec128, vec.get128() ) );
-VECTORMATH_FORCE_INLINE Point3 & Point3::operator +=( const Vector3 &vec )
-    *this = *this + vec;
-    return *this;
-VECTORMATH_FORCE_INLINE Point3 & Point3::operator -=( const Vector3 &vec )
-    *this = *this - vec;
-    return *this;
-VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-    return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
-VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-    return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
-VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt )
-    return Point3( _mm_rcp_ps( pnt.get128() ) );
-VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt )
-    return Point3( fabsf4( pnt.get128() ) );
-VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-	__m128 vmask = toM128(0x7fffffff);
-	return Point3( _mm_or_ps(
-		_mm_and_ps   ( vmask, pnt0.get128() ),			// Value
-		_mm_andnot_ps( vmask, pnt1.get128() ) ) );		// Signs
-VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-    return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt )
-    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
-VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-    return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt )
-    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt )
-    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal )
-    return scale( pnt, floatInVec( scaleVal ) );
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
-    return mulPerElem( pnt, Point3( scaleVal ) );
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
-    return mulPerElem( pnt, Point3( scaleVec ) );
-VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
-    return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
-VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt )
-    return lengthSqr( Vector3( pnt ) );
-VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt )
-    return length( Vector3( pnt ) );
-VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
-    return lengthSqr( ( pnt1 - pnt0 ) );
-VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
-    return length( ( pnt1 - pnt0 ) );
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
-    return select( pnt0, pnt1, boolInVec(select1) );
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
-    return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) );
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = pnt.get128();
-    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name )
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = pnt.get128();
-    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/sse/vecidx_aos.h b/src/bullet/vectormath/sse/vecidx_aos.h
deleted file mode 100644
index 8ba4b1d7..00000000
--- a/src/bullet/vectormath/sse/vecidx_aos.h
+++ /dev/null
@@ -1,80 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-#include "floatInVec.h"
-namespace Vectormath {
-namespace Aos {
-// VecIdx 
-// Used in setting elements of Vector3, Vector4, Point3, or Quat with the 
-// subscripting operator.
-   __m128 &ref;
-   int i;
-    inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; }
-    // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined
-    // in which case, implicitly casts to floatInVec, and one must call
-    // getAsFloat to convert to float.
-    //
-    inline operator floatInVec() const;
-    inline float getAsFloat() const;
-    inline operator float() const;
-    inline float operator =( float scalar );
-    inline floatInVec operator =( const floatInVec &scalar );
-    inline floatInVec operator =( const VecIdx& scalar );
-    inline floatInVec operator *=( float scalar );
-    inline floatInVec operator *=( const floatInVec &scalar );
-    inline floatInVec operator /=( float scalar );
-    inline floatInVec operator /=( const floatInVec &scalar );
-    inline floatInVec operator +=( float scalar );
-    inline floatInVec operator +=( const floatInVec &scalar );
-    inline floatInVec operator -=( float scalar );
-    inline floatInVec operator -=( const floatInVec &scalar );
-} // namespace Aos
-} // namespace Vectormath
diff --git a/src/bullet/vectormath/sse/vectormath_aos.h b/src/bullet/vectormath/sse/vectormath_aos.h
deleted file mode 100644
index c3a02be0..00000000
--- a/src/bullet/vectormath/sse/vectormath_aos.h
+++ /dev/null
@@ -1,2547 +0,0 @@
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-#include <math.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <assert.h>
-#define Vector3Ref Vector3&
-#define QuatRef	Quat&
-#define Matrix3Ref Matrix3&
-#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400)
-	#define USE_SSE3_LDDQU
-	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) __declspec(align(16)) a
-	#define VM_ATTRIBUTE_ALIGN16 __declspec(align(16))
-	#define VECTORMATH_FORCE_INLINE __forceinline 
-	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) a __attribute__ ((aligned (16)))	
-	#define VM_ATTRIBUTE_ALIGN16 __attribute__ ((aligned (16)))	
-	#ifdef __SSE3__
-		#define USE_SSE3_LDDQU
-	#endif //__SSE3__
-#ifdef USE_SSE3_LDDQU
-#include <pmmintrin.h>//_mm_lddqu_si128
-#endif //USE_SSE3_LDDQU
-// TODO: Tidy
-typedef __m128 vec_float4;
-typedef __m128 vec_uint4;
-typedef __m128 vec_int4;
-typedef __m128i vec_uchar16;
-typedef __m128i vec_ushort8;
-#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
-#define _mm_ror_ps(vec,i)	\
-	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
-#define _mm_rol_ps(vec,i)	\
-	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
-#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
-#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
-#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
-#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
-union SSEFloat
-	__m128i vi;
-	__m128 m128;
-	__m128 vf;
-	unsigned int	ui[4];
-	unsigned short s[8];
-	float f[4];
-	SSEFloat(__m128 v) : m128(v) {}
-    SSEFloat(__m128i v) : vi(v) {}
-	SSEFloat() {}//uninitialized
-static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
-	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
-static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
-	return vec_sel(a, b, _mm_load_ps((float *)_mask));
-static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
-	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
-static VECTORMATH_FORCE_INLINE __m128 toM128(unsigned int x)
-    return _mm_set1_ps( *(float *)&x );
-static VECTORMATH_FORCE_INLINE __m128 fabsf4(__m128 x)
-    return _mm_and_ps( x, toM128( 0x7fffffff ) );
-union SSE64
-	__m128 m128;
-	struct
-	{
-		__m64 m01;
-		__m64 m23;
-	} m64;
-static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	SSE64 sse64;
-	sse64.m64.m01 = _mm_cvttps_pi32(x);
-	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
-	_mm_empty();
-    return sse64.m128;
-static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	SSE64 sse64;
-	sse64.m128 = x;
-	__m128 result =_mm_movelh_ps(
-		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
-		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
-	_mm_empty();
-	return result;
-static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	__m128i result = _mm_cvtps_epi32(x);
-    return (__m128 &)result;
-static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	return _mm_cvtepi32_ps((__m128i &)x);
-#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
-#define vec_sub(a,b) _mm_sub_ps( a, b )
-#define vec_add(a,b) _mm_add_ps( a, b )
-#define vec_mul(a,b) _mm_mul_ps( a, b )
-#define vec_xor(a,b) _mm_xor_ps( a, b )
-#define vec_and(a,b) _mm_and_ps( a, b )
-#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
-#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
-#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
-#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
-#define vec_andc(a,b) _mm_andnot_ps( b, a )
-#define sqrtf4(x) _mm_sqrt_ps( x )
-#define rsqrtf4(x) _mm_rsqrt_ps( x )
-#define recipf4(x) _mm_rcp_ps( x )
-#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
-static VECTORMATH_FORCE_INLINE __m128 newtonrapson_rsqrt4( const __m128 v )
-#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
-#define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
-const __m128 approx = _mm_rsqrt_ps( v );   
-const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
-return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
-static VECTORMATH_FORCE_INLINE __m128 acosf4(__m128 x)
-    __m128 xabs = fabsf4(x);
-	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
-    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
-    /* Instruction counts can be reduced if the polynomial was
-     * computed entirely from nested (dependent) fma's. However, 
-     * to reduce the number of pipeline stalls, the polygon is evaluated 
-     * in two halves (hi amd lo). 
-     */
-    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
-    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
-    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
-		xabs, _mm_set1_ps(0.0066700901f)),
-			xabs, _mm_set1_ps(-0.0170881256f)),
-				xabs, _mm_set1_ps( 0.0308918810f));
-    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
-		xabs, _mm_set1_ps(0.0889789874f)),
-			xabs, _mm_set1_ps(-0.2145988016f)),
-				xabs, _mm_set1_ps( 1.5707963050f));
-    __m128 result = vec_madd(hi, xabs4, lo);
-    // Adjust the result if x is negactive.
-    return vec_sel(
-		vec_mul(t1, result),									// Positive
-		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
-		select);
-static VECTORMATH_FORCE_INLINE __m128 sinf4(vec_float4 x)
-// Common constants used to evaluate sinf4/cosf4/tanf4
-#define _SINCOS_CC0  -0.0013602249f
-#define _SINCOS_CC1   0.0416566950f
-#define _SINCOS_CC2  -0.4999990225f
-#define _SINCOS_SC0  -0.0001950727f
-#define _SINCOS_SC1   0.0083320758f
-#define _SINCOS_SC2  -0.1666665247f
-#define _SINCOS_KC1  1.57079625129f
-#define _SINCOS_KC2  7.54978995489e-8f
-    vec_float4 xl,xl2,xl3,res;
-    // Range reduction using : xl = angle * TwoOverPi;
-    //  
-    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
-    // Find the quadrant the angle falls in
-    // using:  q = (int) (ceil(abs(xl))*sign(xl))
-    //
-    vec_int4 q = vec_cts(xl,0);
-    // Compute an offset based on the quadrant that the angle falls in
-    // 
-    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
-    // Remainder in range [-pi/4..pi/4]
-    //
-    vec_float4 qf = vec_ctf(q,0);
-    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
-    // Compute x^2 and x^3
-    //
-    xl2 = vec_mul(xl,xl);
-    xl3 = vec_mul(xl2,xl);
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
-    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    //
-    vec_float4 cx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
-    vec_float4 sx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    //
-    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
-                                          toM128(0x1)),
-										  _mm_setzero_ps()));
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    //
-    return vec_sel(
-		vec_xor(toM128(0x80000000U), res),	// Negative
-		res,								// Positive
-		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
-static VECTORMATH_FORCE_INLINE void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
-    vec_float4 xl,xl2,xl3;
-    vec_int4   offsetSin, offsetCos;
-    // Range reduction using : xl = angle * TwoOverPi;
-    //  
-    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
-    // Find the quadrant the angle falls in
-    // using:  q = (int) (ceil(abs(xl))*sign(xl))
-    //
-    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
-    vec_int4 q = vec_cts(xl,0);
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine. 
-    //
-    offsetSin = vec_and(q,toM128((int)0x3));
-	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
-	offsetCos = (__m128 &)temp;
-    // Remainder in range [-pi/4..pi/4]
-    //
-    vec_float4 qf = vec_ctf(q,0);
-    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
-    // Compute x^2 and x^3
-    //
-    xl2 = vec_mul(xl,xl);
-    xl3 = vec_mul(xl2,xl);
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
-    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    //
-    vec_float4 cx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
-    vec_float4 sx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    //
-    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
-    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
-    *s = vec_sel(cx,sx,sinMask);
-    *c = vec_sel(cx,sx,cosMask);
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    //
-    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
-    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
-    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
-    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
-#include "vecidx_aos.h"
-#include "floatInVec.h"
-#include "boolInVec.h"
-#include <stdio.h>
-namespace Vectormath {
-namespace Aos {
-// Forward Declarations
-class Vector3;
-class Vector4;
-class Point3;
-class Quat;
-class Matrix3;
-class Matrix4;
-class Transform3;
-// A 3-D vector in array-of-structures format
-class Vector3
-    __m128 mVec128;
-	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
-	 VECTORMATH_FORCE_INLINE  vec_float4& get128Ref();
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Vector3( ) { };
-	// Default copy constructor
-    // 
-	VECTORMATH_FORCE_INLINE Vector3(const Vector3& vec);
-    // Construct a 3-D vector from x, y, and z elements
-    // 
-    VECTORMATH_FORCE_INLINE Vector3( float x, float y, float z );
-    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
-    // Copy elements from a 3-D point into a 3-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( const Point3 &pnt );
-    // Set all elements of a 3-D vector to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( float scalar );
-    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( const floatInVec &scalar );
-    // Set vector float data in a 3-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( __m128 vf4 );
-    // Get vector float data from a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-    // Assign one 3-D vector to another
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator =( const Vector3 &vec );
-    // Set the x element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setX( float x );
-    // Set the y element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setY( float y );
-    // Set the z element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setZ( float z );
-    // Set the x element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setX( const floatInVec &x );
-    // Set the y element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setY( const floatInVec &y );
-    // Set the z element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setZ( const floatInVec &z );
-    // Get the x element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-    // Get the y element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-    // Get the z element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-    // Set an x, y, or z element of a 3-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, float value );
-    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, const floatInVec &value );
-    // Get an x, y, or z element of a 3-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-    // Add two 3-D vectors
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator +( const Vector3 &vec ) const;
-    // Subtract a 3-D vector from another 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Vector3 &vec ) const;
-    // Add a 3-D vector to a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator +( const Point3 &pnt ) const;
-    // Multiply a 3-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar ) const;
-    // Divide a 3-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator /( float scalar ) const;
-    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar ) const;
-    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator /( const floatInVec &scalar ) const;
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator +=( const Vector3 &vec );
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator -=( const Vector3 &vec );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator *=( float scalar );
-    // Perform compound assignment and division by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator /=( float scalar );
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator *=( const floatInVec &scalar );
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator /=( const floatInVec &scalar );
-    // Negate all elements of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator -( ) const;
-    // Construct x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector3 xAxis( );
-    // Construct y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector3 yAxis( );
-    // Construct z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector3 zAxis( );
-// Multiply a 3-D vector by a scalar
-VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec );
-// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
-// Multiply two 3-D vectors per element
-VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-// Divide two 3-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-// Compute the reciprocal of a 3-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec );
-// Compute the absolute value of a 3-D vector per element
-VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec );
-// Copy sign from one 3-D vector to another, per element
-VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-// Maximum of two 3-D vectors per element
-VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-// Minimum of two 3-D vectors per element
-VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-// Maximum element of a 3-D vector
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec );
-// Minimum element of a 3-D vector
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec );
-// Compute the sum of all elements of a 3-D vector
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec );
-// Compute the dot product of two 3-D vectors
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
-// Compute the square of the length of a 3-D vector
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec );
-// Compute the length of a 3-D vector
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec );
-// Normalize a 3-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec );
-// Compute cross product of two 3-D vectors
-VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
-// Outer product of two 3-D vectors
-VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
-// Pre-multiply a row vector by a 3x3 matrix
-// NOTE: 
-// Slower than column post-multiply.
-VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
-// Cross-product matrix of a 3-D vector
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec );
-// Create cross-product matrix and multiply
-// NOTE: 
-// Faster than separately creating a cross-product matrix and multiplying.
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
-// Linear interpolation between two 3-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
-// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
-// Spherical linear interpolation between two 3-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
-// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
-// Conditionally select between two 3-D vectors
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
-// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
-// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
-VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad );
-// Load four three-float 3-D vectors, stored in three quadwords
-VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
-// Store four 3-D vectors in three quadwords
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
-// Store eight 3-D vectors as half-floats
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
-// Print a 3-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec );
-// Print a 3-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name );
-// A 4-D vector in array-of-structures format
-class Vector4
-    __m128 mVec128;
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( ) { };
-    // Construct a 4-D vector from x, y, z, and w elements
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( float x, float y, float z, float w );
-    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
-    // Construct a 4-D vector from a 3-D vector and a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, float w );
-    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, const floatInVec &w );
-    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const Vector3 &vec );
-    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const Point3 &pnt );
-    // Copy elements from a quaternion into a 4-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const Quat &quat );
-    // Set all elements of a 4-D vector to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( float scalar );
-    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const floatInVec &scalar );
-    // Set vector float data in a 4-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( __m128 vf4 );
-    // Get vector float data from a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-    // Assign one 4-D vector to another
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator =( const Vector4 &vec );
-    // Set the x, y, and z elements of a 4-D vector
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setXYZ( const Vector3 &vec );
-    // Get the x, y, and z elements of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
-    // Set the x element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setX( float x );
-    // Set the y element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setY( float y );
-    // Set the z element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setZ( float z );
-    // Set the w element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setW( float w );
-    // Set the x element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setX( const floatInVec &x );
-    // Set the y element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setY( const floatInVec &y );
-    // Set the z element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setZ( const floatInVec &z );
-    // Set the w element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setW( const floatInVec &w );
-    // Get the x element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-    // Get the y element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-    // Get the z element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-    // Get the w element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
-    // Set an x, y, z, or w element of a 4-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, float value );
-    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, const floatInVec &value );
-    // Get an x, y, z, or w element of a 4-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-    // Add two 4-D vectors
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator +( const Vector4 &vec ) const;
-    // Subtract a 4-D vector from another 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator -( const Vector4 &vec ) const;
-    // Multiply a 4-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar ) const;
-    // Divide a 4-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator /( float scalar ) const;
-    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar ) const;
-    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator /( const floatInVec &scalar ) const;
-    // Perform compound assignment and addition with a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator +=( const Vector4 &vec );
-    // Perform compound assignment and subtraction by a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator -=( const Vector4 &vec );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator *=( float scalar );
-    // Perform compound assignment and division by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator /=( float scalar );
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator *=( const floatInVec &scalar );
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator /=( const floatInVec &scalar );
-    // Negate all elements of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator -( ) const;
-    // Construct x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 xAxis( );
-    // Construct y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 yAxis( );
-    // Construct z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 zAxis( );
-    // Construct w axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 wAxis( );
-// Multiply a 4-D vector by a scalar
-VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec );
-// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
-// Multiply two 4-D vectors per element
-VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-// Divide two 4-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-// Compute the reciprocal of a 4-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec );
-// Compute the absolute value of a 4-D vector per element
-VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec );
-// Copy sign from one 4-D vector to another, per element
-VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-// Maximum of two 4-D vectors per element
-VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-// Minimum of two 4-D vectors per element
-VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-// Maximum element of a 4-D vector
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec );
-// Minimum element of a 4-D vector
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec );
-// Compute the sum of all elements of a 4-D vector
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec );
-// Compute the dot product of two 4-D vectors
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
-// Compute the square of the length of a 4-D vector
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec );
-// Compute the length of a 4-D vector
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec );
-// Normalize a 4-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec );
-// Outer product of two 4-D vectors
-VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
-// Linear interpolation between two 4-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
-// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
-// Spherical linear interpolation between two 4-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
-// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
-// Conditionally select between two 4-D vectors
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
-// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
-// Store four 4-D vectors as half-floats
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
-// Print a 4-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec );
-// Print a 4-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name );
-// A 3-D point in array-of-structures format
-class Point3
-    __m128 mVec128;
-    // Default constructor; does no initialization
-    // 
-    // Construct a 3-D point from x, y, and z elements
-    // 
-    VECTORMATH_FORCE_INLINE Point3( float x, float y, float z );
-    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
-    // Copy elements from a 3-D vector into a 3-D point
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( const Vector3 &vec );
-    // Set all elements of a 3-D point to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( float scalar );
-    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( const floatInVec &scalar );
-    // Set vector float data in a 3-D point
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( __m128 vf4 );
-    // Get vector float data from a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-    // Assign one 3-D point to another
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & operator =( const Point3 &pnt );
-    // Set the x element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setX( float x );
-    // Set the y element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setY( float y );
-    // Set the z element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setZ( float z );
-    // Set the x element of a 3-D point (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setX( const floatInVec &x );
-    // Set the y element of a 3-D point (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setY( const floatInVec &y );
-    // Set the z element of a 3-D point (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setZ( const floatInVec &z );
-    // Get the x element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-    // Get the y element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-    // Get the z element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-    // Set an x, y, or z element of a 3-D point by index
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, float value );
-    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, const floatInVec &value );
-    // Get an x, y, or z element of a 3-D point by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-    // Subtract a 3-D point from another 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Point3 &pnt ) const;
-    // Add a 3-D point to a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator +( const Vector3 &vec ) const;
-    // Subtract a 3-D vector from a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator -( const Vector3 &vec ) const;
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & operator +=( const Vector3 &vec );
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & operator -=( const Vector3 &vec );
-// Multiply two 3-D points per element
-VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-// Divide two 3-D points per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-// Compute the reciprocal of a 3-D point per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt );
-// Compute the absolute value of a 3-D point per element
-VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt );
-// Copy sign from one 3-D point to another, per element
-VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-// Maximum of two 3-D points per element
-VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-// Minimum of two 3-D points per element
-VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-// Maximum element of a 3-D point
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt );
-// Minimum element of a 3-D point
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt );
-// Compute the sum of all elements of a 3-D point
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt );
-// Apply uniform scale to a 3-D point
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal );
-// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
-// Apply non-uniform scale to a 3-D point
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
-// Scalar projection of a 3-D point on a unit-length 3-D vector
-VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
-// Compute the square of the distance of a 3-D point from the coordinate-system origin
-VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt );
-// Compute the distance of a 3-D point from the coordinate-system origin
-VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt );
-// Compute the square of the distance between two 3-D points
-VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
-// Compute the distance between two 3-D points
-VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
-// Linear interpolation between two 3-D points
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
-// Linear interpolation between two 3-D points (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
-// Conditionally select between two 3-D points
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
-// Conditionally select between two 3-D points (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
-// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
-VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad );
-// Load four three-float 3-D points, stored in three quadwords
-VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
-// Store four 3-D points in three quadwords
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
-// Store eight 3-D points as half-floats
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
-// Print a 3-D point
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt );
-// Print a 3-D point and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name );
-// A quaternion in array-of-structures format
-class Quat
-    __m128 mVec128;
-    // Default constructor; does no initialization
-    // 
-	VECTORMATH_FORCE_INLINE  Quat(const Quat& quat);
-    // Construct a quaternion from x, y, z, and w elements
-    // 
-    VECTORMATH_FORCE_INLINE Quat( float x, float y, float z, float w );
-    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
-    // Construct a quaternion from a 3-D vector and a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, float w );
-    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, const floatInVec &w );
-    // Copy elements from a 4-D vector into a quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( const Vector4 &vec );
-    // Convert a rotation matrix to a unit-length quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( const Matrix3 & rotMat );
-    // Set all elements of a quaternion to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( float scalar );
-    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( const floatInVec &scalar );
-    // Set vector float data in a quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( __m128 vf4 );
-    // Get vector float data from a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-	// Set a quaterion from vector float data
-    //
-	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
-    // Assign one quaternion to another
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator =( const Quat &quat );
-    // Set the x, y, and z elements of a quaternion
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setXYZ( const Vector3 &vec );
-    // Get the x, y, and z elements of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
-    // Set the x element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setX( float x );
-    // Set the y element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setY( float y );
-    // Set the z element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setZ( float z );
-    // Set the w element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setW( float w );
-    // Set the x element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setX( const floatInVec &x );
-    // Set the y element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setY( const floatInVec &y );
-    // Set the z element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setZ( const floatInVec &z );
-    // Set the w element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setW( const floatInVec &w );
-    // Get the x element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-    // Get the y element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-    // Get the z element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-    // Get the w element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
-    // Set an x, y, z, or w element of a quaternion by index
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, float value );
-    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, const floatInVec &value );
-    // Get an x, y, z, or w element of a quaternion by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-    // Add two quaternions
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator +( const Quat &quat ) const;
-    // Subtract a quaternion from another quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator -( const Quat &quat ) const;
-    // Multiply two quaternions
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator *( const Quat &quat ) const;
-    // Multiply a quaternion by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator *( float scalar ) const;
-    // Divide a quaternion by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator /( float scalar ) const;
-    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar ) const;
-    // Divide a quaternion by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator /( const floatInVec &scalar ) const;
-    // Perform compound assignment and addition with a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator +=( const Quat &quat );
-    // Perform compound assignment and subtraction by a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator -=( const Quat &quat );
-    // Perform compound assignment and multiplication by a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator *=( const Quat &quat );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator *=( float scalar );
-    // Perform compound assignment and division by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator /=( float scalar );
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator *=( const floatInVec &scalar );
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator /=( const floatInVec &scalar );
-    // Negate all elements of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator -( ) const;
-    // Construct an identity quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat identity( );
-    // Construct a quaternion to rotate between two unit-length 3-D vectors
-    // NOTE: 
-    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
-    // Construct a quaternion to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotation( float radians, const Vector3 &unitVec );
-    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
-    // Construct a quaternion to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationX( float radians );
-    // Construct a quaternion to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationY( float radians );
-    // Construct a quaternion to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationZ( float radians );
-    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationX( const floatInVec &radians );
-    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationY( const floatInVec &radians );
-    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationZ( const floatInVec &radians );
-// Multiply a quaternion by a scalar
-VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat );
-// Multiply a quaternion by a scalar (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat );
-// Compute the conjugate of a quaternion
-VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat );
-// Use a unit-length quaternion to rotate a 3-D vector
-VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
-// Compute the dot product of two quaternions
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 );
-// Compute the norm of a quaternion
-VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat );
-// Compute the length of a quaternion
-VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat );
-// Normalize a quaternion
-// NOTE: 
-// The result is unpredictable when all elements of quat are at or near zero.
-VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat );
-// Linear interpolation between two quaternions
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
-// Linear interpolation between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
-// Spherical linear interpolation between two quaternions
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
-// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
-// Spherical quadrangle interpolation
-VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
-// Spherical quadrangle interpolation (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
-// Conditionally select between two quaternions
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
-// Conditionally select between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
-// Print a quaternion
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Quat &quat );
-// Print a quaternion and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name );
-// A 3x3 matrix in array-of-structures format
-class Matrix3
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3( ) { };
-    // Copy a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3( const Matrix3 & mat );
-    // Construct a 3x3 matrix containing the specified columns
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
-    // Construct a 3x3 rotation matrix from a unit-length quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix3( const Quat &unitQuat );
-    // Set all elements of a 3x3 matrix to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix3( float scalar );
-    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix3( const floatInVec &scalar );
-    // Assign one 3x3 matrix to another
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator =( const Matrix3 & mat );
-    // Set column 0 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol0( const Vector3 &col0 );
-    // Set column 1 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol1( const Vector3 &col1 );
-    // Set column 2 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol2( const Vector3 &col2 );
-    // Get column 0 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
-    // Get column 1 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
-    // Get column 2 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
-    // Set the column of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol( int col, const Vector3 &vec );
-    // Set the row of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setRow( int row, const Vector3 &vec );
-    // Get the column of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
-    // Get the row of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getRow( int row ) const;
-    // Subscripting operator to set or get a column
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
-    // Subscripting operator to get a column
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
-    // Set the element of a 3x3 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, float val );
-    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, const floatInVec &val );
-    // Get the element of a 3x3 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
-    // Add two 3x3 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator +( const Matrix3 & mat ) const;
-    // Subtract a 3x3 matrix from another 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator -( const Matrix3 & mat ) const;
-    // Negate all elements of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator -( ) const;
-    // Multiply a 3x3 matrix by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar ) const;
-    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar ) const;
-    // Multiply a 3x3 matrix by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
-    // Multiply two 3x3 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const Matrix3 & mat ) const;
-    // Perform compound assignment and addition with a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator +=( const Matrix3 & mat );
-    // Perform compound assignment and subtraction by a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator -=( const Matrix3 & mat );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( float scalar );
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const floatInVec &scalar );
-    // Perform compound assignment and multiplication by a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const Matrix3 & mat );
-    // Construct an identity 3x3 matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 identity( );
-    // Construct a 3x3 matrix to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( float radians );
-    // Construct a 3x3 matrix to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( float radians );
-    // Construct a 3x3 matrix to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( float radians );
-    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( const floatInVec &radians );
-    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( const floatInVec &radians );
-    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( const floatInVec &radians );
-    // Construct a 3x3 matrix to rotate around the x, y, and z axes
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( float radians, const Vector3 &unitVec );
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const Quat &unitQuat );
-    // Construct a 3x3 matrix to perform scaling
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 scale( const Vector3 &scaleVec );
-// Multiply a 3x3 matrix by a scalar
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat );
-// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
-// Append (post-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
-// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
-// Multiply two 3x3 matrices per element
-VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
-// Compute the absolute value of a 3x3 matrix per element
-VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat );
-// Transpose of a 3x3 matrix
-VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat );
-// Compute the inverse of a 3x3 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat );
-// Determinant of a 3x3 matrix
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat );
-// Conditionally select between two 3x3 matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
-// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
-// Print a 3x3 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat );
-// Print a 3x3 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name );
-// A 4x4 matrix in array-of-structures format
-class Matrix4
-    Vector4 mCol0;
-    Vector4 mCol1;
-    Vector4 mCol2;
-    Vector4 mCol3;
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( ) { };
-    // Copy a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Matrix4 & mat );
-    // Construct a 4x4 matrix containing the specified columns
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
-    // Construct a 4x4 matrix from a 3x4 transformation matrix
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix4( const Transform3 & mat );
-    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
-    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
-    // Set all elements of a 4x4 matrix to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix4( float scalar );
-    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix4( const floatInVec &scalar );
-    // Assign one 4x4 matrix to another
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator =( const Matrix4 & mat );
-    // Set the upper-left 3x3 submatrix
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setUpper3x3( const Matrix3 & mat3 );
-    // Get the upper-left 3x3 submatrix of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
-    // Set translation component
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setTranslation( const Vector3 &translateVec );
-    // Get the translation component of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
-    // Set column 0 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol0( const Vector4 &col0 );
-    // Set column 1 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol1( const Vector4 &col1 );
-    // Set column 2 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol2( const Vector4 &col2 );
-    // Set column 3 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol3( const Vector4 &col3 );
-    // Get column 0 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol0( ) const;
-    // Get column 1 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol1( ) const;
-    // Get column 2 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol2( ) const;
-    // Get column 3 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol3( ) const;
-    // Set the column of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol( int col, const Vector4 &vec );
-    // Set the row of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setRow( int row, const Vector4 &vec );
-    // Get the column of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol( int col ) const;
-    // Get the row of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
-    // Subscripting operator to set or get a column
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator []( int col );
-    // Subscripting operator to get a column
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator []( int col ) const;
-    // Set the element of a 4x4 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, float val );
-    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, const floatInVec &val );
-    // Get the element of a 4x4 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
-    // Add two 4x4 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator +( const Matrix4 & mat ) const;
-    // Subtract a 4x4 matrix from another 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator -( const Matrix4 & mat ) const;
-    // Negate all elements of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator -( ) const;
-    // Multiply a 4x4 matrix by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar ) const;
-    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar ) const;
-    // Multiply a 4x4 matrix by a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector4 &vec ) const;
-    // Multiply a 4x4 matrix by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector3 &vec ) const;
-    // Multiply a 4x4 matrix by a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Point3 &pnt ) const;
-    // Multiply two 4x4 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Matrix4 & mat ) const;
-    // Multiply a 4x4 matrix by a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Transform3 & tfrm ) const;
-    // Perform compound assignment and addition with a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator +=( const Matrix4 & mat );
-    // Perform compound assignment and subtraction by a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator -=( const Matrix4 & mat );
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( float scalar );
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const floatInVec &scalar );
-    // Perform compound assignment and multiplication by a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Matrix4 & mat );
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Transform3 & tfrm );
-    // Construct an identity 4x4 matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 identity( );
-    // Construct a 4x4 matrix to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( float radians );
-    // Construct a 4x4 matrix to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( float radians );
-    // Construct a 4x4 matrix to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( float radians );
-    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( const floatInVec &radians );
-    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( const floatInVec &radians );
-    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( const floatInVec &radians );
-    // Construct a 4x4 matrix to rotate around the x, y, and z axes
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( float radians, const Vector3 &unitVec );
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const Quat &unitQuat );
-    // Construct a 4x4 matrix to perform scaling
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 scale( const Vector3 &scaleVec );
-    // Construct a 4x4 matrix to perform translation
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 translation( const Vector3 &translateVec );
-    // Construct viewing matrix based on eye, position looked at, and up direction
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
-    // Construct a perspective projection matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
-    // Construct a perspective projection matrix based on frustum
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
-    // Construct an orthographic projection matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
-// Multiply a 4x4 matrix by a scalar
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat );
-// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
-// Append (post-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
-// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
-// Multiply two 4x4 matrices per element
-VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
-// Compute the absolute value of a 4x4 matrix per element
-VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat );
-// Transpose of a 4x4 matrix
-VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat );
-// Compute the inverse of a 4x4 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat );
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
-VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat );
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
-VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat );
-// Determinant of a 4x4 matrix
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat );
-// Conditionally select between two 4x4 matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
-// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
-// Print a 4x4 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat );
-// Print a 4x4 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name );
-// A 3x4 transformation matrix in array-of-structures format
-class Transform3
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-    Vector3 mCol3;
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( ) { };
-    // Copy a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Transform3 & tfrm );
-    // Construct a 3x4 transformation matrix containing the specified columns
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
-    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
-    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Quat &unitQuat, const Vector3 &translateVec );
-    // Set all elements of a 3x4 transformation matrix to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Transform3( float scalar );
-    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Transform3( const floatInVec &scalar );
-    // Assign one 3x4 transformation matrix to another
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & operator =( const Transform3 & tfrm );
-    // Set the upper-left 3x3 submatrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setUpper3x3( const Matrix3 & mat3 );
-    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
-    // Set translation component
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setTranslation( const Vector3 &translateVec );
-    // Get the translation component of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
-    // Set column 0 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol0( const Vector3 &col0 );
-    // Set column 1 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol1( const Vector3 &col1 );
-    // Set column 2 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol2( const Vector3 &col2 );
-    // Set column 3 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol3( const Vector3 &col3 );
-    // Get column 0 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
-    // Get column 1 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
-    // Get column 2 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
-    // Get column 3 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol3( ) const;
-    // Set the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol( int col, const Vector3 &vec );
-    // Set the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setRow( int row, const Vector4 &vec );
-    // Get the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
-    // Get the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
-    // Subscripting operator to set or get a column
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
-    // Subscripting operator to get a column
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, float val );
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, const floatInVec &val );
-    // Get the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
-    // Multiply a 3x4 transformation matrix by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
-    // Multiply a 3x4 transformation matrix by a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator *( const Point3 &pnt ) const;
-    // Multiply two 3x4 transformation matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Transform3 operator *( const Transform3 & tfrm ) const;
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & operator *=( const Transform3 & tfrm );
-    // Construct an identity 3x4 transformation matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 identity( );
-    // Construct a 3x4 transformation matrix to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( float radians );
-    // Construct a 3x4 transformation matrix to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( float radians );
-    // Construct a 3x4 transformation matrix to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( float radians );
-    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( const floatInVec &radians );
-    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( const floatInVec &radians );
-    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( const floatInVec &radians );
-    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationZYX( const Vector3 &radiansXYZ );
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotation( float radians, const Vector3 &unitVec );
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const Quat &unitQuat );
-    // Construct a 3x4 transformation matrix to perform scaling
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 scale( const Vector3 &scaleVec );
-    // Construct a 3x4 transformation matrix to perform translation
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 translation( const Vector3 &translateVec );
-// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
-// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
-// Multiply two 3x4 transformation matrices per element
-VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
-// Compute the absolute value of a 3x4 transformation matrix per element
-VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm );
-// Inverse of a 3x4 transformation matrix
-// NOTE: 
-// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
-VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm );
-// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
-VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm );
-// Conditionally select between two 3x4 transformation matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
-// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
-// Print a 3x4 transformation matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm );
-// Print a 3x4 transformation matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name );
-} // namespace Aos
-} // namespace Vectormath
-#include "vec_aos.h"
-#include "quat_aos.h"
-#include "mat_aos.h"
diff --git a/src/bullet/vectormath/vmInclude.h b/src/bullet/vectormath/vmInclude.h
deleted file mode 100644
index a43152ce..00000000
--- a/src/bullet/vectormath/vmInclude.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __VM_INCLUDE_H
-#define __VM_INCLUDE_H
-#include "LinearMath/btScalar.h"
-#if defined (USE_SYSTEM_VECTORMATH) || defined (__CELLOS_LV2__)
-	#include <vectormath_aos.h>
-	#if defined (BT_USE_SSE) && defined (_WIN32)
-		#include "sse/vectormath_aos.h"
-	#else //all other platforms
-		#include "scalar/vectormath_aos.h"
-	#endif //(BT_USE_SSE) && defined (_WIN32)
-typedef Vectormath::Aos::Vector3    vmVector3;
-typedef Vectormath::Aos::Quat       vmQuat;
-typedef Vectormath::Aos::Matrix3    vmMatrix3;
-typedef Vectormath::Aos::Transform3 vmTransform3;
-typedef Vectormath::Aos::Point3     vmPoint3;
-#endif //__VM_INCLUDE_H