https://www.youtube.com/playlist?list=PLwr8DnSlIMg0KABru36pg4CvbfkhBofAi
Irgendwie stieĂ ich auf HabrĂ© auf einen ziemlich merkwĂŒrdigen Artikel "Wissenschaftliche und technische Mythen, Teil 1. Warum fliegen Flugzeuge?" ... Der Artikel beschreibt ausfĂŒhrlich, welche Probleme auftreten, wenn versucht wird, den FlĂŒgelhub anhand des Bernoulli-Gesetzes oder des Newtonschen Auftriebsmodells zu erklĂ€ren . Und obwohl der Artikel andere ErklĂ€rungen bietet, möchte ich noch nĂ€her auf Newtons Modell eingehen. Ja, Newtons Modell ist nicht vollstĂ€ndig und hat Annahmen, aber es gibt eine genauere und intuitivere Beschreibung von PhĂ€nomenen als das Bernoulli-Gesetz.
â . - , , .
, . ? GitHub.
Computational Fluid Dynamics
, (Computational fluid dynamics, CFD), -.
, , 7- . 38 (ââ), 40 (â ââ â) 41 (â ââ â). , - â . (, ) . , , .
â . , SpaceX . , , , . . , . , (Smoothed-particle hydrodynamics, SPH). : , , , . . , . , , (screen-space meshes, dual contouring, marching tetrahedra, metaballs). , .
Discrete Element Method
, .
(Discrete Element Method, DEM) , . , , , .
, ( X Y), . , . . â NASA . , STS-1 . Mission Anomalies.
DEM â (Discrete Collision Detection). .
Continuous Collision Detection (CCD), , . , , . . CCD , « ». , Unity Unreal .
Continuous Collision Detection
CCD . â . . â , , , . , â â
, DEM.
, , , , . , . , . .
National Geographic. , . , , .
( National Geographic)
. â . â . . .
https://youtu.be/cx8XbaQNnxw?t=2206
, . , . (heatmap) ( ) ( ) . .
.
, . :
- .
- CPU CUDA.
. . CUDA- , , 16 .
. , , , , , .., . . .
, . (Ordinary Differential Equation, ODE). , â , â , . , .
, f, â .
'Differential Equation Basics' and 'Particle Dynamics' https://www.cs.cmu.edu/~baraff/sigcourse/
3Blue1Brown :
https://www.youtube.com/playlist?list=PLZHQObOWTQDNPOjrT6KVlfJuKtYTftqH6
, â (Forward Euler). - 4- (RK4), , . RK4 , , . , , - . , , 'Differential Equation Basics' lecture notes, section 3, 'Adaptive Stepsizes'.
, , . , . .
float CSimulationCpu::ComputeMinDeltaTime(float requestedDt) const
{
auto rad = m_state.particleRad;
auto velBegin = m_curOdeState.cbegin() + m_state.particles;
auto velEnd = m_curOdeState.cend();
return std::transform_reduce(std::execution::par_unseq, velBegin, velEnd, requestedDt, [](const auto t1, const auto t2)
{
return std::min(t1, t2);
}, [&](const auto& v)
{
auto vel = glm::length(v);
auto radDt = rad / vel;
return radDt;
});
}
float CSimulationCpu::Update(float dt)
{
dt = ComputeMinDeltaTime(dt);
m_odeSolver->NextState(m_curOdeState, dt, m_nextOdeState);
ColorParticles(dt);
m_nextOdeState.swap(m_curOdeState);
return dt;
}â f, â â. CPU CUDA . , CPU , . CUDA , . . â â.
//CPU-
void CDerivativeSolver::Derive(const OdeState_t& curState, OdeState_t& outDerivative)
{
ResetForces();
BuildParticlesTree(curState);
ResolveParticleParticleCollisions(curState);
ResolveParticleWingCollisions(curState);
ParticleToWall(curState);
ApplyGravity();
BuildDerivative(curState, outDerivative);
}
//CUDA-
void CDerivativeSolver::Derive(const OdeState_t& curState, OdeState_t& outDerivative)
{
BuildParticlesTree(curState);
ReorderParticles(curState);
ResetParticlesState();
ResolveParticleParticleCollisions();
ResolveParticleWingCollisions();
ParticleToWall();
ApplyGravity();
BuildDerivative(curState, outDerivative);
}, , , â . 2â097â152 . - . , . â .
â Uniform Grid, . GPU âChapter 32. Broad-Phase Collision Detection with CUDAâ.
( Tero Karras, NVIDIA Corporation)
, . 9 (3x3) 27 (3x3x3) 2D 3D . â . , , RCU lock-free . Nvidia , malloc()/free() device , .
CppCon 2017: Fedor Pikus âRead, Copy, Update, then what? RCU for non-kernel programmersâ
, :
- .
- RAM/VRAM, -, .
- , .
- .
- GPU, lock-free (https://youtu.be/86seb-iZCnI?t=2311, ).
, â BVH- Z-. , .
â Z-, .
Z- ( Wikipedia)
â
( Wikipedia)
, space-filling curve, , . 2D/3D , , , , , . . , , , , , .
, . , . , Tero Karras Nvidia, .
âMaximizing Parallelism in the Construction of BVHs, Octrees, and k-d Treesâ.
:
- : https://developer.nvidia.com/blog/thinking-parallel-part-ii-tree-traversal-gpu/
- : https://developer.nvidia.com/blog/thinking-parallel-part-iii-tree-construction-gpu/
, . N , bounding box , , ( Z-). .
( Tero Karras, NVIDIA Corporation)
. , , , N N-1 . , . . , . , . .
( Tero Karras, NVIDIA Corporation)
( Tero Karras, NVIDIA Corporation)
, , BVH-.
BVH- ( Tero Karras, NVIDIA Corporation)
N , , . , , , . atomicExch() 1. , . 0, . , , . . 1, , , .
.
â-â â- â.
â-â , , ââ , . Tero Karras. , A-B B-A , . . (rightmost leaf), . , rightmost(N2) = 4, rightmost(N3) = 8. , , , O6, , N2. rightmost, , O6 N2. O6 N2. , , N2, N3. , O6 , , N2.
:
template<typename TDeviceCollisionResponseSolver, size_t kTreeStackSize>
void CMortonTree::TraverseReflexive(const TDeviceCollisionResponseSolver& solver);
â- â :
template<typename TDeviceCollisionResponseSolver, size_t kTreeStackSize>
void CMortonTree::Traverse(const thrust::device_vector<SBoundingBox>& objects, const TDeviceCollisionResponseSolver& solver);
TDeviceCollisionResponseSolver â , :
struct Solver
{
struct SDeviceSideSolver
{
...
__device__ SDeviceSideSolver(...);
__device__ void OnPreTraversal(TIndex curId);
__device__ void OnCollisionDetected(TIndex leafId);
__device__ void OnPostTraversal();
};
Solver(...);
__device__ SDeviceSideSolver Create();
};
, , . Create(). OnPreTraversal, . - , OnCollisionDetected . . OnPostTraversal.
. -. , Tero Karras. . NxO, N â , O â . , . . , (coalesced memory access). , . , .
struct SParticleParticleCollisionSolver
{
struct SDeviceSideSolver
{
CDerivativeSolver::SIntermediateSimState& simState;
TIndex curIdx;
float2 pos1;
float2 vel1;
float2 totalForce;
float totalPressure;
__device__ SDeviceSideSolver(CDerivativeSolver::SIntermediateSimState& state) : simState(state)
{
}
__device__ void OnPreTraversal(TIndex curLeafIdx)
{
curIdx = curLeafIdx;
pos1 = simState.pos[curLeafIdx];
vel1 = simState.vel[curLeafIdx];
totalForce = make_float2(0.0f);
totalPressure = 0.0f;
}
__device__ void OnCollisionDetected(TIndex anotherLeafIdx)
{
const auto pos2 = simState.pos[anotherLeafIdx];
const auto deltaPos = pos2 - pos1;
const auto distanceSq = dot(deltaPos, deltaPos);
if (distanceSq > simState.diameterSq || distanceSq < 1e-8f)
return;
const auto vel2 = simState.vel[anotherLeafIdx];
auto dist = sqrtf(distanceSq);
auto dir = deltaPos / dist;
auto springLen = simState.diameter - dist;
auto force = SpringDamper(dir, vel1, vel2, springLen);
auto pressure = length(force);
totalForce += force;
totalPressure += pressure;
atomicAdd(&simState.force[anotherLeafIdx].x, -force.x);
atomicAdd(&simState.force[anotherLeafIdx].y, -force.y);
atomicAdd(&simState.pressure[anotherLeafIdx], pressure);
}
__device__ void OnPostTraversal()
{
atomicAdd(&simState.force[curIdx].x, totalForce.x);
atomicAdd(&simState.force[curIdx].y, totalForce.y);
atomicAdd(&simState.pressure[curIdx], totalPressure);
}
};
CDerivativeSolver::SIntermediateSimState simState;
SParticleParticleCollisionSolver(const CDerivativeSolver::SIntermediateSimState& state) : simState(state)
{
}
__device__ SDeviceSideSolver Create()
{
return SDeviceSideSolver(simState);
}
};
void CDerivativeSolver::ResolveParticleParticleCollisions()
{
m_particlesTree.TraverseReflexive<SParticleParticleCollisionSolver, 24>(SParticleParticleCollisionSolver(m_particles.GetSimState()));
CudaCheckError();
}, , OnCollistionDetected . : - A, B, C D, Z , :
| lock-step # | Thread #1 | Thread #2 | Thread #3 |
|---|---|---|---|
| 1 | OnCollisionDetected
A <-> C |
OnCollisionDetected
B <-> C |
OnCollisionDetected
C <-> D |
| 2 | OnCollisionDetected
A <-> D |
OnCollisionDetected
B <-> D |
INACTIVE |
| 3 | OnPostTraversal(1) | OnPostTraversal(2) | OnPostTraversal(3) |
, 1 2 #1 #2 atomicAdd C D OnCollistionDetected. atomic .
Volta, Nvidia warp-vote . match_any warp, , .
match_any match_all
, warp shuffle .
Warp-wide
, . SM . , Pascal (1080 Ti) , . , , . , atomic , Arithmetic Workload . Volta Turing . , RTX 2060 , atomic . â â.
, , .
SoA
Structure of Arrays â , .
. , , SoA:
typedef uint32_t TIndex;
struct STreeNodeSoA
{
const TIndex leafs;
int* __restrict__ atomicVisits;
TIndex* __restrict__ parents;
TIndex* __restrict__ lefts;
TIndex* __restrict__ rights;
TIndex* __restrict__ rightmosts;
SBoundingBox* __restrict__ boxes;
const uint32_t* __restrict__ sortedMortonCodes;
};
:
struct SIntermediateSimState
{
const TIndex particles;
const float particleRad;
const float diameter;
const float diameterSq;
float2* __restrict__ pos;
float2* __restrict__ vel;
float2* __restrict__ force;
float* __restrict__ pressure;
};
, bounding boxâ SoA , . Array of Structures (AoS):
struct SBoundingBox
{
float2 min;
float2 max;
};
struct SBoundingBoxesAoS
{
const size_t count;
SBoundingBox* __restrict__ boxes;
};
, , . , . . , uncoalesced memory access.
GPU. coalesced memory access, . , . SpaceX: https://youtu.be/vYA0f6R5KAI?t=1939 ( ).
( SpaceX)
50% : 8 FPS 12 FPS .
Shared Memory
, . , â . SM , uncoalesced. , Shared Memory Streaming Multiprocessorâ.
__device__ void traverseIterative( CollisionList& list,
BVH& bvh,
AABB& queryAABB,
int queryObjectIdx)
{
// Allocate traversal stack from thread-local memory,
// and push NULL to indicate that there are no postponed nodes.
NodePtr stack[64]; //<----------------------------
NodePtr* stackPtr = stack;
*stackPtr++ = NULL; // push
// Traverse nodes starting from the root.
NodePtr node = bvh.getRoot();
do
{
// Check each child node for overlap.
NodePtr childL = bvh.getLeftChild(node);
NodePtr childR = bvh.getRightChild(node);
bool overlapL = ( checkOverlap(queryAABB,
bvh.getAABB(childL)) );
bool overlapR = ( checkOverlap(queryAABB,
bvh.getAABB(childR)) );
// Query overlaps a leaf node => report collision.
if (overlapL && bvh.isLeaf(childL))
list.add(queryObjectIdx, bvh.getObjectIdx(childL));
if (overlapR && bvh.isLeaf(childR))
list.add(queryObjectIdx, bvh.getObjectIdx(childR));
// Query overlaps an internal node => traverse.
bool traverseL = (overlapL && !bvh.isLeaf(childL));
bool traverseR = (overlapR && !bvh.isLeaf(childR));
if (!traverseL && !traverseR)
node = *--stackPtr; // pop
else
{
node = (traverseL) ? childL : childR;
if (traverseL && traverseR)
*stackPtr++ = childR; // push
}
}
while (node != NULL);
}template<typename TDeviceCollisionResponseSolver, size_t kTreeStackSize, size_t kWarpSize = 32>
__global__ void TraverseMortonTreeKernel(const CMortonTree::STreeNodeSoA treeInfo, const SBoundingBoxesAoS externalObjects, TDeviceCollisionResponseSolver solver)
{
const auto threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId >= externalObjects.count)
return;
const auto objectBox = externalObjects.boxes[threadId];
const auto internalCount = treeInfo.leafs - 1;
__shared__ TIndex stack[kTreeStackSize][kWarpSize]; //
unsigned top = 0;
stack[top][threadIdx.x] = 0;
auto deviceSideSolver = solver.Create();
deviceSideSolver.OnPreTraversal(threadId);
while (top < kTreeStackSize) //top == -1 also covered
{
auto cur = stack[top--][threadIdx.x];
if (!treeInfo.boxes[cur].Overlaps(objectBox))
continue;
if (cur < internalCount)
{
stack[++top][threadIdx.x] = treeInfo.lefts[cur];
if (top + 1 < kTreeStackSize)
stack[++top][threadIdx.x] = treeInfo.rights[cur];
else
printf("stack size exceeded\n");
continue;
}
deviceSideSolver.OnCollisionDetected(cur - internalCount);
}
deviceSideSolver.OnPostTraversal();
}Shared Memory 43%: 14 FPS 20 FPS. :
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses
, â 1080 Ti Pascal. , , . 20- . .
20- RTX . .
, , . . :
.
atomic- lock-free . , (Out-of-order execution), , . lock-free . , , , . -, , , C++ . std::memory_order.
__device__ void CMortonTree::STreeNodeSoA::BottomToTopInitialization(size_t leafId)
{
auto cur = leafs - 1 + leafId;
auto curBox = boxes[cur];
while (cur != 0)
{
auto parent = parents[cur];
//1. atomic
auto visited = atomicExch(&atomicVisits[parent], 1);
if (!visited)
return;
TIndex siblingIndex;
SBoundingBox siblingBox;
TIndex rightmostIndex;
TIndex rightmostChild;
auto leftParentChild = lefts[parent];
if (leftParentChild == cur)
{
auto rightParentChild = rights[parent];
siblingIndex = rightParentChild;
rightmostIndex = rightParentChild;
}
else
{
siblingIndex = leftParentChild;
rightmostIndex = cur;
}
siblingBox = boxes[siblingIndex];
rightmostChild = rightmosts[rightmostIndex];
SBoundingBox parentBox = SBoundingBox::ExpandBox(curBox, siblingBox);
boxes[parent] = parentBox;
rightmosts[parent] = rightmostChild;
cur = parent;
curBox = parentBox;
//2. .
//
__threadfence();
}
}
, , BVH , . , . , SIMT ( Volta SIMT Model Starvation-Free Algorithms) Nvidia Volta Out-of-order execution. , , atomicExch(), .. , Turing . , , , , . data race.
thrust::device_vector
, thurst::device_vector . resize() GPU cudaDeviceSynchronize(). , . , . , , . â . â (reduce, sum, min, max) , . Cuda UnBound (CUB) . , thrust , .
, , .
, GPU -, . , . â â, APOD .
, , , , GPU.
Wenn Sie CUDA lernen möchten, aber nicht wissen, wo Sie anfangen sollen, gibt es auf Youtube einen groĂartigen Kurs von Udacity âEinfĂŒhrung in die parallele Programmierungâ. Ich empfehle zu ĂŒberprĂŒfen.
https://www.youtube.com/playlist?list=PLAwxTw4SYaPnFKojVQrmyOGFCqHTxfdv2
Zum Schluss noch ein paar Videosimulationen:
CPU-Version, 8 Threads, 131'072 Partikel
CUDA-Version, 4,2 Millionen Partikel, 30 Minuten Simulation



