国科大-高性能计算编程-作业二
作业题目
提交内容一:平台信息
- 本次作业在Windows平台上进行了内容实现,具体平台信息如下
Windows平台
系统:Win11-23H2
CPU:i9-13900HX(x86)
- 内存:64G
- IDE:CLion
- 编译器:MinGW
提交内容二:原代码、SoA及各优化方式的运行结果
原代码的运行结果
SoA方式的运行结果
SoA方式下使用内存对齐进行优化的运行结果
SoA方式下使用循环展开进行优化的运行结果
数据统计
muladd,timing | sum | timing | |
---|---|---|---|
AoS | 731870us | 4582696570 | 59257us |
SoA | 261313us | 4582696570 | 17589us |
SoA_Align | 307924us | 4582696570 | 17527us |
SoA_Unrolling | 263744us | 4582696570 | 17353us |
提交内容三:分析各个版本的代码
- 在windows中,无法使用perf进行代码的性能分析,因此我采用very sleepy对各个版本的代码进行分析
- 为了使效果更加明显,我将各个版本的代码以函数的形式写在同一个cpp文件中,具体运行效果如下
可以看到使用SoA分配方式以及在SoA的基础上再次进行内存对齐和循环展开优化后的代码在运行效率上高出源代码50%左右。这应该是由于SoA分配方式减少了大量的寻址工作所造成的。
但是在SoA的基础上再次进行内存对齐和循环展开优化后的代码相比于SoA并没有太大的效率提升。这应该是由以下两个原因组成:
SoA分配方式所申请的内存是连续的内存块,已经保证了内存空间的连续分布
-O2编译时已经由编译器帮用户进行了适当的循环展开
使用-O0命令再次编译后测试结果如下
可以看到SoA已经保证了内存空间的连续分布,再次手动对齐反而可能造成时间的浪费
如果不让编译器进行优化,而是手动对循环进行展开的话,是可以提升运行效率的。
本次实验中手动的循环展开只展开了四次,所以效果不是很明显,具体代码见附录
自评分及理由
自评分
- 10分(6+2+2)
理由
- 提交作业要求的三个内容
- 编写SoA及SoA基础上的内存对齐和循环展开代码,并收集这四种情况下的运行时间(6+2=8分)
- 由于windows无法使用perf,使用very sleepy分析各个版本的代码运行时间,以及分析了产生运行时间差异的原因(2分)
附录
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
#include <cstdint>
inline int64_t GetUsec()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000l) + tv.tv_usec;
}
typedef struct
{
double x;
double y;
double z;
} XYZ;
typedef struct
{
double mass;
XYZ acct;
XYZ noused; // but here
XYZ Velocity;
} Element;
const int ELEMENT_NUM = (4096 * 4096);
int homework2()
{
Element *elements = new Element[ELEMENT_NUM];
srand(202403);
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
elements[ii].mass = (double)(rand() % 10);
elements[ii].acct.x = (double)(rand() % 10) - 5.0;
elements[ii].acct.y = (double)(rand() % 10) - 5.0;
elements[ii].acct.z = (double)(rand() % 10) - 5.0;
elements[ii].Velocity.x = (double)(rand() % 10);
elements[ii].Velocity.y = (double)(rand() % 10);
elements[ii].Velocity.z = (double)(rand() % 10);
}
size_t start = GetUsec();
double dt = 1.0;
for (size_t step = 0; step < 10; ++step)
{
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
elements[ii].Velocity.x += dt * elements[ii].acct.x;
elements[ii].Velocity.y += dt * elements[ii].acct.y;
elements[ii].Velocity.z += dt * elements[ii].acct.z;
}
dt = dt * (((rand() % 10) / 10.0) * 2.0);
}
size_t finish = GetUsec();
printf("muladd,timing=%ldus\n", finish - start);
start = GetUsec();
double sum = 0.0;
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
sum += 0.5 * elements[ii].mass * (elements[ii].Velocity.x * elements[ii].Velocity.x + elements[ii].Velocity.y * elements[ii].Velocity.y + elements[ii].Velocity.z * elements[ii].Velocity.z);
}
finish = GetUsec();
printf("sum = %.8e,timing=%ldus\n", sum, finish - start);
delete[] elements;
return 0;
}
typedef struct
{
double *x;
double *y;
double *z;
} XYZ_SoA;
typedef struct
{
double *mass;
XYZ_SoA acct;
XYZ_SoA noused; // but here
XYZ_SoA Velocity;
} Element_SoA;
int homework2_SoA()
{
Element_SoA elements;
elements.mass = new double[ELEMENT_NUM];
elements.acct.x = new double[ELEMENT_NUM];
elements.acct.y = new double[ELEMENT_NUM];
elements.acct.z = new double[ELEMENT_NUM];
elements.noused.x = new double[ELEMENT_NUM];
elements.noused.y = new double[ELEMENT_NUM];
elements.noused.z = new double[ELEMENT_NUM];
elements.Velocity.x = new double[ELEMENT_NUM];
elements.Velocity.y = new double[ELEMENT_NUM];
elements.Velocity.z = new double[ELEMENT_NUM];
srand(202403);
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
elements.mass[ii] = (double)(rand() % 10);
elements.acct.x[ii] = (double)(rand() % 10) - 5.0;
elements.acct.y[ii] = (double)(rand() % 10) - 5.0;
elements.acct.z[ii] = (double)(rand() % 10) - 5.0;
elements.Velocity.x[ii] = (double)(rand() % 10);
elements.Velocity.y[ii] = (double)(rand() % 10);
elements.Velocity.z[ii] = (double)(rand() % 10);
}
size_t start = GetUsec();
double dt = 1.0;
for (size_t step = 0; step < 10; ++step)
{
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
elements.Velocity.x[ii] += dt * elements.acct.x[ii];
elements.Velocity.y[ii] += dt * elements.acct.y[ii];
elements.Velocity.z[ii] += dt * elements.acct.z[ii];
}
dt = dt * (((rand() % 10) / 10.0) * 2.0);
// SoA code below is slow on AMD EPYC 7713 64-Core Processor.
// Splitting into three loops is better way.
/*for(size_t ii = 0; ii < ELEMENT_NUM; ++ii){
ele_soa.vel_x[ii] += dt * ele_soa.acct_x[ii];
ele_soa.vel_y[ii] += dt * ele_soa.acct_y[ii];
ele_soa.vel_z[ii] += dt * ele_soa.acct_z[ii];
}
*/
}
size_t finish = GetUsec();
printf("muladd,timing=%ldus\n", finish - start);
start = GetUsec();
double sum = 0.0;
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
sum += 0.5 * elements.mass[ii] * (elements.Velocity.x[ii] * elements.Velocity.x[ii] + elements.Velocity.y[ii] * elements.Velocity.y[ii] + elements.Velocity.z[ii] * elements.Velocity.z[ii]);
}
finish = GetUsec();
printf("sum = %.8e,timing=%ldus\n", sum, finish - start);
return 0;
}
typedef struct
{
double *x;
double *y;
double *z;
} XYZ_SoA_Align;
typedef struct
{
double *mass;
XYZ_SoA_Align acct;
XYZ_SoA_Align noused; // but here
XYZ_SoA_Align Velocity;
} Element_SoA_Align;
int homework2_SoA_Align()
{
size_t alignment = 64;
Element_SoA_Align elements;
elements.mass = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.acct.x = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.acct.y = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.acct.z = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.noused.x = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.noused.y = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.noused.z = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.Velocity.x = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.Velocity.y = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
elements.Velocity.z = (double*)_aligned_malloc(ELEMENT_NUM * sizeof(double), alignment);
srand(202403);
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
elements.mass[ii] = (double)(rand() % 10);
elements.acct.x[ii] = (double)(rand() % 10) - 5.0;
elements.acct.y[ii] = (double)(rand() % 10) - 5.0;
elements.acct.z[ii] = (double)(rand() % 10) - 5.0;
elements.Velocity.x[ii] = (double)(rand() % 10);
elements.Velocity.y[ii] = (double)(rand() % 10);
elements.Velocity.z[ii] = (double)(rand() % 10);
}
size_t start = GetUsec();
double dt = 1.0;
for (size_t step = 0; step < 10; ++step)
{
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
elements.Velocity.x[ii] += dt * elements.acct.x[ii];
elements.Velocity.y[ii] += dt * elements.acct.y[ii];
elements.Velocity.z[ii] += dt * elements.acct.z[ii];
}
dt = dt * (((rand() % 10) / 10.0) * 2.0);
}
size_t finish = GetUsec();
printf("muladd,timing=%ldus\n", finish - start);
start = GetUsec();
double sum = 0.0;
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii)
{
sum += 0.5 * elements.mass[ii] * (elements.Velocity.x[ii] * elements.Velocity.x[ii] + elements.Velocity.y[ii] * elements.Velocity.y[ii] + elements.Velocity.z[ii] * elements.Velocity.z[ii]);
}
finish = GetUsec();
printf("sum = %.8e,timing=%ldus\n", sum, finish - start);
_aligned_free(elements.mass);
_aligned_free(elements.acct.x);
_aligned_free(elements.acct.y);
_aligned_free(elements.acct.z);
_aligned_free(elements.noused.x);
_aligned_free(elements.noused.y);
_aligned_free(elements.noused.z);
_aligned_free(elements.Velocity.x);
_aligned_free(elements.Velocity.y);
_aligned_free(elements.Velocity.z);
return 0;
}
typedef struct
{
double *x;
double *y;
double *z;
} XYZ_SoA_Unrolling;
typedef struct
{
double *mass;
XYZ_SoA_Unrolling acct;
XYZ_SoA_Unrolling noused; // but here
XYZ_SoA_Unrolling Velocity;
} Element_SoA_Unrolling;
int homework2_SoA_Unrolling() {
Element_SoA_Unrolling elements;
elements.mass = new double[ELEMENT_NUM];
elements.acct.x = new double[ELEMENT_NUM];
elements.acct.y = new double[ELEMENT_NUM];
elements.acct.z = new double[ELEMENT_NUM];
elements.noused.x = new double[ELEMENT_NUM];
elements.noused.y = new double[ELEMENT_NUM];
elements.noused.z = new double[ELEMENT_NUM];
elements.Velocity.x = new double[ELEMENT_NUM];
elements.Velocity.y = new double[ELEMENT_NUM];
elements.Velocity.z = new double[ELEMENT_NUM];
srand(202403);
for (size_t ii = 0; ii < ELEMENT_NUM; ++ii) {
elements.mass[ii] = (double) (rand() % 10);
elements.acct.x[ii] = (double) (rand() % 10) - 5.0;
elements.acct.y[ii] = (double) (rand() % 10) - 5.0;
elements.acct.z[ii] = (double) (rand() % 10) - 5.0;
elements.Velocity.x[ii] = (double) (rand() % 10);
elements.Velocity.y[ii] = (double) (rand() % 10);
elements.Velocity.z[ii] = (double) (rand() % 10);
}
size_t start = GetUsec();
double dt = 1.0;
// Loop unrolling example
for (size_t step = 0; step < 10; ++step) {
for (size_t ii = 0; ii < ELEMENT_NUM; ii += 4) // Increase by 4 due to unrolling
{
elements.Velocity.x[ii] += dt * elements.acct.x[ii];
elements.Velocity.y[ii] += dt * elements.acct.y[ii];
elements.Velocity.z[ii] += dt * elements.acct.z[ii];
// Unrolled operations
if (ii + 1 < ELEMENT_NUM) {
elements.Velocity.x[ii + 1] += dt * elements.acct.x[ii + 1];
elements.Velocity.y[ii + 1] += dt * elements.acct.y[ii + 1];
elements.Velocity.z[ii + 1] += dt * elements.acct.z[ii + 1];
}
if (ii + 2 < ELEMENT_NUM) {
elements.Velocity.x[ii + 2] += dt * elements.acct.x[ii + 2];
elements.Velocity.y[ii + 2] += dt * elements.acct.y[ii + 2];
elements.Velocity.z[ii + 2] += dt * elements.acct.z[ii + 2];
}
if (ii + 3 < ELEMENT_NUM) {
elements.Velocity.x[ii + 3] += dt * elements.acct.x[ii + 3];
elements.Velocity.y[ii + 3] += dt * elements.acct.y[ii + 3];
elements.Velocity.z[ii + 3] += dt * elements.acct.z[ii + 3];
}
}
dt = dt * (((rand() % 10) / 10.0) * 2.0);
}
size_t finish = GetUsec();
printf("muladd,timing=%ldus\n", finish - start);
start = GetUsec();
double sum = 0.0;
for (size_t ii = 0; ii < ELEMENT_NUM; ii += 10) // Loop unrolling for sum calculation
{
sum += 0.5 * elements.mass[ii] * (elements.Velocity.x[ii] * elements.Velocity.x[ii]
+ elements.Velocity.y[ii] * elements.Velocity.y[ii]
+ elements.Velocity.z[ii] * elements.Velocity.z[ii]);
sum += 0.5 * elements.mass[ii + 1] * (elements.Velocity.x[ii + 1] * elements.Velocity.x[ii + 1]
+ elements.Velocity.y[ii + 1] * elements.Velocity.y[ii + 1]
+ elements.Velocity.z[ii + 1] * elements.Velocity.z[ii + 1]);
sum += 0.5 * elements.mass[ii + 2] * (elements.Velocity.x[ii + 2] * elements.Velocity.x[ii + 2]
+ elements.Velocity.y[ii + 2] * elements.Velocity.y[ii + 2]
+ elements.Velocity.z[ii + 2] * elements.Velocity.z[ii + 2]);
sum += 0.5 * elements.mass[ii + 3] * (elements.Velocity.x[ii + 3] * elements.Velocity.x[ii + 3]
+ elements.Velocity.y[ii + 3] * elements.Velocity.y[ii + 3]
+ elements.Velocity.z[ii + 3] * elements.Velocity.z[ii + 3]);
}
finish = GetUsec();
printf("sum = %.8e,timing=%ldus\n", sum, finish - start);
return 0;
}
int main(){
homework2();
homework2_SoA();
homework2_SoA_Align();
homework2_SoA_Unrolling();
return 0;
}
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Yuichi's Blog!