Skip to content

Commit

Permalink
AoS + SoA
Browse files Browse the repository at this point in the history
  • Loading branch information
DFPMTS committed May 20, 2023
1 parent 877f5ad commit b5d424f
Show file tree
Hide file tree
Showing 9 changed files with 302 additions and 152 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
CC = gcc
CFLAGS = -std=c99 -Wall -Ofast -mavx
LIBS = -lm -fopenmp
SRC = main.c d2q9_bgk.c calc.c utils.c aa.c
SRC = main.c d2q9_bgk.c calc.c utils.c aa.c aa_AoS.c
EXE=lbm

PARAMS_DIR=./data/params
Expand Down
4 changes: 2 additions & 2 deletions aa.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ int aa_odd_timestep(const t_param params, t_speed *cells, t_speed *tmp_cells,
return 0;
}

float buffer[9];
float prev[9];
static float buffer[9];
static float prev[9];

int aa_odd(const t_param params, t_speed *cells, t_speed *tmp_cells,
int *obstacles) {
Expand Down
77 changes: 41 additions & 36 deletions aa_AoS.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,20 +95,21 @@ int AoS_aa_odd(const t_param params, t_speed_aos *cells, t_speed *tmp_cells,
params.omega * (d_equ - cells[ii + jj * params.nx].speeds[0]);

res = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(res, c_s), omega), c_s);
_mm256_storeu_ps(buffer + 1, res);

cells[ii + jj * params.nx].speeds[1] = buffer[3];
cells[ii + jj * params.nx].speeds[3] = buffer[1];
cells[ii + jj * params.nx].speeds[2] = buffer[4];
cells[ii + jj * params.nx].speeds[4] = buffer[2];
cells[ii + jj * params.nx].speeds[5] = buffer[7];
cells[ii + jj * params.nx].speeds[7] = buffer[5];
cells[ii + jj * params.nx].speeds[6] = buffer[8];
cells[ii + jj * params.nx].speeds[8] = buffer[6];
_mm256_storeu_ps(cells[ii + jj * params.nx].speeds + 1,
_mm256_permute_ps(res, 0b01001110));

// cells[ii + jj * params.nx].speeds[1] = buffer[3];
// cells[ii + jj * params.nx].speeds[3] = buffer[1];
// cells[ii + jj * params.nx].speeds[2] = buffer[4];
// cells[ii + jj * params.nx].speeds[4] = buffer[2];
// cells[ii + jj * params.nx].speeds[5] = buffer[7];
// cells[ii + jj * params.nx].speeds[7] = buffer[5];
// cells[ii + jj * params.nx].speeds[6] = buffer[8];
// cells[ii + jj * params.nx].speeds[8] = buffer[6];
}
down[ii][0] = cells[ii + jj * params.nx].speeds[4];
down[ii][1] = cells[ii + jj * params.nx].speeds[7];
down[ii][2] = cells[ii + jj * params.nx].speeds[8];
down[ii][0] = cells[ii + jj * params.nx].speeds[2];
down[ii][1] = cells[ii + jj * params.nx].speeds[5];
down[ii][2] = cells[ii + jj * params.nx].speeds[6];
}

// ! jj = param.ny - 1 -----------------------------------
Expand Down Expand Up @@ -164,20 +165,22 @@ int AoS_aa_odd(const t_param params, t_speed_aos *cells, t_speed *tmp_cells,
params.omega * (d_equ - cells[ii + jj * params.nx].speeds[0]);

res = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(res, c_s), omega), c_s);
_mm256_storeu_ps(buffer + 1, res);

cells[ii + jj * params.nx].speeds[1] = buffer[3];
cells[ii + jj * params.nx].speeds[3] = buffer[1];
cells[ii + jj * params.nx].speeds[2] = buffer[4];
cells[ii + jj * params.nx].speeds[4] = buffer[2];
cells[ii + jj * params.nx].speeds[5] = buffer[7];
cells[ii + jj * params.nx].speeds[7] = buffer[5];
cells[ii + jj * params.nx].speeds[6] = buffer[8];
cells[ii + jj * params.nx].speeds[8] = buffer[6];
// _mm256_storeu_ps(buffer + 1, res);
_mm256_storeu_ps(cells[ii + jj * params.nx].speeds + 1,
_mm256_permute_ps(res, 0b01001110));

// cells[ii + jj * params.nx].speeds[1] = buffer[3];
// cells[ii + jj * params.nx].speeds[3] = buffer[1];
// cells[ii + jj * params.nx].speeds[2] = buffer[4];
// cells[ii + jj * params.nx].speeds[4] = buffer[2];
// cells[ii + jj * params.nx].speeds[5] = buffer[7];
// cells[ii + jj * params.nx].speeds[7] = buffer[5];
// cells[ii + jj * params.nx].speeds[6] = buffer[8];
// cells[ii + jj * params.nx].speeds[8] = buffer[6];
}
top[ii][0] = cells[ii + jj * params.nx].speeds[2];
top[ii][1] = cells[ii + jj * params.nx].speeds[5];
top[ii][2] = cells[ii + jj * params.nx].speeds[6];
top[ii][0] = cells[ii + jj * params.nx].speeds[4];
top[ii][1] = cells[ii + jj * params.nx].speeds[7];
top[ii][2] = cells[ii + jj * params.nx].speeds[8];
}

#pragma omp parallel for private(buffer)
Expand Down Expand Up @@ -232,16 +235,18 @@ int AoS_aa_odd(const t_param params, t_speed_aos *cells, t_speed *tmp_cells,
params.omega * (d_equ - cells[ii + jj * params.nx].speeds[0]);

res = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(res, c_s), omega), c_s);
_mm256_storeu_ps(buffer + 1, res);

cells[ii + jj * params.nx].speeds[1] = buffer[3];
cells[ii + jj * params.nx].speeds[3] = buffer[1];
cells[ii + jj * params.nx].speeds[2] = buffer[4];
cells[ii + jj * params.nx].speeds[4] = buffer[2];
cells[ii + jj * params.nx].speeds[5] = buffer[7];
cells[ii + jj * params.nx].speeds[7] = buffer[5];
cells[ii + jj * params.nx].speeds[6] = buffer[8];
cells[ii + jj * params.nx].speeds[8] = buffer[6];
// _mm256_storeu_ps(buffer + 1, res);
_mm256_storeu_ps(cells[ii + jj * params.nx].speeds + 1,
_mm256_permute_ps(res, 0b01001110));

// cells[ii + jj * params.nx].speeds[1] = buffer[3];
// cells[ii + jj * params.nx].speeds[3] = buffer[1];
// cells[ii + jj * params.nx].speeds[2] = buffer[4];
// cells[ii + jj * params.nx].speeds[4] = buffer[2];
// cells[ii + jj * params.nx].speeds[5] = buffer[7];
// cells[ii + jj * params.nx].speeds[7] = buffer[5];
// cells[ii + jj * params.nx].speeds[6] = buffer[8];
// cells[ii + jj * params.nx].speeds[8] = buffer[6];
}
}
}
Expand Down
104 changes: 72 additions & 32 deletions calc.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,45 +15,84 @@ int set_inlets(const t_param params, float *inlets) {
}

/* compute average velocity of whole grid, ignore grids with obstacles. */
float av_velocity(const t_param params, t_speed *cells, int *obstacles) {
float av_velocity(const t_param params, t_speed *cells, int *obstacles,
t_speed_aos *cells_aos, int type) {
int tot_cells = 0; /* no. of cells used in calculation */
float tot_u; /* accumulated magnitudes of velocity for each cell */

/* initialise */
tot_u = 0.f;

/* loop over all non-blocked cells */
/* loop over all non-blocked cells */
if (type == 0) {
#pragma omp parallel for
for (int jj = 0; jj < params.ny; jj++) {
for (int ii = 0; ii < params.nx; ii++) {
/* ignore occupied cells */
if (!obstacles[ii + jj * params.nx]) {
float local_density = 0.f;
for (int jj = 0; jj < params.ny; jj++) {
for (int ii = 0; ii < params.nx; ii++) {
/* ignore occupied cells */
if (!obstacles[ii + jj * params.nx]) {
float local_density = 0.f;

for (int kk = 0; kk < NSPEEDS; kk++) {
local_density += cells->speeds[kk][ii + jj * params.nx];
}

for (int kk = 0; kk < NSPEEDS; kk++) {
local_density += cells->speeds[kk][ii + jj * params.nx];
/* compute x velocity component */
float u_x = (cells->speeds[1][ii + jj * params.nx] +
cells->speeds[5][ii + jj * params.nx] +
cells->speeds[8][ii + jj * params.nx] -
(cells->speeds[3][ii + jj * params.nx] +
cells->speeds[6][ii + jj * params.nx] +
cells->speeds[7][ii + jj * params.nx])) /
local_density;
/* compute y velocity component */
float u_y = (cells->speeds[2][ii + jj * params.nx] +
cells->speeds[5][ii + jj * params.nx] +
cells->speeds[6][ii + jj * params.nx] -
(cells->speeds[4][ii + jj * params.nx] +
cells->speeds[7][ii + jj * params.nx] +
cells->speeds[8][ii + jj * params.nx])) /
local_density;
/* accumulate the norm of x- and y- velocity components */
tot_u += sqrtf((u_x * u_x) + (u_y * u_y));
/* increase counter of inspected cells */
++tot_cells;
}
}
}
} else if (type == 1) {
#pragma omp parallel for
for (int jj = 0; jj < params.ny; jj++) {
for (int ii = 0; ii < params.nx; ii++) {
/* ignore occupied cells */
if (!obstacles[ii + jj * params.nx]) {
/* local density total */
float local_density = 0.f;

for (int kk = 0; kk < NSPEEDS; kk++) {
local_density += cells_aos[ii + jj * params.nx].speeds[kk];
}

/* compute x velocity component */
float u_x = (cells->speeds[1][ii + jj * params.nx] +
cells->speeds[5][ii + jj * params.nx] +
cells->speeds[8][ii + jj * params.nx] -
(cells->speeds[3][ii + jj * params.nx] +
cells->speeds[6][ii + jj * params.nx] +
cells->speeds[7][ii + jj * params.nx])) /
local_density;
/* compute y velocity component */
float u_y = (cells->speeds[2][ii + jj * params.nx] +
cells->speeds[5][ii + jj * params.nx] +
cells->speeds[6][ii + jj * params.nx] -
(cells->speeds[4][ii + jj * params.nx] +
cells->speeds[7][ii + jj * params.nx] +
cells->speeds[8][ii + jj * params.nx])) /
local_density;
/* accumulate the norm of x- and y- velocity components */
tot_u += sqrtf((u_x * u_x) + (u_y * u_y));
/* increase counter of inspected cells */
++tot_cells;
/* x-component of velocity */
float u_x = (cells_aos[ii + jj * params.nx].speeds[1] +
cells_aos[ii + jj * params.nx].speeds[5] +
cells_aos[ii + jj * params.nx].speeds[8] -
(cells_aos[ii + jj * params.nx].speeds[3] +
cells_aos[ii + jj * params.nx].speeds[6] +
cells_aos[ii + jj * params.nx].speeds[7])) /
local_density;
/* compute y velocity component */
float u_y = (cells_aos[ii + jj * params.nx].speeds[2] +
cells_aos[ii + jj * params.nx].speeds[5] +
cells_aos[ii + jj * params.nx].speeds[6] -
(cells_aos[ii + jj * params.nx].speeds[4] +
cells_aos[ii + jj * params.nx].speeds[7] +
cells_aos[ii + jj * params.nx].speeds[8])) /
local_density;
/* accumulate the norm of x- and y- velocity components */
tot_u += sqrtf((u_x * u_x) + (u_y * u_y));
/* increase counter of inspected cells */
++tot_cells;
}
}
}
}
Expand All @@ -62,7 +101,8 @@ float av_velocity(const t_param params, t_speed *cells, int *obstacles) {
}

/* calculate reynold number */
float calc_reynolds(const t_param params, t_speed *cells, int *obstacles) {
return av_velocity(params, cells, obstacles) * (float)(params.ny) /
params.viscosity;
float calc_reynolds(const t_param params, t_speed *cells, int *obstacles,
t_speed_aos *cells_aos, int type) {
return av_velocity(params, cells, obstacles, cells_aos, type) *
(float)(params.ny) / params.viscosity;
}
11 changes: 7 additions & 4 deletions calc.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@

#include "types.h"

/* This file contains some calculation functions that are not the main workload, and you can completely unoptimize them */
/* This file contains some calculation functions that are not the main workload,
* and you can completely unoptimize them */

/* set inlets velocity */
int set_inlets(const t_param params, float* inlets);
int set_inlets(const t_param params, float *inlets);

/* compute average velocity */
float av_velocity(const t_param params, t_speed* cells, int* obstacles);
float av_velocity(const t_param params, t_speed *cells, int *obstacles,
t_speed_aos *cells_aos, int type);

/* calculate reynold number */
float calc_reynolds(const t_param params, t_speed* cells, int* obstacles);
float calc_reynolds(const t_param params, t_speed *cells, int *obstacles,
t_speed_aos *cells_aos, int type);

#endif
32 changes: 21 additions & 11 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/

#include "aa.h"
#include "aa_AoS.h"
#include "calc.h"
#include "d2q9_bgk.h"
#include "types.h"
Expand Down Expand Up @@ -59,8 +60,14 @@ int main(int argc, char *argv[]) {
total_time = timstr.tv_sec + (timstr.tv_usec / 1000000.0);
init_time = total_time;
/* initialise our data structures and load values from file */

// ! cells_aos_ptr
t_speed_aos *cells_aos;
// ! type 0 for SoA 1 for AoS
int type = 0;
initialise(paramfile, obstaclefile, &params, &cells, &tmp_cells, &obstacles,
&inlets);
&inlets, &cells_aos, &type);

/* Set the inlet speed */
set_inlets(params, inlets);
/* Init time stops here */
Expand All @@ -85,15 +92,19 @@ int main(int argc, char *argv[]) {
/* timestep loop */
for (int tt = 0; tt < params.maxIters; tt += 2) {
// timestep(params, cells, tmp_cells, inlets, obstacles);
if (type == 0) {
aa_odd_timestep(params, cells, tmp_cells, inlets, obstacles);
aa_even_timestep(params, cells, tmp_cells, inlets, obstacles);
} else if (type == 1) {
AoS_aa_odd_timestep(params, cells_aos, tmp_cells, inlets, obstacles);
AoS_aa_even_timestep(params, cells_aos, tmp_cells, inlets, obstacles);
}

aa_odd_timestep(params, cells, tmp_cells, inlets, obstacles);
aa_even_timestep(params, cells, tmp_cells, inlets, obstacles);

/* Visualization */
/* Visualization */
#ifdef VISUAL
if (tt % 1000 == 0) {
sprintf(buf, "%s/visual/state_%d.dat", out_dir, tt / 1000);
write_state(buf, params, cells, obstacles);
write_state(buf, params, cells, obstacles, cells_aos, type);
}
#endif
}
Expand All @@ -104,20 +115,19 @@ int main(int argc, char *argv[]) {

/* write final state and free memory */
sprintf(buf, "%s/final_state.dat", out_dir);
write_state(buf, params, cells, obstacles);
write_state(buf, params, cells, obstacles, cells_aos, type);

/* Display Reynolds number and time */
printf("==done==\n");
printf("Reynolds number:\t\t\t%.12E\n",
calc_reynolds(params, cells, obstacles));
calc_reynolds(params, cells, obstacles, cells_aos, type));
printf("Average velocity:\t\t\t%.12E\n",
av_velocity(params, cells, obstacles));
av_velocity(params, cells, obstacles, cells_aos, type));
printf("Elapsed Init time:\t\t\t%.6lf (s)\n", init_time);
printf("Elapsed Compute time:\t\t\t%.6lf (s)\n", comp_time);

/* finalise */
finalise(&params, &cells, &tmp_cells, &obstacles, &inlets);

finalise(&params, &cells, &tmp_cells, &obstacles, &inlets, &cells_aos, type);
/* total time stop */
gettimeofday(&timstr, NULL);
total_time = timstr.tv_sec + (timstr.tv_usec / 1000000.0) - total_time;
Expand Down
4 changes: 4 additions & 0 deletions types.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,8 @@ typedef struct {
float *speeds[NSPEEDS];
} t_speed;

typedef struct {
float speeds[NSPEEDS];
} t_speed_aos;

#endif
Loading

0 comments on commit b5d424f

Please sign in to comment.