Try to optimize runner's tight inner loop #2

seriyps · 2019-09-11T13:22:25Z

rebar3_bench/src/rebar3_bench_runner.erl

Lines 129 to 134 in 03bb6ec

    
           %% Inner tight loop 
        
           do_run_n(_, _, 0) -> 
        
               ok; 
        
           do_run_n(F, St, N) -> 
        
               F(St), 
        
               do_run_n(F, St, N - 1).

Should find if it's possible to somehow minimize the amount of work done by the loop.

seriyps · 2019-09-11T13:26:42Z

On OTP22 seems changing the order of arguments doesn't reduce amount of opcodes in the function, but changes opcodes itself and amount of arguments to those opcodes:

do_run_n(_, _, 0) ->
    ok;
do_run_n(St, F, N) ->
    F(St),
    do_run_n(St, F, N - 1).

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {'%',{type_info,{x,2},number}}.
    {test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
    {move,{atom,ok},{x,0}}.
    return.
  {label,5}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{y,1},{x,1}}.
    {line,[{location,"main.erl",17}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",18}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.

00007F6EA37CB398: i_func_info_IaaI 0 main do_run_n 3 
00007F6EA37CB3C0: i_is_eq_exact_immed_fxc f(00007F6EA37CB3E8) x(2) 0 
00007F6EA37CB3D8: move_return_c ok 
00007F6EA37CB3E8: allocate_tt 3 3 
00007F6EA37CB3F8: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F6EA37CB408: move_yx y(1) x(1) 
00007F6EA37CB418: i_call_fun_t 1 
00007F6EA37CB428: i_increment_yWd y(0) -1 x(2) 
00007F6EA37CB448: move_src_window2_yxx y(1) x(1) x(0) 
00007F6EA37CB458: i_call_last_fQ main:do_run_n/3 3 

( 10 opcodes, `00007F6EA37CB408: move_yx y(1) x(1) ` touches 2 registers)
========================================


do_run_n(_, _, 0) ->
    ok;
do_run_n(F, St, N) ->
    F(St),
    do_run_n(F, St, N - 1).

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {'%',{type_info,{x,2},number}}.
    {test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
    {move,{atom,ok},{x,0}}.
    return.
  {label,5}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{x,1},{x,0}}.
    {move,{y,2},{x,1}}.
    {line,[{location,"main.erl",17}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",18}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.

00007F64D740B3A0: i_func_info_IaaI 0 main do_run_n 3 
00007F64D740B3C8: i_is_eq_exact_immed_fxc f(00007F64D740B3F0) x(2) 0 
00007F64D740B3E0: move_return_c ok 
00007F64D740B3F0: allocate_tt 3 3 
00007F64D740B400: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F64D740B410: move_shift_yxx y(2) x(1) x(0) 
00007F64D740B420: i_call_fun_t 1 
00007F64D740B430: i_increment_yWd y(0) -1 x(2) 
00007F64D740B450: move_src_window2_yxx y(1) x(1) x(0) 
00007F64D740B460: i_call_last_fQ main:do_run_n/3 3 

( 10 opcodes, `00007F64D740B410: move_shift_yxx y(2) x(1) x(0) ` touches 3 registers)
==========================================

do_run_n(0, _, _) ->
    ok;
do_run_n(N, F, St) ->
    F(St),
    do_run_n(N - 1, F, St).

00007F62A078B3A0: i_func_info_IaaI 0 main do_run_n 3 
00007F62A078B3C8: i_is_eq_exact_immed_frc f(00007F62A078B3F0) r(0) 0 
00007F62A078B3E0: move_return_c ok 
00007F62A078B3F0: allocate_tt 3 3 
00007F62A078B400: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F62A078B410: move2_par_yxxx y(1) x(1) x(2) x(0) 
00007F62A078B420: i_call_fun_t 1 
00007F62A078B430: i_increment_yWd y(2) -1 x(0) 
00007F62A078B450: move_src_window2_yxx y(0) x(2) x(1) 
00007F62A078B460: i_call_last_fQ main:do_run_n/3 3 

( 10 opcodes, `00007F62A078B410: move2_par_yxxx y(1) x(1) x(2) x(0) ` touches 4 registers)

So, the current order of arguments is the optimal one

seriyps · 2019-09-11T13:33:40Z

It might be problematic that less possible clause N == 0 -> ok is executed first. Would be nice to make main body execute first and jump to -> ok if guard fails.

do_run_n(St, F, N) when N =/= 0 ->
    F(St),
    do_run_n(St, F, N - 1);
do_run_n(_, _, 0) ->
    ok.

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {'%',{type_info,{x,2},number}}.
    {test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
    {move,{atom,ok},{x,0}}.
    return.
  {label,5}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{y,1},{x,1}}.
    {line,[{location,"main.erl",15}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",16}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.

00007FBF13D0B398: i_func_info_IaaI 0 main do_run_n 3 
00007FBF13D0B3C0: i_is_eq_exact_immed_fxc f(00007FBF13D0B3E8) x(2) 0 
00007FBF13D0B3D8: move_return_c ok 
00007FBF13D0B3E8: allocate_tt 3 3 
00007FBF13D0B3F8: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007FBF13D0B408: move_yx y(1) x(1) 
00007FBF13D0B418: i_call_fun_t 1 
00007FBF13D0B428: i_increment_yWd y(0) -1 x(2) 
00007FBF13D0B448: move_src_window2_yxx y(1) x(1) x(0) 
00007FBF13D0B458: i_call_last_fQ main:do_run_n/3 3

So, putting "main" body as 1st clause with N =/= 0 doesn't make any difference on OTP-22. Compiler converts it to the same binary code as original.

do_run_n(St, F, N) when N > 0 ->
    F(St),
    do_run_n(St, F, N - 1);
do_run_n(_, _, N) ->
    ok.

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {test,is_lt,{f,5},[{integer,0},{x,2}]}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{y,1},{x,1}}.
    {line,[{location,"main.erl",15}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",16}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.
  {label,5}.
    {move,{atom,ok},{x,0}}.
    return.

00007F6573B0B3D0: i_func_info_IaaI 0 main do_run_n 3 
00007F6573B0B3F8: is_lt_fcx f(00007F6573B0B490) 0 x(2) 
00007F6573B0B410: allocate_tt 3 3 
00007F6573B0B420: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F6573B0B430: move_yx y(1) x(1) 
00007F6573B0B440: i_call_fun_t 1 
00007F6573B0B450: i_increment_yWd y(0) -1 x(2) 
00007F6573B0B470: move_src_window2_yxx y(1) x(1) x(0) 
00007F6573B0B480: i_call_last_fQ main:do_run_n/3 3 
00007F6573B0B490: move_return_c ok

http://tryerl.seriyps.ru/#id=78b4

So, if we put N > 0 as 1st clause, it will be executed first. But I don't know if is_lt_fcx (<) is noticeably slower than i_is_eq_exact_immed_frc (=:=)

seriyps · 2019-09-11T14:20:24Z

Last option to consider is to run F multiple times on each iteration loop (N should be div 5 from original N):

do_run_n(St, F, N) when N =/= 0 ->
    F(St),
    F(St),
    F(St),
    F(St),
    F(St),
    do_run_n(St, F, N - 1);
do_run_n(_, _, 0) ->
    ok.

00007FEA7A30D968: i_func_info_IaaI 0 main do_run_n 3 
00007FEA7A30D990: i_is_ne_exact_immed_fxc f(00007FEA7A30DB00) x(2) 0 
00007FEA7A30D9B0: allocate_tt 3 3 
00007FEA7A30D9C0: move2_xyxy x(2) y(0) x(1) y(1) 
00007FEA7A30D9D0: move_ry x(0) y(2) 
00007FEA7A30D9E0: i_call_fun_I 1 
00007FEA7A30D9F0: move_yx y(1) x(1) 
00007FEA7A30DA00: move_yr y(2) x(0) 
00007FEA7A30DA10: i_call_fun_I 1 
00007FEA7A30DA20: move_yx y(1) x(1) 
00007FEA7A30DA30: move_yr y(2) x(0) 
00007FEA7A30DA40: i_call_fun_I 1 
00007FEA7A30DA50: move_yx y(1) x(1) 
00007FEA7A30DA60: move_yr y(2) x(0) 
00007FEA7A30DA70: i_call_fun_I 1 
00007FEA7A30DA80: move_yx y(1) x(1) 
00007FEA7A30DA90: move_yr y(2) x(0) 
00007FEA7A30DAA0: i_call_fun_I 1 
00007FEA7A30DAB0: i_increment_yIId y(0) -1 0 x(2) 
00007FEA7A30DAD8: move_yx y(1) x(1) 
00007FEA7A30DAE8: move_call_last_yrfQ y(2) x(0) main:do_run_n/3 3 
00007FEA7A30DB00: move_return_cr ok x(0)

It should be quite good for small fast functions, but might be problematic for slow ones

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Try to optimize runner's tight inner loop #2

Try to optimize runner's tight inner loop #2

seriyps commented Sep 11, 2019

seriyps commented Sep 11, 2019 •

edited

Loading

seriyps commented Sep 11, 2019 •

edited

Loading

seriyps commented Sep 11, 2019

Try to optimize runner's tight inner loop #2

Try to optimize runner's tight inner loop #2

Comments

seriyps commented Sep 11, 2019

seriyps commented Sep 11, 2019 • edited Loading

seriyps commented Sep 11, 2019 • edited Loading

seriyps commented Sep 11, 2019

seriyps commented Sep 11, 2019 •

edited

Loading

seriyps commented Sep 11, 2019 •

edited

Loading