It looks like your comparing original code with -ftreevectorize to "hand coded" with -ftreevectorize. That doesn't prove anything about -ftreevectorize.
You need to test the same code with and without vectorization. Did your hand coded version actually use parallel SIMD Salsa on the data "arranged in lanes"?
ok the original driving codes is like such
#include <stdio.h>
#include <time.h>
void salsa(uint *X, uint rounds);
int main(int argc, char **argv) {
uint X[16];
const int rounds = 1024*1024;
clock_t start, end;
double cpu_time_used;
for(int i=0; i<16; i++)
X[i] = i;
start = clock();
salsa(X,rounds);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("cputime %g\n", cpu_time_used);
}
/* Salsa20, rounds must be a multiple of 2 */
void __attribute__ ((noinline)) salsa(uint *X, uint rounds) {
uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t
;
x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3];
x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7];
x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11];
x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
#define quarter(a, b, c, d, v) \
t = a + d; if (v) printf("t: %d\n",t); \
t = ROTL32(t, 7); if(v) printf("t: %d\n",t); \
b ^= t; if(v) printf("b: %d\n",b); \
t = b + a; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 9); if(v) printf("t: %d\n",t); \
c ^= t; if(v) printf("c: %d\n",c); \
t = c + b; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 13); if(v) printf("t: %d\n",t); \
d ^= t; if(v) printf("d: %d\n",d); \
t = d + c; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 18); if(v) printf("t: %d\n",t); \
a ^= t; if(v) printf("a: %d\n",a);
int v = 0;
for(; rounds; rounds -= 2) {
quarter( x0, x4, x8, x12, v);
quarter( x5, x9, x13, x1, v);
quarter(x10, x14, x2, x6, v);
quarter(x15, x3, x7, x11, v);
quarter( x0, x1, x2, x3, v);
quarter( x5, x6, x7, x4, v);
quarter(x10, x11, x8, x9, v);
quarter(x15, x12, x13, x14, v);
}
X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3;
X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7;
X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11;
X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
#undef quarter
}
that will run 1024 x 1024 rounds ~ 1048576 rounds
without optimization i.e. no -O flags
cputime 0.187971
cputime 0.231245
cputime 0.187873
~ 202.363 ms for that 1048576 rounds
with optimization
-O2 but no
-ftreevectorizecputime 0.011749
cputime 0.011733
cputime 0.025701
~ 16.394 ms for that 1048576 rounds
with
-O2 -ftree-vectorize -ftree-slp-vectorize -ftree-loop-vectorize -ffast-math -ftree-vectorizer-verbose=7 -funsafe-loop-optimizations -funsafe-math-optimizations
cputime 0.012641
cputime 0.012635
cputime 0.012638
~ 12.638 ms for that 1048576 rounds
there is only a small amount of NEON simd assembly generated by
-ftreevectorize, all those v* codes
...
str r2, [sp, #164]
ldr r2, [sp, #12]
vldr d16, [sp, #152]
vldr d17, [sp, #160]
str r2, [sp, #172]
ldr r2, [sp, #64]
vldr d20, [sp, #168]
vldr d21, [sp, #176]
str r2, [sp, #184]
vadd.i32 q8, q8, q9
ldr r2, [sp, #36]
str r2, [sp, #188]
ldr r2, [sp, #68]
str r2, [sp, #192]
ldr r2, [sp, #72]
str r2, [sp, #196]
vldr d18, [sp, #184]
vldr d19, [sp, #192]
str r7, [sp, #212]
ldr r2, [sp]
str r2, [sp, #204]
ldr r2, [sp, #24]
vadd.i32 q9, q9, q10
for practical purpose the running speeds with
-ftreevectorize vs without is nearly the same as
-O2, my earlier observations is likely results from differing cache conditions and possible cpu context switches, i.e. the cpu run other threads
my 'hand optimised assembly' looks like this
/* this is for the salsa quarter round, 4 rounds in parallel,
i.e. each instruction does 4 different quarter rounds in a 4x4 matrix*/
#define vquarter(VA, VB, VC, SHIFT) \
vt = vaddq_u32(VB, VA); \
vt1 = vshlq_n_u32(vt, SHIFT); \
vt1 = vsraq_n_u32(vt1, vt, 32-SHIFT); \
VC = veorq_u32(VC, vt1);
but that there are a lot of permutations codes, this is an abstract from assembly generated from gcc -S option
vld1.32 {d22-d23}, [r5]
vld1.32 {d24-d25}, [r8]
vld1.32 {d18-d19}, [r7]
vld1.32 {d20-d21}, [r6]
cmp r4, #0
beq .L2
.L3:
... matrix permutations
vmov.32 r3, d22[0]
vmov.32 d8[0], r3
vmov.32 r3, d24[1]
vmov.32 d8[1], r3
vmov.32 r3, d19[0]
vmov.32 d9[0], r3
vmov.32 r3, d21[1]
vmov.32 d9[1], r3
vmov.32 r3, d24[0]
vmov.32 d10[0], r3
vmov.32 r3, d18[1]
vmov.32 d10[1], r3
vmov.32 r3, d21[0]
vmov.32 d11[0], r3
vmov.32 r3, d23[1]
...
... // all that 4x parallel quarter rounds generated from the neon intrinsics
vmov q7, q10 @ v4si
vadd.i32 q8, q10, q4
vshl.i32 q9, q8, #7
vsra.u32 q9, q8, #25
veor q9, q9, q5
vadd.i32 q10, q9, q4
vshl.i32 q8, q10, #9
vsra.u32 q8, q10, #23
veor q8, q8, q6
vadd.i32 q11, q8, q9
vshl.i32 q10, q11, #13
vsra.u32 q10, q11, #19
veor q10, q10, q7
vadd.i32 q12, q10, q8
vshl.i32 q11, q12, #18
vsra.u32 q11, q12, #14
veor q11, q11, q4
... then more matrix permutations
vmov.32 r3, d20[1]
vmov q5, q9 @ v4si
vmov.32 d10[0], r3
vmov.32 r3, d21[0]
vmov.32 d10[1], r3
vmov.32 r3, d21[1]
vmov.32 d11[0], r3
vmov.32 r3, d20[0]
vmov q14, q5 @ v4si
vmov.32 d29[1], r3
vmov.32 r3, d17[0]
vmov q6, q8 @ v4si
vmov.32 d12[0], r3
vmov.32 r3, d17[1]
vmov.32 d12[1], r3
vmov.32 r3, d16[0]
the above codes is nearly 'everything NEON', all that permutations is a bummer and cannot be paralleled (e.g. a lot of stalls as different values need to be loaded into a single simd register)
cputime 0.059751
cputime 0.05988
cputime 0.059885
~ 59.8387 ms for that 1048576 rounds
it is likely that there are 'better ways to do neon', but that gcc's
-O2 and
-ftreevectorize takes less than a minute to type in a script / config file and probably just works be it ARM, Intel or AMD, etc cpus.
And that it is 5x faster than this naive neon implementation.
blurb:
my guess is GPUs and maybe Intel, AMD probably has a different hardware architecture that can drastically speed up loads and stores to / from the vector registers.
raspberry pi that broadcom a72 cpu uses an external dram that has a 16 bits bus, there is little that can be done about it as it needs to optimise for 'cheapness' and space and ease of manufacturing.
I think some GPUs and 'high end' CPUs has more like 64 bits or even 128 bits or more data buses which can possibly load/store 2x64 or more words or more in a single cycle.
at just 16 bits dram data bus even a single 64 bits words takes 4 cycles just to get or store that word and one need to add all the memory wait states which means more cycles.
i avoided that in my 'hand coded assembly' and simply used cpu registers, but that it did not overcome the stalls writing to / reading from neon registers.
for a comparison, without optimization i.e. no
-O2 and/or
-ftreevectorize flags and simply using that 'rearranged arrays salsa to look like lanes', so that it writes to and read from memory on the raspberry pi.
it takes > 1 seconds for that same 1048576 rounds, all that optimizations is then hundreds of times faster than if it isn't optimised, that is even true vs the naive neon simd.