dga
|
|
July 20, 2015, 05:30:52 PM |
|
Thanks for keeping my toys profitable this week. This isn't a particularly effective patch - sub-0.1% improvement on 750ti - but it's a little cleaner. (I'm not sure if this part is the speedup or some other changes I made that I'll submit separately, but I figure the cleanup is worthwhile anyway). The only real substance in here is ensuring that the temporary variable for the swaps is scoped more tightly; the rest just shifts them to using typesafe inline functions instead of the existing macros. diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu index fa81e83..67786c0 100644 --- a/bitslice_transformations_quad.cu +++ b/bitslice_transformations_quad.cu @@ -10,46 +10,53 @@ #define merge8(z, x, y, b)\ z=__byte_perm(x, y, b); \ -#define SWAP8(x,y)\ - x=__byte_perm(x, y, 0x5410); \ - y=__byte_perm(x, y, 0x7632); - -#define SWAP4(x,y)\ - t = (y<<4); \ - t = (x ^ t); \ - t = 0xf0f0f0f0UL & t; \ - x = (x ^ t); \ - t= t>>4;\ - y= y ^ t; - -#define SWAP4_final(x,y)\ - t = (y<<4); \ - t = (x ^ t); \ - t = 0xf0f0f0f0UL & t; \ - x = (x ^ t); \ - - -#define SWAP2(x,y)\ - t = (y<<2); \ - t = (x ^ t); \ - t = 0xccccccccUL & t; \ - x = (x ^ t); \ - t= t>>2;\ - y= y ^ t; - -#define SWAP1(x,y)\ - t = (y+y); \ - t = (x ^ t); \ - t = 0xaaaaaaaaUL & t; \ - x = (x ^ t); \ - t= t>>1;\ - y= y ^ t; +__device__ __forceinline__ +void SWAP8(uint32_t &x, uint32_t &y) { + x = __byte_perm(x, y, 0x5410); + y = __byte_perm(x, y, 0x7632); +} + +__device__ __forceinline__ +void SWAP4(uint32_t &x, uint32_t &y) { + uint32_t t = (y<<4) ^ x; + t = 0xf0f0f0f0UL & t; + x = (x ^ t); + t = t>>4; + y = y ^ t; +} + +__device__ __forceinline__ +void SWAP4_final(uint32_t &x, const uint32_t y) { + uint32_t t = (y<<4); + t = (x ^ t); + t = 0xf0f0f0f0UL & t; + x = (x ^ t); +} + +__device__ __forceinline__ +void SWAP2(uint32_t &x, uint32_t &y) { + uint32_t t = (y<<2); + t = (x ^ t); + t = 0xccccccccUL & t; + x = (x ^ t); + t = t>>2; + y = y ^ t; +} + +__device__ __forceinline__ +void SWAP1(uint32_t &x, uint32_t &y) { + uint32_t t = (y+y); + t = (x ^ t); + t = 0xaaaaaaaaUL & t; + x = (x ^ t); + t = t>>1; + y = y ^ t; +} __device__ __forceinline__ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { uint32_t other[8]; - uint32_t t; uint32_t perm = (threadIdx.x & 1) ? 0x7362 : 0x5140; const unsigned int n = threadIdx.x & 3; @@ -90,7 +97,6 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest __device__ __forceinline__ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { - uint32_t t; const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531; output[0] = __byte_perm(input[0], input[4], perm); @@ -158,7 +164,6 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons __device__ __forceinline__ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { - uint32_t t; const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531; if (threadIdx.x & 3)
And to groestl functions: diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu index c39e81d..5b1cdb1 100644 --- a/groestl_functions_quad.cu +++ b/groestl_functions_quad.cu @@ -54,11 +56,9 @@ __device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32 __device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, const uint32_t &y3, const uint32_t &y2, const uint32_t &y1, const uint32_t &y0) { - uint32_t t0,t1,t2; - - t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1)); - t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0; - t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1; + uint32_t t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1)); + uint32_t t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0; + uint32_t t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1; t0 = (x2^x3) & (y2^y3); x3 = (x3 & y3) ^ t0 ^ t1; @@ -71,26 +71,24 @@ __device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t __device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_ { - uint32_t t0,t1,t2,t3,t4,t5,t6,a,b; - - t3 = x7; - t2 = x6; - t1 = x5; - t0 = x4; + uint32_t t3 = x7; + uint32_t t2 = x6; + uint32_t t1 = x5; + uint32_t t0 = x4; G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0); - a = (x4 ^ x0); + uint32_t a = (x4 ^ x0); t0 ^= a; t2 ^= (x7 ^ x3) ^ (x5 ^ x1); t1 ^= (x5 ^ x1) ^ a; t3 ^= (x6 ^ x2) ^ a; - b = t0 ^ t1; - t4 = (t2 ^ t3) & b; + uint32_t b = t0 ^ t1; + uint32_t t4 = (t2 ^ t3) & b; a = t4 ^ t3 ^ t1; - t5 = (t3 & t1) ^ a; - t6 = (t2 & t0) ^ a ^ (t2 ^ t0); + uint32_t t5 = (t3 & t1) ^ a; + uint32_t t6 = (t2 & t0) ^ a ^ (t2 ^ t0); t4 = (t5 ^ t6) & b; t1 = (t6 & t1) ^ t4; @@ -107,9 +105,8 @@ __device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32 __device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32 { - uint32_t t0, t1; - t0 = x0 ^ x1 ^ x2; - t1 = x5 ^ x6; + uint32_t t0 = x0 ^ x1 ^ x2; + uint32_t t1 = x5 ^ x6; x2 = t0 ^ t1 ^ x7; x6 = t0 ^ x3 ^ x6; x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7; @@ -122,19 +119,17 @@ __device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint3 __device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32 { - uint32_t t0,t2,t3,t5; - x1 ^= x4; - t0 = x1 ^ x6; + uint32_t t0 = x1 ^ x6; x1 ^= x5; - t2 = x0 ^ x2; + uint32_t t2 = x0 ^ x2; x2 = x3 ^ x5; t2 ^= x2 ^ x6; x2 ^= x7; - t3 = x4 ^ x2 ^ x6; + uint32_t t3 = x4 ^ x2 ^ x6; - t5 = x0 ^ x6; + uint32_t t5 = x0 ^ x6; x4 = x3 ^ x7; x0 = x3 ^ x5; @@ -160,14 +155,12 @@ __device__ __forceinline__ void sbox_quad(uint32_t *const r) __device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, { - uint32_t t0,t1; - const uint32_t tpos = threadIdx.x & 0x03; const uint32_t shift1 = tpos << 1; const uint32_t shift2 = shift1 + 1 + ((tpos == 3) << 2); - t0 = __byte_perm(x0, 0, 0x1010)>>shift1; - t1 = __byte_perm(x0, 0, 0x3232)>>shift2; + uint32_t t0 = __byte_perm(x0, 0, 0x1010)>>shift1; + uint32_t t1 = __byte_perm(x0, 0, 0x3232)>>shift2; x0 = __byte_perm(t0, t1, 0x5410); t0 = __byte_perm(x1, 0, 0x1010)>>shift1; @@ -201,14 +194,12 @@ __device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6 __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, { - uint32_t t0,t1; - const uint32_t tpos = threadIdx.x & 0x03; const uint32_t shift1 = (1 - (tpos >> 1)) + ((tpos & 0x01) << 2); const uint32_t shift2 = shift1 + 2 + ((tpos == 1) << 2); - t0 = __byte_perm(x0, 0, 0x1010)>>shift1; - t1 = __byte_perm(x0, 0, 0x3232)>>shift2; + uint32_t t0 = __byte_perm(x0, 0, 0x1010)>>shift1; + uint32_t t1 = __byte_perm(x0, 0, 0x3232)>>shift2; x0 = __byte_perm(t0, t1, 0x5410); t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
Cheers.
|