hm, is it possible to hide the code somehow, like obfuscating or something like this?
No.
Since the native instruction set of each gpu model is be different, the driver will compile a binary based on a pseudo assembly language stored in the file. Unlike FPGA bitstreams (that are run encrypted onchip), gpu binary kernels are open. Protecting the exe file doesn't really help, because you can extract the assembly code by snooping the driver. To protect kernels coders can alter the hash abit. They can rename tablenames to make it less readable. But they can't hide how they managed to get the speedup. As you can see from the SIMD-512 t-rex disassembly they use sharedmem instead of slow DDR5 mem to store table data.
Based upon the 2 Christians,klaus_t's, Tanguy Pruvot's, tsiv and SP's work (2013-2016)
.version 6.2
.target sm_52
.address_size 32
.const .align 1 .b8 c_perm[64] = {2, 3, 6, 7, 0, 1, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 7, 6, 5, 4, 3, 2, 1, 0, 1, 0, 3, 2, 5, 4, 7, 6, 0, 1, 4, 5, 6, 7, 2, 3, 6, 7, 2, 3, 0, 1, 4, 5, 6, 7, 0, 1, 4, 5, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1};
.const .align 4 .b8 c_IV_512[128] = {149, 107, 161, 11, 173, 153, 249, 114, 174, 194, 236, 159, 252, 100, 50, 186, 41, 73, 137, 94, 229, 48, 159, 142, 55, 170, 29, 47, 88, 197, 242, 240, 67, 102, 80, 172, 165, 53, 6, 169, 139, 135, 91, 226, 143, 135, 183, 170, 122, 127, 129, 136, 43, 137, 2, 10, 80, 117, 154, 85, 126, 101, 143, 89, 161, 96, 239, 126, 232, 227, 112, 107, 209, 20, 23, 156, 168, 226, 88, 185, 94, 103, 2, 171, 79, 1, 28, 237, 187, 101, 141, 205, 87, 162, 183, 253, 153, 72, 37, 9, 188, 199, 153, 214, 220, 182, 25, 144, 228, 34, 144, 43, 86, 73, 161, 143, 211, 155, 191, 33, 67, 9, 77, 185, 34, 220, 253, 111};
.const .align 2 .b8 c_FFT128_8_16_Twiddle[256] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 60, 0, 2, 0, 120, 0, 4, 0, 239, 255, 8, 0, 222, 255, 16, 0, 188, 255, 32, 0, 121, 0, 64, 0, 241, 255, 128, 0, 226, 255, 1, 0, 46, 0, 60, 0, 189, 255, 2, 0, 92, 0, 120, 0, 123, 0, 4, 0, 183, 255, 239, 255, 245, 255, 8, 0, 111, 0, 222, 255, 234, 255, 1, 0, 189, 255, 120, 0, 183, 255, 8, 0, 234, 255, 188, 255, 186, 255, 64, 0, 81, 0, 226, 255, 210, 255, 254, 255, 133, 255, 17, 0, 145, 255, 1, 0, 138, 255, 46, 0, 225, 255, 60, 0, 116, 0, 189, 255, 195, 255, 2, 0, 21, 0, 92, 0, 194, 255, 120, 0, 231, 255, 123, 0, 134, 255, 1, 0, 116, 0, 92, 0, 134, 255, 239, 255, 84, 0, 234, 255, 18, 0, 32, 0, 114, 0, 117, 0, 207, 255, 226, 255, 118, 0, 67, 0, 62, 0, 1, 0, 225, 255, 189, 255, 21, 0, 120, 0, 134, 255, 183, 255, 206, 255, 8, 0, 9, 0, 234, 255, 167, 255, 188, 255, 52, 0, 186, 255, 114, 0, 1, 0, 195, 255, 123, 0, 206, 255, 222, 255, 18, 0, 186, 255, 157, 255, 128, 0, 158, 255, 67, 0, 25, 0, 17, 0, 247, 255, 35, 0, 177, 255};
.const .align 2 .b8 c_FFT256_2_128_Twiddle[256] = {1, 0, 41, 0, 138, 255, 45, 0, 46, 0, 87, 0, 225, 255, 14, 0, 60, 0, 146, 255, 116, 0, 129, 255, 189, 255, 80, 0, 195, 255, 69, 0, 2, 0, 82, 0, 21, 0, 90, 0, 92, 0, 173, 255, 194, 255, 28, 0, 120, 0, 37, 0, 231, 255, 3, 0, 123, 0, 159, 255, 134, 255, 137, 255, 4, 0, 163, 255, 42, 0, 179, 255, 183, 255, 91, 0, 132, 255, 56, 0, 239, 255, 74, 0, 206, 255, 6, 0, 245, 255, 63, 0, 13, 0, 19, 0, 8, 0, 71, 0, 84, 0, 103, 0, 111, 0, 181, 255, 9, 0, 112, 0, 222, 255, 147, 255, 156, 255, 12, 0, 234, 255, 126, 0, 26, 0, 38, 0, 16, 0, 141, 255, 167, 255, 205, 255, 221, 255, 107, 0, 18, 0, 223, 255, 188, 255, 39, 0, 57, 0, 24, 0, 212, 255, 251, 255, 52, 0, 76, 0, 32, 0, 27, 0, 79, 0, 154, 255, 186, 255, 213, 255, 36, 0, 190, 255, 121, 0, 78, 0, 114, 0, 48, 0, 168, 255, 246, 255, 104, 0, 151, 255, 64, 0, 54, 0, 157, 255, 53, 0, 117, 0, 170, 255, 72, 0, 125, 0, 241, 255, 155, 255, 227, 255, 96, 0, 81, 0, 236, 255, 207, 255, 47, 0, 128, 0, 108, 0, 59, 0, 106, 0, 233, 255, 85, 0, 143, 255, 249, 255, 226, 255, 55, 0, 198, 255, 191, 255, 161, 255, 216, 255, 158, 255, 94, 0};
.const .align 4 .b8 d_cw[1024] = {32, 23, 27, 83, 9, 222, 44, 172, 135, 45, 144, 11, 244, 177, 105, 35, 1, 170, 49, 41, 130, 176, 228, 2, 20, 201, 20, 201, 166, 225, 218, 193, 92, 43, 140, 241, 107, 48, 172, 8, 20, 201, 191, 39, 141, 84, 220, 206, 190, 196, 48, 198, 53, 67, 140, 241, 124, 66, 211, 240, 128, 163, 61, 190, 228, 2, 60, 20, 48, 198, 72, 169, 9, 222, 242, 164, 133, 32, 29, 167, 132, 189, 57, 164, 106, 205, 159, 16, 97, 239, 168, 238, 232, 28, 171, 165, 164, 212, 144, 11, 157, 3, 109, 61, 83, 77, 148, 37, 52, 224, 160, 186, 90, 30, 199, 91, 254, 242, 244, 177, 9, 222, 202, 18, 195, 65, 141, 84, 13, 248, 180, 60, 196, 235, 236, 54, 238, 67, 100, 166, 189, 26, 53, 67, 73, 12, 162, 199, 102, 179, 11, 235, 152, 63, 41, 245, 9, 222, 182, 73, 234, 41, 27, 83, 228, 2, 228, 2, 5, 196, 37, 219, 67, 229, 212, 83, 32, 23, 215, 10, 4, 26, 166, 225, 193, 52, 117, 184, 238, 67, 223, 62, 240, 80, 62, 33, 223, 62, 23, 57, 14, 91, 72, 169, 249, 46, 168, 238, 113, 87, 245, 20, 70, 85, 241, 250, 179, 217, 109, 61, 46, 185, 115, 171, 253, 72, 42, 88, 146, 24, 168, 238, 1, 170, 126, 79, 143, 168, 16, 175, 32, 23, 88, 17, 219, 36, 193, 52, 115, 171, 192, 209, 211, 240, 90, 30, 243, 7, 76, 195, 60, 20, 20, 201, 18, 188, 156, 89, 67, 229, 203, 188, 183, 243, 94, 56, 154, 76, 245, 20, 104, 192, 215, 10, 247, 33, 74, 182, 16, 175, 194, 222, 33, 193, 233, 198, 242, 164, 184, 86, 7, 209, 88, 17, 143, 168, 11, 235, 186, 170, 15, 5, 77, 38, 147, 194, 210, 70, 141, 84, 224, 232, 229, 172, 247, 33, 212, 83, 121, 210, 112, 244, 12, 78, 151, 220, 255, 85, 207, 214, 126, 79, 28, 253, 236, 54, 236, 54, 90, 30, 38, 62, 28, 253, 196, 235, 208, 57, 184, 86, 247, 33, 14, 91, 123, 223, 227, 88, 124, 66, 199, 91, 150, 50, 97, 239, 159, 16, 88, 17, 24, 227, 85, 90, 3, 183, 214, 167, 110, 231, 88, 17, 255, 85, 130, 176, 113, 87, 240, 80, 224, 232, 168, 238, 37, 219, 63, 203, 141, 84, 64, 46, 45, 15, 166, 225, 22, 214, 229, 172, 28, 253, 28, 253, 251, 59, 219, 36, 189, 26, 44, 172, 224, 232, 41, 245, 252, 229, 90, 30, 63, 203, 139, 71, 18, 188, 33, 193, 92, 43, 112, 244, 99, 252, 147, 194, 173, 178, 108, 218, 204, 31, 96, 69, 166, 225, 57, 164, 2, 13, 12, 78, 247, 33, 54, 237, 61, 190, 115, 171, 164, 212, 116, 14, 149, 207, 84, 247, 236, 54, 65, 216, 115, 171, 36, 49, 66, 59, 208, 57, 203, 188, 116, 14, 132, 189, 45, 15, 128, 92, 195, 65, 237, 91, 19, 164, 242, 30, 14, 225, 177, 147, 79, 108, 223, 145, 33, 110, 32, 29, 224, 226, 107, 46, 149, 209, 131, 149, 125, 106, 227, 236, 29, 19, 100, 201, 156, 54, 141, 4, 115, 251, 99, 97, 157, 158, 244, 215, 12, 40, 58, 38, 198, 217, 158, 239, 98, 16, 57, 213, 199, 42, 211, 82, 45, 173, 253, 245, 3, 10, 132, 230, 124, 25, 142, 85, 114, 170, 173, 33, 83, 222, 121, 15, 135, 240, 134, 159, 122, 96, 24, 80, 232, 175, 57, 213, 199, 42, 32, 29, 224, 226, 57, 213, 199, 42, 87, 57, 169, 198, 180, 157, 76, 98, 177, 147, 79, 108, 226, 155, 30, 100, 212, 186, 44, 69, 198, 217, 58, 38, 156, 54, 100, 201, 251, 60, 5, 195, 212, 186, 44, 69, 125, 106, 131, 149, 94, 181, 162, 74, 165, 84, 91, 171, 188, 83, 68, 172, 128, 139, 128, 116, 202, 52, 54, 203, 164, 3, 92, 252, 117, 180, 139, 75, 83, 222, 173, 33, 32, 29, 224, 226, 196, 32, 60, 223, 113, 66, 143, 189, 142, 85, 114, 170, 164, 3, 92, 252, 48, 183, 208, 72, 57, 213, 199, 42, 245, 40, 11, 215, 68, 172, 188, 83, 74, 192, 182, 63, 17, 235, 239, 20, 104, 36, 152, 219, 240, 101, 16, 154, 47, 79, 209, 176, 174, 114, 82, 141, 41, 59, 215, 196, 33, 110, 223, 145, 102, 107, 154, 148, 195, 207, 61, 48, 206, 166, 50, 89, 204, 237, 52, 18, 236, 10, 20, 245, 15, 50, 241, 205, 28, 194, 228, 61, 48, 183, 208, 72, 204, 237, 52, 18, 227, 236, 29, 19, 45, 173, 211, 82, 124, 25, 132, 230, 200, 146, 56, 109, 82, 141, 174, 114, 13, 144, 243, 111, 105, 140, 151, 115, 239, 20, 17, 235, 40, 234, 216, 21, 59, 142, 197, 113, 10, 111, 246, 144, 40, 234, 216, 21, 30, 100, 226, 155, 16, 154, 240, 101, 216, 21, 40, 234, 113, 66, 143, 189, 192, 197, 64, 58, 58, 38, 198, 217, 116, 76, 140, 179, 44, 69, 212, 186, 36, 143, 220, 112, 165, 84, 91, 171, 2, 185, 254, 70, 155, 229, 101, 26, 89, 242, 167, 13, 214, 92, 42, 163, 222, 41, 34, 214, 231, 71, 25, 184, 200, 146, 56, 109, 40, 234, 216, 21, 101, 26, 155, 229, 161, 249, 95, 6, 93, 77, 163, 178, 131, 149, 125, 106, 171, 104, 85, 151, 164, 3, 92, 252, 149, 209, 107, 46, 148, 105, 108, 150, 167, 13, 89, 242, 198, 217, 58, 38, 229, 165, 27, 90, 47, 79, 209, 176, 171, 104, 85, 151, 108, 150, 148, 105, 144, 14, 112, 241, 153, 44, 103, 211, 225, 51, 31, 204, 164, 3, 92, 252, 212, 186, 44, 69, 186, 177, 70, 78, 144, 14, 112, 241, 93, 77, 163, 178, 84, 47, 172, 208, 160, 168, 96, 87, 151, 115, 105, 140, 180, 157, 76, 98, 170, 23, 86, 232, 125, 106, 131, 149};
.const .align 4 .b8 SIMD_Q_128[1024] = {6, 0, 0, 0, 110, 0, 0, 0, 197, 255, 255, 255, 226, 255, 255, 255, 45, 0, 0, 0, 48, 0, 0, 0, 239, 255, 255, 255, 161, 255, 255, 255, 28, 0, 0, 0, 166, 255, 255, 255, 161, 255, 255, 255, 26, 0, 0, 0, 100, 0, 0, 0, 135, 255, 255, 255, 174, 255, 255, 255, 13, 0, 0, 0, 105, 0, 0, 0, 29, 0, 0, 0, 76, 0, 0, 0, 155, 255, 255, 255, 65, 0, 0, 0, 200, 255, 255, 255, 12, 0, 0, 0, 200, 255, 255, 255, 15, 0, 0, 0, 98, 0, 0, 0, 1, 0, 0, 0, 79, 0, 0, 0, 128, 255, 255, 255, 255, 255, 255, 255, 248, 255, 255, 255, 61, 0, 0, 0, 204, 255, 255, 255, 87, 0, 0, 0, 89, 0, 0, 0, 187, 255, 255, 255, 217, 255, 255, 255, 233, 255, 255, 255, 221, 255, 255, 255, 16, 0, 0, 0, 8, 0, 0, 0, 18, 0, 0, 0, 138, 255, 255, 255, 160, 255, 255, 255, 187, 255, 255, 255, 151, 255, 255, 255, 117, 0, 0, 0, 154, 255, 255, 255, 128, 0, 0, 0, 187, 255, 255, 255, 254, 255, 255, 255, 28, 0, 0, 0, 91, 0, 0, 0, 243, 255, 255, 255, 83, 0, 0, 0, 199, 255, 255, 255, 53, 0, 0, 0, 68, 0, 0, 0, 174, 255, 255, 255, 17, 0, 0, 0, 159, 255, 255, 255, 80, 0, 0, 0, 210, 255, 255, 255, 215, 255, 255, 255, 64, 0, 0, 0, 141, 255, 255, 255, 32, 0, 0, 0, 39, 0, 0, 0, 249, 255, 255, 255, 229, 255, 255, 255, 184, 255, 255, 255, 239, 255, 255, 255, 2, 0, 0, 0, 134, 255, 255, 255, 4, 0, 0, 0, 52, 0, 0, 0, 93, 0, 0, 0, 170, 255, 255, 255, 62, 0, 0, 0, 66, 0, 0, 0, 122, 0, 0, 0, 168, 255, 255, 255, 161, 255, 255, 255, 57, 0, 0, 0, 34, 0, 0, 0, 120, 0, 0, 0, 35, 0, 0, 0, 240, 255, 255, 255, 17, 0, 0, 0, 170, 255, 255, 255, 7, 0, 0, 0, 54, 0, 0, 0, 153, 255, 255, 255, 137, 255, 255, 255, 45, 0, 0, 0, 133, 255, 255, 255, 187, 255, 255, 255, 88, 0, 0, 0, 126, 0, 0, 0, 118, 0, 0, 0, 157, 255, 255, 255, 139, 255, 255, 255, 4, 0, 0, 0, 181, 255, 255, 255, 144, 255, 255, 255, 231, 255, 255, 255, 35, 0, 0, 0, 171, 255, 255, 255, 253, 255, 255, 255, 195, 255, 255, 255, 31, 0, 0, 0, 3, 0, 0, 0, 244, 255, 255, 255, 43, 0, 0, 0, 90, 0, 0, 0, 31, 0, 0, 0, 48, 0, 0, 0, 46, 0, 0, 0, 68, 0, 0, 0, 79, 0, 0, 0, 213, 255, 255, 255, 32, 0, 0, 0, 35, 0, 0, 0, 98, 0, 0, 0, 154, 255, 255, 255, 161, 255, 255, 255, 14, 0, 0, 0, 33, 0, 0, 0, 250, 255, 255, 255, 146, 255, 255, 255, 59, 0, 0, 0, 30, 0, 0, 0, 211, 255, 255, 255, 208, 255, 255, 255, 17, 0, 0, 0, 95, 0, 0, 0, 228, 255, 255, 255, 90, 0, 0, 0, 95, 0, 0, 0, 230, 255, 255, 255, 156, 255, 255, 255, 121, 0, 0, 0, 82, 0, 0, 0, 243, 255, 255, 255, 151, 255, 255, 255, 227, 255, 255, 255, 180, 255, 255, 255, 101, 0, 0, 0, 191, 255, 255, 255, 56, 0, 0, 0, 244, 255, 255, 255, 56, 0, 0, 0, 241, 255, 255, 255, 158, 255, 255, 255, 255, 255, 255, 255, 177, 255, 255, 255, 128, 0, 0, 0, 1, 0, 0, 0, 8, 0, 0, 0, 195, 255, 255, 255, 52, 0, 0, 0, 169, 255, 255, 255, 167, 255, 255, 255, 69, 0, 0, 0, 39, 0, 0, 0, 23, 0, 0, 0, 35, 0, 0, 0, 240, 255, 255, 255, 248, 255, 255, 255, 238, 255, 255, 255, 118, 0, 0, 0, 96, 0, 0, 0, 69, 0, 0, 0, 105, 0, 0, 0, 139, 255, 255, 255, 102, 0, 0, 0, 128, 255, 255, 255, 69, 0, 0, 0, 2, 0, 0, 0, 228, 255, 255, 255, 165, 255, 255, 255, 13, 0, 0, 0, 173, 255, 255, 255, 57, 0, 0, 0, 203, 255, 255, 255, 188, 255, 255, 255, 82, 0, 0, 0, 239, 255, 255, 255, 97, 0, 0, 0, 176, 255, 255, 255, 46, 0, 0, 0, 41, 0, 0, 0, 192, 255, 255, 255, 115, 0, 0, 0, 224, 255, 255, 255, 217, 255, 255, 255, 7, 0, 0, 0, 27, 0, 0, 0, 72, 0, 0, 0, 17, 0, 0, 0, 254, 255, 255, 255, 122, 0, 0, 0, 252, 255, 255, 255, 204, 255, 255, 255, 163, 255, 255, 255, 86, 0, 0, 0, 194, 255, 255, 255, 190, 255, 255, 255, 134, 255, 255, 255, 88, 0, 0, 0, 95, 0, 0, 0, 199, 255, 255, 255, 222, 255, 255, 255, 136, 255, 255, 255, 221, 255, 255, 255, 16, 0, 0, 0, 239, 255, 255, 255, 86, 0, 0, 0, 249, 255, 255, 255, 202, 255, 255, 255, 103, 0, 0, 0, 119, 0, 0, 0, 211, 255, 255, 255, 123, 0, 0, 0, 69, 0, 0, 0, 168, 255, 255, 255, 130, 255, 255, 255, 138, 255, 255, 255, 99, 0, 0, 0, 117, 0, 0, 0, 252, 255, 255, 255, 75, 0, 0, 0, 112, 0, 0, 0, 25, 0, 0, 0, 221, 255, 255, 255, 85, 0, 0, 0, 3, 0, 0, 0, 61, 0, 0, 0, 225, 255, 255, 255, 253, 255, 255, 255, 12, 0, 0, 0, 213, 255, 255, 255, 166, 255, 255, 255, 225, 255, 255, 255, 208, 255, 255, 255, 210, 255, 255, 255, 188, 255, 255, 255, 177, 255, 255, 255, 43, 0, 0, 0, 224, 255, 255, 255, 221, 255, 255, 255, 158, 255, 255, 255, 102, 0, 0, 0, 95, 0, 0, 0, 242, 255, 255, 255, 223, 255, 255, 255};
.entry _Z4k218jPjPKj(
.param .u32 _Z4k218jPjPKj_param_0,
.param .u32 _Z4k218jPjPKj_param_1,
.param .u32 _Z4k218jPjPKj_param_2
)
.maxntid 128, 1, 1
.minnctapersm 8
{
.reg .pred %p<555>;
.reg .b32 %r<3440>;
.shared .align 4 .b8 _ZZ4k218jPjPKjE12sharedMemory[16384];
ld.param.u32 %r254, [_Z4k218jPjPKj_param_0];
ld.param.u32 %r252, [_Z4k218jPjPKj_param_1];
ld.param.u32 %r253, [_Z4k218jPjPKj_param_2];
mov.u32 %r255, %ctaid.x;
mov.u32 %r256, %ntid.x;
mov.u32 %r1, %tid.x;
mad.lo.s32 %r257, %r255, %r256, %r1;
shr.u32 %r3391, %r257, 3;
setp.ge.u32 %p26, %r3391, %r254;
@%p26 bra BB0_68;
shr.u32 %r3, %r1, 3;
and.b32 %r4, %r1, 7;
setp.eq.s32 %p27, %r253, 0;
@%p27 bra BB0_3;
shl.b32 %r260, %r3391, 6;
add.s32 %r261, %r260, %r253;
add.s32 %r259, %r261, 36;
ld.global.nc.u32 %r3391, [%r259];
BB0_3:
cvta.to.global.u32 %r7, %r252;
shl.b32 %r298, %r3391, 4;
add.s32 %r8, %r298, %r4;
shl.b32 %r299, %r8, 2;
add.s32 %r263, %r252, %r299;
ld.global.nc.u32 %r262, [%r263];
add.s32 %r10, %r4, 8;
add.s32 %r265, %r263, 32;
ld.global.nc.u32 %r264, [%r265];
add.s32 %r300, %r1, 2;
and.b32 %r11, %r300, 4;
mov.u32 %r301, 6175;
mov.u32 %r296, 0;
mov.u32 %r302, -1;
shfl.sync.idx.b32 %r303|%p28, %r262, %r296, %r301, %r302;
mov.u32 %r304, 1;
shfl.sync.idx.b32 %r305|%p29, %r262, %r304, %r301, %r302;
prmt.b32 %r267, %r303, %r305, %r4;
mov.u32 %r297, 8;
bfe.u32 %r266, %r267, %r296, %r297;
mov.u32 %r306, 2;
shfl.sync.idx.b32 %r307|%p30, %r262, %r306, %r301, %r302;
mov.u32 %r308, 3;
shfl.sync.idx.b32 %r309|%p31, %r262, %r308, %r301, %r302;
prmt.b32 %r271, %r307, %r309, %r4;
bfe.u32 %r270, %r271, %r296, %r297;
mov.u32 %r310, 4;
shfl.sync.idx.b32 %r311|%p32, %r262, %r310, %r301, %r302;
mov.u32 %r312, 5;
shfl.sync.idx.b32 %r313|%p33, %r262, %r312, %r301, %r302;
prmt.b32 %r275, %r311, %r313, %r4;
bfe.u32 %r274, %r275, %r296, %r297;
mov.u32 %r314, 6;
shfl.sync.idx.b32 %r315|%p34, %r262, %r314, %r301, %r302;
mov.u32 %r316, 7;
shfl.sync.idx.b32 %r317|%p35, %r262, %r316, %r301, %r302;
prmt.b32 %r279, %r315, %r317, %r4;
bfe.u32 %r278, %r279, %r296, %r297;
shfl.sync.idx.b32 %r318|%p36, %r264, %r296, %r301, %r302;
shfl.sync.idx.b32 %r319|%p37, %r264, %r304, %r301, %r302;
prmt.b32 %r283, %r318, %r319, %r4;
bfe.u32 %r282, %r283, %r296, %r297;
shfl.sync.idx.b32 %r320|%p38, %r264, %r306, %r301, %r302;
shfl.sync.idx.b32 %r321|%p39, %r264, %r308, %r301, %r302;
prmt.b32 %r287, %r320, %r321, %r4;
bfe.u32 %r286, %r287, %r296, %r297;
shfl.sync.idx.b32 %r322|%p40, %r264, %r310, %r301, %r302;
shfl.sync.idx.b32 %r323|%p41, %r264, %r312, %r301, %r302;
prmt.b32 %r291, %r322, %r323, %r4;
bfe.u32 %r290, %r291, %r296, %r297;
shfl.sync.idx.b32 %r324|%p42, %r264, %r314, %r301, %r302;
shfl.sync.idx.b32 %r325|%p43, %r264, %r316, %r301, %r302;
prmt.b32 %r295, %r324, %r325, %r4;
bfe.u32 %r294, %r295, %r296, %r297;
shl.b32 %r326, %r4, 1;
mov.u32 %r327, c_FFT256_2_128_Twiddle;
add.s32 %r328, %r327, %r326;
ld.const.s16 %r329, [%r328];
mul.lo.s32 %r330, %r329, %r266;
and.b32 %r331, %r330, 255;
shr.s32 %r332, %r330, 8;
sub.s32 %r333, %r331, %r332;
ld.const.s16 %r334, [%r328+16];
mul.lo.s32 %r335, %r334, %r270;
and.b32 %r336, %r335, 255;
shr.s32 %r337, %r335, 8;
sub.s32 %r338, %r336, %r337;
ld.const.s16 %r339, [%r328+32];
mul.lo.s32 %r340, %r339, %r274;
and.b32 %r341, %r340, 255;
shr.s32 %r342, %r340, 8;
sub.s32 %r343, %r341, %r342;
ld.const.s16 %r344, [%r328+48];
mul.lo.s32 %r345, %r344, %r278;
and.b32 %r346, %r345, 255;
shr.s32 %r347, %r345, 8;
sub.s32 %r348, %r346, %r347;
ld.const.s16 %r349, [%r328+64];
mul.lo.s32 %r350, %r349, %r282;
and.b32 %r351, %r350, 255;
shr.s32 %r352, %r350, 8;
sub.s32 %r353, %r351, %r352;
ld.const.s16 %r354, [%r328+80];
mul.lo.s32 %r355, %r354, %r286;
and.b32 %r356, %r355, 255;
shr.s32 %r357, %r355, 8;
sub.s32 %r358, %r356, %r357;
ld.const.s16 %r359, [%r328+96];
mul.lo.s32 %r360, %r359, %r290;
and.b32 %r361, %r360, 255;
shr.s32 %r362, %r360, 8;
sub.s32 %r363, %r361, %r362;
ld.const.s16 %r364, [%r328+112];
mul.lo.s32 %r365, %r364, %r294;
and.b32 %r366, %r365, 255;
shr.s32 %r367, %r365, 8;
sub.s32 %r368, %r366, %r367;
setp.eq.s32 %p44, %r4, 7;
selp.b32 %r369, 163, 0, %p44;
selp.u32 %r370, 1, 0, %p44;
shl.b32 %r371, %r274, 2;
shl.b32 %r372, %r282, 4;
shl.b32 %r373, %r290, 6;
and.b32 %r374, %r372, 240;
bfe.s32 %r375, %r282, 4, 24;
sub.s32 %r376, %r374, %r375;
and.b32 %r377, %r373, 192;
bfe.s32 %r378, %r290, 2, 24;
sub.s32 %r379, %r377, %r378;
add.s32 %r380, %r282, %r266;
sub.s32 %r381, %r266, %r282;
add.s32 %r382, %r376, %r266;
sub.s32 %r383, %r266, %r376;
add.s32 %r384, %r290, %r274;
sub.s32 %r385, %r274, %r290;
shl.b32 %r386, %r385, 4;
add.s32 %r387, %r379, %r371;
sub.s32 %r388, %r371, %r379;
shl.b32 %r389, %r388, 4;
and.b32 %r390, %r389, 240;
bfe.s32 %r391, %r388, 4, 24;
sub.s32 %r392, %r390, %r391;
add.s32 %r393, %r384, %r380;
sub.s32 %r394, %r380, %r384;
add.s32 %r395, %r386, %r381;
sub.s32 %r396, %r381, %r386;
add.s32 %r397, %r387, %r382;
sub.s32 %r398, %r382, %r387;
add.s32 %r399, %r392, %r383;
sub.s32 %r400, %r383, %r392;
and.b32 %r401, %r393, 255;
shr.s32 %r402, %r393, 8;
sub.s32 %r403, %r401, %r402;
and.b32 %r404, %r394, 255;
shr.s32 %r405, %r394, 8;
sub.s32 %r406, %r404, %r405;
and.b32 %r407, %r395, 255;
shr.s32 %r408, %r395, 8;
sub.s32 %r409, %r407, %r408;
and.b32 %r410, %r396, 255;
shr.s32 %r411, %r396, 8;
sub.s32 %r412, %r410, %r411;
and.b32 %r413, %r397, 255;
shr.s32 %r414, %r397, 8;
sub.s32 %r415, %r413, %r414;
and.b32 %r416, %r398, 255;
shr.s32 %r417, %r398, 8;
sub.s32 %r418, %r416, %r417;
and.b32 %r419, %r399, 255;
shr.s32 %r420, %r399, 8;
sub.s32 %r421, %r419, %r420;
and.b32 %r422, %r400, 255;
shr.s32 %r423, %r400, 8;
sub.s32 %r424, %r422, %r423;
setp.lt.s32 %p45, %r403, 129;
add.s32 %r425, %r403, -257;
selp.b32 %r426, %r403, %r425, %p45;
setp.lt.s32 %p46, %r406, 129;
add.s32 %r427, %r406, -257;
selp.b32 %r428, %r406, %r427, %p46;
setp.lt.s32 %p47, %r409, 129;
add.s32 %r429, %r409, -257;
selp.b32 %r430, %r409, %r429, %p47;
setp.lt.s32 %p48, %r412, 129;
add.s32 %r431, %r412, -257;
selp.b32 %r432, %r412, %r431, %p48;
setp.lt.s32 %p49, %r415, 129;
add.s32 %r433, %r415, -257;
selp.b32 %r434, %r415, %r433, %p49;
setp.lt.s32 %p50, %r418, 129;
add.s32 %r435, %r418, -257;
selp.b32 %r436, %r418, %r435, %p50;
setp.lt.s32 %p51, %r421, 129;
add.s32 %r437, %r421, -257;
selp.b32 %r438, %r421, %r437, %p51;
setp.lt.s32 %p52, %r424, 129;
add.s32 %r439, %r424, -257;
selp.b32 %r440, %r424, %r439, %p52;
shl.b32 %r441, %r343, 2;
shl.b32 %r442, %r353, 4;
shl.b32 %r443, %r363, 6;
and.b32 %r444, %r442, 240;
shr.s32 %r445, %r353, 4;
sub.s32 %r446, %r444, %r445;
and.b32 %r447, %r443, 192;
shr.s32 %r448, %r363, 2;
sub.s32 %r449, %r447, %r448;
add.s32 %r450, %r353, %r333;
sub.s32 %r451, %r333, %r353;
add.s32 %r452, %r446, %r333;
sub.s32 %r453, %r333, %r446;
add.s32 %r454, %r363, %r343;
sub.s32 %r455, %r343, %r363;
shl.b32 %r456, %r455, 4;
add.s32 %r457, %r449, %r441;
sub.s32 %r458, %r441, %r449;
shl.b32 %r459, %r458, 4;
and.b32 %r460, %r459, 240;
shr.s32 %r461, %r458, 4;
sub.s32 %r462, %r460, %r461;
add.s32 %r463, %r454, %r450;
sub.s32 %r464, %r450, %r454;
add.s32 %r465, %r456, %r451;
sub.s32 %r466, %r451, %r456;
add.s32 %r467, %r457, %r452;
sub.s32 %r468, %r452, %r457;
add.s32 %r469, %r462, %r453;
sub.s32 %r470, %r453, %r462;
and.b32 %r471, %r463, 255;
shr.s32 %r472, %r463, 8;
sub.s32 %r473, %r471, %r472;
and.b32 %r474, %r464, 255;
shr.s32 %r475, %r464, 8;
sub.s32 %r476, %r474, %r475;
and.b32 %r477, %r465, 255;
shr.s32 %r478, %r465, 8;
sub.s32 %r479, %r477, %r478;
and.b32 %r480, %r466, 255;
shr.s32 %r481, %r466, 8;
sub.s32 %r482, %r480, %r481;
and.b32 %r483, %r467, 255;
shr.s32 %r484, %r467, 8;
sub.s32 %r485, %r483, %r484;
and.b32 %r486, %r468, 255;
shr.s32 %r487, %r468, 8;
sub.s32 %r488, %r486, %r487;
and.b32 %r489, %r469, 255;
shr.s32 %r490, %r469, 8;
(...)
mov.u32 %r1668, _ZZ4k218jPjPKjE12sharedMemory;
add.s32 %r1669, %r1668, %r1667;
shl.b32 %r1670, %r4, 4;
add.s32 %r1671, %r1669, %r1670;
st.shared.u32 [%r1671], %r1666;
shfl.sync.up.b32 %r1672|%p370, %r162, %r304, %r1653, %r302;
selp.b32 %r1673, %r144, %r1672, %p92;
shfl.sync.up.b32 %r1674|%p371, %r154, %r304, %r1653, %r302;
selp.b32 %r1675, %r136, %r1674, %p92;
mul.lo.s32 %r1676, %r1673, 185;
mul.lo.s32 %r1677, %r1675, 185;
prmt.b32 %r1678, %r1676, %r1677, %r1661;
shfl.sync.idx.b32 %r1679|%p372, %r1678, %r1665, %r301, %r302;
st.shared.u32 [%r1671+4], %r1679;
shfl.sync.up.b32 %r1680|%p373, %r126, %r304, %r1653, %r302;
selp.b32 %r1681, %r108, %r1680, %p92;
shfl.sync.up.b32 %r1682|%p374, %r118, %r304, %r1653, %r302;
selp.b32 %r1683, %r100, %r1682, %p92;
mul.lo.s32 %r1684, %r1681, 185;
mul.lo.s32 %r1685, %r1683, 185;
prmt.b32 %r1686, %r1684, %r1685, %r1661;
shfl.sync.idx.b32 %r1687|%p375, %r1686, %r1665, %r301, %r302;
st.shared.u32 [%r1671+8], %r1687;
shfl.sync.up.b32 %r1688|%p376, %r1613, %r304, %r1653, %r302;
selp.b32 %r1689, %r180, %r1688, %p92;
shfl.sync.up.b32 %r1690|%p377, %r190, %r304, %r1653, %r302;
selp.b32 %r1691, %r172, %r1690, %p92;
mul.lo.s32 %r1692, %r1689, 185;
mul.lo.s32 %r1693, %r1691, 185;
prmt.b32 %r1694, %r1692, %r1693, %r1661;
shfl.sync.idx.b32 %r1695|%p378, %r1694, %r1665, %r301, %r302;
st.shared.u32 [%r1671+12], %r1695;
shfl.sync.up.b32 %r1696|%p379, %r91, %r304, %r1653, %r302;
selp.b32 %r1697, %r73, %r1696, %p92;
shfl.sync.up.b32 %r1698|%p380, %r3433, %r304, %r1653, %r302;
selp.b32 %r1699, %r3432, %r1698, %p92;
mul.lo.s32 %r1700, %r1697, 185;
mul.lo.s32 %r1701, %r1699, 185;
prmt.b32 %r1702, %r1700, %r1701, %r1661;
ld.const.u8 %r1703, [%r1664+8];
shfl.sync.idx.b32 %r1704|%p381, %r1702, %r1703, %r301, %r302;
st.shared.u32 [%r1671+128], %r1704;
shfl.sync.up.b32 %r1705|%p382, %r163, %r304, %r1653, %r302;
selp.b32 %r1706, %r145, %r1705, %p92;
shfl.sync.up.b32 %r1707|%p383, %r3435, %r304, %r1653, %r302;
selp.b32 %r1708, %r3434, %r1707, %p92;
mul.lo.s32 %r1709, %r1706, 185;
mul.lo.s32 %r1710, %r1708, 185;
prmt.b32 %r1711, %r1709, %r1710, %r1661;
shfl.sync.idx.b32 %r1712|%p384, %r1711, %r1703, %r301, %r302;
st.shared.u32 [%r1671+132], %r1712;
shfl.sync.up.b32 %r1713|%p385, %r127, %r304, %r1653, %r302;
selp.b32 %r1714, %r109, %r1713, %p92;
shfl.sync.up.b32 %r1715|%p386, %r3437, %r304, %r1653, %r302;
selp.b32 %r1716, %r3436, %r1715, %p92;
mul.lo.s32 %r1717, %r1714, 185;
mul.lo.s32 %r1718, %r1716, 185;
prmt.b32 %r1719, %r1717, %r1718, %r1661;
shfl.sync.idx.b32 %r1720|%p387, %r1719, %r1703, %r301, %r302;
st.shared.u32 [%r1671+136], %r1720;
shfl.sync.up.b32 %r1721|%p388, %r1618, %r304, %r1653, %r302;
(...)
ld.shared.u32 %r3171, [%r3170];
shf.l.wrap.b32 %r2054, %r2059, %r2059, %r308;
shfl.sync.bfly.b32 %r3172|%p482, %r2054, %r304, %r301, %r302;
add.s32 %r3173, %r3169, %r3171;
lop3.b32 %r2058, %r2059, %r2077, %r2061, 0xCA;
add.s32 %r2064, %r3173, %r2058;
mov.u32 %r2649, 23;
shf.l.wrap.b32 %r2062, %r2064, %r2064, %r2649;
add.s32 %r2075, %r2062, %r3172;
add.s32 %r3174, %r1669, %r3162;
ld.shared.u32 %r3175, [%r3174];
shf.l.wrap.b32 %r2066, %r2075, %r2075, %r2649;
shfl.sync.bfly.b32 %r3177|%p483, %r2066, %r314, %r301, %r302;
add.s32 %r3178, %r2061, %r3175;
lop3.b32 %r2070, %r2075, %r2054, %r2077, 0xCA;
add.s32 %r2079, %r3178, %r2070;
lop3.b32 %r2074, %r2075, %r2054, %r2077, 0xCA;
add.s32 %r2080, %r3178, %r2074;
mov.u32 %r2665, 17;
shf.l.wrap.b32 %r2078, %r2079, %r2080, %r2665;
add.s32 %r2091, %r2078, %r3177;
add.s32 %r3179, %r1669, %r3165;
ld.shared.u32 %r3180, [%r3179];
shf.l.wrap.b32 %r2082, %r2091, %r2091, %r2665;
shfl.sync.bfly.b32 %r3182|%p484, %r2082, %r306, %r301, %r302;
add.s32 %r3183, %r2077, %r3180;
lop3.b32 %r2086, %r2091, %r2066, %r2054, 0xCA;
add.s32 %r2095, %r3183, %r2086;
lop3.b32 %r2090, %r2091, %r2066, %r2054, 0xCA;
add.s32 %r2096, %r3183, %r2090;
mov.u32 %r2681, 27;
shf.l.wrap.b32 %r2094, %r2095, %r2096, %r2681;
add.s32 %r2103, %r2094, %r3182;
add.s32 %r3184, %r1669, %r3167;
ld.shared.u32 %r3185, [%r3184];
shf.l.wrap.b32 %r2098, %r2103, %r2103, %r2681;
shfl.sync.bfly.b32 %r3186|%p485, %r2098, %r308, %r301, %r302;
add.s32 %r3187, %r2054, %r3185;
lop3.b32 %r2102, %r2103, %r2082, %r2066, 0xCA;
add.s32 %r2108, %r3187, %r2102;
shf.l.wrap.b32 %r2106, %r2108, %r2108, %r308;
add.s32 %r2115, %r2106, %r3186;
ld.shared.u32 %r3188, [%r3174+96];
shf.l.wrap.b32 %r2110, %r2115, %r2115, %r308;
shfl.sync.bfly.b32 %r3189|%p486, %r2110, %r312, %r301, %r302;
add.s32 %r3190, %r2066, %r3188;
lop3.b32 %r2114, %r2115, %r2098, %r2082, 0xE8;
add.s32 %r2120, %r3190, %r2114;
shf.l.wrap.b32 %r2118, %r2120, %r2120, %r2649;
add.s32 %r2131, %r2118, %r3189;
ld.shared.u32 %r3191, [%r3174+128];
shf.l.wrap.b32 %r2122, %r2131, %r2131, %r2649;
shfl.sync.bfly.b32 %r3192|%p487, %r2122, %r316, %r301, %r302;
add.s32 %r3193, %r2082, %r3191;
lop3.b32 %r2126, %r2131, %r2110, %r2098, 0xE8;
add.s32 %r2135, %r3193, %r2126;
lop3.b32 %r2130, %r2131, %r2110, %r2098, 0xE8;
add.s32 %r2136, %r3193, %r2130;
shf.l.wrap.b32 %r2134, %r2135, %r2136, %r2665;
add.s32 %r2147, %r2134, %r3192;
ld.shared.u32 %r3194, [%r3174+160];
shf.l.wrap.b32 %r2138, %r2147, %r2147, %r2665;
shfl.sync.bfly.b32 %r3195|%p488, %r2138, %r310, %r301, %r302;
add.s32 %r3196, %r2098, %r3194;
lop3.b32 %r2142, %r2147, %r2122, %r2110, 0xE8;
add.s32 %r2151, %r3196, %r2142;
lop3.b32 %r2146, %r2147, %r2122, %r2110, 0xE8;
add.s32 %r2152, %r3196, %r2146;
shf.l.wrap.b32 %r2150, %r2151, %r2152, %r2681;
add.s32 %r2159, %r2150, %r3195;
ld.shared.u32 %r3197, [%r3174+192];
shf.l.wrap.b32 %r2154, %r2159, %r2159, %r2681;
(...)