Oh ... there was someone faster then I was ^^
Sry but we could not resist analyzing what you did - or rather what I did, because comparing codes gMiner AMD is just a lolMiner copy cat - at least currently and except for the combination round - that one looks indeed a bit different.
So dear community - if you want to support AMD development - pick the original at lower fee. Thx
Edit: I realized that the above screens are not bad, but only show a meta fraction from the kernel top.
This here is better imao (end of equihash round 2, extracted on Radeon 7):
lolMiner 0.7 a5
BB3_147:
v_bfe_u32 v3, v7, 12, 13 // 000000011CD8: D1C80003 02351907
v_lshlrev_b32_e32 v5, 2, v3 // 000000011CE0: 240A0682
v_mov_b32_e32 v9, s19 // 000000011CE4: 7E120213
v_add_co_u32_e32 v8, vcc, s18, v5 // 000000011CE8: 32100A12
v_addc_co_u32_e32 v9, vcc, 0, v9, vcc // 000000011CEC: 38121280
v_mov_b32_e32 v12, 1 // 000000011CF0: 7E180281
global_atomic_add v5, v[8:9], v12, off glc // 000000011CF4: DD098000 057F0C08
v_lshlrev_b32_e32 v8, 2, v0 // 000000011CFC: 24100082
v_lshlrev_b32_e32 v4, 2, v4 // 000000011D00: 24080882
v_add_u32_e32 v8, s4, v8 // 000000011D04: 68101004
v_add_u32_e32 v4, s4, v4 // 000000011D08: 68080804
ds_read2st64_b32 v[8:9], v8 offset1:19 // 000000011D0C: D8701300 08000008
ds_read2st64_b32 v[10:11], v4 offset1:19 // 000000011D14: D8701300 0A000004
v_lshrrev_b32_e32 v13, 25, v7 // 000000011D1C: 201A0E99
v_lshrrev_b32_e32 v7, 25, v6 // 000000011D20: 200E0C99
v_lshl_or_b32 v6, v6, 7, v13 // 000000011D24: D2000006 04350F06
s_waitcnt lgkmcnt(0) // 000000011D2C: BF8CC07F
v_xor_b32_e32 v4, v10, v8 // 000000011D30: 2A08110A
v_xor_b32_e32 v8, v11, v9 // 000000011D34: 2A10130B
v_lshrrev_b32_e32 v10, 25, v4 // 000000011D38: 20140899
v_lshl_or_b32 v7, v4, 7, v7 // 000000011D3C: D2000007 041D0F04
v_lshrrev_b32_e32 v4, 4, v9 // 000000011D44: 20081284
v_lshrrev_b32_e32 v9, 4, v11 // 000000011D48: 20121684
v_max_u32_e32 v11, v4, v9 // 000000011D4C: 1E161304
v_lshrrev_b32_e32 v14, 16, v11 // 000000011D50: 201C1690
v_min_u32_e32 v4, v4, v9 // 000000011D54: 1C081304
v_lshlrev_b32_e32 v9, 16, v11 // 000000011D58: 24121690
v_add_u32_e32 v12, v14, v12 // 000000011D5C: 6818190E
v_lshrrev_b32_e32 v13, 16, v4 // 000000011D60: 201A0890
v_and_or_b32 v9, v4, s2, v9 // 000000011D64: D2010009 04240504
v_mul_u32_u24_e32 v4, v12, v14 // 000000011D6C: 10081D0C
v_and_b32_e32 v8, 15, v8 // 000000011D70: 2610108F
v_lshrrev_b32_e32 v4, 1, v4 // 000000011D74: 20080881
v_lshlrev_b32_e32 v8, 7, v8 // 000000011D78: 24101087
v_add_lshl_u32 v4, v4, v13, 12 // 000000011D7C: D1FE0004 02321B04
v_or3_b32 v8, v10, v8, v4 // 000000011D84: D2020008 0412110A
s_waitcnt vmcnt(0) // 000000011D8C: BF8C0F70
v_cmp_gt_u32_e32 vcc, s3, v5 // 000000011D90: 7D980A03
s_and_b64 exec, exec, vcc // 000000011D94: 86FE6A7E
BB3_148:
v_mad_u32_u24 v3, v3, s3, v5 // 000000011D98: D1C30003 04140703
v_mov_b32_e32 v4, 0 // 000000011DA0: 7E080280
v_lshlrev_b64 v[3:4], 4, v[3:4] // 000000011DA4: D28F0003 00020684
v_mov_b32_e32 v5, s21 // 000000011DAC: 7E0A0215
v_add_co_u32_e32 v3, vcc, s20, v3 // 000000011DB0: 32060614
v_addc_co_u32_e32 v4, vcc, v5, v4, vcc // 000000011DB4: 38080905
global_store_dwordx4 v[3:4], v[6:9], off // 000000011DB8: DC7C8000 007F0603
GMiner 1.34
BB3_147:
v_bfe_u32 v3, v7, 12, 13 // 000000011CD8: D1C80003 02351907
v_lshlrev_b32_e32 v5, 2, v3 // 000000011CE0: 240A0682
v_mov_b32_e32 v9, s19 // 000000011CE4: 7E120213
v_add_co_u32_e32 v8, vcc, s18, v5 // 000000011CE8: 32100A12
v_addc_co_u32_e32 v9, vcc, 0, v9, vcc // 000000011CEC: 38121280
v_mov_b32_e32 v12, 1 // 000000011CF0: 7E180281
global_atomic_add v5, v[8:9], v12, off glc // 000000011CF4: DD098000 057F0C08
v_lshlrev_b32_e32 v8, 2, v0 // 000000011CFC: 24100082
v_lshlrev_b32_e32 v4, 2, v4 // 000000011D00: 24080882
v_add_u32_e32 v8, s4, v8 // 000000011D04: 68101004
v_add_u32_e32 v4, s4, v4 // 000000011D08: 68080804
ds_read2st64_b32 v[8:9], v8 offset1:19 // 000000011D0C: D8701300 08000008
ds_read2st64_b32 v[10:11], v4 offset1:19 // 000000011D14: D8701300 0A000004
v_lshrrev_b32_e32 v13, 25, v7 // 000000011D1C: 201A0E99
v_lshrrev_b32_e32 v7, 25, v6 // 000000011D20: 200E0C99
v_lshl_or_b32 v6, v6, 7, v13 // 000000011D24: D2000006 04350F06
s_waitcnt lgkmcnt(0) // 000000011D2C: BF8CC07F
v_xor_b32_e32 v4, v10, v8 // 000000011D30: 2A08110A
v_xor_b32_e32 v8, v11, v9 // 000000011D34: 2A10130B
v_lshrrev_b32_e32 v10, 25, v4 // 000000011D38: 20140899
v_lshl_or_b32 v7, v4, 7, v7 // 000000011D3C: D2000007 041D0F04
v_lshrrev_b32_e32 v4, 4, v9 // 000000011D44: 20081284
v_lshrrev_b32_e32 v9, 4, v11 // 000000011D48: 20121684
v_max_u32_e32 v11, v4, v9 // 000000011D4C: 1E161304
v_lshrrev_b32_e32 v14, 16, v11 // 000000011D50: 201C1690
v_min_u32_e32 v4, v4, v9 // 000000011D54: 1C081304
v_lshlrev_b32_e32 v9, 16, v11 // 000000011D58: 24121690
v_add_u32_e32 v12, v14, v12 // 000000011D5C: 6818190E
v_lshrrev_b32_e32 v13, 16, v4 // 000000011D60: 201A0890
v_and_or_b32 v9, v4, s2, v9 // 000000011D64: D2010009 04240504
v_mul_u32_u24_e32 v4, v12, v14 // 000000011D6C: 10081D0C
v_and_b32_e32 v8, 15, v8 // 000000011D70: 2610108F
v_lshrrev_b32_e32 v4, 1, v4 // 000000011D74: 20080881
v_lshlrev_b32_e32 v8, 7, v8 // 000000011D78: 24101087
v_add_lshl_u32 v4, v4, v13, 12 // 000000011D7C: D1FE0004 02321B04
v_or3_b32 v8, v10, v8, v4 // 000000011D84: D2020008 0412110A
s_waitcnt vmcnt(0) // 000000011D8C: BF8C0F70
v_cmp_gt_u32_e32 vcc, s3, v5 // 000000011D90: 7D980A03
s_and_b64 exec, exec, vcc // 000000011D94: 86FE6A7E
BB3_148:
v_mad_u32_u24 v3, v3, s3, v5 // 000000011D98: D1C30003 04140703
v_mov_b32_e32 v4, 0 // 000000011DA0: 7E080280
v_lshlrev_b64 v[3:4], 4, v[3:4] // 000000011DA4: D28F0003 00020684
v_mov_b32_e32 v5, s21 // 000000011DAC: 7E0A0215
v_add_co_u32_e32 v3, vcc, s20, v3 // 000000011DB0: 32060614
v_addc_co_u32_e32 v4, vcc, v5, v4, vcc // 000000011DB4: 38080905
global_store_dwordx4 v[3:4], v[6:9], off // 000000011DB8: DC7C8000 007F0603
That code is from the part where the round results get combined and written back to the memory. Its really surprising this performs the same