asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[0]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[0*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[0]) : "r"(op1[0*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[1]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[1*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[1]) : "r"(op1[1*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[2]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[2*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[2]) : "r"(op1[2*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[3]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[3*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[3]) : "r"(op1[3*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[4]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[4*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[4]) : "r"(op1[4*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[5]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[5*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[5]) : "r"(op1[5*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[6]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[6*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[6]) : "r"(op1[6*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[7]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[7*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[7]) : "r"(op1[7*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[8]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[8*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[8]) : "r"(op1[8*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[9]) : "r"(tasm));
asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[9*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[9]) : "r"(op1[9*op1_interleaved]), "r"(op2[0*op2_interleaved]));
asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));
asm( "add.cc.u32 %0, %0, %1;" : "+r"(c0) : "r"(tasm) );
asm( "addc.u32 %0, 0, 0;" : "=r"(c1));
I don't get why you have so many adds in there... especially when you have fancy mad.lo.cc.u32/mad.hi.cc.u32 instructions...