KappaCUDA

 view release on metacpan or  search on metacpan

cuda/matrixMul_kernel.ptx  view on Meta::CPAN

	.reg .f32 %f<37>;
	.reg .pred %p<4>;
	.shared .align 4 .b8 __cuda_Bs20[1024];
	.shared .align 4 .b8 __cuda_As1044[1024];
	.loc	28	41	0
$LBB1_matrixMul:
	.loc	28	72	0
	cvt.s32.u16 	%r1, %ctaid.x;
	mul24.lo.s32 	%r2, %r1, 16;
	cvt.s32.u16 	%r3, %ctaid.y;
	ld.param.s32 	%r4, [__cudaparm_matrixMul_wA];
	mul.lo.s32 	%r5, %r3, %r4;
	mul.lo.s32 	%r6, %r5, 16;
	add.s32 	%r7, %r6, %r4;
	sub.s32 	%r8, %r7, 1;
	cvt.s32.u16 	%r9, %tid.x;
	cvt.s32.u16 	%r10, %tid.y;
	ld.param.s32 	%r11, [__cudaparm_matrixMul_wB];
	setp.lt.s32 	%p1, %r8, %r6;
	@%p1 bra 	$Lt_0_3330;
	mov.u32 	%r12, __cuda_Bs20;
	mov.u32 	%r13, __cuda_As1044;
	ld.param.s32 	%r4, [__cudaparm_matrixMul_wA];
	add.s32 	%r14, %r4, 15;
	shr.s32 	%r15, %r14, 31;
	mov.s32 	%r16, 15;
	and.b32 	%r17, %r15, %r16;
	add.s32 	%r18, %r17, %r14;
	shr.s32 	%r19, %r18, 4;
	ld.param.s32 	%r11, [__cudaparm_matrixMul_wB];
	mul.lo.s32 	%r20, %r10, %r11;
	mul.lo.s32 	%r21, %r10, %r4;
	mul24.lo.u32 	%r22, %r10, 16;
	mul24.lo.u32 	%r23, %r10, 64;
	mul24.lo.u32 	%r24, %r9, 4;
	mul.lo.s32 	%r25, %r11, 16;
	add.s32 	%r26, %r20, %r2;
	add.s32 	%r27, %r21, %r6;
	add.u32 	%r28, %r9, %r22;
	add.u32 	%r29, %r23, %r13;
	add.u32 	%r30, %r24, %r12;
	add.s32 	%r31, %r26, %r9;
	add.s32 	%r32, %r27, %r9;
	mul.lo.u32 	%r33, %r28, 4;
	mul.lo.u32 	%r34, %r31, 4;
	mul.lo.u32 	%r35, %r25, 4;
	mul.lo.u32 	%r36, %r32, 4;
	add.u32 	%r37, %r33, %r13;
	add.u32 	%r38, %r33, %r12;
	add.s32 	%r39, %r21, %r8;
	ld.param.u32 	%r40, [__cudaparm_matrixMul_B];
	add.u32 	%r41, %r40, %r34;
	ld.param.u32 	%r42, [__cudaparm_matrixMul_A];
	add.u32 	%r43, %r36, %r42;
	add.s32 	%r44, %r39, %r9;
	mul.lo.u32 	%r45, %r44, 4;
	add.u32 	%r46, %r45, %r42;
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.s32 	%r47, %r19;
$Lt_0_2818:
 //<loop> Loop body line 72, nesting depth: 1, estimated iterations: unknown
	.loc	28	87	0
	ld.global.f32 	%f2, [%r43+0];
	st.shared.f32 	[%r37+0], %f2;
	.loc	28	88	0
	ld.global.f32 	%f3, [%r41+0];
	st.shared.f32 	[%r38+0], %f3;
	.loc	28	91	0
	bar.sync 	0;
	.loc	28	97	0
	ld.shared.f32 	%f4, [%r29+0];
	ld.shared.f32 	%f5, [%r30+0];
	mad.f32 	%f1, %f4, %f5, %f1;
	ld.shared.f32 	%f6, [%r29+4];
	ld.shared.f32 	%f7, [%r30+64];
	mad.f32 	%f1, %f6, %f7, %f1;
	ld.shared.f32 	%f8, [%r29+8];
	ld.shared.f32 	%f9, [%r30+128];
	mad.f32 	%f1, %f8, %f9, %f1;
	ld.shared.f32 	%f10, [%r29+12];
	ld.shared.f32 	%f11, [%r30+192];
	mad.f32 	%f1, %f10, %f11, %f1;
	ld.shared.f32 	%f12, [%r29+16];
	ld.shared.f32 	%f13, [%r30+256];
	mad.f32 	%f1, %f12, %f13, %f1;
	ld.shared.f32 	%f14, [%r29+20];
	ld.shared.f32 	%f15, [%r30+320];
	mad.f32 	%f1, %f14, %f15, %f1;
	ld.shared.f32 	%f16, [%r29+24];
	ld.shared.f32 	%f17, [%r30+384];
	mad.f32 	%f1, %f16, %f17, %f1;
	ld.shared.f32 	%f18, [%r29+28];
	ld.shared.f32 	%f19, [%r30+448];
	mad.f32 	%f1, %f18, %f19, %f1;
	ld.shared.f32 	%f20, [%r29+32];
	ld.shared.f32 	%f21, [%r30+512];
	mad.f32 	%f1, %f20, %f21, %f1;
	ld.shared.f32 	%f22, [%r29+36];
	ld.shared.f32 	%f23, [%r30+576];
	mad.f32 	%f1, %f22, %f23, %f1;
	ld.shared.f32 	%f24, [%r29+40];
	ld.shared.f32 	%f25, [%r30+640];
	mad.f32 	%f1, %f24, %f25, %f1;
	ld.shared.f32 	%f26, [%r29+44];
	ld.shared.f32 	%f27, [%r30+704];
	mad.f32 	%f1, %f26, %f27, %f1;
	ld.shared.f32 	%f28, [%r29+48];
	ld.shared.f32 	%f29, [%r30+768];
	mad.f32 	%f1, %f28, %f29, %f1;
	ld.shared.f32 	%f30, [%r29+52];
	ld.shared.f32 	%f31, [%r30+832];
	mad.f32 	%f1, %f30, %f31, %f1;
	ld.shared.f32 	%f32, [%r29+56];
	ld.shared.f32 	%f33, [%r30+896];
	mad.f32 	%f1, %f32, %f33, %f1;
	ld.shared.f32 	%f34, [%r29+60];
	ld.shared.f32 	%f35, [%r30+960];
	mad.f32 	%f1, %f34, %f35, %f1;
	.loc	28	102	0
	bar.sync 	0;
	.loc	28	72	0



( run in 1.811 second using v1.01-cache-2.11-cpan-71847e10f99 )