// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-20633761 // Cuda compilation tools, release 7.5, V7.5.26 // Based on LLVM 3.4svn // .version 4.3 .target sm_20 .address_size 64 // .globl interleave .visible .entry interleave( .param .u64 interleave_param_0, .param .u64 interleave_param_1, .param .u64 interleave_param_2, .param .u32 interleave_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<3>; .reg .b32 %r<11>; .reg .b64 %rd<12>; ld.param.u64 %rd4, [interleave_param_0]; ld.param.u64 %rd5, [interleave_param_1]; ld.param.u64 %rd6, [interleave_param_2]; ld.param.u32 %r5, [interleave_param_3]; cvta.to.global.u64 %rd1, %rd4; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r6, %nctaid.x; mov.u32 %r7, %ntid.x; mul.lo.s32 %r1, %r6, %r7; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r10, %r8, %r7, %r9; setp.ge.s32 %p1, %r10, %r5; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd7, %r10, 4; add.s64 %rd8, %rd3, %rd7; add.s64 %rd9, %rd2, %rd7; mul.wide.s32 %rd10, %r10, 8; add.s64 %rd11, %rd1, %rd10; ld.global.f32 %f1, [%rd9]; ld.global.f32 %f2, [%rd8]; st.global.v2.f32 [%rd11], {%f2, %f1}; add.s32 %r10, %r10, %r1; setp.lt.s32 %p2, %r10, %r5; @%p2 bra BB0_1; BB0_2: ret; } // .globl deinterleave .visible .entry deinterleave( .param .u64 deinterleave_param_0, .param .u64 deinterleave_param_1, .param .u64 deinterleave_param_2, .param .u32 deinterleave_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<5>; .reg .b32 %r<11>; .reg .b64 %rd<12>; ld.param.u64 %rd4, [deinterleave_param_0]; ld.param.u64 %rd5, [deinterleave_param_1]; ld.param.u64 %rd6, [deinterleave_param_2]; ld.param.u32 %r5, [deinterleave_param_3]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r6, %nctaid.x; mov.u32 %r7, %ntid.x; mul.lo.s32 %r1, %r6, %r7; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r10, %r8, %r7, %r9; setp.ge.s32 %p1, %r10, %r5; @%p1 bra BB1_2; BB1_1: mul.wide.s32 %rd7, %r10, 8; add.s64 %rd8, %rd3, %rd7; ld.global.v2.f32 {%f1, %f2}, [%rd8]; mul.wide.s32 %rd9, %r10, 4; add.s64 %rd10, %rd2, %rd9; st.global.f32 [%rd10], %f1; add.s64 %rd11, %rd1, %rd9; st.global.f32 [%rd11], %f2; add.s32 %r10, %r10, %r1; setp.lt.s32 %p2, %r10, %r5; @%p2 bra BB1_1; BB1_2: ret; }