@@ -173,16 +173,16 @@ __global__ void __launch_bounds__(WARP_SIZE *kMmaTileSeqLenQ *kMmaTileSeqLenK)
173173 uint32_t smem_K_base_ptr = __cvta_generic_to_shared (K_tile_smem);
174174 uint32_t smem_V_base_ptr = __cvta_generic_to_shared (V_tile_smem);
175175
176- // --------------------- Registers/SMEM for thread block
177- // ------------------------- block m_old, l_old, store in lane, use float to
176+ // Registers/SMEM for thread block
177+ // block m_old, l_old, store in lane, use float to
178178 // keep precision.
179179 float lane_block_row_max_old[kWarpTileSeqLenQ ][2 ]; // [1][2]
180180 float lane_block_row_sum_old[kWarpTileSeqLenQ ][2 ]; // [1][2]
181181 fill_2D_regs<float , kWarpTileSeqLenQ , 2 >(lane_block_row_max_old, -INFINITY);
182182 fill_2D_regs<float , kWarpTileSeqLenQ , 2 >(lane_block_row_sum_old, 0 .0f );
183183
184- // ---------------------- Registers for S=Q@K^T/O=P@V
185- // ---------------------------- registers for QKV, S=Q[Br,d]@K[Bc,d]=[Br,Bc]
184+ // Registers for S=Q@K^T/O=P@V
185+ // registers for QKV, S=Q[Br,d]@K[Bc,d]=[Br,Bc]
186186 // and O=P[Br,Bc]@V[Bc,d]=[Br,d]. Allocate R_Q[(kHeadDim/kMmaAtomK)<=8][1][4],
187187 // e.g R_Q[4][1][4] 16 regs. By the way, we have to reduce R_Z to 0 regs and
188188 // reuse R_Q for collective store. Then we can load Q from smem only once and
0 commit comments