Skip to content

Commit 0600e4f

Browse files
agattidpgeorge
authored andcommitted
py/asmrv32: Make some code sequences smaller.
This commit changes a few code sequences to use more compressed opcodes where possible. The sequences in question are the ones that show up the most in the test suite and require the least amount of code changes, namely short offset loads from memory to RET/ARG registers, indirect calls through the function table, register-based jumps, locals' offset calculation, reg-is-null jumps, and register comparisons. There are no speed losses or gains from these changes, but there is an average 15-20% generated code size reduction. Signed-off-by: Alessandro Gatti <[email protected]>
1 parent 0e26144 commit 0600e4f

File tree

2 files changed

+105
-23
lines changed

2 files changed

+105
-23
lines changed

py/asmrv32.c

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,14 @@ static uint32_t fallback_popcount(uint32_t value) {
6464
#endif
6565
#endif
6666

67-
#define INTERNAL_TEMPORARY ASM_RV32_REG_T4
67+
#define INTERNAL_TEMPORARY ASM_RV32_REG_S0
6868
#define AVAILABLE_REGISTERS_COUNT 32
6969

70+
#define IS_IN_C_REGISTER_WINDOW(register_number) \
71+
(((register_number) >= ASM_RV32_REG_X8) && ((register_number) <= ASM_RV32_REG_X15))
72+
#define MAP_IN_C_REGISTER_WINDOW(register_number) \
73+
((register_number) - ASM_RV32_REG_X8)
74+
7075
#define FIT_UNSIGNED(value, bits) (((value) & ~((1U << (bits)) - 1)) == 0)
7176
#define FIT_SIGNED(value, bits) \
7277
((((value) & ~((1U << ((bits) - 1)) - 1)) == 0) || \
@@ -269,7 +274,7 @@ static void emit_function_epilogue(asm_rv32_t *state, mp_uint_t registers) {
269274

270275
void asm_rv32_entry(asm_rv32_t *state, mp_uint_t locals) {
271276
state->saved_registers_mask |= (1U << REG_FUN_TABLE) | (1U << REG_LOCAL_1) | \
272-
(1U << REG_LOCAL_2) | (1U << REG_LOCAL_3);
277+
(1U << REG_LOCAL_2) | (1U << REG_LOCAL_3) | (1U << INTERNAL_TEMPORARY);
273278
state->locals_count = locals;
274279
emit_function_prologue(state, state->saved_registers_mask);
275280
}
@@ -288,6 +293,14 @@ void asm_rv32_emit_call_ind(asm_rv32_t *state, mp_uint_t index) {
288293
mp_uint_t offset = index * ASM_WORD_SIZE;
289294
state->saved_registers_mask |= (1U << ASM_RV32_REG_RA);
290295

296+
if (IS_IN_C_REGISTER_WINDOW(REG_FUN_TABLE) && IS_IN_C_REGISTER_WINDOW(INTERNAL_TEMPORARY) && FIT_SIGNED(offset, 7)) {
297+
// c.lw temporary, offset(fun_table)
298+
// c.jalr temporary
299+
asm_rv32_opcode_clw(state, MAP_IN_C_REGISTER_WINDOW(INTERNAL_TEMPORARY), MAP_IN_C_REGISTER_WINDOW(REG_FUN_TABLE), offset);
300+
asm_rv32_opcode_cjalr(state, INTERNAL_TEMPORARY);
301+
return;
302+
}
303+
291304
if (FIT_UNSIGNED(offset, 11)) {
292305
// lw temporary, offset(fun_table)
293306
// c.jalr temporary
@@ -343,27 +356,43 @@ void asm_rv32_emit_jump_if_reg_eq(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs
343356
void asm_rv32_emit_jump_if_reg_nonzero(asm_rv32_t *state, mp_uint_t rs, mp_uint_t label) {
344357
ptrdiff_t displacement = (ptrdiff_t)(state->base.label_offsets[label] - state->base.code_offset);
345358

359+
if (FIT_SIGNED(displacement, 9) && IS_IN_C_REGISTER_WINDOW(rs)) {
360+
// c.bnez rs', displacement
361+
asm_rv32_opcode_cbnez(state, MAP_IN_C_REGISTER_WINDOW(rs), displacement);
362+
return;
363+
}
364+
346365
// The least significant bit is ignored anyway.
347366
if (FIT_SIGNED(displacement, 13)) {
348367
// bne rs, zero, displacement
349368
asm_rv32_opcode_bne(state, rs, ASM_RV32_REG_ZERO, displacement);
350369
return;
351370
}
352371

353-
// Compensate for the initial BEQ opcode.
354-
displacement -= ASM_WORD_SIZE;
372+
// Compensate for the initial C.BEQZ/BEQ opcode.
373+
displacement -= IS_IN_C_REGISTER_WINDOW(rs) ? ASM_HALFWORD_SIZE : ASM_WORD_SIZE;
355374

356375
mp_uint_t upper = 0;
357376
mp_uint_t lower = 0;
358377
split_immediate(displacement, &upper, &lower);
359378

360379
// TODO: Can this clobber REG_TEMP[0:2]?
361380

362-
// beq rs1, zero, 12 ; PC + 0
363-
// auipc temporary, HI(displacement) ; PC + 4
364-
// jalr zero, temporary, LO(displacement) ; PC + 8
365-
// ... ; PC + 12
366-
asm_rv32_opcode_beq(state, rs, ASM_RV32_REG_ZERO, 12);
381+
// if rs1 in C window (the offset always fits):
382+
// c.beqz rs', 10 ; PC + 0
383+
// auipc temporary, HI(displacement) ; PC + 2
384+
// jalr zero, temporary, LO(displacement) ; PC + 6
385+
// ... ; PC + 10
386+
// else:
387+
// beq rs, zero, 12 ; PC + 0
388+
// auipc temporary, HI(displacement) ; PC + 4
389+
// jalr zero, temporary, LO(displacement) ; PC + 8
390+
// ... ; PC + 12
391+
if (IS_IN_C_REGISTER_WINDOW(rs)) {
392+
asm_rv32_opcode_cbeqz(state, MAP_IN_C_REGISTER_WINDOW(rs), 10);
393+
} else {
394+
asm_rv32_opcode_beq(state, rs, ASM_RV32_REG_ZERO, 12);
395+
}
367396
asm_rv32_opcode_auipc(state, INTERNAL_TEMPORARY, upper);
368397
asm_rv32_opcode_jalr(state, ASM_RV32_REG_ZERO, INTERNAL_TEMPORARY, lower);
369398
}
@@ -427,7 +456,13 @@ void asm_rv32_emit_mov_reg_local(asm_rv32_t *state, mp_uint_t rd, mp_uint_t loca
427456
void asm_rv32_emit_mov_reg_local_addr(asm_rv32_t *state, mp_uint_t rd, mp_uint_t local) {
428457
mp_uint_t offset = state->locals_stack_offset + (local * ASM_WORD_SIZE);
429458

430-
if (FIT_SIGNED(offset, 11)) {
459+
if (FIT_UNSIGNED(offset, 10) && offset != 0 && IS_IN_C_REGISTER_WINDOW(rd)) {
460+
// c.addi4spn rd', offset
461+
asm_rv32_opcode_caddi4spn(state, MAP_IN_C_REGISTER_WINDOW(rd), offset);
462+
return;
463+
}
464+
465+
if (FIT_UNSIGNED(offset, 11)) {
431466
// addi rd, sp, offset
432467
asm_rv32_opcode_addi(state, rd, ASM_RV32_REG_SP, offset);
433468
return;
@@ -442,6 +477,12 @@ void asm_rv32_emit_mov_reg_local_addr(asm_rv32_t *state, mp_uint_t rd, mp_uint_t
442477
void asm_rv32_emit_load_reg_reg_offset(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs, mp_int_t offset) {
443478
mp_int_t scaled_offset = offset * sizeof(ASM_WORD_SIZE);
444479

480+
if (IS_IN_C_REGISTER_WINDOW(rd) && IS_IN_C_REGISTER_WINDOW(rs) && FIT_SIGNED(offset, 7)) {
481+
// c.lw rd', offset(rs')
482+
asm_rv32_opcode_clw(state, MAP_IN_C_REGISTER_WINDOW(rd), MAP_IN_C_REGISTER_WINDOW(rs), scaled_offset);
483+
return;
484+
}
485+
445486
if (FIT_SIGNED(scaled_offset, 12)) {
446487
// lw rd, offset(rs)
447488
asm_rv32_opcode_lw(state, rd, rs, scaled_offset);
@@ -554,12 +595,12 @@ void asm_rv32_emit_optimised_xor(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs)
554595

555596
void asm_rv32_meta_comparison_eq(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs2, mp_uint_t rd) {
556597
// c.li rd, 1 ;
557-
// beq rs1, rs2, 8 ; PC + 0
558-
// addi rd, zero, 0 ; PC + 4
559-
// ... ; PC + 8
598+
// beq rs1, rs2, 6 ; PC + 0
599+
// c.li rd, 0 ; PC + 4
600+
// ... ; PC + 6
560601
asm_rv32_opcode_cli(state, rd, 1);
561-
asm_rv32_opcode_beq(state, rs1, rs2, 8);
562-
asm_rv32_opcode_addi(state, rd, ASM_RV32_REG_ZERO, 0);
602+
asm_rv32_opcode_beq(state, rs1, rs2, 6);
603+
asm_rv32_opcode_cli(state, rd, 0);
563604
}
564605

565606
void asm_rv32_meta_comparison_ne(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs2, mp_uint_t rd) {

py/asmrv32.h

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,23 +151,40 @@ void asm_rv32_end_pass(asm_rv32_t *state);
151151
((op & 0b1111111) | ((rd & 0b11111) << 7) | \
152152
(imm & 0b11111111111111111111000000000000))
153153

154+
#define RV32_ENCODE_TYPE_CB(op, ft3, rs, imm) \
155+
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rs & 0b111) << 7) | \
156+
(((imm) & 0b100000000) << 4) | (((imm) & 0b11000000) >> 1) | \
157+
(((imm) & 0b100000) >> 3) | (((imm) & 0b11000) << 7) | \
158+
(((imm) & 0b110) << 2))
159+
154160
#define RV32_ENCODE_TYPE_CI(op, ft3, rd, imm) \
155161
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rd & 0b11111) << 7) | \
156162
(((imm) & 0b100000) << 7) | (((imm) & 0b11111) << 2))
157163

164+
#define RV32_ENCODE_TYPE_CIW(op, ft3, rd, imm) \
165+
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rd & 0b111) << 2) | \
166+
((imm & 0b1111000000) << 1) | ((imm & 0b110000) << 7) | \
167+
((imm & 0b1000) << 2) | ((imm & 0b100) << 4))
168+
158169
#define RV32_ENCODE_TYPE_CJ(op, ft3, imm) \
159170
((op & 0b11) | ((ft3 & 0b111) << 13) | \
160171
((imm & 0b1110) << 2) | ((imm & 0b1100000000) << 1) | \
161172
((imm & 0b100000000000) << 1) | ((imm & 0b10000000000) >> 2) | \
162173
((imm & 0b10000000) >> 1) | ((imm & 0b1000000) << 1) | \
163174
((imm & 0b100000) >> 3) | ((imm & 0b10000) << 7))
164175

176+
#define RV32_ENCODE_TYPE_CL(op, ft3, rd, rs, imm) \
177+
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rd & 0b111) << 2) | \
178+
((rs & 0b111) << 7) | ((imm & 0b1000000) >> 1) | \
179+
((imm & 0b111000) << 7) | ((imm & 0b100) << 4))
180+
165181
#define RV32_ENCODE_TYPE_CR(op, ft4, rs1, rs2) \
166182
((op & 0b11) | ((rs2 & 0b11111) << 2) | ((rs1 & 0b11111) << 7) | \
167183
((ft4 & 0b1111) << 12))
168184

169185
#define RV32_ENCODE_TYPE_CSS(op, ft3, rs, imm) \
170-
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rs & 0b11111) << 2) | ((imm) & 0b111111) << 7)
186+
((op & 0b11) | ((ft3 & 0b111) << 13) | ((rs & 0b11111) << 2) | \
187+
((imm) & 0b111111) << 7)
171188

172189
void asm_rv32_emit_word_opcode(asm_rv32_t *state, mp_uint_t opcode);
173190
void asm_rv32_emit_halfword_opcode(asm_rv32_t *state, mp_uint_t opcode);
@@ -220,10 +237,28 @@ static inline void asm_rv32_opcode_caddi(asm_rv32_t *state, mp_uint_t rd, mp_int
220237
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CI(0b01, 0b000, rd, immediate));
221238
}
222239

240+
// C.ADDI4SPN RD', IMMEDIATE
241+
static inline void asm_rv32_opcode_caddi4spn(asm_rv32_t *state, mp_uint_t rd, mp_uint_t immediate) {
242+
// CIW: 000 ........ ... 00
243+
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CIW(0b00, 0b000, rd, immediate));
244+
}
245+
246+
// C.BEQZ RS', IMMEDIATE
247+
static inline void asm_rv32_opcode_cbeqz(asm_rv32_t *state, mp_uint_t rs, mp_int_t offset) {
248+
// CB: 110 ... ... ..... 01
249+
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CB(0b01, 0b110, rs, offset));
250+
}
251+
252+
// C.BNEZ RS', IMMEDIATE
253+
static inline void asm_rv32_opcode_cbnez(asm_rv32_t *state, mp_uint_t rs, mp_int_t offset) {
254+
// CB: 111 ... ... ..... 01
255+
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CB(0b01, 0b111, rs, offset));
256+
}
257+
223258
// C.J OFFSET
224-
static inline void asm_rv32_opcode_cj(asm_rv32_t *state, mp_uint_t offset) {
259+
static inline void asm_rv32_opcode_cj(asm_rv32_t *state, mp_int_t offset) {
225260
// CJ: 101 ........... 01
226-
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CJ(0b01, 0b001, offset));
261+
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CJ(0b01, 0b101, offset));
227262
}
228263

229264
// C.JALR RS
@@ -250,6 +285,12 @@ static inline void asm_rv32_opcode_clui(asm_rv32_t *state, mp_uint_t rd, mp_int_
250285
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CI(0b01, 0b011, rd, immediate >> 12));
251286
}
252287

288+
// C.LW RD', OFFSET(RS')
289+
static inline void asm_rv32_opcode_clw(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs, mp_int_t offset) {
290+
// CL: 010 ... ... .. ... 00
291+
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CL(0b00, 0b010, rd, rs, offset));
292+
}
293+
253294
// C.LWSP RD, OFFSET
254295
static inline void asm_rv32_opcode_clwsp(asm_rv32_t *state, mp_uint_t rd, mp_uint_t offset) {
255296
// CI: 010 . ..... ..... 10
@@ -383,6 +424,7 @@ static inline void asm_rv32_opcode_xori(asm_rv32_t *state, mp_uint_t rd, mp_uint
383424
}
384425

385426
#define ASM_WORD_SIZE (4)
427+
#define ASM_HALFWORD_SIZE (2)
386428

387429
#define REG_RET ASM_RV32_REG_A0
388430
#define REG_ARG_1 ASM_RV32_REG_A0
@@ -392,8 +434,7 @@ static inline void asm_rv32_opcode_xori(asm_rv32_t *state, mp_uint_t rd, mp_uint
392434
#define REG_TEMP0 ASM_RV32_REG_T1
393435
#define REG_TEMP1 ASM_RV32_REG_T2
394436
#define REG_TEMP2 ASM_RV32_REG_T3
395-
// S0 may be used as the frame pointer by the compiler.
396-
#define REG_FUN_TABLE ASM_RV32_REG_S2
437+
#define REG_FUN_TABLE ASM_RV32_REG_S1
397438
#define REG_LOCAL_1 ASM_RV32_REG_S3
398439
#define REG_LOCAL_2 ASM_RV32_REG_S4
399440
#define REG_LOCAL_3 ASM_RV32_REG_S5
@@ -432,10 +473,10 @@ void asm_rv32_emit_store_reg_reg_offset(asm_rv32_t *state, mp_uint_t source, mp_
432473
#define ASM_JUMP_IF_REG_EQ(state, rs1, rs2, label) asm_rv32_emit_jump_if_reg_eq(state, rs1, rs2, label)
433474
#define ASM_JUMP_IF_REG_NONZERO(state, rs, label, bool_test) asm_rv32_emit_jump_if_reg_nonzero(state, rs, label)
434475
#define ASM_JUMP_IF_REG_ZERO(state, rs, label, bool_test) asm_rv32_emit_jump_if_reg_eq(state, rs, ASM_RV32_REG_ZERO, label)
435-
#define ASM_JUMP_REG(state, rs) asm_rv32_opcode_jalr(state, ASM_RV32_REG_ZERO, rs, 0)
476+
#define ASM_JUMP_REG(state, rs) asm_rv32_opcode_cjr(state, rs)
436477
#define ASM_LOAD16_REG_REG_OFFSET(state, rd, rs, offset) asm_rv32_emit_load16_reg_reg_offset(state, rd, rs, offset)
437478
#define ASM_LOAD16_REG_REG(state, rd, rs) asm_rv32_opcode_lhu(state, rd, rs, 0)
438-
#define ASM_LOAD32_REG_REG(state, rd, rs) asm_rv32_opcode_lw(state, rd, rs, 0)
479+
#define ASM_LOAD32_REG_REG(state, rd, rs) ASM_LOAD_REG_REG_OFFSET(state, rd, rs, 0)
439480
#define ASM_LOAD8_REG_REG(state, rd, rs) asm_rv32_opcode_lbu(state, rd, rs, 0)
440481
#define ASM_LOAD_REG_REG_OFFSET(state, rd, rs, offset) asm_rv32_emit_load_reg_reg_offset(state, rd, rs, offset)
441482
#define ASM_LOAD_REG_REG(state, rd, rs) ASM_LOAD32_REG_REG(state, rd, rs)
@@ -452,7 +493,7 @@ void asm_rv32_emit_store_reg_reg_offset(asm_rv32_t *state, mp_uint_t source, mp_
452493
#define ASM_NOT_REG(state, rd) asm_rv32_opcode_xori(state, rd, rd, -1)
453494
#define ASM_OR_REG_REG(state, rd, rs) asm_rv32_opcode_or(state, rd, rd, rs)
454495
#define ASM_STORE16_REG_REG(state, rs1, rs2) asm_rv32_opcode_sh(state, rs1, rs2, 0)
455-
#define ASM_STORE32_REG_REG(state, rs1, rs2) asm_rv32_opcode_sw(state, rs1, rs2, 0)
496+
#define ASM_STORE32_REG_REG(state, rs1, rs2) ASM_STORE_REG_REG_OFFSET(state, rs1, rs2, 0)
456497
#define ASM_STORE8_REG_REG(state, rs1, rs2) asm_rv32_opcode_sb(state, rs1, rs2, 0)
457498
#define ASM_STORE_REG_REG_OFFSET(state, rd, rs, offset) asm_rv32_emit_store_reg_reg_offset(state, rd, rs, offset)
458499
#define ASM_STORE_REG_REG(state, rs1, rs2) ASM_STORE32_REG_REG(state, rs1, rs2)

0 commit comments

Comments
 (0)