Skip to content

Add count-trailing-zero libcalls #597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions src/crt/cttz.src
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
assume adl=1

section .text
public __bcttz
__bcttz:
tst a, 0Fh
jr z, .high4
tst a, 3
jr z, .high6
cpl
and a, 1
ret
.high6:
dec a
and a, 6
ret po
rrca
ret
.high4:
tst a, 030h
jr z, .high2
dec a
and a, 014h
ret po
ld a, 5
ret
.high2:
add a, a
sbc a, -8
ret p
ld a, 6
ret
Comment on lines +20 to +32
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This unfortunately breaks the optimization of the first trailing one functions relying on this to produce an output of 8 for an input of 0, but it might be worth it?

Suggested change
tst a, 030h
jr z, .high2
dec a
and a, 014h
ret po
ld a, 5
ret
.high2:
add a, a
sbc a, -8
ret p
ld a, 6
ret
add a, a
add a, a
jr z, .high2
add a, a
add a, a
sbc a, -5
ret
.high2:
sbc a, -7
ret

Copy link
Member

@runer112 runer112 Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this change would also hurt for maybe eventually implementing std::countr_zero, wouldn't it...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, that assumption is also made by the compiler (this libcall implements CTTZ which is output by __builtin_ctz intrinsics on the Z80 target and not CTTZ_ZERO_UNDEF). Good optimization idea though, I'll have to think whether I can do something similar.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this change would also hurt for maybe eventually implementing std::countr_zero, wouldn't it...

It's actually already been implemented in the toolchain recently, and makes use of the Z80 intrinsic behavior I mentioned.

Copy link
Member

@runer112 runer112 Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It occurs to me that, except for that pesky 0 case, you could potentially rework this whole routine to be based on branches after pairs of left shifts. I think whether that would be faster or not depends on the input distribution, but it would certainly be smaller.


section .text
public __scttz
__scttz:
ld a, l
or a, a
Comment on lines +37 to +38
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Matching __icttz's first 3 bytes might compress better.

Suggested change
ld a, l
or a, a
xor a, a
or a, l

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I always forget to think about compression alongside other size optimizations.

jr nz, __bcttz
ld a, h
require __scttz.hijack

section .text
private __scttz.hijack
__scttz.hijack:
call __bcttz
add a, 8
ret

section .text
public __icttz
__icttz:
xor a, a
require __icttz.hijack

section .text
private __icttz.hijack
__icttz.hijack:
or a, l
jr nz, __bcttz
or a, h
jr nz, __scttz.hijack
dec sp
push hl
inc sp
pop af
call __bcttz
add a, 16
ret

section .text
public __lcttz
__lcttz:
add hl, de
xor a, a
sbc hl, de
jr nz, __icttz.hijack
ld a, e
call __bcttz
add a, 24
ret

section .text
public __llcttz
__llcttz:
add hl, de
xor a, a
sbc hl, de
jr nz, __icttz.hijack
ex de, hl
sbc hl, de
jr nz, __i48cttz.hijack
or a, c
jr z, .high
call __bcttz
add a, 48
ret
.high:
ld a, b
call __bcttz
add a, 56
ret

section .text
public __i48cttz
__i48cttz:
add hl, de
xor a, a
sbc hl, de
jr nz, __icttz.hijack
ex de, hl
require __i48cttz.hijack

section .text
private __i48cttz.hijack
__i48cttz.hijack:
call __icttz.hijack
ex de, hl
add a, 24
ret
98 changes: 31 additions & 67 deletions src/libc/ez80_builtin.src
Original file line number Diff line number Diff line change
Expand Up @@ -6,61 +6,38 @@

public ___ez80_ctzc
___ez80_ctzc:
; unoptimized
ld hl, 3
add hl, sp
ld l, (hl)
xor a, a
sub a, l
and a, l
call __bctlz
bit 3, a
ret nz
xor a, 7
ret
ld a, (hl)
jp __bcttz

;-------------------------------------------------------------------------------

section .text

public ___ez80_ctzi48
___ez80_ctzi48:
; unoptimized
ld hl, 3
ld hl, 6
add hl, sp
ld bc, (hl)
inc hl
inc hl
inc hl
ld iy, (hl)
sbc hl, hl
add hl, bc
lea de, iy
call __i48neg
call __i48and
call __i48ctlz
cpl
add a, 48
ret p
ld a, 48
ret
ld de, (hl)
dec hl
dec hl
dec hl
ld hl, (hl)
jp __i48cttz

;-------------------------------------------------------------------------------

section .text

public ___ez80_ffsc
___ez80_ffsc:
; unoptimized
ld hl, 3
add hl, sp
ld l, (hl)
xor a, a
sub a, l
and a, l
call __bctlz
cpl
add a, 9
ld a, (hl)
or a, a
rla
jp nz, __bcttz
ret

;-------------------------------------------------------------------------------
Expand All @@ -69,24 +46,13 @@ ___ez80_ffsc:

public ___ez80_ffss
___ez80_ffss:
; unoptimized
ld hl, 3
add hl, sp
ld hl, (hl)
; HL & -HL
ld b, h
ld c, l
sbc hl, hl
sbc hl, bc
ld a, h
and a, b
ld h, a
ld a, l
and a, c
ld l, a
call __sctlz
cpl
add a, 17
or a, l
add hl, hl
jp nz, __scttz
ret

;-------------------------------------------------------------------------------
Expand All @@ -95,22 +61,18 @@ ___ez80_ffss:

public ___ez80_ffsi48
___ez80_ffsi48:
; unoptimized
ld hl, 3
ld hl, 6
add hl, sp
ld bc, (hl)
inc hl
inc hl
inc hl
ld iy, (hl)
sbc hl, hl
add hl, bc
lea de, iy
call __i48neg
call __i48and
call __i48ctlz
cpl
add a, 49
ld de, (hl)
dec hl
dec hl
dec hl
ld hl, (hl)
call __i48cttz
inc a
cp a, 49
ret nz
xor a, a
ret

;-------------------------------------------------------------------------------
Expand Down Expand Up @@ -406,13 +368,15 @@ ___ez80_rotateright48:

extern __snot
extern __i48not
extern __i48and
extern __i48neg

extern __bctlz
extern __sctlz
extern __i48ctlz

extern __bcttz
extern __scttz
extern __i48cttz

extern __bpopcnt
extern __spopcnt
extern __i48popcnt
Expand Down
4 changes: 2 additions & 2 deletions test/standalone/ez80_builtin/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static int test_ctzc(void) {
unsigned char input;
for (int i = 0; i < 256; i++) {
input = (unsigned char)i;
truth = (input == 0) ? 8 : __builtin_ctz((unsigned int)input);
truth = (input == 0) ? 8 : __builtin_ctz((unsigned int)input | INT_MIN);
guess = __ez80_ctzc(input);
CMP("%d", input, truth, guess);
}
Expand Down Expand Up @@ -271,7 +271,7 @@ static int test_ctzi48(void) {
CMP("%012llX", (uint64_t)UINT48_MAX, 0, __ez80_ctzi48(UINT48_MAX));
for (int i = 0; i < RANDOM_TEST_COUNT; i++) {
input = rand48();
truth = (input == 0) ? 48 : __builtin_ctzll((unsigned long long)input);
truth = (input == 0) ? 48 : __builtin_ctzll((unsigned long long)input | INT64_MIN);
guess = __ez80_ctzi48(input);
CMP("%012llX", (uint64_t)input, truth, guess);
}
Expand Down
Loading