Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizations #173

Merged
merged 2 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions GNUmakefile.os4
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ STATIC := $(if $(STATIC),$(STATIC),yes)
LARGEDATA :=
OPTIONS += $(LARGEDATA) -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -Wa,-mregnames -fno-builtin -nostdlib -D_GNU_SOURCE -D_XOPEN_SOURCE -D_USE_GNU -pipe \
-nostdinc -nostartfiles -nostdlib
OPTIMIZE := -O3 -mregnames -mmultiple -mupdate -ffp-contract=fast -mstrict-align
OPTIMIZE := -O3 -mregnames -mstrict-align

STABS :=
DLIBS :=
Expand All @@ -117,9 +117,9 @@ else
DLIBS += $(BUILD_DIR)/lib/libdebug.a
endif

CFLAGS := $(WARNINGS) $(OPTIMIZE) $(OPTIONS) $(INCLUDES) -D__USE_INLINE__
CFLAGS_N := $(WARNINGS) $(OPTIMIZE) $(OPTIONS) $(INCLUDES)
AFLAGS := -Wa,-mregnames -mstrict-align
CFLAGS := $(WARNINGS) $(OPTIMIZE) $(OPTIONS) $(INCLUDES) -D__USE_INLINE__ -falign-functions=8
CFLAGS_N := $(WARNINGS) $(OPTIMIZE) $(OPTIONS) $(INCLUDES) -falign-functions=8
AFLAGS := -Wa,-mregnames -falign-functions=8 -mstrict-align

ifdef SPE
CC := ppc-amigaos-gcc-6.4.0
Expand Down
3 changes: 3 additions & 0 deletions libc.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,9 @@ C_STRING := \
cpu/altivec/vec_memset.o \
cpu/altivec/vec_strcpy.o \
cpu/generic/bcopy.o \
cpu/generic/strcat.o \
cpu/generic/strcpy.o \
cpu/generic/strlen.o \
string/rindex.o \
string/stccpy.o \
string/stpcpy.o \
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/bswap16.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
asm("\
.text\n\
.align 2\n\
.align 8\n\
.globl bswap16\n\
.type bswap16, @function\n\
bswap16:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/bswap24.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
asm(" .text\n\
.align 2\n\
.align 8\n\
.globl bswap24\n\
.type bswap24, @function\n\
bswap24:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/bswap32.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
asm(" .text\n\
.align 2\n\
.align 8\n\
.globl bswap32\n\
.type bswap32, @function\n\
bswap32:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/bswap64.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

asm(" .text\n\
.align 2\n\
.align 8\n\
.globl bswap64\n\
.type bswap64, @function\n\
bswap64:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/swab.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
asm("\
.text\n\
.align 2\n\
.align 8\n\
.globl swab\n\
.type swab,@function\n\
swab:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/swab24.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
asm("\
.text\n\
.align 2\n\
.align 8\n\
.globl swab24\n\
.type swab24,@function\n\
swab24:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/swab32.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
asm("\
.text\n\
.align 2\n\
.align 8\n\
.globl swab32\n\
.type swab32,@function\n\
swab32:\n\
Expand Down
2 changes: 1 addition & 1 deletion library/byteswap/swab64.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

asm("\
.text\n\
.align 2\n\
.align 8\n\
.globl swab64\n\
.type swab64,@function\n\
swab64:\n\
Expand Down
6 changes: 3 additions & 3 deletions library/c.lib_rev.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define REVISION 0
#define SUBREVISION 0

#define DATE "21.03.2024"
#define DATE "23.03.2024"
#define VERS "clib4.library 1.0.0"
#define VSTRING "clib4.library 1.0.0 (21.03.2024)\r\n"
#define VERSTAG "\0$VER: clib4.library 1.0.0 (21.03.2024)"
#define VSTRING "clib4.library 1.0.0 (23.03.2024)\r\n"
#define VERSTAG "\0$VER: clib4.library 1.0.0 (23.03.2024)"
20 changes: 10 additions & 10 deletions library/cpu/generic/bcopy.S
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
#define w6 r11
#define w7 r0
#define w8 r2
#define kLong 33 // too long for string ops
#define kLong 33 // too long for string ops

.text

Expand All @@ -69,7 +69,7 @@
.align 5
.global bcopy_g3

bcopy_g3: // void bcopy(const void *src, void *dst, size_t len)
bcopy_g3: // void bcopy(const void *src, void *dst, size_t len)
cmplwi rc,kLong // length > 32 bytes?
sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
mr rd,r4 // start to move source & dest to canonic spot
Expand All @@ -82,8 +82,8 @@ bcopy_g3: // void bcopy(const void *src, void *dst, size_t len)
// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.

.align 5
Lmemcpy_g3: // void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g3: // void* memmove(void *dst, const void *src, size_t len)
Lmemcpy_g3: // void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g3: // void* memmove(void *dst, const void *src, size_t len)
cmplwi rc,kLong // length > 32 bytes?
sub w1,r3,rs // must move in reverse if (rd-rs)<rc
mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
Expand All @@ -96,9 +96,9 @@ Lmemmove_g3: // void* memmove(void *dst, const void *src, size_t len)
// Long operands (more than 32 bytes.)
// w1 = (rd-rs), used to check for alignment

LLong0: // enter from bcopy()
LLong0: // enter from bcopy()
mr rs,r3 // must leave r3 alone (it is return value for memcpy)
LLong1: // enter from memcpy() and memmove()
LLong1: // enter from memcpy() and memmove()
cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
rlwinm r0,w1,0,0x3 // are operands relatively word-aligned?
neg w2,rd // prepare to align destination
Expand Down Expand Up @@ -144,7 +144,7 @@ LLong1: // enter from memcpy() and memmove()
stw w8,28(rd)
addi rd,rd,32
bdnz 1b
2: // rc = remaining bytes (0-31)
2: // rc = remaining bytes (0-31)
mtxer rc // set up count for string ops
mr r0,rd // move dest ptr out of the way
lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
Expand All @@ -170,7 +170,7 @@ LLongFloat:
stswx w1,0,rd
add rd,rd,w4
beq- 2f // pathologic case, no chunks to xfer
1: // loop over 32-byte chunks
1: // loop over 32-byte chunks
lfd f0,0(rs)
lfd f1,8(rs)
lfd f2,16(rs)
Expand All @@ -182,7 +182,7 @@ LLongFloat:
stfd f3,24(rd)
addi rd,rd,32
bdnz 1b
2: // rc = remaining bytes (0-31)
2: // rc = remaining bytes (0-31)
mtxer rc // set up count for string ops
mr r0,rd // move dest ptr out of the way
lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
Expand Down Expand Up @@ -256,7 +256,7 @@ LReverseFloat:
stfd f2,-24(rd)
stfdu f3,-32(rd)
bdnz 1b
2: // rc = remaining bytes (0-31)
2: // rc = remaining bytes (0-31)
mtxer rc // set up count for string ops
sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
sub r0,rd,rc // move dest ptr out of way
Expand Down
16 changes: 16 additions & 0 deletions library/cpu/generic/strcat.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include "../4xx/ppc4xx.inc"

function_prolog(__strcat_ppc)
addi r5,r3,-1
addi r4,r4,-1
1: lbzu r0,1(r5)
cmpwi 0,r0,0
bne 1b
addi r5,r5,-1
1: lbzu r0,1(r4)
cmpwi 0,r0,0
stbu r0,1(r5)
bne 1b
blr

function_epilog(__strcat_ppc)
95 changes: 95 additions & 0 deletions library/cpu/generic/strcpy.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/* Optimized strcpy implementation for PowerPC.
Copyright (C) 1997-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */

/* See strlen.s for comments on how the end-of-string testing works. */
/* char * [r3] strcpy (char *dest [r3], const char *src [r4]) */
#include "../4xx/ppc4xx.inc"
function_prolog(__strcpy_ppc)

#define L(x) .L##x
#define rTMP r0
#define rRTN r3 /* incoming DEST arg preserved as result */
#define rSRC r4 /* pointer to previous word in src */
#define rDEST r5 /* pointer to previous word in dest */
#define rWORD r6 /* current word from src */
#define rFEFE r7 /* constant 0xfefefeff (-0x01010101) */
#define r7F7F r8 /* constant 0x7f7f7f7f */
#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f) */
#define rALT r10 /* alternate word from src */

or rTMP, rSRC, rRTN
clrlwi. rTMP, rTMP, 30
addi rDEST, rRTN, -4
bne L(unaligned)
lis rFEFE, -0x101
lis r7F7F, 0x7f7f
lwz rWORD, 0(rSRC)
addi rFEFE, rFEFE, -0x101
addi r7F7F, r7F7F, 0x7f7f
b L(g2)
L(g0):
lwzu rALT, 4(rSRC)
stwu rWORD, 4(rDEST)
add rTMP, rFEFE, rALT
nor rNEG, r7F7F, rALT
and. rTMP, rTMP, rNEG
bne- L(g1)
lwzu rWORD, 4(rSRC)
stwu rALT, 4(rDEST)
L(g2):
add rTMP, rFEFE, rWORD
nor rNEG, r7F7F, rWORD
and. rTMP, rTMP, rNEG
beq+ L(g0)
mr rALT, rWORD
/* We've hit the end of the string. Do the rest byte-by-byte. */
L(g1):
rlwinm. rTMP, rALT, 8, 24, 31
stb rTMP, 4(rDEST)
beqlr-
rlwinm. rTMP, rALT, 16, 24, 31
stb rTMP, 5(rDEST)
beqlr-
rlwinm. rTMP, rALT, 24, 24, 31
stb rTMP, 6(rDEST)
beqlr-
stb rALT, 7(rDEST)
blr
/* Oh well. In this case, we just do a byte-by-byte copy. */
.align 4
nop
L(unaligned):
lbz rWORD, 0(rSRC)
addi rDEST, rRTN, -1
cmpwi rWORD, 0
beq- L(u2)
L(u0):
lbzu rALT, 1(rSRC)
stbu rWORD, 1(rDEST)
cmpwi rALT, 0
beq- L(u1)
nop /* Let 601 load start of loop. */
lbzu rWORD, 1(rSRC)
stbu rALT, 1(rDEST)
cmpwi rWORD, 0
bne+ L(u0)
L(u2):
stb rWORD, 1(rDEST)
blr
L(u1):
stb rALT, 1(rDEST)
blr

function_epilog(__strcpy_ppc)
Loading