summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'games-arcade/koules/files/1.4-gcc3.patch')
-rw-r--r--games-arcade/koules/files/1.4-gcc3.patch924
1 files changed, 924 insertions, 0 deletions
diff --git a/games-arcade/koules/files/1.4-gcc3.patch b/games-arcade/koules/files/1.4-gcc3.patch
new file mode 100644
index 000000000000..e2830d94adba
--- /dev/null
+++ b/games-arcade/koules/files/1.4-gcc3.patch
@@ -0,0 +1,924 @@
+diff -ur koules1.4/koules.sndsrv.linux.c koules1.4-gcc3/koules.sndsrv.linux.c
+--- koules1.4/koules.sndsrv.linux.c 1998-03-04 19:59:19.000000000 +0100
++++ koules1.4-gcc3/koules.sndsrv.linux.c 2003-04-23 01:15:16.000000000 +0200
+@@ -136,7 +136,7 @@
+ Eventually I'll look at the koules signal handlers and
+ just trap this.
+ */
+-int
++void
+ do_nothing (void)
+ {
+ fprintf (stderr, "koules.sndsrv: doing nothing, something is broken\n");
+diff -ur koules1.4/xlib/inlstring.h koules1.4-gcc3/xlib/inlstring.h
+--- koules1.4/xlib/inlstring.h 1998-03-04 19:59:19.000000000 +0100
++++ koules1.4-gcc3/xlib/inlstring.h 2003-04-23 00:53:56.000000000 +0200
+@@ -1,292 +1,348 @@
+-
+ /* Based on functions in linux/string.h */
+
+-
++#ifndef INLSTRING_H
++#define INLSTRING_H
++#include <sys/types.h> /* for size_t */
+
+ #if !defined(__386__)||!defined(ASSEMBLY)
+
+ #define __memcpy(dst,src,n) memcpy((dst),(src),(n))
+-#define __memcpy_conventioanl(dst,src,n) memcpy((dst),(src),(n))
++#define __memcpy_conventional(dst,src,n) memcpy((dst),(src),(n))
+ #define __memcpyb(dst,src,n) memcpy((dst),(src),(n))
+ #define __memsetb(dst,c,n) memset((dst),(c),(n))
+-#define __memsetlong(dst,c,n) memset((dst),(c),(n))
+ #define __memset(dst,c,n) memset((dst),(c),(n))
+-#define __memset2(dst,c,n) memset((dst),(c),2*(n))
+-#define __memset3(dst,c,n) memset((dst),(c),3*(n))
++
++static inline void *__memsetlong(void *s, long c, size_t count) {
++ long *p=s;
++ int i;
++ for(i=0;i<count;i++)*p++=c;
++ return s;
++}
++
++static inline void *__memset2(void *s, short c, size_t count) {
++ short *p=s;
++ int i;
++ for(i=0;i<count;i++)*p++=c;
++ return s;
++}
++
++static inline void *__memset3(void *s, short c, size_t count) {
++ unsigned char *p=s;
++ int i;
++ for(i=0;i<count;i++) {
++ *p++=c&0xff;
++ *p++=(c>>8)&0xff;
++ *p++=(c>>16)&0xff;
++ }
++ return s;
++}
+
+ #else
+
+-#include <linux/types.h> /* for size_t */
+-static INLINE void *
+-__memcpy_conventional (void *to, const void *from, size_t n)
++static inline void *
++ __memcpy_conventional(void *to, const void *from, size_t n)
+ {
+- __asm__ ("cld\n\t"
+- "movl %%edi,%%ecx\n\t"
+- "andl $1,%%ecx\n\t"
+- "subl %%ecx,%%edx\n\t"
+- "rep ; movsb\n\t" /* 16-bit align destination */
+- "movl %%edx,%%ecx\n\t"
+- "shrl $2,%%ecx\n\t"
+- "rep ; movsl\n\t"
+- "testb $1,%%dl\n\t"
+- "je 1f\n\t"
+- "movsb\n"
+- "1:\ttestb $2,%%dl\n\t"
+- "je 2f\n\t"
+- "movsw\n"
+- "2:\n"
+-: : "d" (n), "D" ((long) to), "S" ((long) from)
+-: "cx", "dx", "di", "si");
+- return (to);
++ int dummy1;
++ long dummy2, dummy3;
++ __asm__ __volatile__("cld\n\t"
++ "cmpl $0,%%edx\n\t"
++ "jle 2f\n\t"
++ "movl %%edi,%%ecx\n\t"
++ "andl $1,%%ecx\n\t"
++ "subl %%ecx,%%edx\n\t"
++ "rep ; movsb\n\t" /* 16-bit align destination */
++ "movl %%edx,%%ecx\n\t"
++ "shrl $2,%%ecx\n\t"
++ "jz 3f\n\t"
++ "rep ; movsl\n\t"
++ "3:\n\t"
++ "testb $1,%%dl\n\t"
++ "je 1f\n\t"
++ "movsb\n"
++ "1:\ttestb $2,%%dl\n\t"
++ "je 2f\n\t"
++ "movsw\n"
++ "2:\n"
++ : "=d"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
++ : "0"(n), "1"((long) to), "2"((long) from)
++ : "cx"/***rjr***, "dx", "di", "si"***/
++ );
++ return (to);
+ }
+
+
+-static INLINE void *
+-__memcpyb (void *to, const void *from, size_t n)
++static inline void *
++ __memcpyb(void *to, const void *from, size_t n)
+ {
+- __asm__ ("cld\n\t"
+- "rep ; movsb\n\t"
+-: : "c" (n), "D" ((long) to), "S" ((long) from)
+-: "cx", "di", "si");
+- return (to);
++ int dummy1;
++ long dummy2, dummy3;
++ __asm__ __volatile__("cld\n\t"
++ "rep ; movsb\n\t"
++ : "=c"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
++ : "0"(n), "1"((long) to), "2"((long) from)
++ /***rjr***: "cx", "di", "si"***/
++ );
++ return (to);
+ }
+
+-static INLINE void *
+-__memsetb (void *s, char c, size_t count)
++static inline void *
++ __memsetb(void *s, char c, size_t count)
+ {
+- __asm__ ("cld\n\t"
+- "rep\n\t"
+- "stosb"
+-: : "a" (c), "D" (s), "c" (count)
+-: "cx", "di");
+- return s;
++ __asm__("cld\n\t"
++ "rep\n\t"
++ "stosb"
++ : : "a"(c), "D"(s), "c"(count)
++ : "cx", "di");
++ return s;
+ }
+
+-static INLINE void *
+-__memsetlong (void *s, unsigned c, size_t count)
++static inline void *
++ __memsetlong(void *s, unsigned c, size_t count)
+ {
+- __asm__ ("cld\n\t"
+- "rep\n\t"
+- "stosl"
+-: : "a" (c), "D" (s), "c" (count)
+-: "cx", "di");
+- return s;
++ long dummy1;
++ int dummy2;
++ __asm__ __volatile__("cld\n\t"
++ "rep\n\t"
++ "stosl"
++ : "=D"(dummy1), "=c"(dummy2) /* fake outputs */
++ : "a"(c), "0"(s), "1"(count)
++ /***rjr***: "cx", "di"***/
++ );
++ return s;
+ }
+
+-static INLINE void *
+-__memset (void *s, char c, size_t count)
++static inline void *
++ __memset(void *s, char c, size_t count)
+ {
+- __asm__ (
+- "cld\n\t"
+- "cmpl $12,%%edx\n\t"
+- "jl 1f\n\t" /* if (count >= 12) */
+-
+- "movzbl %%al,%%ax\n\t"
+- "movl %%eax,%%ecx\n\t"
+- "shll $8,%%ecx\n\t" /* c |= c << 8 */
+- "orl %%ecx,%%eax\n\t"
+- "movl %%eax,%%ecx\n\t"
+- "shll $16,%%ecx\n\t" /* c |= c << 16 */
+- "orl %%ecx,%%eax\n\t"
+-
+- "movl %%edx,%%ecx\n\t"
+- "negl %%ecx\n\t"
+- "andl $3,%%ecx\n\t" /* (-s % 4) */
+- "subl %%ecx,%%edx\n\t" /* count -= (-s % 4) */
+- "rep ; stosb\n\t" /* align to longword boundary */
+-
+- "movl %%edx,%%ecx\n\t"
+- "shrl $2,%%ecx\n\t"
+- "rep ; stosl\n\t" /* fill longwords */
+-
+- "andl $3,%%edx\n" /* fill last few bytes */
+- "1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
+- "rep ; stosb\n\t"
+-: : "a" (c), "D" (s), "d" (count)
+-: "ax", "cx", "dx", "di");
+- return s;
++ int dummy1;
++ long dummy2;
++ int dummy3;
++ __asm__ __volatile__(
++ "cld\n\t"
++ "cmpl $12,%%edx\n\t"
++ "jl 1f\n\t" /* if (count >= 12) */
++
++ "movzbl %%al,%%ax\n\t"
++ "movl %%eax,%%ecx\n\t"
++ "shll $8,%%ecx\n\t" /* c |= c << 8 */
++ "orl %%ecx,%%eax\n\t"
++ "movl %%eax,%%ecx\n\t"
++ "shll $16,%%ecx\n\t" /* c |= c << 16 */
++ "orl %%ecx,%%eax\n\t"
++
++ "movl %%edx,%%ecx\n\t"
++ "negl %%ecx\n\t"
++ "andl $3,%%ecx\n\t" /* (-s % 4) */
++ "subl %%ecx,%%edx\n\t" /* count -= (-s % 4) */
++ "rep ; stosb\n\t" /* align to longword boundary */
++
++ "movl %%edx,%%ecx\n\t"
++ "shrl $2,%%ecx\n\t"
++ "rep ; stosl\n\t" /* fill longwords */
++
++ "andl $3,%%edx\n" /* fill last few bytes */
++ "1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
++ "rep ; stosb\n\t"
++ : "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
++ : "0"(c), "1"(s), "2"(count)
++ : /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
++ );
++ return s;
+ }
+
+-static INLINE void *
+-__memset2 (void *s, short c, size_t count)
++static inline void *
++ __memset2(void *s, short c, size_t count)
+ /* count is in 16-bit pixels */
+ /* s is assumed to be 16-bit aligned */
+ {
+- __asm__ (
+- "cld\n\t"
+- "cmpl $12,%%edx\n\t"
+- "jl 1f\n\t" /* if (count >= 12) */
+-
+- "movzwl %%ax,%%eax\n\t"
+- "movl %%eax,%%ecx\n\t"
+- "shll $16,%%ecx\n\t" /* c |= c << 16 */
+- "orl %%ecx,%%eax\n\t"
+-
+- "movl %%edi,%%ecx\n\t"
+- "andl $2,%%ecx\n\t" /* s & 2 */
+- "jz 2f\n\t"
+- "decl %%edx\n\t" /* count -= 1 */
+- "stosw\n\t" /* align to longword boundary */
+-
+- "2:\n\t"
+- "movl %%edx,%%ecx\n\t"
+- "shrl $1,%%ecx\n\t"
+- "rep ; stosl\n\t" /* fill longwords */
+-
+- "andl $1,%%edx\n" /* one 16-bit word left? */
+- "jz 3f\n\t" /* no, finished */
+- "1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
+- "rep ; stosw\n\t"
+- "3:\n\t"
+-: : "a" (c), "D" (s), "d" (count)
+-: "ax", "cx", "dx", "di");
+- return s;
++ int dummy1;
++ long dummy2;
++ int dummy3;
++ __asm__ __volatile__(
++ "cld\n\t"
++ "cmpl $12,%%edx\n\t"
++ "jl 1f\n\t" /* if (count >= 12) */
++
++ "movzwl %%ax,%%eax\n\t"
++ "movl %%eax,%%ecx\n\t"
++ "shll $16,%%ecx\n\t" /* c |= c << 16 */
++ "orl %%ecx,%%eax\n\t"
++
++ "movl %%edi,%%ecx\n\t"
++ "andl $2,%%ecx\n\t" /* s & 2 */
++ "jz 2f\n\t"
++ "decl %%edx\n\t" /* count -= 1 */
++ "movw %%ax,(%%edi)\n\t" /* align to longword boundary */
++ "addl $2,%%edi\n\t"
++
++ "2:\n\t"
++ "movl %%edx,%%ecx\n\t"
++ "shrl $1,%%ecx\n\t"
++ "rep ; stosl\n\t" /* fill longwords */
++
++ "andl $1,%%edx\n" /* one 16-bit word left? */
++ "jz 3f\n\t" /* no, finished */
++ "1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
++ "rep ; stosw\n\t"
++ "3:\n\t"
++ : "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
++ : "0"(c), "1"(s), "2"(count)
++ : /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
++ );
++ return s;
+ }
+
+-static INLINE void *
+-__memset3 (void *s, int c, size_t count)
++static inline void *
++ __memset3(void *s, int c, size_t count)
+ /* count is in 24-bit pixels (3 bytes per pixel) */
+ {
+- __asm__ (
+- "cmpl $8,%%edx\n\t"
+- /* "jmp 2f\n\t" *//* debug */
+- "jl 2f\n\t"
+-
+- "movl %%eax,%%ebx\n\t" /* eax = (low) BGR0 (high) */
+- "shll $24,%%ebx\n\t" /* ebx = 000B */
+- "orl %%ebx,%%eax\n\t" /* eax = BGRB */
+-
+- "movl %%eax,%%ebx\n\t"
+- "shrl $8,%%ebx\n\t" /* ebx = GRB0 */
+- "movl %%ebx,%%ecx\n\t"
+- "shll $24,%%ecx\n\t" /* ecx = 000G */
+- "orl %%ecx,%%ebx\n\t" /* ebx = GRBG */
+-
+- "movl %%eax,%%ecx\n\t"
+- "shll $8,%%ecx\n\t" /* ecx = 0BGR */
+- "movb %%bh,%%cl\n\t" /* ecx = RBGR */
+-
+- "cmpl $16,%%edx\n\t"
+- "jl 1f\n\t"
+- "jmp 5f\n\t"
+- ".align 4,0x90\n\t"
+-
+- "5:\n\t" /* loop unrolling */
+- "movl %%eax,(%%edi)\n\t" /* write BGRB */
+- "movl %%ebx,4(%%edi)\n\t" /* write GRBG */
+- "movl %%ecx,8(%%edi)\n\t" /* write RBGR */
+- "movl %%eax,12(%%edi)\n\t"
+- "movl %%ebx,16(%%edi)\n\t"
+- "movl %%ecx,20(%%edi)\n\t"
+- "movl %%eax,24(%%edi)\n\t"
+- "movl %%ebx,28(%%edi)\n\t"
+- "movl %%ecx,32(%%edi)\n\t"
+- "movl %%eax,36(%%edi)\n\t"
+- "subl $16,%%edx\n\t" /* blend end-of-loop instr. */
+- "movl %%ebx,40(%%edi)\n\t"
+- "movl %%ecx,44(%%edi)\n\t"
+- "addl $48,%%edi\n\t"
+- "cmpl $16,%%edx\n\t"
+- "jge 5b\n\t"
+- "andl %%edx,%%edx\n\t"
+- "jz 4f\n\t" /* finished */
+- "cmpl $4,%%edx\n\t"
+- "jl 2f\n\t" /* less than 4 pixels left */
+- "jmp 1f\n\t"
+- ".align 4,0x90\n\t"
+-
+- "1:\n\t"
+- "movl %%eax,(%%edi)\n\t" /* write BGRB */
+- "movl %%ebx,4(%%edi)\n\t" /* write GRBG */
+- "movl %%ecx,8(%%edi)\n\t" /* write RBGR */
+- "addl $12,%%edi\n\t"
+- "subl $4,%%edx\n\t"
+- "cmpl $4,%%edx\n\t"
+- "jge 1b\n\t"
+-
+- "2:\n\t"
+- "cmpl $0,%%edx\n\t" /* none left? */
+- "jle 4f\n\t" /* finished */
+-
+- "mov %%eax,%%ecx\n\t"
+- "shrl $16,%%ecx\n\t" /* B in cl */
+-
+- "3:\n\t" /* write last few pixels */
+- "movw %%ax,(%%edi)\n\t" /* write RG */
+- "movb %%cl,2(%%edi)\n\t" /* write B */
+- "addl $3,%%edi\n\t"
+- "decl %%edx\n\t"
+- "jnz 3b\n\t"
+-
+- "4:\n\t"
+-: : "a" (c), "D" (s), "d" (count)
+-: "ax", "bx", "cx", "dx", "di");
+- return s;
++ int dummy1;
++ long dummy2;
++ int dummy3;
++ __asm__ __volatile__(
++ "cmpl $8,%%edx\n\t"
++ /* "jmp 2f\n\t" *//* debug */
++ "jl 2f\n\t"
++
++ "movl %%eax,%%esi\n\t" /* esi = (low) BGR0 (high) */
++ "shll $24,%%eax\n\t" /* eax = 000B */
++ "orl %%eax,%%esi\n\t" /* esi = BGRB */
++
++ "movl %%esi,%%eax\n\t"
++ "shrl $8,%%eax\n\t" /* eax = GRB0 */
++ "movl %%eax,%%ecx\n\t"
++ "shll $24,%%ecx\n\t" /* ecx = 000G */
++ "orl %%ecx,%%eax\n\t" /* eax = GRBG */
++
++ "movl %%esi,%%ecx\n\t"
++ "shll $8,%%ecx\n\t" /* ecx = 0BGR */
++ "movb %%ah,%%cl\n\t" /* ecx = RBGR */
++
++ "cmpl $16,%%edx\n\t"
++ "jl 1f\n\t"
++ "jmp 5f\n\t"
++ ".align 4,0x90\n\t"
++
++ "5:\n\t" /* loop unrolling */
++ "movl %%esi,(%%edi)\n\t" /* write BGRB */
++ "movl %%eax,4(%%edi)\n\t" /* write GRBG */
++ "movl %%ecx,8(%%edi)\n\t" /* write RBGR */
++ "movl %%esi,12(%%edi)\n\t"
++ "movl %%eax,16(%%edi)\n\t"
++ "movl %%ecx,20(%%edi)\n\t"
++ "movl %%esi,24(%%edi)\n\t"
++ "movl %%eax,28(%%edi)\n\t"
++ "movl %%ecx,32(%%edi)\n\t"
++ "movl %%esi,36(%%edi)\n\t"
++ "subl $16,%%edx\n\t" /* blend end-of-loop instr. */
++ "movl %%eax,40(%%edi)\n\t"
++ "movl %%ecx,44(%%edi)\n\t"
++ "addl $48,%%edi\n\t"
++ "cmpl $16,%%edx\n\t"
++ "jge 5b\n\t"
++ "andl %%edx,%%edx\n\t"
++ "jz 4f\n\t" /* finished */
++ "cmpl $4,%%edx\n\t"
++ "jl 2f\n\t" /* less than 4 pixels left */
++ "jmp 1f\n\t"
++ ".align 4,0x90\n\t"
++
++ "1:\n\t"
++ "movl %%esi,(%%edi)\n\t" /* write BGRB */
++ "movl %%eax,4(%%edi)\n\t" /* write GRBG */
++ "movl %%ecx,8(%%edi)\n\t" /* write RBGR */
++ "addl $12,%%edi\n\t"
++ "subl $4,%%edx\n\t"
++ "cmpl $4,%%edx\n\t"
++ "jge 1b\n\t"
++
++ "2:\n\t"
++ "cmpl $0,%%edx\n\t" /* none left? */
++ "jle 4f\n\t" /* finished */
++
++ "mov %%ecx,%%eax\n\t"
++ "shrl $8,%%ecx\n\t" /* R in cl */
++
++ "3:\n\t" /* write last few pixels */
++ "movw %%cx,(%%edi)\n\t" /* write BG */
++ "movb %%al,2(%%edi)\n\t" /* write R */
++ "addl $3,%%edi\n\t"
++ "decl %%edx\n\t"
++ "jnz 3b\n\t"
++
++ "4:\n\t"
++ : "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
++ : "0"(c), "1"(s), "2"(count)
++ : /***rjr***"ax",*/ "cx", /*"dx",*/ "si"/*, "di"*/
++ );
++ return s;
+ }
+
+-/* Functions defined in mem.S */
+-
+-extern memcpy4to3 (void *dest, void *src, int n);
+-extern memcpy32shift8 (void *dest, void *src, int n);
+-
+ /* Functions for which arguments must be passed in %ebx, %edx, and %ecx. */
+-extern __memcpyasm_regargs (); /* nu_bytes >= 3 */
+-extern __memcpyasm_regargs_aligned (); /* nu_bytes >= 32 */
++#if 0 /* Why declare 'em? Just confuses the compiler and can't be called from C
++ anyway */
++extern __memcpyasm_regargs(); /* nu_bytes >= 3 */
++extern __memcpyasm_regargs_aligned(); /* nu_bytes >= 32 */
++#endif
+
+
+ /* Always 32-bit align destination, even for a small number of bytes. */
+-static INLINE void *
+-__memcpy_aligndest (void *dest, const void *src, int n)
++static inline void *
++ __memcpy_aligndest(void *dest, const void *src, int n)
+ {
+- __asm__ __volatile__ ("
+- cmpl $3, %%ecx
+- ja 1f
+- call * __memcpy_jumptable (, %%ecx, 4)
+- jmp 2f
+- 1:call __memcpyasm_regargs
+- "
+- :
+- :"b" (dest), "d" (src), "c" (n)
+- :"ax", "0", "1", "2");
++ __asm__ __volatile__("cmpl $3, %%ecx\n\t"
++ "ja 1f\n\t"
++ "call * __memcpy_jumptable (, %%ecx, 4)\n\t"
++ "jmp 2f\n\t"
++ "1:call __memcpyasm_regargs\n\t"
++ "2:":
++ :"S"(dest), "d"(src), "c"(n)
++ :"ax", "0", "1", "2");
++ return dest;
+ }
+
+
+ /* Optimized version for 32-bit aligned destination. */
+-static INLINE void *
+-__memcpy_destaligned (void *dest, const void *src, int n)
++static inline void *
++ __memcpy_destaligned(void *dest, const void *src, int n)
+ {
+- __asm__ __volatile__ ("
+- cmpl $32, %%ecx
+- ja 1f
+- call * __memcpy_jumptable (, %%ecx, 4)
+- jmp 2f
+- 1:call __memcpyasm_regargs_aligned
+- 2:
+- "
+- :
+- :"b" (dest), "d" (src), "c" (n)
+- :"ax", "0", "1", "2");
++ __asm__ __volatile__("cmpl $32, %%ecx\n\t"
++ "ja 1f\n\t"
++ "call * __memcpy_jumptable (, %%ecx, 4)\n\t"
++ "jmp 2f\n\t"
++ "1:call __memcpyasm_regargs_aligned\n\t"
++ "2:\n\t":
++ :"S"(dest), "d"(src), "c"(n)
++ :"ax", "0", "1", "2");
++ return dest;
+ }
+
+
+-/* Balanced INLINE memcpy; 32-bit align destination if nu_bytes >= 20. */
+-static INLINE void *
+-__memcpy_balanced (void *dest, const void *src, int n)
++/* Balanced inline memcpy; 32-bit align destination if nu_bytes >= 20. */
++static inline void *
++ __memcpy_balanced(void *dest, const void *src, int n)
+ {
+- __asm__ __volatile__ ("
+- cmpl $19, %%ecx
+- ja 1f
+- call * __memcpy_jumptable (, %%ecx, 4)
+- jmp 2f
+- 1:call __memcpyasm_regargs
+- 2:
+- "
+- :
+- :"b" ((long) dest), "d" ((long) src), "c" ((long) n)
+- :"ax", "bx", "cx", "dx");
++ __asm__ __volatile__("cmpl $19, %%ecx\n\t"
++ "ja 1f\n\t"
++ "call * __memcpy_jumptable (, %%ecx, 4)\n\t"
++ "jmp 2f\n\t"
++ "1:call __memcpyasm_regargs\n\t"
++ "2:\n\t"
++ :
++ :"S"((long) dest), "d"((long) src), "c"((long) n)
++ :"ax", "0", "1", "2");
++ return dest;
+ }
+
+
+ #define __memcpy __memcpy_conventional
+
+ #endif
++
++/* Functions defined in mem.S or mem.c */
++
++extern void __memcpy4to3(void *dest, void *src, int n);
++extern void __memcpy32shift8(void *dest, void *src, int n);
++
++#endif
++
+diff -ur koules1.4/xlib/shmbitmap.c koules1.4-gcc3/xlib/shmbitmap.c
+--- koules1.4/xlib/shmbitmap.c 1998-03-04 19:59:19.000000000 +0100
++++ koules1.4-gcc3/xlib/shmbitmap.c 2003-04-23 01:11:02.000000000 +0200
+@@ -139,23 +139,37 @@
+ count = *dp++;
+ /* __memcpy gives severe bug here */
+ if (y >= ny)
++ {
+ if (x >= nx)
++ {
+ if (x + count > __clipx2 + 1)
+ {
+ if (x <= __clipx2)
+- __memcpyb (vp, dp, __clipx2 - x + 1);
++ {
++ __memcpyb (vp, dp, __clipx2 - x + 1);
++ }
+ }
+ else
+- __memcpyb (vp, dp, count);
++ {
++ __memcpyb (vp, dp, count);
++ }
++ }
+ else if (x + count > __clipx1)
++ {
+ if (x + count > __clipx2 + 1)
++ {
+ __memcpyb (vp + __clipx1 - x,
+ dp + __clipx1 - x,
+ __clipx2 - __clipx1 + 1);
++ }
+ else
++ {
+ __memcpy (vp + __clipx1 - x,
+ dp + __clipx1 - x,
+ count - __clipx1 + x);
++ }
++ }
++ }
+ x += count;
+ vp += count;
+ dp += count;
+@@ -224,11 +238,7 @@
+
+
+ /*following routines are ripped from vgagl library */
+-/* We use the 32-bit to 64-bit multiply and 64-bit to 32-bit divide of the */
+-/* 386 (which gcc doesn't know well enough) to efficiently perform integer */
+-/* scaling without having to worry about overflows. */
+ #define swap(x, y) { int temp = x; x = y; y = temp; }
+-#define setpixel (*(backscreen->ff.driver_setpixel_func))
+ #undef __clipx2
+ #define __clipx2 (MAPWIDTH-1)
+ #undef __clipx1
+@@ -237,23 +247,15 @@
+ #define __clipy1 0
+ #undef __clipy2
+ #define __clipy2 (MAPHEIGHT+19)
+-#ifdef __i386__
++
+ static INLINE int
+-muldiv64 (int CONST m1, int CONST m2, int CONST d)
++muldiv64(int m1, int m2, int d)
+ {
+-/* int32 * int32 -> int64 / int32 -> int32 */
+- int result;
+- __asm__ (
+- "imull %%edx\n\t"
+- "idivl %3\n\t"
+-: "=a" (result) /* out */
+-: "a" (m1), "d" (m2), "g" (d) /* in */
+-: "ax", "dx" /* mod */
+- );
+- return result;
++ return (float) m1 * (float) m2 / ((float) d);
+ }
+
+-#define INC_IF_NEG(y) \
++#ifdef __i386__
++#define INC_IF_NEG(y, result) \
+ { \
+ __asm__("btl $31,%1\n\t" \
+ "adcl $0,%0" \
+@@ -264,20 +266,20 @@
+ static INLINE int
+ gl_regioncode (CONST int x, CONST int y)
+ {
+- int dx1, dx2, dy1, dy2;
+- int result;
++ int dx1, dx2, dy1, dy2;
++ int result;
+ result = 0;
+ dy2 = __clipy2 - y;
+- INC_IF_NEG (dy2);
++ INC_IF_NEG (dy2, result);
+ result <<= 1;
+ dy1 = y - __clipy1;
+- INC_IF_NEG (dy1);
++ INC_IF_NEG (dy1, result);
+ result <<= 1;
+ dx2 = __clipx2 - x;
+- INC_IF_NEG (dx2);
++ INC_IF_NEG (dx2, result);
+ result <<= 1;
+ dx1 = x - __clipx1;
+- INC_IF_NEG (dx1);
++ INC_IF_NEG (dx1, result);
+ return result;
+ }
+
+@@ -287,7 +289,7 @@
+ static INLINE int
+ gl_regioncode (CONST int x, CONST int y)
+ {
+- int result = 0;
++ int result = 0;
+ if (x < 0)
+ result |= 1;
+ else if (x > __clipx2)
+@@ -300,15 +302,44 @@
+ }
+ #endif
+
+-/* Partly based on vgalib by Tommy Frandsen */
+-/* This would be a lot faster if setpixel was inlined */
++#define line_loop_linear_a(m,i,u,v) \
++ { \
++ int d = ay - (ax >> 1); \
++ if ((x = abs (dx))) \
++ do { \
++ i; \
++ if (d m 0) { \
++ vp v; \
++ d -= ax; \
++ } \
++ vp u; \
++ d += ay; \
++ } while (--x); \
++ }
++
++#define line_loop_linear_b(m,i,u,v) \
++ { \
++ int d = ax - (ay >> 1); \
++ if ((y = abs (dy))) \
++ do { \
++ i; \
++ if (d m 0) { \
++ vp u; \
++ d -= ay; \
++ } \
++ vp v; \
++ d += ax; \
++ } while (--y); \
++ }
++
++/* Partly based on the work which was partly based on vgalib by Tommy Frandsen */
++/* This is a lot faster now that setpixel is inlined */
+
+ void
+ Line (int x1, int y1, int x2, int y2, int c)
+ {
+- int dx, dy, ax, ay, sx, sy, x, y;
+- int syp;
+- char *point;
++ int dx, dy, ax, ay, sx, sy, x, y;
++ unsigned char *vp = NULL;
+ if (!shm)
+ {
+ qLine (x1, y1, x2, y2, c);
+@@ -319,8 +350,8 @@
+ if (Clipping)
+ for (;;)
+ {
+- int r1 = gl_regioncode (x1, y1);
+- int r2 = gl_regioncode (x2, y2);
++ int r1 = gl_regioncode (x1, y1);
++ int r2 = gl_regioncode (x2, y2);
+ if (!(r1 | r2))
+ break; /* completely inside */
+ if (r1 & r2)
+@@ -333,38 +364,22 @@
+ }
+ if (r1 & 1)
+ { /* left */
+-#ifdef __i386__
+ y1 += muldiv64 (__clipx1 - x1, y2 - y1, x2 - x1);
+-#else
+- y1 += (long) (__clipx1 - x1) * (long) (y2 - y1) / (long) (x2 - x1);
+-#endif
+ x1 = __clipx1;
+ }
+ else if (r1 & 2)
+ { /* right */
+-#ifdef __i386__
+ y1 += muldiv64 (__clipx2 - x1, y2 - y1, x2 - x1);
+-#else
+- y1 += (long) (__clipx2 - x1) * (long) (y2 - y1) / (long) (x2 - x1);
+-#endif
+ x1 = __clipx2;
+ }
+ else if (r1 & 4)
+ { /* top */
+-#ifdef __i386__
+ x1 += muldiv64 (__clipy1 - y1, x2 - x1, y2 - y1);
+-#else
+- x1 += (long) (__clipy1 - y1) * (long) (x2 - x1) / (long) (y2 - y1);
+-#endif
+ y1 = __clipy1;
+ }
+ else if (r1 & 8)
+ { /* bottom */
+-#ifdef __i386__
+ x1 += muldiv64 (__clipy2 - y1, x2 - x1, y2 - y1);
+-#else
+- x1 += (long) (__clipy2 - y1) * (long) (x2 - x1) / (long) (y2 - y1);
+-#endif
+ y1 = __clipy2;
+ }
+ }
+@@ -377,45 +392,66 @@
+ x = x1;
+ y = y1;
+
+- point = VScreenToBuffer (backscreen) + x + y * MAPWIDTH;
++#define insert_pixel_1 *((unsigned char *) vp) = c;
++
++ vp = VScreenToBuffer (backscreen) + y * MAPWIDTH + x;
+ if (ax > ay)
+ {
+- int d = ay - (ax >> 1);
+- syp = sy * MAPWIDTH;
+- while (x != x2)
++ if(sx > 0)
+ {
+- *point = c;
+- if (d > 0 || (d == 0 && sx == 1))
+- {
+- y += sy;
+- point += syp;
+- d -= ax;
+- }
+- x += sx;
+- point += sx;
+- d += ay;
++ line_loop_linear_a(>=,insert_pixel_1,++,+=MAPWIDTH*sy);
++ }
++ else
++ {
++ line_loop_linear_a(>,insert_pixel_1,--,+=MAPWIDTH*sy);
+ }
+ }
+ else
+ {
+- int sy = (dy >= 0) ? 1 : -1;
+- int d = ax - (ay >> 1);
+- syp = sy * MAPWIDTH;
+- while (y != y2)
++ if(sy > 0)
++ {
++ line_loop_linear_b(>=,insert_pixel_1,+=sx,+=MAPWIDTH);
++ }
++ else
+ {
+- *(point) = c;
+- if (d > 0 || (d == 0 && sy == 1))
++ line_loop_linear_b(>,insert_pixel_1,+=sx,-=MAPWIDTH);
++ }
++ }
++ insert_pixel_1;
++
++ if (!vp)
++ {
++ if (ax > ay)
++ {
++ int d = ay - (ax >> 1);
++ while (x != x2)
+ {
++ insert_pixel_1;
++ if (d > 0 || (d == 0 && sx == 1))
++ {
++ y += sy;
++ d -= ax;
++ }
+ x += sx;
+- point += sx;
+- d -= ay;
++ d += ay;
++ }
++ }
++ else
++ {
++ int d = ax - (ay >> 1);
++ while (y != y2)
++ {
++ insert_pixel_1;
++ if (d > 0 || (d == 0 && sy == 1))
++ {
++ x += sx;
++ d -= ay;
++ }
++ y += sy;
++ d += ax;
+ }
+- y += sy;
+- point += syp;
+- d += ax;
+ }
++ insert_pixel_1;
+ }
+- *(point) = c;
+- point++;
+ }
+ #endif
+--- koules1.4/Iconfig 2003-07-12 00:20:13.000000000 -0400
++++ koules1.4-gcc3/Iconfig 2003-07-12 00:20:45.000000000 -0400
+@@ -36,7 +36,7 @@
+ /* directories*/
+ KOULESDIR =/usr/bin/X11
+ SOUNDDIR =/usr/local/lib/koules
+-MANDIR =/usr/local/man/man6
++MANDIR =/usr/share/man/man6
+
+ /*You need some extra libraryes for BSD sockets compatibility?*/
+ /* TOP_INCLUDES = /* Sun users with GCC need this */