.text .global gl_clear_fb1 gl_clear_fb1: ld [%o0+16],%o4 ld [%o0],%o0 subcc %o4,8,%g0 bl 1f nop andcc %o0,1,%g0 be 2f andcc %o0,2,%g0 stb %g0,[%o0] inc %o0 dec %o4 andcc %o0,2,%g0 2: be 2f andcc %o0,4,%g0 sth %g0,[%o0] add %o0,2,%o0 sub %o4,2,%o4 andcc %o0,4,%g0 2: be 2f nop st %g0,[%o0] add %o0,4,%o0 sub %o4,4,%o4 2: mov %g0,%o2 mov %g0,%o3 2: subcc %o4,8,%o4 bl 2f nop std %o2,[%o0] add %o0,8,%o0 b 2b nop 2: andcc %o4,4,%g0 be 2f andcc %o4,2,%g0 st %g0,[%o0] add %o0,4,%o0 2: be 2f andcc %o4,1,%g0 sth %g0,[%o0] add %o0,2,%o0 2: be 2f nop stb %g0,[%o0] 2: retl nop 1: tst %o4 be 1f nop 2: stb %g0,[%o0] deccc %o4 bg 2b inc %o0 1: retl nop .global gl_copy_fb1_fb1_inverted gl_copy_fb1_fb1_inverted: save %sp,-96,%sp ! %i2 = xsize to copy (in bytes) ! %i3 = ysize left to copy ! %i4 = src stride ! %i5 = dst stride ! %l0 = src line ptr ! %l1 = dst line ptr ld [%i0+12],%i3 ld [%i1+12],%l0 subcc %l0,%i3,%g0 bg 1f ld [%i0+8],%i2 mov %l0,%i3 1: ld [%i1+8],%l0 subcc %l0,%i2,%g0 bg 1f ld [%i0+4],%i4 mov %l0,%i2 1: ld [%i1+4],%i5 ld [%i0],%l0 ld [%i1],%l1 add %i2,7,%i2 srl %i2,3,%i2 c11i_loop: ! bcopy_inverted(%l0,%l1,%i2) ! %l2-%l7 and %o0-%o5 available for use ! we assume %i2 is at least 16 or so ! ! %o0 = src ptr ! %o1 = dst ptr ! %o2 = bytes left to copy mov %l0,%o0 mov %l1,%o1 mov %i2,%o2 ! copy single bytes so as to quad-align src ptr 1: andcc %o0,7,%g0 be 1f nop ldub [%o0],%l7 inc %o0 not %l7,%l7 stb %l7,[%o1] inc %o1 b 1b dec %o2 1: and %o1,7,%o5 ! switch based on dst ptr alignment andcc %o1,4,%g0 be 1f andcc %o1,2,%g0 be 2f andcc %o1,1,%g0 be 3f nop ! dst ptr = ....111 add %g0,8,%o3 b c11i_bci_567 add %g0,24,%o4 1: be 1f andcc %o1,1,%g0 be 4f nop ! dst ptr = ....011 add %g0,8,%o3 b c11i_bci_123 add %g0,24,%o4 1: be 1f nop ! dst ptr = ....001 add %g0,24,%o3 b c11i_bci_123 add %g0,8,%o4 1: ! dst ptr = ....000 sub %o2,8,%o2 6: ldd [%o0],%l6 add %o0,8,%o0 not %l6,%l6 not %l7,%l7 std %l6,[%o1] subcc %o2,8,%o2 bge 6b add %o1,8,%o1 b c11i_bci_wrapup nop 2: be 1f nop ! dst ptr = ....101 add %g0,24,%o3 b c11i_bci_567 add %g0,8,%o4 1: ! dst ptr = ....100 sub %o2,8,%o2 ldd [%o0],%l4 add %o0,8,%o0 not %l4,%l4 st %l4,[%o1] b 6f add %o1,4,%o1 7: ldd [%o0],%l4 add %o0,8,%o0 not %l4,%l7 std %l6,[%o1] add %o1,8,%o1 6: subcc %o2,8,%o2 bge 7b not %l5,%l6 st %l6,[%o1] b c11i_bci_wrapup add %o1,4,%o1 3: ! dst ptr = ....110 add %g0,16,%o3 b c11i_bci_567 add %g0,16,%o4 4: ! dst ptr = ....010 add %g0,16,%o3 !b c11i_bci_123 add %g0,16,%o4 c11i_bci_123: ! dst ptr ends in 001, 010, 011 ! %o3, %o4 hold shift counts; %o5 holds dst ptr low bits sub %o2,8,%o2 ldd [%o0],%l4 add %o0,8,%o0 not %l4,%l4 not %l5,%l5 subcc %o5,2,%g0 be 2f srl %l4,%o4,%l3 bl 1f nop 3: stb %l3,[%o1] b 9f inc %o1 2: sth %l3,[%o1] b 9f add %o1,2,%o1 1: srl %l3,16,%l2 sth %l3,[%o1+1] stb %l2,[%o1] add %o1,3,%o1 9: sll %l4,%o3,%l3 srl %l5,%o4,%l2 or %l2,%l3,%l2 st %l2,[%o1] b 9f add %o1,4,%o1 2: ldd [%o0],%l4 add %o0,8,%o0 not %l4,%l4 not %l5,%l5 srl %l4,%o4,%l3 or %l6,%l3,%l6 sll %l4,%o3,%l7 srl %l5,%o4,%l3 or %l7,%l3,%l7 std %l6,[%o1] add %o1,8,%o1 9: subcc %o2,8,%o2 bge 2b sll %l5,%o3,%l6 andcc %o5,2,%g0 be 1f andcc %o5,1,%g0 srl %l6,16,%l3 sth %l3,[%o1] add %o1,2,%o1 sll %l6,16,%l6 1: be 1f nop srl %l6,24,%l3 stb %l3,[%o1] inc %o1 1: b c11i_bci_wrapup nop c11i_bci_567: ! dst ptr ends in 101, 110, 111 ! %o3, %o4 hold shift counts; %o5 holds dst ptr low bits sub %o2,8,%o2 ldd [%o0],%l4 add %o0,8,%o0 not %l4,%l4 not %l5,%l5 subcc %o5,6,%g0 be 6f srl %l4,%o4,%l3 bl 5f nop 7: stb %l3,[%o1] b 1f inc %o1 6: sth %l3,[%o1] b 1f add %o1,2,%o1 5: srl %l3,16,%l2 stb %l2,[%o1] sth %l3,[%o1+1] b 1f add %o1,3,%o1 2: ldd [%o0],%l4 add %o0,8,%o0 not %l4,%l4 not %l5,%l5 srl %l4,%o4,%l3 or %l3,%l7,%l7 std %l6,[%o1] add %o1,8,%o1 1: subcc %o2,8,%o2 sll %l4,%o3,%l6 srl %l5,%o4,%l3 or %l6,%l3,%l6 bge 2b sll %l5,%o3,%l7 st %l6,[%o1] andcc %o5,2,%g0 add %o1,4,%o1 be 1f andcc %o5,1,%g0 srl %l7,16,%l3 sth %l3,[%o1] add %o1,2,%o1 sll %l7,16,%l7 1: be 1f nop srl %l7,24,%l3 stb %l3,[%o1] inc %o1 1: !b c11i_bci_wrapup !nop c11i_bci_wrapup: ! now %o0 and %o1 are correct and we have 8+%o2 more bytes to move addcc %o2,7,%o2 ! add %o2,8,%o2; deccc %o2 bl 2f nop 1: ldub [%o0],%l7 inc %o0 not %l7,%l7 stb %l7,[%o1] deccc %o2 bge 1b inc %o1 2: ! end bcopy_inverted add %l0,%i4,%l0 deccc %i3 bg c11i_loop add %l1,%i5,%l1 ret restore .global gl_point_fb1 1: retl nop gl_point_fb1: orcc %o1,%o2,%g0 bl 1b ld [%o0+8],%o3 subcc %o1,%o3,%g0 bge 1b ld [%o0+12],%o3 subcc %o2,%o3,%g0 bge 1b ld [%o0+4],%o3 ! multiply %o4 = %o3 * %o2; %o2 and %o3 are destroyed mov %g0,%o4 2: andcc %o3,1,%g0 be 1f srl %o3,1,%o3 add %o4,%o2,%o4 1: tst %o3 bne 2b sll %o2,1,%o2 ! end multiply ! %o2, %o3 now dead ld [%o0],%o0 srl %o1,3,%o2 add %o2,%o4,%o2 and %o1,7,%o1 set 0x80,%o3 srl %o3,%o1,%o3 ! %o1 now dead ldub [%o0+%o2],%o5 or %o3,%o5,%o3 cmp %o3,%o5 bne,a 1f stb %o3,[%o0+%o2] 1: retl nop .global gl_line_fb1 ! void (struct gl_fb1 *fb, int x0, int y0, int x1, int y1) gl_line_fb1: save %sp,-96,%sp subcc %i3,%i1,%i3 bge 1f mov %i3,%l0 neg %i3,%l0 1: subcc %i4,%i2,%i4 bge 1f mov %i4,%l1 neg %i4,%l1 1: subcc %l0,%l1,%g0 bl 1f nop tst %i3 bge 2f tst %i4 add %i1,%i3,%i1 neg %i3 add %i2,%i4,%i2 subcc %g0,%i4,%i4 2: bge 2f add %g0,1,%l7 neg %l7 neg %i4 2: srl %i3,1,%l6 mov %i1,%o1 mov %i2,%o2 call gl_point_fb1 mov %i0,%o0 orcc %g0,%i3,%l5 ble 9f .empty 3: inc %i1 add %l6,%i4,%l6 subcc %l6,%i3,%l0 bl 2f mov %i1,%o1 add %i2,%l7,%i2 mov %l0,%l6 2: mov %i2,%o2 call gl_point_fb1 mov %i0,%o0 deccc %l5 bg 3b nop 9: ret restore 1: tst %i4 bge 2f tst %i3 add %i1,%i3,%i1 subcc %g0,%i3,%i3 add %i2,%i4,%i2 neg %i4 2: bge 2f add %g0,1,%l7 neg %l7 neg %i3 2: srl %i4,1,%l6 mov %i1,%o1 mov %i2,%o2 call gl_point_fb1 mov %i0,%o0 orcc %g0,%i4,%l5 ble 9f .empty 3: inc %i2 add %l6,%i3,%l6 subcc %l6,%i4,%l0 bl 2f mov %i1,%o1 add %i1,%l7,%i1 mov %l0,%l6 2: mov %i2,%o2 call gl_point_fb1 mov %i0,%o0 deccc %l5 bg 3b nop 9: ret restore .global gl_copy_fb1_fb8_bit gl_copy_fb1_fb8_bit: ! void gl_copy_fb1_fb8_bit(struct gl_fb1 *f, struct gl_fb8 *t, unsigned char bit) ! { save %sp,-104,%sp ! f in %i0 ! t in %i1 ! bit in %i2 ! unsigned char *fp0; ! fp0 in %i3 ! unsigned char *fp; ! fp in %i4 ! unsigned char *tp0; ! tp0 in %i5 ! unsigned char *tp; ! tp in %l0 ! int xs; ! xs in %l1 ! int ys; ! ys in %l2 ! int x; ! x in %l3 ! int y; ! y in %l4 ! unsigned long long int fbuf; ! fbuf in %l6/l7 ! int nfb; ! nfb in %l5 ! unsigned long long int tbuf; ! tbuf in %o0/o1 ! int tc; ! tc in %o2 ! (f->stride in %o3) ld [%i0+4],%o3 ! (t->stride in %o4) ld [%i1+4],%o4 ! xs = (f->xsize < t->xsize) ? f->xsize : t->xsize; ld [%i0+8],%l1 ld [%i1+8],%o5 subcc %l1,%o5,%g0 bg,a 1f mov %o5,%l1 1: ! ys = (f->ysize < t->ysize) ? f->ysize : t->ysize; ld [%i0+12],%l2 ld [%i1+12],%o5 subcc %l2,%o5,%g0 bg,a 1f mov %o5,%l2 1: ! fp = f->vram; ld [%i0],%i4 ! f is now dead (f->stride in %o3); %i0 is released for scratch use ! tp = t->vram; ld [%i1],%l0 ! t is now dead (t->stride in %o4); %i1 is released for scratch use ! for (y=ys;y>0;y--) add %l2,1,%l4 ! ys is now dead; %l2 is reclaimed to hold 0x80000000 sethi %hi(0x80000000),%l2 c18b_y_loop: deccc %l4 ble c18b_y_done nop ! { fp0 = fp; mov %i4,%i3 ! tp0 = tp; mov %l0,%i5 ! fbuf = *(unsigned long long int *)(fp-(7&(int)fp)); and %i4,7,%i0 sub %i4,%i0,%o5 ldd [%o5],%l6 ! [note %i0 holds 7&(int)fp] ! nfb = 8 * (8 - (7 & (int)fp)); ! [note %i0 holds 7&(int)fp] set 8,%l5 sub %l5,%i0,%i0 sll %i0,3,%l5 ! [note %i0 holds 8-(7&(int)fp)] ! fp += 8 - (7 & (int)fp); ! [note %i0 holds 8-(7&(int)fp)] add %i4,%i0,%i4 ! fbuf <<= 64 - nfb; set 64,%i0 sub %i0,%l5,%i0 andcc %i0,32,%g0 be 1f andcc %i0,31,%i0 sll %l7,%i0,%l6 b 2f mov %g0,%l7 1: be 2f .empty sll %l6,%i0,%l6 set 32,%i1 sub %i1,%i0,%i1 srl %l7,%i1,%o5 or %l6,%o5,%l6 sll %l7,%i0,%l7 2: ! tc = 7 & (int)tp; and %l0,7,%o2 ! tp -= tc; sub %l0,%o2,%l0 ! tbuf = *(unsigned long long int *)tp; ldd [%l0],%o0 ! for (x=xs;x>0;x--) add %l1,1,%l3 c18b_x_loop: deccc %l3 ble c18b_x_done nop ! { if (nfb < 1) tst %l5 bg 1f nop ! { fbuf = *(unsigned long long int *)fp; ldd [%i4],%l6 ! fp += 8; add %i4,8,%i4 ! nfb = 64; set 64,%l5 ! } 1: ! if (fbuf & 0x8000000000000000ULL) tbuf |= ((unsigned long long int)bit) << ((7-tc) * 8); else tbuf &= ~(((unsigned long long int)bit) << ((7-tc) * 8)); set 7,%o5 sub %o5,%o2,%o5 sll %o5,3,%o5 ! %o5 holds shift count, (7-tc)*8 ! note we know that (a) bit is a one-byte value and (b) the shift ! count is a multiple of 8. (This simplifies the <32 case.) andcc %o5,32,%g0 be 1f andcc %o5,31,%o5 sll %i2,%o5,%i0 b 2f mov %g0,%i1 1: sll %i2,%o5,%i1 mov %g0,%i0 2: andcc %l6,%l2,%g0 be 1f inc %o2 ! tc++, moved from below or %o0,%i0,%o0 b 2f or %o1,%i1,%o1 1: andn %o0,%i0,%o0 andn %o1,%i1,%o1 2: ! tc ++; ! moved to delay slot above ! if (tc >= 8) subcc %o2,8,%g0 bl 1f nop ! { *(unsigned long long int *)tp = tbuf; std %o0,[%l0] ! tp += 8; add %l0,8,%l0 ! tbuf = *(unsigned long long int *)tp; ldd [%l0],%o0 ! tc = 0; mov %g0,%o2 ! } 1: ! fbuf <<= 1; sll %l6,1,%l6 srl %l7,31,%o5 or %l6,%o5,%l6 sll %l7,1,%l7 ! nfb --; ! moved to delay slot below ! } b c18b_x_loop dec %l5 ! nfb--, moved from above c18b_x_done: ! if (tc > 0) *(unsigned long long int *)tp = tbuf; tst %o2 ble 1f nop std %o0,[%l0] 1: ! fp = fp0 + f->stride; add %i3,%o3,%i4 ! tp = tp0 + t->stride; ! moved to delay slot below ! } b c18b_y_loop add %i5,%o4,%l0 ! tp=tp0+t->stride, moved from above c18b_y_done: ! } ret restore .global gl_copy_fb1_fb8 gl_copy_fb1_fb8: ! void gl_copy_fb1_fb8(struct gl_fb1 *f, struct gl_fb8 *t) ! { save %sp,-104,%sp ! f in %i0 ! t in %i1 ! %i2 available as scratch ! unsigned char *fp0; ! fp0 in %i3 ! unsigned char *fp; ! fp in %i4 ! unsigned char *tp0; ! tp0 in %i5 ! unsigned char *tp; ! tp in %l0 ! int xs; ! xs in %l1 ! int ys; ! ys in %l2 ! int x; ! x in %l3 ! int y; ! y in %l4 ! unsigned long long int fbuf; ! fbuf in %l6/l7 ! int nfb; ! nfb in %l5 ! unsigned long long int tbuf; ! tbuf in %o0/o1 ! int tc; ! tc in %o2 ! (f->stride in %o3) ld [%i0+4],%o3 ! (t->stride in %o4) ld [%i1+4],%o4 ! xs = (f->xsize < t->xsize) ? f->xsize : t->xsize; ld [%i0+8],%l1 ld [%i1+8],%o5 subcc %l1,%o5,%g0 bg,a 1f mov %o5,%l1 1: ! ys = (f->ysize < t->ysize) ? f->ysize : t->ysize; ld [%i0+12],%l2 ld [%i1+12],%o5 subcc %l2,%o5,%g0 bg,a 1f mov %o5,%l2 1: ! fp = f->vram; ld [%i0],%i4 ! f is now dead (f->stride in %o3); %i0 is released for scratch use ! tp = t->vram; ld [%i1],%l0 ! t is now dead (t->stride in %o4); %i1 is released for scratch use ! for (y=ys;y>0;y--) add %l2,1,%l4 ! ys is now dead; %l2 is reclaimed to hold 0x80000000 sethi %hi(0x80000000),%l2 c18_y_loop: deccc %l4 ble c18_y_done nop ! { fp0 = fp; mov %i4,%i3 ! tp0 = tp; mov %l0,%i5 ! fbuf = *(unsigned long long int *)(fp-(7&(int)fp)); and %i4,7,%i0 sub %i4,%i0,%o5 ldd [%o5],%l6 ! [note %i0 holds 7&(int)fp] ! nfb = 8 * (8 - (7 & (int)fp)); ! [note %i0 holds 7&(int)fp] set 8,%l5 sub %l5,%i0,%i0 sll %i0,3,%l5 ! [note %i0 holds 8-(7&(int)fp)] ! fp += 8 - (7 & (int)fp); ! [note %i0 holds 8-(7&(int)fp)] add %i4,%i0,%i4 ! fbuf <<= 64 - nfb; set 64,%i0 sub %i0,%l5,%i0 andcc %i0,32,%g0 be 1f andcc %i0,31,%i0 sll %l7,%i0,%l6 b 2f mov %g0,%l7 1: be 2f .empty sll %l6,%i0,%l6 set 32,%i1 sub %i1,%i0,%i1 srl %l7,%i1,%o5 or %l6,%o5,%l6 sll %l7,%i0,%l7 2: ! tc = 7 & (int)tp; andcc %l0,7,%o2 ! if (tc > 0) bne 1f .empty ! { tp -= tc; sub %l0,%o2,%l0 ! tbuf = (*(unsigned long long int *)tp) >> (64 - (8*tc)); ! Note that this code sequence knows the shift count is (a) a multiple of 8 ! and (b) neither 0 nor 64. ldd [%l0],%o0 sll %o2,3,%i0 set 64,%i1 sub %i1,%i0,%i1 andcc %i1,32,%g0 beq 2f and %i1,31,%i1 srl %o0,%i1,%o1 b 3f mov %g0,%o0 2: set 32,%o5 sub %o5,%i1,%o5 sll %o0,%o5,%i0 srl %o0,%i1,%o0 srl %o1,%i1,%o1 or %o1,%i0,%o1 3: ! } 1: ! for (x=xs;x>0;x--) add %l1,1,%l3 c18_x_loop: deccc %l3 ble c18_x_done nop ! { if (nfb < 1) tst %l5 bg 1f nop ! { fbuf = *(unsigned long long int *)fp; ldd [%i4],%l6 ! fp += 8; add %i4,8,%i4 ! nfb = 64; set 64,%l5 ! } 1: ! if (fbuf & 0x8000000000000000ULL) ! { tbuf = (tbuf << 8) | 255; ! } ! else ! { tbuf <<= 8; ! } andcc %l6,%l2,%g0 sll %o0,8,%o0 srl %o1,24,%o5 sll %o1,8,%o1 beq 1f or %o0,%o5,%o0 or %o1,255,%o1 1: ! tc ++; inc %o2 ! if (tc >= 8) subcc %o2,8,%g0 bl 1f nop ! { *(unsigned long long int *)tp = tbuf; std %o0,[%l0] ! tp += 8; add %l0,8,%l0 ! tc = 0; mov %g0,%o2 ! } 1: ! fbuf <<= 1; sll %l6,1,%l6 srl %l7,31,%o5 or %l6,%o5,%l6 sll %l7,1,%l7 ! nfb --; ! moved to delay slot below ! } b c18_x_loop dec %l5 ! nfb--, moved from above c18_x_done: ! if (tc > 0) tst %o2 ble 1f ! { tbuf = (tbuf << (8 * (8-tc))) | (((~0ULL) >> (8 * tc)) & *(unsigned long long int *)tp); ! Note that this code sequence knows the shift count is (a) a multiple of 8 ! and (b) neither 0 nor 64. This sequence can probably be improved somewhat. set 8,%o5 sub %o5,%o2,%o5 sll %o5,3,%o5 andcc %o5,32,%g0 beq 2f and %o5,31,%o5 sll %o1,%o5,%o0 b 3f mov %g0,%o1 2: set 32,%i0 sub %i0,%o5,%i0 sll %o0,%o5,%o0 srl %o1,%i0,%i1 sll %o1,%o5,%o1 or %o0,%i1,%o0 3: ldd [%l0],%i0 sll %o2,3,%o5 andcc %o5,32,%g0 not %g0,%i2 and %o5,31,%o5 beq 2f srl %i2,%o5,%i2 b 3f and %i1,%i2,%i1 2: and %i0,%i2,%i0 or %o0,%i0,%o0 3: or %o1,%i1,%o1 ! *(unsigned long long int *)tp = tbuf; std %o0,[%l0] ! } 1: ! fp = fp0 + f->stride; add %i3,%o3,%i4 ! tp = tp0 + t->stride; ! moved to delay slot below ! } b c18_y_loop add %i5,%o4,%l0 ! tp=tp0+t->stride, moved from above c18_y_done: ! } ret restore