;   "fx_mmx.asm"   - MMX transparency routines
;
;   DDS - Dureks DemoSystem
;   Copyright (C)2001 dureks
;
;   This source code is licensed under the GNU GPL.
;   See the GNU General Public License for more details.

;----- Includes -----
%include "fx_trans.inc"


;----- Global functions -----
GLOBAL _mmx_add_pixels_32
GLOBAL _mmx_add_pixels_grade_32
GLOBAL _mmx_add_pixels_16
GLOBAL _mmx_add_pixels_grade_16

GLOBAL _mmx_sub_pixels_32
GLOBAL _mmx_sub_pixels_16


;----- Data -----
[SECTION .data]
filler      dd 0
            dd 0
RMASK_565   dd 0xF800F800
            dd 0xF800F800
GMASK_565   dd 0x07E007E0
            dd 0x07E007E0
BMASK_565   dd 0x001F001F
            dd 0x001F001F

;----- Code -----
[SECTION .text]


; MMX add pixels, 32bpp
; ---------------------
_mmx_add_pixels_32
	push	ebp
%ifdef REGISTER_CALLING
    mov     ebp,    eax
%else
	mov	ebp,	[esp+8]
%endif
    pushad

	; get variables from structure
	mov	edx,	DWORD [ebp+tr_height]
	mov	esi,	DWORD [ebp+tr_src]
	mov	edi,	DWORD [ebp+tr_dest]
	sub	esi,	8
	sub	edi,	8

   .line_loop:
	mov	ecx,	DWORD [ebp+tr_width]
	shr	ecx,	1
	jz	.single_pixel

	; move 2 pixels per loop
   .pixel_loop:
	movq	mm0,	QWORD [esi+ecx*8]
	movq	mm1,	QWORD [edi+ecx*8]
	paddusb	mm0,	mm1
	movq	QWORD [edi+ecx*8],	mm0

	dec	ecx
	jnz	.pixel_loop

   .single_pixel:
	mov	ecx,	DWORD [ebp+tr_width]
	test	ecx,	0x1
	jz	.next_line
	inc	ecx
	movd	mm0,	DWORD [esi+ecx*4]
	movd	mm1,	DWORD [edi+ecx*4]
	paddusb	mm0,	mm1
	movd	DWORD [edi+ecx*4],	mm0

   .next_line:
	; go to next line
	add	esi,	DWORD [ebp+tr_srcnl]
	add	edi,	DWORD [ebp+tr_destnl]
	dec	edx
	jnz	.line_loop

	emms
	popad
	pop	ebp
	ret
	
	
; MMX add pixels with grade, 32bpp
; --------------------------------
_mmx_add_pixels_grade_32
	push	ebp
%ifdef REGISTER_CALLING
    mov     ebp,    eax
%else
	mov	ebp,	[esp+8]
%endif
	pushad

    mov  edi,  filler
    mov  al,   BYTE [ebp+tr_grade]
    mov  ecx,  8
    rep  stosb
    movq mm2,  QWORD [filler]

    mov  edx,   DWORD [ebp+tr_height]
    mov  esi,   DWORD [ebp+tr_src]
    mov  edi,   DWORD [ebp+tr_dest]
    sub  esi,   8
    sub  edi,   8

   .line_loop:
    mov  ecx,   DWORD [ebp+tr_width]
    shr  ecx,   1
    jz   .single_pixel

; move 2 pixels per loop
   .pixel_loop:
    movq mm0,   QWORD [esi+ecx*8]
    movq mm1,   QWORD [edi+ecx*8]
    psubusb mm0,mm2
    paddusb mm0,mm1
    movq QWORD  [edi+ecx*8],  mm0

    dec  ecx
    jnz  .pixel_loop

; move 1 pixel per loop
   .single_pixel:
    mov  ecx,   [ebp+tr_width]
    test ecx,   $01
    jz   .next_line
    inc  ecx
    mov  ecx,   [ebp+tr_width]
    movd mm0,   DWORD [esi+ecx*4]
    movd mm1,   DWORD [edi+ecx*4]
    psubusb mm0,mm2
    paddusb mm0,mm1
    movd DWORD [edi+ecx*4],  mm0

; go to next line
   .next_line:
    add  esi,   DWORD [ebp+tr_srcnl]
    add  edi,   DWORD [ebp+tr_destnl]
    dec  edx
    jnz  .line_loop

    emms
	popad
	pop	ebp
	ret


; MMX add pixels, 16bpp
; ---------------------
_mmx_add_pixels_16
	push	ebp
%ifdef REGISTER_CALLING
    mov     ebp,    eax
%else
	mov	ebp,	[esp+8]
%endif
	pushad

 ; get measures
    mov  edx,   DWORD [ebp+tr_height]
    mov  esi,   DWORD [ebp+tr_src]
    mov  edi,   DWORD [ebp+tr_dest]
    sub  esi,   8
    sub  edi,   8

   .line_loop:
    mov  ecx,   DWORD [ebp+tr_width]
    shr  ecx,   2
    jz   .single_pixel ; *** TODO: width < 3 pixels are not shown

; move 2 pixels per loop
   .pixel_loop:
    movq mm0,   QWORD [esi+ecx*8]
    movq mm1,   QWORD [edi+ecx*8]

 ; fetch source color fields
    movq mm3,   mm0
    movq mm4,   mm0
    pand mm0,   QWORD [RMASK_565]   ; source R
    pand mm3,   QWORD [GMASK_565]   ; source G
    pand mm4,   QWORD [BMASK_565]   ; source B

 ; fetch dest color fields
    movq mm5,   mm1
    movq mm6,   mm1
    pand mm1,   QWORD [RMASK_565]   ; dest R
    pand mm5,   QWORD [GMASK_565]   ; dest G
    pand mm6,   QWORD [BMASK_565]   ; dest B

 ; move fields to upper part of each word
    psllq mm3,  5
    psllq mm5,  5
    psllq mm4,  11
    psllq mm6,  11

 ; add each color field (with saturization)
    paddusb mm0, mm1
    paddusb mm3, mm5
    paddusb mm4, mm6

 ; move fields back to original 565 positions
    psrlq mm3,  5
    psrlq mm4,  11

 ; combine the color fields
    por mm0,   mm3
    por mm0,   mm4

 ; draw it
    movq QWORD [edi+ecx*8],  mm0

    dec  ecx
    jnz  .pixel_loop

; move 1 pixel per loop
   .single_pixel:

; go to next line
   .next_line:
    add  esi,   DWORD [ebp+tr_srcnl]
    add  edi,   DWORD [ebp+tr_destnl]
    dec  edx

    jz   .end
    jmp  .line_loop

.end
    emms
	popad
	pop	ebp
	ret


; MMX add pixels with grade, 16bpp
; --------------------------------
_mmx_add_pixels_grade_16
	push	ebp
%ifdef REGISTER_CALLING
    mov     ebp,    eax
%else
	mov	ebp,	[esp+8]
%endif
	pushad

 ; create "grade"  *** FIX: blue channel has six bits, here we use only 5 ***
    mov  edi,  filler
    mov  al,   BYTE [ebp+tr_grade]
    shr  al,   3
    shl  al,   3
    mov  ecx,  8
    rep  stosb
    movq mm2,  QWORD [filler]

 ; get measures
    mov  edx,   DWORD [ebp+tr_height]
    mov  esi,   DWORD [ebp+tr_src]
    mov  edi,   DWORD [ebp+tr_dest]
    sub  esi,   8
    sub  edi,   8

   .line_loop:
    mov  ecx,   DWORD [ebp+tr_width]
    shr  ecx,   2
    jz   .single_pixel
; move 4 pixels per loop
   .pixel_loop:
    movq mm0,   QWORD [esi+ecx*8]
    movq mm1,   QWORD [edi+ecx*8]

 ; fetch source color fields
    movq mm3,   mm0
    movq mm4,   mm0
    pand mm0,   QWORD [RMASK_565]   ; source R
    pand mm3,   QWORD [GMASK_565]   ; source G
    pand mm4,   QWORD [BMASK_565]   ; source B

 ; fetch dest color fields
    movq mm5,   mm1
    movq mm6,   mm1
    pand mm1,   QWORD [RMASK_565]   ; dest R
    pand mm5,   QWORD [GMASK_565]   ; dest G
    pand mm6,   QWORD [BMASK_565]   ; dest B

 ; move fields to upper part of each word
    psllq mm3,  5
    psllq mm5,  5
    psllq mm4,  11
    psllq mm6,  11

 ; subtract grade, then add each color field (with saturization)
    psubusb mm0, mm2
    paddusb mm0, mm1
    psubusb mm3, mm2
    paddusb mm3, mm5
    psubusb mm4, mm2
    paddusb mm4, mm6

 ; move fields back to original 565 positions
    psrlq mm3,  5
    psrlq mm4,  11

 ; combine the color fields
    por mm0,   mm3
    por mm0,   mm4

 ; draw it
    movq QWORD [edi+ecx*8],  mm0

    dec  ecx
    jnz  .pixel_loop

   .single_pixel
;    mov  ecx,   [ebp+tr_width]
;    and  ecx,   0xFFFFFFFC
;    jz   .end
;    sub  esi,   20
;    add  esi,   ecx
;    sub  edi,   20
;    add  edi,   ecx
;    mov  ecx,   1
;    jmp  .pixel_loop

; go to next line
   .next_line:
    add  esi,   DWORD [ebp+tr_srcnl]
    add  edi,   DWORD [ebp+tr_destnl]
    dec  edx

    jz   .end
    jmp  .line_loop

.end
    emms
	popad
	pop	ebp
	ret
	
	
; MMX sub pixels, 32bpp
; ---------------------
_mmx_sub_pixels_32
	push	ebp
%ifdef REGISTER_CALLING
    mov     ebp,    eax
%else
	mov	ebp,	[esp+8]
%endif
    pushad

	; get variables from structure
	mov	edx,	DWORD [ebp+tr_height]
	mov	esi,	DWORD [ebp+tr_src]
	mov	edi,	DWORD [ebp+tr_dest]
	sub	esi,	8
	sub	edi,	8

   .line_loop:
	mov	ecx,	DWORD [ebp+tr_width]
	shr	ecx,	1
	jz	.single_pixel

	; move 2 pixels per loop
   .pixel_loop:
	movq	mm0,	QWORD [esi+ecx*8]
	movq	mm1,	QWORD [edi+ecx*8]
	psubusb	mm1,	mm0
	movq	QWORD [edi+ecx*8],	mm1

	dec	ecx
	jnz	.pixel_loop

   .single_pixel:
	mov	ecx,	DWORD [ebp+tr_width]
	test	ecx,	0x1
	jz	.next_line
	inc	ecx
	movd	mm0,	DWORD [esi+ecx*4]
	movd	mm1,	DWORD [edi+ecx*4]
	psubusb	mm1,	mm0
	movd	DWORD [edi+ecx*4],	mm1

   .next_line:
	; go to next line
	add	esi,	DWORD [ebp+tr_srcnl]
	add	edi,	DWORD [ebp+tr_destnl]
	dec	edx
	jnz	.line_loop

   .exit:
	emms
	popad
	pop	ebp
	ret	
	
; MMX sub pixels, 16bpp
; ---------------------
_mmx_sub_pixels_16
	push	ebp
%ifdef REGISTER_CALLING
    mov     ebp,    eax
%else
	mov	ebp,	[esp+8]
%endif
	pushad

 ; get measures
    mov  edx,   DWORD [ebp+tr_height]
    mov  esi,   DWORD [ebp+tr_src]
    mov  edi,   DWORD [ebp+tr_dest]
    sub  esi,   8
    sub  edi,   8

   .line_loop:
    mov  ecx,   DWORD [ebp+tr_width]
    shr  ecx,   2
    jz   .single_pixel

; move 2 pixels per loop
   .pixel_loop:
    movq mm0,   QWORD [esi+ecx*8]
    movq mm1,   QWORD [edi+ecx*8]

 ; fetch source color fields
    movq mm3,   mm0
    movq mm4,   mm0
    pand mm0,   QWORD [RMASK_565]   ; source R
    pand mm3,   QWORD [GMASK_565]   ; source G
    pand mm4,   QWORD [BMASK_565]   ; source B

 ; fetch dest color fields
    movq mm5,   mm1
    movq mm6,   mm1
    pand mm1,   QWORD [RMASK_565]   ; dest R
    pand mm5,   QWORD [GMASK_565]   ; dest G
    pand mm6,   QWORD [BMASK_565]   ; dest B

 ; move fields to upper part of each word
    psllq mm3,  5
    psllq mm5,  5
    psllq mm4,  11
    psllq mm6,  11

 ; add each color field (with saturization)
    psubusb mm1, mm0
    psubusb mm5, mm3
    psubusb mm6, mm4

 ; move fields back to original 565 positions
    psrlq mm5,  5
    psrlq mm6,  11

 ; combine the color fields
    por mm1,   mm5
    por mm1,   mm6

 ; draw it
    movq QWORD [edi+ecx*8],  mm1

    dec  ecx
    jnz  .pixel_loop

; move 1 pixel per loop
   .single_pixel:

; go to next line
   .next_line:
    add  esi,   DWORD [ebp+tr_srcnl]
    add  edi,   DWORD [ebp+tr_destnl]
    dec  edx

    jz   .end
    jmp  .line_loop

.end
    emms
	popad
	pop	ebp
	ret