CALCMAND.S

File: CALCMAND.S - Tab length: 1 2 4 8 - Lines: on off - No wrap: on off

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; The X numbers are shifted right by 13 before use
; These numbers are in units of 1/8192

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; The upper commented out numbers do this SciAm cover (almost)
; XSTART equ $fffffa86
; XINC equ 1

; XSTART equ ((-1)<<13)
; XINC equ (1<<9)

; XSTART equ 0
; XINC equ (1<<7)

; XSTART equ ((-2)<<13)
; XINC equ ((10<<11)/WIDTH)

; The Y numbers are shifted right by 13 before use
; These numbers are in units of 1/8192

; The upper commented out numbers do this SciAm cover
; YSTART equ $ffffde9a
; YINC equ 1

; YSTART equ ((-1)<<13)
; YINC equ (1<<9)

; YSTART equ ((-2)<<13)
; YINC equ (1<<7)

; YSTART equ ((-19)<<9)
; YINC equ ((6<<12)/WIDTH)

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; The most important thing in a Mandlebrot program is the inner loop
; The most important thing in a GPU program is to keep as much as possible
; in registers and as much of the rest in internal RAM
; First the inner loop:
; In order to handle both the Mandlebrot and Julia sets we make no assumptions
; about initial conditions.
; The basic loop is: (given xi, yi, cx, cy)
; temp=xi*yi
; sx=xi*xi
; sy=yi*yi
; yi=temp+temp+cy
; xi=sx-sy+cx
; count+=1
; interate until count>maxcount or sx+sy>4

; Note that the nubers used here are 3.13 fixed point
; For a Mandlebrot xi=yi=0 at the start always

; Assume that the following registers are already set up
; movei #MAXCNT,maxcnt
; movei #FOUR,four

INBUF equ $00f03810
; SEMAPHORE equ $0000bff0

WIDTH equ 640
HEIGHT equ 480

XSTART equ ((-2)<<13)
XINC equ ((10<<11)/WIDTH)

YSTART equ ((-19)<<9)
YINC equ ((6<<12)/WIDTH)

.gpu

xi .equr R1
yi .equr R2
cx .equr R3
cy .equr R4
sx .equr R5
sy .equr R6
temp .equr R7
count .equr R8
maxcnt .equr R9
four .equr R10
inloop .equr R11
semaphore .equr R12
inbuf .equr R13

; A1 = R14
; d2 = R18
; A0 = R19
; d0 = R20
; d1 = R21

jx .equr R15
jy .equr R16
ypos .equr R17
xpos .equr R23
rinner .equr R24
router .equr R25

mandGPU::
.org $f03000
; ancien code 68000
; Mandle:
start_mandGPU::
moveq #0,jx
moveq #0,jy

movei #$20000,R14

movei #YSTART,ypos ; Initialize y position

movei #HEIGHT-1,R18

movei #INBUF+8,R19

movei #inner,rinner
movei #outer,router

move jx,R20
store R20,(R19)
addq #4,R19

move jy,R20
store R20,(R19)

outer:
movei #INBUF+4,R19

move ypos,R20
store R20,(R19)

movei #WIDTH-1,R21
movei #XSTART,xpos ; Initialize x position

inner:
movei #INBUF,R19

move xpos,R20
store R20,(R19)
; fin 68000

; start_mandGPU::
movei #loop,inloop

movei #(4<<13),four
movei #254,maxcnt

; movei #$0000bff0,semaphore
movei #$00f03810,inbuf

xor count,count

load (inbuf),cx
addq #4,inbuf

load (inbuf),cy
addq #4,inbuf

load (inbuf),xi
addq #4,inbuf

load (inbuf),yi
addq #4,inbuf

loop:
move xi,temp
imult yi,temp ; temp=xi*yi

imult xi,xi ; xi=xi*xi

imult yi,yi ; yi=yi*yi

sharq #13,xi ; normalize all mult results
sharq #13,temp
sharq #13,yi

; The folowing code has been interleaved

add temp,temp ; temp=temp+temp

move yi,sy ; sy=yi*yi

add cy,temp ; temp=temp+temp+cy

move xi,sx ; sx=xi*xi

move temp,yi ; yi=temp+temp+cy

sub sy,xi ; xi=sx-sy

add cx,xi ; xi=sx-sy+cx

addq #1,count
cmp count,maxcnt

jr MI,noloop ; MI is branch count<maxcnt
; nop ; optimisation, no need to "nop" here

add sx,sy
cmp sy,four

jr EQ,noloop
nop
jump CC,(inloop)
nop

noloop:
; store count,(semaphore)
; re 68000
storeb count,(R14)
addq #1,R14

; addq #XINC,xpos
movei #XINC,R26
add R26,xpos

subq #1,R21
jump PL,(rinner)
nop

; addq #YINC/2,ypos
; addq #YINC/2,ypos
movei #YINC,R26
add R26,ypos

subq #1,R18
jump PL,(router)
nop
nop
; re fin 68000
;BRAK !

; NOTE: This halts the GPU
movei #0,R30
movei #$00f02114,R31
store R30,(R31)

nop
nop
nop
nop
nop
nop
nop
nop
end_mandGPU::