I'm rewriting my library in assembler, although optimization is not a priority right now, I just wanted to give asm a try, and I'm liking it.
Here I post my very first asm routine ever, for anyone who wants to comment, correct, point out some errors, give some advice or anything.
The main idea is to copy a memory chunk from RAM to VRAM, a byte is used for the length so up to 256 bytes can be copied.
There will be anothe function to copy more than 256, but that will have to implement a 2 byte counter, spending more cpu cycles per loop.
A special case is used to be able to copy exactly 256 bytes, which is when the length is set to 0. Do you consider this is useful? it means more bytes, more cpu cycles and more jumps just for one case, which I'm not sure will occur very often.
Note: _param_p0, _param_w0 and _param_b0 are set to the given arguments in a C macro function called by the user which in turn calls this routine. This speeds things a little by using zero page vars instead of an argument stack, and also because I was too lazy to investigate how does cc65 operate that stack.
(recursion and being called from an interrupt won't ocurr for this type of functions, so there's no problem of the zero page vars being corrupted)
here's the code:
Here I post my very first asm routine ever, for anyone who wants to comment, correct, point out some errors, give some advice or anything.
The main idea is to copy a memory chunk from RAM to VRAM, a byte is used for the length so up to 256 bytes can be copied.
There will be anothe function to copy more than 256, but that will have to implement a 2 byte counter, spending more cpu cycles per loop.
A special case is used to be able to copy exactly 256 bytes, which is when the length is set to 0. Do you consider this is useful? it means more bytes, more cpu cycles and more jumps just for one case, which I'm not sure will occur very often.
Note: _param_p0, _param_w0 and _param_b0 are set to the given arguments in a C macro function called by the user which in turn calls this routine. This speeds things a little by using zero page vars instead of an argument stack, and also because I was too lazy to investigate how does cc65 operate that stack.
(recursion and being called from an interrupt won't ocurr for this type of functions, so there's no problem of the zero page vars being corrupted)
here's the code:
Code:
; void _copy_VRAM( byte* source, word dest , byte length )
; Copies memory from source in RAM to dest in VRAM up to length bytes(*)
; [ byte* source ]: pointer in RAM to data to be copied
; [ word dest ]: address in VRAM where data should be copied
; [ byte length ]: how many bytes should be copied
; (*) if length is in range [1..255] copies that many bytes
; else, if length is 0, copies 256 bytes
.importzp _param_p0, _param_w0, _param_b0
.export __copy_VRAM
source = _param_p0 ; 2 bytes
dest = _param_w0 ; 2 bytes
length = _param_b0
.proc __copy_VRAM
ldy #0 ; y used to index source data
ldx #0 ; x used as counter
cpx length
beq @copy_256_bytes ; if length is 0, copy 256 bytes
@loop:
; if counter X reached length, stop
cpx length
beq @end
; skip first compare, to copy 256 bytes
@copy_256_bytes_entry:
; dest's high byte
lda dest+1
sta $2006
; dest's low byte
lda dest
sta $2006
; copy data from source
lda (source),y
sta $2007
; 2 byte increment of dest
inc dest
bne @no_overflow
inc dest+1
@no_overflow:
inx
iny
jmp @loop
; If length is 0, copies 256 bytes
; start counter X in -1 ($FF) to make one more cycle than usual
; and skip first comparison to avoid ( $FF = $FF ) which would exit
; and not copy any byte at all
@copy_256_bytes:
ldx #$FF
stx length
jmp @copy_256_bytes_entry
@end:
rts
.endproc
; Copies memory from source in RAM to dest in VRAM up to length bytes(*)
; [ byte* source ]: pointer in RAM to data to be copied
; [ word dest ]: address in VRAM where data should be copied
; [ byte length ]: how many bytes should be copied
; (*) if length is in range [1..255] copies that many bytes
; else, if length is 0, copies 256 bytes
.importzp _param_p0, _param_w0, _param_b0
.export __copy_VRAM
source = _param_p0 ; 2 bytes
dest = _param_w0 ; 2 bytes
length = _param_b0
.proc __copy_VRAM
ldy #0 ; y used to index source data
ldx #0 ; x used as counter
cpx length
beq @copy_256_bytes ; if length is 0, copy 256 bytes
@loop:
; if counter X reached length, stop
cpx length
beq @end
; skip first compare, to copy 256 bytes
@copy_256_bytes_entry:
; dest's high byte
lda dest+1
sta $2006
; dest's low byte
lda dest
sta $2006
; copy data from source
lda (source),y
sta $2007
; 2 byte increment of dest
inc dest
bne @no_overflow
inc dest+1
@no_overflow:
inx
iny
jmp @loop
; If length is 0, copies 256 bytes
; start counter X in -1 ($FF) to make one more cycle than usual
; and skip first comparison to avoid ( $FF = $FF ) which would exit
; and not copy any byte at all
@copy_256_bytes:
ldx #$FF
stx length
jmp @copy_256_bytes_entry
@end:
rts
.endproc