# ## Meta

base:seriously_fast_multiplication

# Seriously fast multiplication (8-bit and 16-bit)

By Jackasser

Without further explanation here's the code for really fast multiplications. They require 2k of tables which can be generated using table_generator_routine_for_fast_8_bit_mul_table. The article from C=Hacking 16 which is mentioned in the source is available here.

Here are four routines, signed/unsigned 8/16-bit multiplication with 16/32-bit result:

```; Description: Unsigned 8-bit multiplication with unsigned 16-bit result.
;
; Input: 8-bit unsigned value in T1
;        8-bit unsigned value in T2
;        Carry=0: Re-use T1 from previous multiplication (faster)
;        Carry=1: Set T1 (slower)
;
; Output: 16-bit unsigned value in PRODUCT
;
; Clobbered: PRODUCT, X, A, C
;
; Allocation setup: T1,T2 and PRODUCT preferably on Zero-page.
;                   square1_lo, square1_hi, square2_lo, square2_hi must be
;                   page aligned. Each table are 512 bytes. Total 2kb.
;
; Table generation: I:0..511
;                   square1_lo = <((I*I)/4)
;                   square1_hi = >((I*I)/4)
;                   square2_lo = <(((I-255)*(I-255))/4)
;                   square2_hi = >(((I-255)*(I-255))/4)
.proc multiply_8bit_unsigned
bcc :+
lda T1
sta sm1+1
sta sm3+1
eor #\$ff
sta sm2+1
sta sm4+1
:

ldx T2
sec
sm1:            lda square1_lo,x
sm2:            sbc square2_lo,x
sta PRODUCT+0
sm3:            lda square1_hi,x
sm4:            sbc square2_hi,x
sta PRODUCT+1

rts
.endproc

; Description: Signed 8-bit multiplication with signed 16-bit result.
;
; Input: 8-bit signed value in T1
;        8-bit signed value in T2
;        Carry=0: Re-use T1 from previous multiplication (faster)
;        Carry=1: Set T1 (slower)
;
; Output: 16-bit signed value in PRODUCT
;
; Clobbered: PRODUCT, X, A, C
.proc multiply_8bit_signed
jsr multiply_8bit_unsigned

; Apply sign (See C=Hacking16 for details).
lda T1
bpl :+
sec
lda PRODUCT+1
sbc T2
sta PRODUCT+1
:
lda T2
bpl :+
sec
lda PRODUCT+1
sbc T1
sta PRODUCT+1
:

rts
.endproc

; Description: Unsigned 16-bit multiplication with unsigned 32-bit result.
;
; Input: 16-bit unsigned value in T1
;        16-bit unsigned value in T2
;        Carry=0: Re-use T1 from previous multiplication (faster)
;        Carry=1: Set T1 (slower)
;
; Output: 32-bit unsigned value in PRODUCT
;
; Clobbered: PRODUCT, X, A, C
;
; Allocation setup: T1,T2 and PRODUCT preferably on Zero-page.
;                   square1_lo, square1_hi, square2_lo, square2_hi must be
;                   page aligned. Each table are 512 bytes. Total 2kb.
;
; Table generation: I:0..511
;                   square1_lo = <((I*I)/4)
;                   square1_hi = >((I*I)/4)
;                   square2_lo = <(((I-255)*(I-255))/4)
;                   square2_hi = >(((I-255)*(I-255))/4)
.proc multiply_16bit_unsigned
; <T1 * <T2 = AAaa
; <T1 * >T2 = BBbb
; >T1 * <T2 = CCcc
; >T1 * >T2 = DDdd
;
;       AAaa
;     BBbb
;     CCcc
; + DDdd
; ----------
;   PRODUCT!

; Setup T1 if changed
bcc :+
lda T1+0
sta sm1a+1
sta sm3a+1
sta sm5a+1
sta sm7a+1
eor #\$ff
sta sm2a+1
sta sm4a+1
sta sm6a+1
sta sm8a+1
lda T1+1
sta sm1b+1
sta sm3b+1
sta sm5b+1
sta sm7b+1
eor #\$ff
sta sm2b+1
sta sm4b+1
sta sm6b+1
sta sm8b+1
:

; Perform <T1 * <T2 = AAaa
ldx T2+0
sec
sm1a:           lda square1_lo,x
sm2a:           sbc square2_lo,x
sta PRODUCT+0
sm3a:           lda square1_hi,x
sm4a:           sbc square2_hi,x
sta _AA+1

; Perform >T1_hi * <T2 = CCcc
sec
sm1b:           lda square1_lo,x
sm2b:           sbc square2_lo,x
sta _cc+1
sm3b:           lda square1_hi,x
sm4b:           sbc square2_hi,x
sta _CC+1

; Perform <T1 * >T2 = BBbb
ldx T2+1
sec
sm5a:           lda square1_lo,x
sm6a:           sbc square2_lo,x
sta _bb+1
sm7a:           lda square1_hi,x
sm8a:           sbc square2_hi,x
sta _BB+1

; Perform >T1 * >T2 = DDdd
sec
sm5b:           lda square1_lo,x
sm6b:           sbc square2_lo,x
sta _dd+1
sm7b:           lda square1_hi,x
sm8b:           sbc square2_hi,x
sta PRODUCT+3

; Add the separate multiplications together
clc
_AA:            lda #0
_bb:            adc #0
sta PRODUCT+1
_BB:            lda #0
_CC:            adc #0
sta PRODUCT+2
bcc :+
inc PRODUCT+3
clc
:
_cc:            lda #0
adc PRODUCT+1
sta PRODUCT+1
_dd:            lda #0
adc PRODUCT+2
sta PRODUCT+2
bcc :+
inc PRODUCT+3
:

rts
.endproc

; Description: Signed 16-bit multiplication with signed 32-bit result.
;
; Input: 16-bit signed value in T1
;        16-bit signed value in T2
;        Carry=0: Re-use T1 from previous multiplication (faster)
;        Carry=1: Set T1 (slower)
;
; Output: 32-bit signed value in PRODUCT
;
; Clobbered: PRODUCT, X, A, C
.proc multiply_16bit_signed
jsr multiply_16bit_unsigned

; Apply sign (See C=Hacking16 for details).
lda T1+1
bpl :+
sec
lda PRODUCT+2
sbc T2+0
sta PRODUCT+2
lda PRODUCT+3
sbc T2+1
sta PRODUCT+3
:
lda T2+1
bpl :+
sec
lda PRODUCT+2
sbc T1+0
sta PRODUCT+2
lda PRODUCT+3
sbc T1+1
sta PRODUCT+3
:

rts
.endproc
```
base/seriously_fast_multiplication.txt · Last modified: 2015-04-17 04:33 (external edit)

### Page Tools 