Differences

This shows you the differences between two versions of the page.

--- base:fastest_multiplication_2023 [2023-12-11 12:22] – repose
+++ base:fastest_multiplication_2023 [2024-02-13 08:11] (current) – missing comments in first multiplication repose
@@ Line 1: / Line 1: @@
--If you see this message, the Wiki is being updated. Please check back later! Thanks-
 The research continues in 2023 and the fastest multiply got faster!
-Thanks to a 6502 simulator written in C and the analyzing of statistics in the program branches and boundary crossings, the exact speed of routines are now known. This analysis has inspired new optimizations! The new routine executes in a blazing 188.1 cycles on average (a 6% speedup or 10.5 cycles faster from my original code which was 198.6), with inputs in zero page and outputs in zero page plus one register, not including caller setup or RTS. \\
+Thanks to a 6502 simulator written in C and the analyzing of statistics in the program branches and boundary crossings, the exact speed of routines are now known. This analysis has inspired new optimizations! The new routine executes in a blazing 187.1 cycles on average (a 6% speedup or 11.5 cycles faster from my original code which was 198.6), with inputs in zero page and outputs in zero page plus two registers, not including caller setup or RTS. 14 bytes of zp, 2044 in tables, 112 bytes of code. The claim to "fastest" is based on tests of over 100 published routines here: https://github.com/tobyLobster/multiply_test/?tab=readme-ov-file#the-results\\
-Code to come, but for now see the end of this post:
+Code in ACME assembly format. Please adjust addresses to be usable for your situation.
-https://csdb.dk/forums/?roomid=11&topicid=91766
+<code>
+; World's fastest unsigned 16x16=32 bit multiply, 2023 version by Repose
+;
+; 16 bit x 16 bit unsigned multiply, 32 bit result
+; Average cycles: 193.07 (including RTS)
+; 2170 bytes
+; How to use:
+; call jsr init, before first use
+; put numbers in (x0,x1) and (y0,y1) and result is (z3, A, Y, z0)
+; pointers to square tables
+p_sqr_lo    = $8b   ; 2 bytes
+p_sqr_hi    = $8d   ; 2 bytes
+p_neg_sqr_lo = $8f  ; 2 bytes
+p_neg_sqr_hi = $91  ; 2 bytes
+; the inputs and outputs
+x0  = $02    ; multiplier, 2 bytes
+x1  = $03
+y0  = $04    ; multiplicand, 2 bytes
+y1  = $05
+z0  = $06    ; product, 2 bytes + 2 registers
+; z1  = $07 returned in Y reg
+; z2  = $08 returned in A reg
+z3  = $09
+* = $C000
+; Align tables to start of page
+; Note - the last byte of each table is never referenced, as a+b<=510
+sqrlo
+    !for i, 0, 511 {
+        !byte <((i*i)/4)
+    }
+sqrhi
+    !for i, 0, 511 {
+        !byte >((i*i)/4)
+    }
+negsqrlo
+    !for i, 0, 511 {
+        !byte <(((255-i)*(255-i))/4)
+    }
+negsqrhi
+    !for i, 0, 511 {
+        !byte >(((255-i)*(255-i))/4)
+    }
+; Diagram of the additions
+;                 y1    y0
+;              x  x1    x0
+;                 --------
+;              x0y0h x0y0l
+; +      x0y1h x0y1l
+; +      x1y0h x1y0l
+; +x1y1h x1y1l
+; ------------------------
+;     z3    z2    z1    z0
+umult16
+    ; set multiplier as x1
+    lda x1
+    sta p_sqr_lo
+    sta p_sqr_hi
+    eor #$ff
+    sta p_neg_sqr_lo
+    sta p_neg_sqr_hi
+    ; set multiplicand as y0
+    ldy y0
+    ;x1y0l =  low(x1*y0)
+    ;x1y0h = high(x1*y0)
+    sec
+    lda (p_sqr_lo),y
+    sbc (p_neg_sqr_lo),y
+    sta x1y0l+1
+    lda (p_sqr_hi), y
+    sbc (p_neg_sqr_hi),y
+    sta x1y0h+1
+    ; set multiplicand as y1
+    ldy y1
+    ; x1y1l =  low(x1*y1)
+    ; z3    = high(x1*y1)
+    lda (p_sqr_lo),y
+    sbc (p_neg_sqr_lo),y
+    sta x1y1l+1
+    lda (p_sqr_hi),y
+    sbc (p_neg_sqr_hi),y
+    sta z3
+    ; set multiplier as x0
+    lda x0
+    sta p_sqr_lo
+    sta p_sqr_hi
+    eor #$ff
+    sta p_neg_sqr_lo
+    sta p_neg_sqr_hi
+    ; x0y1l =  low(x0*y1)
+    ; X     = high(x0*y1)
+    lda (p_sqr_lo),y
+    sbc (p_neg_sqr_lo),y
+    sta x0y1l+1
+    lda (p_sqr_hi),y
+    sbc (p_neg_sqr_hi ),y
+    tax
+    ; set multiplicand as y0
+    ldy y0
+    ; z0    =  low(x0*y0)
+    ; A     = high(x0*y0)
+    lda (p_sqr_lo),y
+    sbc (p_neg_sqr_lo),y
+    sta z0
+    lda (p_sqr_hi),y
+    sbc (p_neg_sqr_hi),y
+    clc
+do_adds
+    ; add the first two numbers of column 1
+x0y1l
+    adc #0      ; x0y0h + x0y1l
+    tay
+    ; continue to first two numbers of column 2
+    txa
+x1y0h
+    adc #0      ; x0y1h + x1y0h
+    tax         ; X=z2 so far
+    bcc +
+    inc z3      ; column 3
+    clc
+    ; add last number of column 1
++
+    tya
+x1y0l
+    adc #0      ; + x1y0l
+    tay         ; Y=z1
+    ; add last number of column 2
+    txa
+x1y1l
+    adc #0      ; + x1y1l
+    bcc fin     ; A=z2
+    inc z3      ; column 3
+fin
+    rts
+; Once only initialization
+; this could set up the pointer values in a loop to save memory
+; it could also generate the square tables in code rather than load them
+init
+    lda #>sqrlo
+    sta p_sqr_lo+1
+    lda #>sqrhi
+    sta p_sqr_hi+1
+    lda #>negsqrlo
+    sta p_neg_sqr_lo+1
+    lda #>negsqrhi
+    sta p_neg_sqr_hi+1
+    rts
+</code>