Codebase64 wiki

This is an old revision of the document!

The research continues in 2023 and the fastest multiply got faster! Thanks to a 6502 simulator written in C and the analyzing of statistics in the program branches and boundary crossings, the exact speed of routines are now known. This analysis has inspired new optimizations! The new routine executes in a blazing 187.1 cycles on average (a 6% speedup or 11.5 cycles faster from my original code which was 198.6), with inputs in zero page and outputs in zero page plus two registers, not including caller setup or RTS.
Code in ACME assembly format. Please adjust addresses to be usable for your situation.

; from 6502.org, by Repose: http://forum.6502.org/viewtopic.php?p=106519#p106519 ; ; 16 bit x 16 bit unsigned multiply, 32 bit result ; Average cycles: 193.07 ; 2170 bytes

; How to use: ; call jsr init, before first use ; put numbers in (x0,x1) and (y0,y1) and result is (z3, A, Y, z0)

; pointers to square tables p_sqr_lo = $8b ; 2 bytes p_sqr_hi = $8d ; 2 bytes p_neg_sqr_lo = $8f ; 2 bytes p_neg_sqr_hi = $91 ; 2 bytes

; the inputs and outputs x0 = $02 ; multiplier, 2 bytes x1 = $03 y0 = $04 ; multiplicand, 2 bytes y1 = $05 z0 = $06 ; product, 2 bytes + 2 registers ; z1 = $07 returned in Y reg ; z2 = $08 returned in A reg z3 = $09

* = $C000

; Align tables to start of page sqrlo

  !for i, 0, 511 {
      !byte <((i*i)/4)
  }

sqrhi

  !for i, 0, 511 {
      !byte >((i*i)/4)
  }

negsqrlo

  !for i, 0, 511 {
      !byte <(((255-i)*(255-i))/4)
  }

negsqrhi

  !for i, 0, 511 {
      !byte >(((255-i)*(255-i))/4)
  }

; Diagram of the additions ; y1 y0 ; x x1 x0 ; ——– ; x0y0h x0y0l ; + x0y1h x0y1l ; + x1y0h x1y0l ; +x1y1h x1y1l ; ———————— ; z3 z2 z1 z0

umult16

  ; set multiplier as x1
  lda x1
  sta p_sqr_lo
  sta p_sqr_hi
  eor #$ff
  sta p_neg_sqr_lo
  sta p_neg_sqr_hi

  ; set multiplicand as y0
  ldy y0
  sec
  lda (p_sqr_lo),y
  sbc (p_neg_sqr_lo),y
  sta x1y0l+1
  lda (p_sqr_hi), y
  sbc (p_neg_sqr_hi),y
  sta x1y0h+1

  ; set multiplicand as y1
  ldy y1

  ; x1y1l =  low(x1*y1)
  ; z3    = high(x1*y1)
  lda (p_sqr_lo),y
  sbc (p_neg_sqr_lo),y
  sta x1y1l+1
  lda (p_sqr_hi),y
  sbc (p_neg_sqr_hi),y
  sta z3

  ; set multiplier as x0
  lda x0
  sta p_sqr_lo
  sta p_sqr_hi
  eor #$ff
  sta p_neg_sqr_lo
  sta p_neg_sqr_hi

  ; x0y1l =  low(x0*y1)
  ; X     = high(x0*y1)
  lda (p_sqr_lo),y
  sbc (p_neg_sqr_lo),y
  sta x0y1l+1
  lda (p_sqr_hi),y
  sbc (p_neg_sqr_hi ),y
  tax

  ; set multiplicand as y0
  ldy y0

  ; z0    =  low(x0*y0)
  ; A     = high(x0*y0)
  lda (p_sqr_lo),y
  sbc (p_neg_sqr_lo),y
  sta z0
  lda (p_sqr_hi),y
  sbc (p_neg_sqr_hi),y

clc

do_adds

  ; add the first two numbers of column 1

x0y1l

  adc #0      ; x0y0h + x0y1l
  tay

  ; continue to first two numbers of column 2
  txa

x1y0h

  adc #0      ; x0y1h + x1y0h
  tax         ; X=z2 so far
  bcc +
  inc z3      ; column 3
  clc

  ; add last number of column 1

+

tya

x1y0l

  adc #0      ; + x1y0l
  tay         ; Y=z1

  ; add last number of column 2
  txa

x1y1l

  adc #0      ; + x1y1l
  bcc fin     ; A=z2
  inc z3      ; column 3

fin

rts

; Once only initialization ; this could set up the pointer values in a loop to save memory ; it could also generate the square tables in code rather than load them init

  lda #>sqrlo
  sta p_sqr_lo+1

  lda #>sqrhi
  sta p_sqr_hi+1

  lda #>negsqrlo
  sta p_neg_sqr_lo+1

  lda #>negsqrhi
  sta p_neg_sqr_hi+1
  rts