The research continues in 2023 and the fastest multiply got faster!
Thanks to a 6502 simulator written in C and the analyzing of statistics in the program branches and boundary crossings, the exact speed of routines are now known. This analysis has inspired new optimizations! The new routine executes in a blazing 187.1 cycles on average (a 6% speedup or 11.5 cycles faster from my original code which was 198.6), with inputs in zero page and outputs in zero page plus two registers, not including caller setup or RTS. 14 bytes of zp, 2044 in tables, 112 bytes of code. The claim to "fastest" is based on tests of over 100 published routines here: https://github.com/tobyLobster/multiply_test/?tab=readme-ov-file#the-results\\
Code in ACME assembly format. Please adjust addresses to be usable for your situation.
<code>
; World's fastest unsigned 16x16=32 bit multiply, 2023 version by Repose
;
; 16 bit x 16 bit unsigned multiply, 32 bit result
; Average cycles: 193.07 (including RTS)
; 2170 bytes


; How to use:
; call jsr init, before first use
; put numbers in (x0,x1) and (y0,y1) and result is (z3, A, Y, z0)

; pointers to square tables
p_sqr_lo    = $8b   ; 2 bytes
p_sqr_hi    = $8d   ; 2 bytes
p_neg_sqr_lo = $8f  ; 2 bytes
p_neg_sqr_hi = $91  ; 2 bytes

; the inputs and outputs
x0  = $02    ; multiplier, 2 bytes
x1  = $03
y0  = $04    ; multiplicand, 2 bytes
y1  = $05
z0  = $06    ; product, 2 bytes + 2 registers
; z1  = $07 returned in Y reg
; z2  = $08 returned in A reg
z3  = $09

* = $C000

; Align tables to start of page
; Note - the last byte of each table is never referenced, as a+b<=510
sqrlo
    !for i, 0, 511 {
        !byte <((i*i)/4)
    }
sqrhi
    !for i, 0, 511 {
        !byte >((i*i)/4)
    }

negsqrlo
    !for i, 0, 511 {
        !byte <(((255-i)*(255-i))/4)
    }

negsqrhi
    !for i, 0, 511 {
        !byte >(((255-i)*(255-i))/4)
    }

; Diagram of the additions
;                 y1    y0
;              x  x1    x0
;                 --------
;              x0y0h x0y0l
; +      x0y1h x0y1l
; +      x1y0h x1y0l
; +x1y1h x1y1l
; ------------------------
;     z3    z2    z1    z0

umult16
    ; set multiplier as x1
    lda x1
    sta p_sqr_lo
    sta p_sqr_hi
    eor #$ff
    sta p_neg_sqr_lo
    sta p_neg_sqr_hi

    ; set multiplicand as y0
    ldy y0
    
    ;x1y0l =  low(x1*y0)
    ;x1y0h = high(x1*y0)
    sec
    lda (p_sqr_lo),y
    sbc (p_neg_sqr_lo),y
    sta x1y0l+1
    lda (p_sqr_hi), y
    sbc (p_neg_sqr_hi),y
    sta x1y0h+1

    ; set multiplicand as y1
    ldy y1

    ; x1y1l =  low(x1*y1)
    ; z3    = high(x1*y1)
    lda (p_sqr_lo),y
    sbc (p_neg_sqr_lo),y
    sta x1y1l+1
    lda (p_sqr_hi),y
    sbc (p_neg_sqr_hi),y
    sta z3

    ; set multiplier as x0
    lda x0
    sta p_sqr_lo
    sta p_sqr_hi
    eor #$ff
    sta p_neg_sqr_lo
    sta p_neg_sqr_hi

    ; x0y1l =  low(x0*y1)
    ; X     = high(x0*y1)
    lda (p_sqr_lo),y
    sbc (p_neg_sqr_lo),y
    sta x0y1l+1
    lda (p_sqr_hi),y
    sbc (p_neg_sqr_hi ),y
    tax

    ; set multiplicand as y0
    ldy y0

    ; z0    =  low(x0*y0)
    ; A     = high(x0*y0)
    lda (p_sqr_lo),y
    sbc (p_neg_sqr_lo),y
    sta z0
    lda (p_sqr_hi),y
    sbc (p_neg_sqr_hi),y

    clc
do_adds
    ; add the first two numbers of column 1
x0y1l
    adc #0      ; x0y0h + x0y1l
    tay

    ; continue to first two numbers of column 2
    txa
x1y0h
    adc #0      ; x0y1h + x1y0h
    tax         ; X=z2 so far
    bcc +
    inc z3      ; column 3
    clc

    ; add last number of column 1
+
    tya
x1y0l
    adc #0      ; + x1y0l
    tay         ; Y=z1

    ; add last number of column 2
    txa
x1y1l
    adc #0      ; + x1y1l
    bcc fin     ; A=z2
    inc z3      ; column 3
fin
    rts


; Once only initialization
; this could set up the pointer values in a loop to save memory
; it could also generate the square tables in code rather than load them
init
    lda #>sqrlo
    sta p_sqr_lo+1

    lda #>sqrhi
    sta p_sqr_hi+1

    lda #>negsqrlo
    sta p_neg_sqr_lo+1

    lda #>negsqrhi
    sta p_neg_sqr_hi+1
    rts
</code>