After looking all over for a fast 16x16-->32 unsigned multiply routine, I adapted stuff from here to create the following. It is easily modified to other multiply resolutions, does nothing tricky like depend on register locations or order, and goes fast (16x16 mpy in about 168 instructions):
;**********************************************************************************************
; PIC16 fast unsigned multiply routines
; Jim Bixby
; bix@san.rr.com
;**********************************************************************************************
list r=dec,x=on,t=off,p=16F877
include p16F877.INC
; STATUS bit definitions
#define _C STATUS,0
#define _Z STATUS,2
cblock 0x20 ;Register definitions
TEMP0, TEMP1, TEMP2, TEMP3 ;convention: '0' is most sig byte
AARG0, AARG1, AARG2, AARG3
BARG0, BARG1
endc
org 0x0000
nop
goto MAIN
; a short test routine
MAIN
movlw 0x45 ;init a couple of arguments
movwf AARG0
movlw 0x30
movwf AARG1
movlw 0xC2
movwf BARG0
movlw 0x51
movwf BARG1
call MUL16x16U ;call the multiply
goto MAIN
;*****************************************************************************************
; Unsigned multiply routine
; Multiplies of any size can easily be made from the macros - for this code, only MUL16X16
; is made
;
; The approach here is linear code for speed at the expense of RAM and registers
; Adapted from <unknown>: http://www.piclist.com/techref/microchip/math/mul/8x8u.htm
;*****************************************************************************************
; Add16AB macro - add AH:AL to BH:BL, result goes to BH:BL
Add16AB MACRO AH,AL,BH,BL
movfw AL
addwf BL,f
movfw AH
btfsc _C
incfsz AH,w
addwf BH,f
ENDM
;*****************************************************************************************
; Macro for adding & right shifting - used once per bit by mulmac
mult MACRO bit,A,H,L ;A=multiplier,H:L=result, other arg in W
btfsc A,bit
addwf H,f
rrf H,f
rrf L,f
ENDM ; End of macro
;*****************************************************************************************
; 8x8-->16 multiply macro A * B --> H:L
; Invokes mult macro above
mulmac MACRO A,B,H,L ; H:L = A*B
clrf H
clrf L
movf B,W ; move the multiplicand to W reg.
bcf _C ; Clear the carry bit in the status Reg.
mult 0,A,H,L
mult 1,A,H,L
mult 2,A,H,L
mult 3,A,H,L
mult 4,A,H,L
mult 5,A,H,L
mult 6,A,H,L
mult 7,A,H,L
ENDM
;
NOEXPAND
;*****************************************************************************************
; MUL16x16U
; AARG0:1 * BARG0:1 --> AARG0:3
; Invokes the three macros above
; 164 prog words, 168 inst cycles, 10 registers (6 for call/return arguments + 4)
; Uses TEMP0:3 also. BARG0:1 unchanged on return
;*****************************************************************************************
MUL16x16U
; 32 bit result is first calculated into T0:T1:A2:A3
; When done, T0:1 is moved to A0:1 so the final result is in A0:3
; Register names can be changed at will, and any register can be located
; anywhere - order is not important.
mulmac ARG0, BARG0, TEMP0, TEMP1 ;T0:1 <--A0*B0
mulmac AARG1, BARG1, AARG2, AARG3 ;A2:3 <--A1*B1
mulmac AARG1, BARG0, TEMP2, TEMP3 ;T2:3 <--A1*B0
Add16AB TEMP2, TEMP3, TEMP1, AARG2 ; Add T2:3 to T1:A2, carry into T0
btfsc _C ; nb: this relies on the _C bit being
incf TEMP0,f ; correct after the Add16AB macro
mulmac AARG0, BARG1, TEMP2, TEMP3 ;T2:3 <--A0*B1
Add16AB TEMP2, TEMP3, TEMP1, AARG2 ; Add T2:3 to T1:A2, carry into T0
btfsc _C
incf TEMP0,f
movfw TEMP1 ;Move T0:1 to A0:1
movwf AARG1 ;to finish
movfw TEMP0
movwf AARG0
retlw 0
EXPAND
END
Comments: