PIC Microcontoller Math Method

24 Bit Floating Point Square Root

Here is a debugged version. It is slightly smaller and faster. Thanks to James Hillman [james at ind-interface.co.uk] for testing!
Final release! :)
;**********************************************************************
;By Nikolai Golovchenko
;24 BIT FLOATING POINT SQARE ROOT
;
;Input:
; AARGB0 - high byte with sign bit
; AARGB1 - low byte
; AEXP - exponent 2^(AEXP - 127)
;Output:
; AARGB0-1, AEXP - 24 bit floating point
;Temporary:
; BARGB0-1, BEXP - temporary for result
; TEMPB0-1
; LOOPCOUNT - counter
;
;ROM: 85 instructions
;RAM: 9 bytes
;
;Timing (includes call and return)
;6 cycles best case
;2+3+18+8*26-1+8*21-1+7+2= 406 cycles worst case
;**********************************************************************
;NOTES: 1)Square root is taken on absolute value of the number
;       (sign bit is ignored)
;       2)Rounding not implemented yet  
;**********************************************************************
FPSQRT24        
;Normalize input
;1)Check for zero input - if zero then exit
;2)Change sign to 1 and use sign bit as explicit MSB
;3)Divide BEXP = BEXP / 2
;4)If AEXP can be divided by 2 (AEXP<0>=1) then 
;BARGB1 = 1, AARGB<0-1> << 1 and find 15 more bits 
;5)Else find 16 more bits

        movf AEXP, w    ;if zero input then return
        btfsc 0x03, 2
         retlw 0x00

        clrf BARGB0     ;set up all used
        clrf BARGB1     ;temporary registers
        clrf TEMPB0     ;
        clrf TEMPB1     ;
        
        bsf AARGB0, 7   ;make MSB explicit and ignore mantissa sign 

;sqrt(2^E*M)=2^(E/2)*sqrt(M)
;we will align mantissa point above MSb. This is
;equivalent to division by 2. Or,
;sqrt(2^E*M)=sqrt(M/2)*2^((E+1)/2)
;new exponent is (E+1)/2
;new mantissa is M/2 (not changed, just new point position)

;divide (bexp+1) by 2. bexp is (eb + 127), where eb=-126..128!
;eb/2 is rounded to minus infinity!
        movwf BEXP      ;copy aexp to bexp
        decf BEXP, f    ;ensure required round mode
        bcf 0x03, 0     ;divide by 2
        rrf BEXP, f     ;
        movlw 0x40      ;correct bias after division
        addwf BEXP, f   ;

;if (E+1)/2 result has a remainder, then multiply mantissa by 2
;(left shift)
        btfss AEXP, 0
         goto FPSQRT24a
        rlf AARGB1, f   ;(carry was zero)
        rlf AARGB0, f
        bsf BARGB1, 0   ;set first bit of current result
                        ;and discard MSb of mantissa (we
                        ;used it already by setting first bit)
FPSQRT24a

;First find 8 bits of result. This will shift AARGB0 - AARGB1 to TEMPB1
;Then only zeros will be fed instead of AARGB0
        movlw 0x08                      ;loop counter
        movwf LOOPCOUNT ;
FPSQRT24b
        movlw 0x40      ;substract test bit from 
        subwf AARGB0, f ;current lowest byte.
                        ;it works also exactly
                        ;like 0xC0 addition for the
                        ;addition branch.
        movf BARGB1, w  ;load accumulator with
                        ;current result LSB

        btfsc TEMPB0, 7
         goto FPSQRT24b_add

        btfss 0x03, 0           
         incfsz BARGB1, w
          subwf TEMPB1, f
        movlw 0x01
        btfss 0x03, 0
         subwf TEMPB0, f
        goto FPSQRT24b_next

FPSQRT24b_add
        btfsc 0x03, 0   
         incfsz BARGB1, w
          addwf TEMPB1, f
        movlw 0x01
        btfsc 0x03, 0
         addwf TEMPB0, f

FPSQRT24b_next
        rlf BARGB1, f   ;shift result into result bytes
        rlf BARGB0, f
        
        rlf AARGB1, f   ;Shift out next two bits of input
        rlf AARGB0, f   ;
        rlf TEMPB1, f   ;
        rlf TEMPB0, f   ;
        rlf AARGB1, f   ;
        rlf AARGB0, f   ;
        rlf TEMPB1, f   ;
        rlf TEMPB0, f   ;

        decfsz LOOPCOUNT, f     ;repeat untill 8 bits will be found
         goto FPSQRT24b

;Find other 7 or 8 bits. Only zeros are fed instead of AARGB0
;Repeat untill MSb of result gets set
FPSQRT24d
        btfsc BARGB1, 0         ;if Temp sign is positive, than jump to subtraction
         goto FPSQRT24d_sub     ;(previous result bit is inverted sign bit,
                                ;Tempb0.7 can not be used instead, because
                                ;it may overflow)

        ;after LSBs addition (0x00 + 0xC0 = 0xC0) C=0,
        ;so we just continue adding higher bytes,
        ;keeping in mind that LSB=0xC0 and C=0
        movf BARGB1, w
        addwf TEMPB1, f
        movf BARGB0, w
        btfsc 0x03, 0
         incfsz BARGB0, w
          addwf TEMPB0, f
        goto FPSQRT24d_next

FPSQRT24d_sub

        bcf 0x03, 0     ;simulate borrow (0x00 - 0x40 = 0xC0, C=0)
        incfsz BARGB1, w
         subwf TEMPB1, f
        movf BARGB0, w
        btfss 0x03, 0
         incfsz BARGB0, w
          subwf TEMPB0, f

FPSQRT24d_next
        rlf BARGB1, f   ;shift result into result bytes
        rlf BARGB0, f
        
        bsf 0x03, 0     ;Shift out next two bits of input
        rlf TEMPB1, f   ;(set carry before each shift to
        rlf TEMPB0, f   ;simulate 0xC0 value)
        bsf 0x03, 0     ;
        rlf TEMPB1, f   ;
        rlf TEMPB0, f   ;
        
        btfss BARGB0, 7 ;repeat untill all 16 bits will be found
         goto FPSQRT24d

;flag C, TEMPB1 - TEMPB0 contain current input that may be used to find 17th bit for rounding

;Copy BARG to AARG
        movf BEXP, w
        movwf AEXP
        movf BARGB0, w
        movwf AARGB0
        movf BARGB1, w
        movwf AARGB1
        bcf AARGB0, 7   ;clear sign bit (overwrites explicit MSB, which is always one)

        retlw 0x00

;**********************************************************************
;Last updated 21Nov00
Comments:
Please note that the above code will not work for PIC18 family chips. The following changes have to be made:

1) Change all "rlf" into "rlcf" and all "rrf" into "rrcf" ;
2) Change all "0x03" into "STATUS" (provided you are using the header file)