orr yh, yh, #0x00100000
beq LSYM(Lml_1)
-#if __ARM_ARCH__ < 4
-
- @ Put sign bit in r6, which will be restored in yl later.
- and r6, r6, #0x80000000
-
- @ Well, no way to make it shorter without the umull instruction.
- stmfd sp!, {r6, r7, r8, r9, sl, fp} @ sp -= 24
- .cfi_remember_state @ Save the current CFI state.
- .cfi_adjust_cfa_offset 24 @ CFA is now sp + previousOffset + 24.
- .cfi_rel_offset r6, 0 @ Registers are saved from sp to sp + 20.
- .cfi_rel_offset r7, 4
- .cfi_rel_offset r8, 8
- .cfi_rel_offset r9, 12
- .cfi_rel_offset sl, 16
- .cfi_rel_offset fp, 20
-
- mov r7, xl, lsr #16
- mov r8, yl, lsr #16
- mov r9, xh, lsr #16
- mov sl, yh, lsr #16
- bic xl, xl, r7, lsl #16
- bic yl, yl, r8, lsl #16
- bic xh, xh, r9, lsl #16
- bic yh, yh, sl, lsl #16
- mul ip, xl, yl
- mul fp, xl, r8
- mov lr, #0
- adds ip, ip, fp, lsl #16
- adc lr, lr, fp, lsr #16
- mul fp, r7, yl
- adds ip, ip, fp, lsl #16
- adc lr, lr, fp, lsr #16
- mul fp, xl, sl
- mov r5, #0
- adds lr, lr, fp, lsl #16
- adc r5, r5, fp, lsr #16
- mul fp, r7, yh
- adds lr, lr, fp, lsl #16
- adc r5, r5, fp, lsr #16
- mul fp, xh, r8
- adds lr, lr, fp, lsl #16
- adc r5, r5, fp, lsr #16
- mul fp, r9, yl
- adds lr, lr, fp, lsl #16
- adc r5, r5, fp, lsr #16
- mul fp, xh, sl
- mul r6, r9, sl
- adds r5, r5, fp, lsl #16
- adc r6, r6, fp, lsr #16
- mul fp, r9, yh
- adds r5, r5, fp, lsl #16
- adc r6, r6, fp, lsr #16
- mul fp, xl, yh
- adds lr, lr, fp
- mul fp, r7, sl
- adcs r5, r5, fp
- mul fp, xh, yl
- adc r6, r6, #0
- adds lr, lr, fp
- mul fp, r9, r8
- adcs r5, r5, fp
- mul fp, r7, r8
- adc r6, r6, #0
- adds lr, lr, fp
- mul fp, xh, yh
- adcs r5, r5, fp
- adc r6, r6, #0
- ldmfd sp!, {yl, r7, r8, r9, sl, fp} @ sp += 24
- .cfi_restore_state @ Restore the previous CFI state.
-#else
-
@ Here is the actual multiplication.
+ @ This code works on architecture versions > 4
umull ip, lr, xl, yl
mov r5, #0
umlal lr, r5, xh, yl
mov r6, #0
umlal r5, r6, xh, yh
-#endif
-
@ The LSBs in ip are only significant for the final rounding.
@ Fold them into lr.
teq ip, #0
orr r0, r3, r0, lsr #5
orr r1, r3, r1, lsr #5
-#if __ARM_ARCH__ < 4
-
- @ Put sign bit in r3, which will be restored into r0 later.
- and r3, ip, #0x80000000
-
- @ Well, no way to make it shorter without the umull instruction.
- do_push {r3, r4, r5} @ sp -= 12
- .cfi_remember_state @ Save the current CFI state
- .cfi_adjust_cfa_offset 12 @ CFA is now sp + previousOffset + 12
- .cfi_rel_offset r3, 0 @ Registers are saved from sp to sp + 8
- .cfi_rel_offset r4, 4
- .cfi_rel_offset r5, 8
-
- mov r4, r0, lsr #16
- mov r5, r1, lsr #16
- bic r0, r0, r4, lsl #16
- bic r1, r1, r5, lsl #16
- mul ip, r4, r5
- mul r3, r0, r1
- mul r0, r5, r0
- mla r0, r4, r1, r0
- adds r3, r3, r0, lsl #16
- adc r1, ip, r0, lsr #16
- do_pop {r0, r4, r5} @ sp += 12
- .cfi_restore_state @ Restore the previous CFI state
-
-#else
-
@ The actual multiplication.
+ @ This code works on architecture versions > 4
umull r3, r1, r0, r1
@ Put final sign in r0.
and r0, ip, #0x80000000
-#endif
-
@ Adjust result upon the MSB position.
cmp r1, #(1 << 23)
do_it cc, tt