1/* 2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#include "hardware/regs/sio.h" 8#include "hardware/regs/addressmap.h" 9 10.syntax unified 11.cpu cortex-m0plus 12.thumb 13 14#include "pico/asm_helper.S" 15 16#ifndef PICO_DIVIDER_CALL_IDIV0 17#define PICO_DIVIDER_CALL_IDIV0 1 18#endif 19 20#ifndef PICO_DIVIDER_CALL_LDIV0 21#define PICO_DIVIDER_CALL_LDIV0 1 22#endif 23 24.macro div_section name 25#if PICO_DIVIDER_IN_RAM 26.section RAM_SECTION_NAME(\name), "ax" 27#else 28.section SECTION_NAME(\name), "ax" 29#endif 30.endm 31 32#if SIO_DIV_CSR_READY_LSB == 0 33.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1 34#else 35need to change SHIFT above 36#endif 37#if SIO_DIV_CSR_DIRTY_LSB == 1 38.equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2 39#else 40need to change SHIFT above 41#endif 42 43@ wait 8-n cycles for the hardware divider 44.macro wait_div n 45.rept (8-\n) / 2 46 b 9f 479: 48.endr 49.if (8-\n) % 2 50 nop 51.endif 52.endm 53 54 55#if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4) 56#error register layout has changed - we rely on this order to make sure we save/restore in the right order 57#endif 58 59# SIO_BASE ptr in r2 60.macro save_div_state_and_lr 61 ldr r3, [r2, #SIO_DIV_CSR_OFFSET] 62 # wait for results as we can't save signed-ness of operation 631: 64 lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY 65 bcc 1b 66 push {r4, r5, r6, r7, lr} 67 // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia! 68 ldr r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET] 69 ldr r5, [r2, #SIO_DIV_SDIVISOR_OFFSET] 70 ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET] 71 ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET] 72.endm 73 74.macro restore_div_state_and_return 75 // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order 76 // 77 // it is worth considering what happens if we are interrupted 78 // 79 // after writing r4: we are DIRTY and !READY 80 // ... interruptor using div will complete based on incorrect inputs, but dividend at least will be 81 // saved/restored correctly and we'll restore the rest ourselves 82 // after writing r4, r5: we are DIRTY and !READY 83 // ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor 84 // at least will be saved/restored correctly and and we'll restore the rest ourselves 85 // after writing r4, r5, r6: we are DIRTY and READY 86 // ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves), 87 // and we'll restore the remainder after the fact 88 89 // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space 90 // and so 4 reads is cheaper (and we don't have to adjust r2) 91 str r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET] 92 str r5, [r2, #SIO_DIV_SDIVISOR_OFFSET] 93 str r7, [r2, #SIO_DIV_REMAINDER_OFFSET] 94 str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET] 95 pop {r4, r5, r6, r7, pc} 96.endm 97 98.macro save_div_state_and_lr_64 99 push {r4, r5, r6, r7, lr} 100 ldr r6, =SIO_BASE 1011: 102 ldr r5, [r6, #SIO_DIV_CSR_OFFSET] 103 # wait for results as we can't save signed-ness of operation 104 lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY 105 bcc 1b 106 // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia! 107 ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET] 108 ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET] 109 ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET] 110 ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET] 111.endm 112 113.macro restore_div_state_and_return_64 114 // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order 115 // 116 // it is worth considering what happens if we are interrupted 117 // 118 // after writing r4: we are DIRTY and !READY 119 // ... interruptor using div will complete based on incorrect inputs, but dividend at least will be 120 // saved/restored correctly and we'll restore the rest ourselves 121 // after writing r4, r5: we are DIRTY and !READY 122 // ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor 123 // at least will be saved/restored correctly and and we'll restore the rest ourselves 124 // after writing r4, r5, r6: we are DIRTY and READY 125 // ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves), 126 // and we'll restore the remainder after the fact 127 128 mov ip, r2 129 ldr r2, =SIO_BASE 130 // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space 131 // and so 4 reads is cheaper (and we don't have to adjust r2) 132 str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET] 133 str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET] 134 str r7, [r2, #SIO_DIV_REMAINDER_OFFSET] 135 str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET] 136 mov r2, ip 137 pop {r4, r5, r6, r7, pc} 138.endm 139 140 141// since idiv and idivmod only differ by a cycle, we'll make them the same! 142div_section WRAPPER_FUNC_NAME(__aeabi_idiv) 143.align 2 144wrapper_func __aeabi_idiv 145wrapper_func __aeabi_idivmod 146regular_func div_s32s32 147regular_func divmod_s32s32 148 ldr r2, =(SIO_BASE) 149 # to support IRQ usage we must save/restore 150 ldr r3, [r2, #SIO_DIV_CSR_OFFSET] 151 lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 152 bcs divmod_s32s32_savestate 153regular_func divmod_s32s32_unsafe 154 str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET] 155 str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET] 156 cmp r1, #0 157 beq 1f 158 wait_div 2 159 // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last) 160 ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET] 161 ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET] 162 bx lr 1631: 164 push {r2, lr} 165 movs r1, #0x80 166 lsls r1, #24 167 asrs r2, r0, #31 168 eors r1, r2 169 cmp r0, #0 170 beq 1f 171 mvns r0, r1 1721: 173#if PICO_DIVIDER_CALL_IDIV0 174 bl __aeabi_idiv0 175#endif 176 movs r1, #0 // remainder 0 177 // need to restore saved r2 as it hold SIO ptr 178 pop {r2, pc} 179.align 2 180regular_func divmod_s32s32_savestate 181 save_div_state_and_lr 182 bl divmod_s32s32_unsafe 183 restore_div_state_and_return 184 185// since uidiv and uidivmod only differ by a cycle, we'll make them the same! 186div_section WRAPPER_FUNC_NAME(__aeabi_uidiv) 187regular_func div_u32u32 188regular_func divmod_u32u32 189wrapper_func __aeabi_uidiv 190wrapper_func __aeabi_uidivmod 191 ldr r2, =(SIO_BASE) 192 # to support IRQ usage we must save/restore 193 ldr r3, [r2, #SIO_DIV_CSR_OFFSET] 194 lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 195 bcs divmod_u32u32_savestate 196regular_func divmod_u32u32_unsafe 197 str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET] 198 str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET] 199 cmp r1, #0 200 beq 1f 201 wait_div 2 202 // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last) 203 ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET] 204 ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET] 205 bx lr 2061: 207 push {r2, lr} 208 cmp r0, #0 209 beq 1f 210 movs r0, #0 211 mvns r0, r0 2121: 213#if PICO_DIVIDER_CALL_IDIV0 214 bl __aeabi_idiv0 215#endif 216 movs r1, #0 // remainder 0 217 // need to restore saved r2 as it hold SIO ptr 218 pop {r2, pc} 219.align 2 220regular_func divmod_u32u32_savestate 221 save_div_state_and_lr 222 bl divmod_u32u32_unsafe 223 restore_div_state_and_return 224 225div_section WRAPPER_FUNC_NAME(__aeabi_ldiv) 226 227.align 2 228wrapper_func __aeabi_ldivmod 229regular_func div_s64s64 230regular_func divmod_s64s64 231 mov ip, r2 232 ldr r2, =(SIO_BASE) 233 # to support IRQ usage we must save/restore 234 ldr r2, [r2, #SIO_DIV_CSR_OFFSET] 235 lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 236 mov r2, ip 237 bcs divmod_s64s64_savestate 238 b divmod_s64s64_unsafe 239.align 2 240divmod_s64s64_savestate: 241 save_div_state_and_lr_64 242 bl divmod_s64s64_unsafe 243 restore_div_state_and_return_64 244 245.align 2 246wrapper_func __aeabi_uldivmod 247regular_func div_u64u64 248regular_func divmod_u64u64 249 mov ip, r2 250 ldr r2, =(SIO_BASE) 251 # to support IRQ usage we must save/restore 252 ldr r2, [r2, #SIO_DIV_CSR_OFFSET] 253 lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 254 mov r2, ip 255 bcs divmod_u64u64_savestate 256 b divmod_u64u64_unsafe 257.align 2 258regular_func divmod_u64u64_savestate 259 save_div_state_and_lr_64 260 bl divmod_u64u64_unsafe 261 restore_div_state_and_return_64 262.macro dneg lo,hi 263 mvns \hi,\hi 264 rsbs \lo,#0 265 bne l\@_1 266 adds \hi,#1 267l\@_1: 268.endm 269 270.align 2 271regular_func divmod_s64s64_unsafe 272 cmp r3,#0 273 blt 1f 274@ here x +ve 275 beq 2f @ could x be zero? 2763: 277 cmp r1,#0 278 bge divmod_u64u64_unsafe @ both positive 279@ y -ve, x +ve 280 push {r14} 281 dneg r0,r1 282 bl divmod_u64u64_unsafe 283 dneg r0,r1 284 dneg r2,r3 285 pop {r15} 286 2872: 288 cmp r2,#0 289 bne 3b @ back if x not zero 290 291 cmp r0,#0 @ y==0? 292 bne 4f 293 cmp r1,#0 294 beq 5f @ then pass 0 to __aeabi_ldiv0 2954: 296 movs r0,#0 297 lsrs r1,#31 298 lsls r1,#31 @ get sign bit 299 bne 5f @ y -ve? pass -2^63 to __aeabi_ldiv0 300 mvns r0,r0 301 lsrs r1,r0,#1 @ y +ve: pass 2^63-1 to __aeabi_ldiv0 3025: 303 push {r14} 304#if PICO_DIVIDER_CALL_LDIV0 305 bl __aeabi_ldiv0 306#endif 307 movs r2,#0 @ and return 0 for the remainder 308 movs r3,#0 309 pop {r15} 310 3111: 312@ here x -ve 313 push {r14} 314 cmp r1,#0 315 blt 1f 316@ y +ve, x -ve 317 dneg r2,r3 318 bl divmod_u64u64_unsafe 319 dneg r0,r1 320 pop {r15} 321 3221: 323@ y -ve, x -ve 324 dneg r0,r1 325 dneg r2,r3 326 bl divmod_u64u64_unsafe 327 dneg r2,r3 328 pop {r15} 329 330regular_func divmod_u64u64_unsafe 331 cmp r1,#0 332 bne y64 @ y fits in 32 bits? 333 cmp r3,#0 @ yes; and x? 334 bne 1f 335 cmp r2,#0 336 beq 2f @ x==0? 337 mov r12,r7 338 ldr r7,=#SIO_BASE 339 str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 340 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] 341 movs r1,#0 342 movs r3,#0 343 wait_div 2 344 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] 345 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] 346 mov r7,r12 347 bx r14 348 3492: @ divide by 0 with y<2^32 350 cmp r0,#0 @ y==0? 351 beq 3f @ then pass 0 to __aeabi_ldiv0 352udiv0: 353 ldr r0,=#0xffffffff 354 movs r1,r0 @ pass 2^64-1 to __aeabi_ldiv0 3553: 356 push {r14} 357#if PICO_DIVIDER_CALL_LDIV0 358 bl __aeabi_ldiv0 359#endif 360 movs r2,#0 @ and return 0 for the remainder 361 movs r3,#0 362 pop {r15} 363 3641: 365 movs r2,r0 @ x>y, so result is 0 remainder y 366 movs r3,r1 367 movs r0,#0 368 movs r1,#0 369 bx r14 370 371.ltorg 372 373@ here y occupies more than 32 bits 374@ split into cases acccording to the size of x 375y64: 376 cmp r3,#0 377 beq 1f 378 b y64_x48 @ if x does not fit in 32 bits, go to 48- and 64-bit cases 3791: 380 lsrs r3,r2,#16 381 bne y64_x32 @ jump if x is 17..32 bits 382 383@ here x is at most 16 bits 384 385 cmp r2,#0 386 beq udiv0 @ x==0? exit as with y!=0 case above 387 push {r7} 388 ldr r7,=#SIO_BASE 389 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 390 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] 391 wait_div 4 392 push {r4, r5} 393 lsrs r4,r0,#16 394 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x 395 ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q0=y0/x; 396 lsls r3,#16 397 orrs r3,r4 398 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16); 399 wait_div 1 400 uxth r4,r0 401 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x 402 ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q1=y1/x; 403 lsls r3,#16 404 orrs r3,r4 405 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16); 406 wait_div 3 407 movs r3,#0 408 lsls r4,r5,#16 @ quotient=(q0<<32)+(q1<<16)+q2 409 lsrs r5,#16 410 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x 411 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q2=y2/x; 412 adds r0,r4 413 adcs r1,r5 414 pop {r4,r5,r7} 415 bx r14 416 417.ltorg 418 419y64_x32: 420@ here x is 17..32 bits 421 push {r4-r7,r14} 422 mov r12,r2 @ save x 423 movs r5,#0 @ xsh=0 424 lsrs r4,r2,#24 425 bne 1f 426 lsls r2,#8 @ if(x0<1U<<24) x0<<=8,xsh =8; 427 adds r5,#8 4281: 429 lsrs r4,r2,#28 430 bne 1f 431 lsls r2,#4 @ if(x0<1U<<28) x0<<=4,xsh+=4; 432 adds r5,#4 4331: 434 lsrs r4,r2,#30 435 bne 1f 436 lsls r2,#2 @ if(x0<1U<<30) x0<<=2,xsh+=2; 437 adds r5,#2 4381: 439 lsrs r4,r2,#31 440 bne 1f 441 lsls r2,#1 @ if(x0<1U<<31) x0<<=1,xsh+=1; 442 adds r5,#1 4431: 444@ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49 445 lsrs r4,r2,#15 446 adds r4,#1 @ x1=(x0>>15)+1; 2^16<x1<=2^17 447 448 ldr r7,=#SIO_BASE 449 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET] 450 ldr r4,=#0xffffffff 451 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 452 lsrs r6,r1,#16 453 uxth r3,r2 @ x0l 454 wait_div 2 455 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate 456 457@ here 458@ r0:r1 y 459@ r2 x0 460@ r4 r 461@ r5 xsh 462@ r12 x 463 464 muls r6,r4 465 lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16; 466 lsls r7,r6,#13 467 mov r14,r7 @ quh=q0<<13 468 469 muls r3,r6 @ x0l*q 470 lsrs r7,r3,#15 471 lsls r3,#17 @ r3:r7 is (x0l*q)<<17 472 subs r0,r3 473 sbcs r1,r7 @ y-=(x0l*q)<<17 474 475 lsrs r3,r2,#16 @ x0h 476 muls r3,r6 @ q*x0h 477 adds r3,r3 478 subs r1,r3 @ y-=(x0h*q)<<17 479 480 lsrs r6,r1,#3 481 muls r6,r4 482 lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16; 483 add r14,r6 @ quh+=q1 484 485 uxth r3,r2 @ x0l 486 muls r3,r6 @ x0l*q 487 lsrs r7,r3,#28 488 lsls r3,#4 @ r3:r7 is (x0l*q)<<4 489 subs r0,r3 490 sbcs r1,r7 @ y-=(x0l*q)<<4 491 492 lsrs r3,r2,#16 @ x0h 493 muls r3,r6 @ x0h*q 494 lsrs r7,r3,#12 495 lsls r3,#20 @ r3:r7 is (x0h*q)<<4 496 subs r0,r3 497 sbcs r1,r7 @ y-=(x0h*q)<<4 498 499 lsrs r6,r0,#22 500 lsls r7,r1,#10 501 orrs r6,r7 @ y>>22 502 muls r6,r4 503 lsrs r6,#16 @ q=((ui32)(y>>22)*r)>>16; 504 505 cmp r5,#9 506 blt last0 @ if(xsh<9) goto last0; 507 508@ on this path xsh>=9, which means x<2^23 509 lsrs r2,#9 @ x0>>9: this shift loses no bits 510@ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient 511@ bits (at most 6 bits) times x, and so fits in one word 512 muls r2,r6 @ x0*q 513 subs r0,r2 @ y-x0*q 514 lsls r7,r6,#13 @ qul=q<<13 5151: 516 lsrs r6,r0,#9 517 muls r6,r4 518 lsrs r6,#16 @ q=((ui32)(y>>9)*r)>>16; 519 520@ here 521@ r0 y 522@ r2 x0>>9 523@ r5 xsh 524@ r6 q 525@ r7 qul 526@ r12 x 527@ r14 quh 528 529 movs r3,#22 530 subs r3,r5 @ 22-xsh 531 lsrs r6,r3 @ q>>=22-xsh 532 lsrs r7,r3 @ qul>>=22-xsh 533 adds r7,r6 @ qul+=q 534 mov r4,r12 535 muls r6,r4 @ x*q 536 subs r2,r0,r6 @ y-=x*q 537 mov r0,r14 @ quh 538 adds r5,#4 @ xsh+4 539 adds r3,#6 @ 28-xsh 540 movs r1,r0 541 lsrs r1,r3 542 lsls r0,r5 @ r0:r1 is quh<<(4+xsh) 543 adds r0,r7 544 bcc 1f 5452: 546 adds r1,#1 5471: @ qu=((ui64)quh<<(4+xsh))+qul 548 cmp r2,r4 549 bhs 3f 550 movs r3,#0 551 pop {r4-r7,r15} 552 553.ltorg 554 5553: 556 subs r2,r4 557 adds r0,#1 558 bcc 1b 559 b 2b @ while(y>=x) y-=x,qu++; 560 561@ here: 562@ r0:r1 y 563@ r2 x0 564@ r4 r 565@ r5 xsh; xsh<9 566@ r6 q 567 568last0: 569 movs r7,#9 570 subs r7,r5 @ 9-xsh 571 lsrs r6,r7 572 mov r4,r12 @ x 573 uxth r2,r4 574 muls r2,r6 @ q*xlo 575 subs r0,r2 576 bcs 1f 577 subs r1,#1 @ y-=q*xlo 5781: 579 lsrs r2,r4,#16 @ xhi 580 muls r2,r6 @ q*xhi 581 lsrs r3,r2,#16 582 lsls r2,#16 583 subs r2,r0,r2 584 sbcs r1,r3 @ y-q*xhi 585 movs r3,r1 @ y now in r2:r3 586 mov r0,r14 @ quh 587 adds r5,#4 @ xsh+4 588 adds r7,#19 @ 28-xsh 589 movs r1,r0 590 lsrs r1,r7 591 lsls r0,r5 @ r0:r1 is quh<<(4+xsh) 592 adds r0,r6 593 bcc 1f 594 adds r1,#1 @ quh<<(xsh+4))+q 5951: 596 cmp r3,#0 @ y>=2^32? 597 bne 3f 598 cmp r2,r4 @ y>=x? 599 bhs 4f 600 pop {r4-r7,r15} 601 6023: 603 adds r0,#1 @ qu++ 604 bcc 2f 605 adds r1,#1 6062: 607 subs r2,r4 @ y-=x 608 bcs 3b 609 subs r3,#1 610 bne 3b 611 6121: 613 cmp r2,r4 614 bhs 4f 615 pop {r4-r7,r15} 616 6174: 618 adds r0,#1 @ qu++ 619 bcc 2f 620 adds r1,#1 6212: 622 subs r2,r4 @ y-=x 623 b 1b 624 625y64_x48: 626@ here x is 33..64 bits 627 push {r4-r7,r14} @ save a copy of x 628 lsrs r4,r3,#16 629 beq 1f 630 b y64_x64 @ jump if x is 49..64 bits 6311: 632 push {r2-r3} @ save a copy of x 633@ here x is 33..48 bits 634 movs r5,#0 @ xsh=0 635 lsrs r4,r3,#8 636 bne 1f 637 lsls r3,#8 638 lsrs r6,r2,#24 639 orrs r3,r6 640 lsls r2,#8 @ if(x0<1U<<40) x0<<=8,xsh =8; 641 adds r5,#8 6421: 643 lsrs r4,r3,#12 644 bne 1f 645 lsls r3,#4 646 lsrs r6,r2,#28 647 orrs r3,r6 648 lsls r2,#4 @ if(x0<1U<<44) x0<<=4,xsh+=4; 649 adds r5,#4 6501: 651 lsrs r4,r3,#14 652 bne 1f 653 lsls r3,#2 654 lsrs r6,r2,#30 655 orrs r3,r6 656 lsls r2,#2 @ if(x0<1U<<46) x0<<=2,xsh+=2; 657 adds r5,#2 6581: 659 lsrs r4,r3,#15 660 bne 1f 661 adds r2,r2 662 adcs r3,r3 @ if(x0<1U<<47) x0<<=1,xsh+=1; 663 adds r5,#1 6641: 665@ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33 666 movs r4,r3 667 adds r7,r2,r2 668 adcs r4,r4 669 adds r4,#1 @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17 670 671 ldr r7,=#SIO_BASE 672 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET] 673 ldr r4,=#0xffffffff 674 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 675 lsrs r6,r1,#16 676 wait_div 1 677 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate 678 679@ here 680@ r0:r1 y 681@ r2:r3 x0 682@ r4 r 683@ r5 xsh 0<=xsh<16 684 685 muls r6,r4 686 lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16; 687 lsls r7,r6,#13 688 mov r14,r7 @ save q<<13 689 uxth r7,r2 @ x0l 690 muls r7,r6 691 subs r0,r7 692 bcs 1f 693 subs r1,#1 6941: 695 subs r0,r7 696 bcs 1f 697 subs r1,#1 6981: 699 uxth r7,r3 @ x0h 700 muls r7,r6 701 subs r1,r7 702 subs r1,r7 703 lsrs r7,r2,#16 @ x0m 704 muls r7,r6 705 lsls r6,r7,#17 706 lsrs r7,#15 707 subs r0,r6 708 sbcs r1,r7 @ y-=((ui64)q*x0)<<1; 709 710 lsrs r6,r1,#3 @ y>>35 711 muls r6,r4 712 lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16; 713 714 cmp r5,#12 715 blt last1 @ if(xsh<12) goto last1; 716 717 add r14,r6 @ qu<<13+q 718 lsrs r2,#12 719 lsls r7,r3,#20 720 orrs r2,r7 721 lsrs r3,#12 @ x0>>12 722 723 uxth r7,r2 @ x0l 724 muls r7,r6 725 subs r0,r7 726 bcs 1f 727 subs r1,#1 7281: 729 uxth r7,r3 @ x0h 730 muls r7,r6 731 subs r1,r7 732 lsrs r7,r2,#16 @ x0m 733 muls r7,r6 734 lsls r6,r7,#16 735 lsrs r7,#16 736 subs r0,r6 737 sbcs r1,r7 @ y-=((ui64)q*x0)>>12 738 739 lsrs r6,r0,#22 740 lsls r7,r1,#10 741 orrs r6,r7 @ y>>22 742 muls r6,r4 743 movs r7,#41 744 subs r7,r5 745 lsrs r6,r7 @ q=((ui32)(y>>22)*r)>>(16+25-xsh) 746 747 subs r5,#12 748 mov r7,r14 749 lsls r7,r5 7502: 751 adds r7,r6 @ qu=(qu<<(xsh-12))+q 752 pop {r4,r5} @ recall x 753 754@ here 755@ r0:r1 y 756@ r4:r5 x 757@ r6 q 758@ r7 qu 759 760 uxth r2,r4 761 uxth r3,r5 762 muls r2,r6 @ xlo*q 763 muls r3,r6 @ xhi*q 764 subs r0,r2 765 sbcs r1,r3 766 lsrs r2,r4,#16 767 muls r2,r6 768 lsrs r3,r2,#16 769 lsls r2,#16 @ xm*q 770 subs r0,r2 771 sbcs r1,r3 @ y-=(ui64)q*x 772 7731: 774 movs r2,r0 775 movs r3,r1 776 adds r7,#1 777 subs r0,r4 778 sbcs r1,r5 @ while(y>=x) y-=x,qu++; 779 bhs 1b 780 subs r0,r7,#1 @ correction to qu 781 movs r1,#0 782 pop {r4-r7,r15} 783 784last1: 785@ r0:r1 y 786@ r2:r3 x0 787@ r5 xsh 788@ r6 q 789 790 movs r7,#12 791 subs r7,r5 792 lsrs r6,r7 @ q>>=12-xsh 793 mov r7,r14 794 lsrs r7,#13 795 lsls r7,r5 796 adds r7,r7 @ qu<<(xsh+1) 797 b 2b 798 799y64_x64: 800@ here x is 49..64 bits 801 movs r4,#0 @ q=0 if x>>32==0xffffffff 802 adds r5,r3,#1 803 beq 1f 804 805 ldr r7,=#SIO_BASE 806 str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET] 807 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 808 wait_div 0 809 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1) 8101: 811 uxth r5,r2 812 uxth r6,r3 813 muls r5,r4 814 muls r6,r4 815 subs r0,r5 816 sbcs r1,r6 817 lsrs r5,r2,#16 818 lsrs r6,r3,#16 819 muls r5,r4 820 muls r6,r4 821 lsls r6,#16 822 lsrs r7,r5,#16 823 orrs r6,r7 824 lsls r5,#16 825 subs r0,r5 826 sbcs r1,r6 @ y-=(ui64)q*x 827 828 cmp r1,r3 @ while(y>=x) y-=x,q++ 829 bhs 1f 8303: 831 movs r2,r0 832 movs r3,r1 833 movs r0,r4 834 movs r1,#0 835 pop {r4-r7,r15} 836 8371: 838 bne 2f 839 cmp r0,r2 840 blo 3b 8412: 842 subs r0,r2 843 sbcs r1,r3 844 adds r4,#1 845 cmp r1,r3 846 blo 3b 847 b 1b 848 849div_section divmod_s64s64_rem 850regular_func divmod_s64s64_rem 851 push {r4, lr} 852 bl divmod_s64s64 853 ldr r4, [sp, #8] 854 stmia r4!, {r2,r3} 855 pop {r4, pc} 856 857div_section divmod_u64u64_rem 858regular_func divmod_u64u64_rem 859 push {r4, lr} 860 bl divmod_u64u64 861 ldr r4, [sp, #8] 862 stmia r4!, {r2,r3} 863 pop {r4, pc} 864