1.file "sqrt.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//******************************************************************** 40// History 41//******************************************************************** 42// 02/02/00 Initial version 43// 04/04/00 Unwind support added 44// 08/15/00 Bundle added after call to __libm_error_support to properly 45// set [the previously overwritten] GR_Parameter_RESULT. 46// 02/10/03 Reordered header: .section, .global, .proc, .align 47// 48//******************************************************************** 49// 50// Function: Combined sqrt(x), where 51// _ 52// sqrt(x) = |x, for double precision x values 53// 54//******************************************************************** 55// 56// Accuracy: Correctly Rounded 57// 58//******************************************************************** 59// 60// Resources Used: 61// 62// Floating-Point Registers: f8 (Input and Return Value) 63// f7 -f14 64// 65// General Purpose Registers: 66// r32-r36 (Locals) 67// r37-r40 (Used to pass arguments to error handling routine) 68// 69// Predicate Registers: p6, p7, p8 70// 71//********************************************************************* 72// 73// IEEE Special Conditions: 74// 75// All faults and exceptions should be raised correctly. 76// sqrt(QNaN) = QNaN 77// sqrt(SNaN) = QNaN 78// sqrt(+/-0) = +/-0 79// sqrt(negative) = QNaN and error handling is called 80// 81//********************************************************************* 82// 83// Implementation: 84// 85// Modified Newton-Raphson Algorithm 86// 87//********************************************************************* 88 89GR_SAVE_PFS = r33 90GR_SAVE_B0 = r34 91GR_SAVE_GP = r35 92 93GR_Parameter_X = r37 94GR_Parameter_Y = r38 95GR_Parameter_RESULT = r39 96 97 98.section .text 99GLOBAL_IEEE754_ENTRY(sqrt) 100{ .mfi 101 alloc r32= ar.pfs,0,5,4,0 102 frsqrta.s0 f7,p6=f8 103 nop.i 0 104} { .mlx 105 // BEGIN DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM 106 nop.m 0 107 // exponent of +1/2 in r2 108 movl r2 = 0x0fffe;; 109} { .mmi 110 // +1/2 in f9 111 setf.exp f9 = r2 112 nop.m 0 113 nop.i 0 114} { .mlx 115 nop.m 0 116 // 3/2 in r3 117 movl r3=0x3fc00000;; 118} { .mfi 119 setf.s f10=r3 120 // Step (1) 121 // y0 = 1/sqrt(a) in f7 122 fclass.m.unc p7,p8 = f8,0x3A 123 nop.i 0;; 124} { .mlx 125 nop.m 0 126 // 5/2 in r2 127 movl r2 = 0x40200000 128} { .mlx 129 nop.m 0 130 // 63/8 in r3 131 movl r3 = 0x40fc0000;; 132} { .mfi 133 setf.s f11=r2 134 // Step (2) 135 // h = +1/2 * y0 in f6 136 (p6) fma.s1 f6=f9,f7,f0 137 nop.i 0 138} { .mfi 139 setf.s f12=r3 140 // Step (3) 141 // g = a * y0 in f7 142 (p6) fma.s1 f7=f8,f7,f0 143 nop.i 0 144} { .mfi 145 nop.m 0 146 mov f15 = f8 147 nop.i 0;; 148} { .mlx 149 nop.m 0 150 // 231/16 in r2 151 movl r2 = 0x41670000;; 152} { .mfi 153 setf.s f13=r2 154 // Step (4) 155 // e = 1/2 - g * h in f9 156 (p6) fnma.s1 f9=f7,f6,f9 157 nop.i 0 158} { .mlx 159 nop.m 0 160 // 35/8 in r3 161 movl r3 = 0x408c0000;; 162} { .mfi 163 setf.s f14=r3 164 // Step (5) 165 // S = 3/2 + 5/2 * e in f10 166 (p6) fma.s1 f10=f11,f9,f10 167 nop.i 0 168} { .mfi 169 nop.m 0 170 // Step (6) 171 // e2 = e * e in f11 172 (p6) fma.s1 f11=f9,f9,f0 173 nop.i 0;; 174} { .mfi 175 nop.m 0 176 // Step (7) 177 // t = 63/8 + 231/16 * e in f12 178 (p6) fma.s1 f12=f13,f9,f12 179 nop.i 0;; 180} { .mfi 181 nop.m 0 182 // Step (8) 183 // S1 = e + e2 * S in f10 184 (p6) fma.s1 f10=f11,f10,f9 185 nop.i 0 186} { .mfi 187 nop.m 0 188 // Step (9) 189 // e4 = e2 * e2 in f11 190 (p6) fma.s1 f11=f11,f11,f0 191 nop.i 0;; 192} { .mfi 193 nop.m 0 194 // Step (10) 195 // t1 = 35/8 + e * t in f9 196 (p6) fma.s1 f9=f9,f12,f14 197 nop.i 0;; 198} { .mfi 199 nop.m 0 200 // Step (11) 201 // G = g + S1 * g in f12 202 (p6) fma.s1 f12=f10,f7,f7 203 nop.i 0 204} { .mfi 205 nop.m 0 206 // Step (12) 207 // E = g * e4 in f7 208 (p6) fma.s1 f7=f7,f11,f0 209 nop.i 0;; 210} { .mfi 211 nop.m 0 212 // Step (13) 213 // u = S1 + e4 * t1 in f10 214 (p6) fma.s1 f10=f11,f9,f10 215 nop.i 0;; 216} { .mfi 217 nop.m 0 218 // Step (14) 219 // g1 = G + t1 * E in f7 220 (p6) fma.d.s1 f7=f9,f7,f12 221 nop.i 0;; 222} { .mfi 223 nop.m 0 224 // Step (15) 225 // h1 = h + u * h in f6 226 (p6) fma.s1 f6=f10,f6,f6 227 nop.i 0;; 228} { .mfi 229 nop.m 0 230 // Step (16) 231 // d = a - g1 * g1 in f9 232 (p6) fnma.s1 f9=f7,f7,f8 233 nop.i 0;; 234} { .mfb 235 nop.m 0 236 // Step (17) 237 // g2 = g1 + d * h1 in f7 238 (p6) fma.d.s0 f8=f9,f6,f7 239 (p6) br.ret.sptk b0 ;; 240} 241 242{ .mfb 243 nop.m 0 244 mov f8 = f7 245 (p8) br.ret.sptk b0 ;; 246} 247{ .mfb 248 (p7) mov r40 = 49 249 nop.f 0 250 (p7) br.cond.sptk __libm_error_region ;; 251} 252// END DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM 253GLOBAL_IEEE754_END(sqrt) 254libm_alias_double_other (__sqrt, sqrt) 255libm_alias_double_narrow (__sqrt, sqrt) 256 257// Stack operations when calling error support. 258// (1) (2) (3) (call) (4) 259// sp -> + psp -> + psp -> + sp -> + 260// | | | | 261// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 262// | | | | 263// | <-GR_Y Y2->| Y2 ->| <- GR_Y | 264// | | | | 265// | | <- GR_X X1 ->| | 266// | | | | 267// sp-64 -> + sp -> + sp -> + + 268// save ar.pfs save b0 restore gp 269// save gp restore ar.pfs 270 271 272LOCAL_LIBM_ENTRY(__libm_error_region) 273 274// 275// This branch includes all those special values that are not negative, 276// with the result equal to frcpa(x) 277// 278 279.prologue 280// We are distinguishing between over(under)flow and letting 281// __libm_error_support set ERANGE or do anything else needed. 282 283// (1) 284{ .mfi 285 add GR_Parameter_Y=-32,sp // Parameter 2 value 286 nop.f 0 287.save ar.pfs,GR_SAVE_PFS 288 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 289} 290{ .mfi 291.fframe 64 292 add sp=-64,sp // Create new stack 293 nop.f 0 294 mov GR_SAVE_GP=gp // Save gp 295};; 296 297 298// (2) 299{ .mmi 300 stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack 301 add GR_Parameter_X = 16,sp // Parameter 1 address 302.save b0, GR_SAVE_B0 303 mov GR_SAVE_B0=b0 // Save b0 304};; 305 306.body 307// (3) 308{ .mib 309 stfd [GR_Parameter_X] = f15 // STORE Parameter 1 on stack 310 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address 311 nop.b 0 312} 313{ .mib 314 stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack 315 add GR_Parameter_Y = -16,GR_Parameter_Y 316 br.call.sptk b0=__libm_error_support# // Call error handling function 317};; 318{ .mmi 319 nop.m 0 320 nop.m 0 321 add GR_Parameter_RESULT = 48,sp 322};; 323 324// (4) 325{ .mmi 326 ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack 327.restore sp 328 add sp = 64,sp // Restore stack pointer 329 mov b0 = GR_SAVE_B0 // Restore return address 330};; 331{ .mib 332 mov gp = GR_SAVE_GP // Restore gp 333 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 334 br.ret.sptk b0 // Return 335};; 336 337LOCAL_LIBM_END(__libm_error_region) 338 339 340 341 342.type __libm_error_support#,@function 343.global __libm_error_support# 344