ps2maths.cpp

00001 #include "ps2maths.h"
00002 
00003 // Morten "Sparky" Mikkelsen's fast maths routines (From a post on the forums
00004 // at www.playstation2-linux.com)
00005 
00006 float Abs(const float x) { float r; asm(" abs.s %0, %1 " : "=&f" (r) : "f" (x) ); return r; } 
00007 float Sqrt(const float x) { float r; asm(" sqrt.s %0, %1 " : "=&f" (r) : "f" (x) ); return r; } 
00008 float Max(const float a, const float b) { float r; asm(" max.s %0, %1, %2 " : "=&f" (r) : "f" (a), "f" (b) ); return r; } 
00009 float Min(const float a, const float b) { float r; asm(" min.s %0, %1, %2 " : "=&f" (r) : "f" (a), "f" (b) ); return r; } 
00010 float ACos(float x) { return PIHALF - ASin(x); }
00011 float Sin(float v) { return Cos(v-PIHALF); }
00012 
00013 float DegToRad(float Deg)
00014 {
00015         return (Deg / 180.0f) * PI;
00016 }
00017 
00018 float RadToDeg(float Rad)
00019 {
00020         return (Rad / PI) * 180.0f;
00021 }
00022 
00023 float Mod(const float a, const float b) 
00024 { 
00025 float r; 
00026 
00027 asm __volatile__ (" 
00028 .set push 
00029 .set noreorder 
00030 div.s %0, %1, %2 
00031 mtc1 $0, $f8 
00032 mfc1 $10, %0 
00033 srl $8, $10, 23 
00034 addiu $9, $0, 127+23 
00035 andi $8, $8, 0xff 
00036 addiu $12, $0, 1 
00037 addiu $11, $8, -127 
00038 subu $8, $9, $8 
00039 pmaxw $8, $8, $0 
00040 sllv $8, $12, $8 
00041 lui $9, 0x8000 
00042 negu $8, $8 
00043 srl $11, $11, 31 
00044 movz $9, $8, $11 
00045 and $10, $9, $10 
00046 mtc1 $10, %0 # trunc(fdiv) 
00047 adda.s %1, $f8 
00048 msub.s %0, %2, %0 
00049 
00050 .set pop " 
00051 : "=&f" (r) : "f" (a), "f" (b) : "$8", "$9", "$10", "$11", "$12", "$f8"); 
00052 
00053 /* 
00054 float fDiv = a / b 
00055 int32 uDiv = MFC1(fDiv) 
00056 int32 exp = (uDiv>>23)&0xff 
00057 int32 exp_cen = exp-127 
00058 int32 bits = max(127+23 - exp, 0) 
00059 uint32 and_val = 0x80000000 
00060 MOVZ(and_val, -(1<<bits), exp_cen>>31) // ~((1<<bits)-1) 
00061 fTruncDiv = MTC1(uDiv&and_val) 
00062 
00063 return a - fTruncDiv*b; // a = i*b + r 
00064 */ 
00065 
00066 return r; 
00067 }
00068 
00069 
00070 float ASin(float x) 
00071 { 
00072 float r; 
00073 
00074 asm __volatile__ ( 
00075 " 
00076 # s0-s4, $f3-$f7 
00077 # offs, $f6 (tmp) 
00078 # x^2, $f7 (tmp) 
00079 
00080 
00081 .set noreorder 
00082 .align 3 
00083 lui $8, 0x3f80 
00084 mtc1 $0, $f8 
00085 mtc1 $8, $f1 
00086 lui $8, 0x3f35 
00087 mfc1 $9, %1 
00088 ori $8, $8, 0x04f3 
00089 adda.s $f1, $f8 
00090 lui $10, 0x8000 
00091 msub.s $f2, %1, %1 
00092 not $11, $10 
00093 and $11, $9, $11 # $11, uAbsX 
00094 subu $8, $8, $11 
00095 nop # don't remove this nop 
00096 and $9, $9, $10 # $9, uSign 
00097 sqrt.s $f2, $f2 
00098 srl $8, $8, 31 # $8, bFlip = ((BOUND_HEX - uAbsX) >> 31) 
00099 abs.s %0, %1 
00100 lui $11, 0x3fc9 
00101 ori $11, 0x0fdb 
00102 or $11, $9, $11 
00103 movz $11, $0, $8 # offs_hex 
00104 xor $10, $9, $10 
00105 mtc1 $11, $f6 
00106 movz $10, $9, $8 # uSign 
00107 min.s %0, $f2, %0 
00108 lui $9, 0x3e2a 
00109 lui $8, 0x3f80 
00110 ori $9, $9, 0xaaab 
00111 or $8, $10, $8 
00112 or $9, $10, $9 
00113 mtc1 $8, $f3 # upload S0 
00114 lui $8, 0x3d99 
00115 mul.s $f7, %0, %0 # x^2 
00116 ori $8, $8, 0x999a 
00117 mtc1 $9, $f4 # upload S1 
00118 lui $9, 0x3d36 
00119 or $8, $10, $8 
00120 ori $9, $9, 0xdb6e 
00121 mtc1 $8, $f5 # upload S2 
00122 or $9, $10, $9 
00123 mul.s $f1, %0, $f7 # x^3 
00124 lui $8, 0x3cf8 
00125 adda.s $f6, $f8 # acc = offs 
00126 ori $8, $8, 0xe38e 
00127 mul.s $f8, $f7, $f7 # x^4 
00128 or $8, $10, $8 
00129 madda.s $f3, %0 
00130 mul.s $f2, $f1, $f7 # x^5 
00131 madda.s $f4, $f1 
00132 mtc1 $9, $f6 # upload S3 to $f6 
00133 mul.s $f1, $f1, $f8 # x^7 
00134 mul.s %0, $f2, $f8 # x^9 
00135 mtc1 $8, $f7 # upload S4 to $f7 
00136 madda.s $f5, $f2 
00137 madda.s $f6, $f1 
00138 madd.s %0, $f7, %0 
00139 
00140 .set reorder " 
00141 
00142 : "=&f" (r) : "f" (x) : "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8" ); 
00143 
00144 /* 
00145 // BOUND_HEX, 0x3f3504f3 (IEEE) 
00146 // PIHALF_HEX, 0x3fc90fdb (IEEE) 
00147 float r; 
00148 
00149 const float Xc = Sqrt(1-x*x); 
00150 const float Xs = Abs(x); 
00151 
00152 float offs; 
00153 const uint32 uX = *((uint32 *) &x); 
00154 uint32 uSign = uX&0x80000000; 
00155 const uint32 uAbsX = uX&0x7fffffff; 
00156 *((uint32 *) &offs) = 0; 
00157 
00158 // if Abs(x) > bound then manipulate 
00159 // expression to do an acos taylor instead. 
00160 // this is done for better precission 
00161 bool bFlip = (bool) ((BOUND_HEX - uAbsX) >> 31); // bound = 1/sqrt(2) 
00162 MovNZero((uint32 *) &offs, PIHALF_HEX | uSign, bFlip); 
00163 MovNZero(&uSign, uSign^0x80000000, bFlip); 
00164 
00165 x = Min(Xc, Xs); 
00166 
00167 const float x2 = x*x; 
00168 const float x3 = x*x2; 
00169 const float x5 = x3*x2; 
00170 const float x7 = x5*x2; 
00171 const float x9 = x7*x2; 
00172 
00173 const float s0 = MTC1(0x3f800000 | uSign); // 1 
00174 const float s1 = MTC1(0x3e2aaaab | uSign); // 1/6 
00175 const float s2 = MTC1(0x3d99999a | uSign); // 3/40 
00176 const float s3 = MTC1(0x3d36db6e | uSign); // 5/112 
00177 const float s4 = MTC1(0x3cf8e38e | uSign); // 35/1152 
00178 
00179 r = offs+s0*x+s1*x3+s2*x5+s3*x7+s4*x9; 
00180 
00181 
00182 return r; 
00183 
00184 */ 
00185 
00186 return r; 
00187 } 
00188 
00189 float Cos(float v) 
00190 { 
00191 float r; 
00192 
00193 asm __volatile__ ( 
00194 " 
00195 lui $9, 0x3f00 
00196 
00197 .set noreorder 
00198 .align 3 
00199 abs.s %0, %1 
00200 lui $8, 0xbe22 
00201 
00202 mtc1 $9, $f1 # 0.5f 
00203 ori $8, $8, 0xf983 
00204 
00205 mtc1 $8, $f8 # -1/TWOPI 
00206 lui $9, 0x4b00 
00207 
00208 mtc1 $9, $f3 # bias 
00209 lui $8, 0x3f80 
00210 
00211 mtc1 $8, $f2 # 1.0f 
00212 mula.s %0, $f8 
00213 msuba.s $f3, $f2 
00214 madda.s $f3, $f2 
00215 lui $8, 0x40c9 
00216 
00217 
00218 
00219 msuba.s %0, $f8 
00220 ori $8, 0x0fdb 
00221 
00222 msub.s %0, $f1, $f2 
00223 lui $9, 0xc225 
00224 
00225 abs.s %0, %0 
00226 lui $10, 0x3e80 
00227 
00228 mtc1 $10, $f7 
00229 ori $9, 0x5de1 
00230 
00231 sub.s %0, %0, $f7 
00232 lui $10, 0x42a3 
00233 
00234 mtc1 $8, $f3 
00235 ori $10, 0x3458 
00236 
00237 mtc1 $9, $f4 
00238 lui $8, 0xc299 
00239 
00240 mtc1 $10, $f5 
00241 ori $8, 0x2663 
00242 
00243 mul.s $f8, %0, %0 
00244 lui $9, 0x421e 
00245 
00246 mtc1 $8, $f6 
00247 ori $9, 0xd7bb 
00248 
00249 mtc1 $9, $f7 
00250 nop 
00251 mul.s $f1, %0, $f8 # v^3 
00252 mul.s $f9, $f8, $f8 
00253 mula.s $f3, %0 
00254 mul.s $f2, $f1, $f8 # v^5 
00255 madda.s $f4, $f1 
00256 mul.s $f1, $f1, $f9 # v^7 
00257 mul.s %0, $f2, $f9 # v^9 
00258 madda.s $f5, $f2 
00259 madda.s $f6, $f1 
00260 madd.s %0, $f7, %0 
00261 .set reorder " 
00262 
00263 
00264 : "=&f" (r) : "f" (v) : "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$8", "$9", "$10" ); 
00265 
00266 /* 
00267 v = Abs(v); 
00268 
00269 // using accumulator for speed 
00270 float acc = v*(-1.0f/TWOPI); // be22f983 
00271 float b; *((uint32 *) &b) = 0x4b000000; 
00272 acc -= 1.0f*b; 
00273 acc += 1.0f*b; // after this add all decimals will be removed leaving -int(v) due to controled rounding errors 
00274 acc -= (v*(-1.0f/TWOPI)); // this will add v to -int(v) leaving just the decimals 
00275 
00276 // v (in acc) is now in the range [0; 1) 
00277 // cos/sin in this description is addapted to 
00278 // the domain [0; 1) instead of [0; TWOPI) 
00279 // cos(v)=-cos(v-0.5)=-cos(-(v-0.5)) = -cos(abs(v-0.5)) = 
00280 // -sin(abs(v-0.5)+0.25) = sin(abs(v-0.5)-0.25) 
00281 // since v was in the range [0; 1) then 
00282 // v' = abs(v-0.5)-0.25 will be in the range [-0.25; 0.25) 
00283 acc -= 1.0f*0.5f; 
00284 v = Abs(acc) - 0.25f; 
00285 
00286 float s1, s2, s3, s4, s5; 
00287 s1 = MTC1(0x40c90fdb); 
00288 s2 = MTC1(0xc2255de1); 
00289 s3 = MTC1(0x42a33458); 
00290 s4 = MTC1(0xc2992663); 
00291 s5 = MTC1(0x421ed7bb); 
00292 
00293 float v2 = v*v; 
00294 float v3 = v*v2; 
00295 float v5 = v3*v2; 
00296 float v7 = v5*v2; 
00297 float v9 = v7*v2; 
00298 r = s1*v+s2*v3+s3*v5+s4*v7+s5*v9; 
00299 
00300 */ 
00301 
00302 return r; 
00303 } 

Generated on Sun May 18 21:45:08 2008 for PS2X by  doxygen 1.5.4