00001 /* 00002 ----------------------------------------------------------------------------- 00003 This source file is part of OGRE 00004 (Object-oriented Graphics Rendering Engine) 00005 For the latest info, see http://www.ogre3d.org/ 00006 00007 Copyright (c) 2000-2006 Torus Knot Software Ltd 00008 Also see acknowledgements in Readme.html 00009 00010 This program is free software; you can redistribute it and/or modify it under 00011 the terms of the GNU Lesser General Public License as published by the Free Software 00012 Foundation; either version 2 of the License, or (at your option) any later 00013 version. 00014 00015 This program is distributed in the hope that it will be useful, but WITHOUT 00016 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 00017 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. 00018 00019 You should have received a copy of the GNU Lesser General Public License along with 00020 this program; if not, write to the Free Software Foundation, Inc., 59 Temple 00021 Place - Suite 330, Boston, MA 02111-1307, USA, or go to 00022 http://www.gnu.org/copyleft/lesser.txt. 00023 00024 You may alternatively use this source under the terms of a specific version of 00025 the OGRE Unrestricted License provided you have obtained such a license from 00026 Torus Knot Software Ltd. 00027 ----------------------------------------------------------------------------- 00028 */ 00029 #ifndef __SIMDHelper_H__ 00030 #define __SIMDHelper_H__ 00031 00032 #include "OgrePrerequisites.h" 00033 #include "OgrePlatformInformation.h" 00034 00035 // Stack-alignment hackery. 00036 // 00037 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests 00038 // special code to ensure stack align to a 16-bytes boundary. 00039 // 00040 // Note: 00041 // This macro can only guarantee callee stack pointer (esp) align 00042 // to a 16-bytes boundary, but not that for frame pointer (ebp). 00043 // Because most compiler might use frame pointer to access to stack 00044 // variables, so you need to wrap those alignment required functions 00045 // with extra function call. 00046 // 00047 #if defined(__INTEL_COMPILER) 00048 // For intel's compiler, simply calling alloca seems to do the right 00049 // thing. The size of the allocated block seems to be irrelevant. 00050 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16) 00051 00052 #elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC 00053 // 00054 // Horrible hack to align the stack to a 16-bytes boundary for gcc. 00055 // 00056 // We assume a gcc version >= 2.95 so that 00057 // -mpreferred-stack-boundary works. Otherwise, all bets are 00058 // off. However, -mpreferred-stack-boundary does not create a 00059 // stack alignment, but it only preserves it. Unfortunately, 00060 // since Ogre are designed as a flexibility library, user might 00061 // compile their application with wrong stack alignment, even 00062 // if user taken care with stack alignment, but many versions 00063 // of libc on linux call main() with the wrong initial stack 00064 // alignment the result that the code is now pessimally aligned 00065 // instead of having a 50% chance of being correct. 00066 // 00067 #if OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64 00068 00069 #define __OGRE_SIMD_ALIGN_STACK() \ 00070 { \ 00071 /* Use alloca to allocate some memory on the stack. */ \ 00072 /* This alerts gcc that something funny is going on, */ \ 00073 /* so that it does not omit the frame pointer etc. */ \ 00074 (void)__builtin_alloca(16); \ 00075 /* Now align the stack pointer */ \ 00076 __asm__ __volatile__ ("andl $-16, %esp"); \ 00077 } 00078 00079 #else // 64 00080 #define __OGRE_SIMD_ALIGN_STACK() \ 00081 { \ 00082 /* Use alloca to allocate some memory on the stack. */ \ 00083 /* This alerts gcc that something funny is going on, */ \ 00084 /* so that it does not omit the frame pointer etc. */ \ 00085 (void)__builtin_alloca(16); \ 00086 /* Now align the stack pointer */ \ 00087 __asm__ __volatile__ ("andq $-16, %rsp"); \ 00088 } 00089 #endif //64 00090 00091 #elif defined(_MSC_VER) 00092 // Fortunately, MSVC will align the stack automatically 00093 00094 #endif 00095 00096 00097 // Additional platform-dependent header files and declares. 00098 // 00099 // NOTE: Should be sync with __OGRE_HAVE_SSE macro. 00100 // 00101 00102 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00103 00104 #if OGRE_COMPILER == OGRE_COMPILER_MSVC || defined(__INTEL_COMPILER) 00105 #include <xmmintrin.h> 00106 00107 #elif OGRE_COMPILER == OGRE_COMPILER_GNUC 00108 // Don't define ourself version SSE intrinsics if "xmmintrin.h" already included. 00109 // 00110 // Note: gcc in some platform already included "xmmintrin.h" for some reason. 00111 // I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h" 00112 // comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition 00113 // problem on gcc for x86. 00114 // 00115 #if !defined(_XMMINTRIN_H_INCLUDED) 00116 00117 // Simulate VC/ICC intrinsics. Only used intrinsics are declared here. 00118 # if OGRE_COMP_VER >= 350 00119 typedef float __m128 __attribute__ ((vector_size (16), aligned(16))); 00120 typedef int __m64 __attribute__ ((vector_size (8))); 00121 # else 00122 typedef float __m128 __attribute__ ((mode(V4SF),aligned(16))); 00123 typedef int __m64 __attribute__ ((mode(V2SI))); 00124 # endif 00125 00126 // Macro for declare intrinsic routines always inline even if in debug build 00127 #define __ALWAYS_INLINE FORCEINLINE __attribute__ ((__always_inline__)) 00128 00129 // Shuffle instruction must be declare as macro 00130 00131 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 00132 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) 00133 00134 #define _mm_shuffle_ps(a, b, imm8) __extension__ \ 00135 ({ \ 00136 __m128 result; \ 00137 __asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8)); \ 00138 result; \ 00139 }) 00140 00141 00142 // Load/store instructions 00143 00144 #define __MM_DECL_LD(name, instruction, type) \ 00145 static __ALWAYS_INLINE __m128 _mm_##name(const type *addr) \ 00146 { \ 00147 __m128 result; \ 00148 __asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr)); \ 00149 return result; \ 00150 } 00151 00152 #define __MM_DECL_LD2(name, instruction, type) \ 00153 static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr) \ 00154 { \ 00155 __m128 result; \ 00156 __asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr)); \ 00157 return result; \ 00158 } 00159 00160 #define __MM_DECL_ST(name, instruction, type) \ 00161 static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val) \ 00162 { \ 00163 __asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val)); \ 00164 } 00165 00166 __MM_DECL_LD(loadu_ps, movups, float) 00167 __MM_DECL_ST(storeu_ps, movups, float) 00168 00169 __MM_DECL_LD(load_ss, movss, float) 00170 __MM_DECL_ST(store_ss, movss, float) 00171 00172 __MM_DECL_ST(storel_pi, movlps, __m64) 00173 __MM_DECL_ST(storeh_pi, movhps, __m64) 00174 __MM_DECL_LD2(loadl_pi, movlps, __m64) 00175 __MM_DECL_LD2(loadh_pi, movhps, __m64) 00176 00177 #undef __MM_DECL_LD 00178 #undef __MM_DECL_LD2 00179 #undef __MM_DECL_ST 00180 00181 // Two operand instructions 00182 00183 #define __MM_DECL_OP2(name, instruction, constraint) \ 00184 static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b) \ 00185 { \ 00186 __m128 result; \ 00187 __asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b)); \ 00188 return result; \ 00189 } 00190 00191 __MM_DECL_OP2(add_ps, addps, xm) 00192 __MM_DECL_OP2(add_ss, addss, xm) 00193 __MM_DECL_OP2(sub_ps, subps, xm) 00194 __MM_DECL_OP2(sub_ss, subss, xm) 00195 __MM_DECL_OP2(mul_ps, mulps, xm) 00196 __MM_DECL_OP2(mul_ss, mulss, xm) 00197 00198 __MM_DECL_OP2(xor_ps, xorps, xm) 00199 00200 __MM_DECL_OP2(unpacklo_ps, unpcklps, xm) 00201 __MM_DECL_OP2(unpackhi_ps, unpckhps, xm) 00202 00203 __MM_DECL_OP2(movehl_ps, movhlps, x) 00204 __MM_DECL_OP2(movelh_ps, movlhps, x) 00205 00206 __MM_DECL_OP2(cmpnle_ps, cmpnleps, xm) 00207 00208 #undef __MM_DECL_OP2 00209 00210 // Other used instructions 00211 00212 static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr) 00213 { 00214 __m128 tmp = _mm_load_ss(addr); 00215 return _mm_shuffle_ps(tmp, tmp, 0); 00216 } 00217 00218 static __ALWAYS_INLINE __m128 _mm_setzero_ps(void) 00219 { 00220 __m128 result; 00221 __asm__("xorps %0, %0" : "=x" (result)); 00222 return result; 00223 } 00224 00225 static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val) 00226 { 00227 __m128 result; 00228 __asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val)); 00229 //__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val)); 00230 return result; 00231 } 00232 00233 static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val) 00234 { 00235 int result; 00236 __asm__("movmskps %1, %0" : "=r" (result) : "x" (val)); 00237 return result; 00238 } 00239 00240 #endif // !defined(_XMMINTRIN_H_INCLUDED) 00241 00242 #endif // OGRE_COMPILER == OGRE_COMPILER_GNUC 00243 00244 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00245 00246 00247 00248 //--------------------------------------------------------------------- 00249 // SIMD macros and helpers 00250 //--------------------------------------------------------------------- 00251 00252 00253 namespace Ogre { 00254 00255 #if __OGRE_HAVE_SSE 00256 00267 #if 1 00268 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x) 00269 #else 00270 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below 00271 #endif 00272 00281 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \ 00282 { \ 00283 __m128 t3, t2, t1, t0; \ 00284 \ 00285 /* r00 r01 r02 r03 */ \ 00286 /* r10 r11 r12 r13 */ \ 00287 /* r20 r21 r22 r23 */ \ 00288 /* r30 r31 r32 r33 */ \ 00289 \ 00290 t0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \ 00291 t2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \ 00292 t1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \ 00293 t3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \ 00294 \ 00295 r0 = _mm_movelh_ps(t0, t1); /* r00 r10 r20 r30 */ \ 00296 r1 = _mm_movehl_ps(t1, t0); /* r01 r11 r21 r31 */ \ 00297 r2 = _mm_movelh_ps(t2, t3); /* r02 r12 r22 r32 */ \ 00298 r3 = _mm_movehl_ps(t3, t2); /* r03 r13 r23 r33 */ \ 00299 } 00300 00309 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \ 00310 { \ 00311 __m128 t0, t1, t2; \ 00312 \ 00313 /* r00 r01 r02 r10 */ \ 00314 /* r11 r12 r20 r21 */ \ 00315 /* r22 r30 r31 r32 */ \ 00316 \ 00317 t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \ 00318 t1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \ 00319 t2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \ 00320 \ 00321 v0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \ 00322 v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \ 00323 v2 = _mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \ 00324 } 00325 00333 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \ 00334 { \ 00335 __m128 t0, t1, t2; \ 00336 \ 00337 /* r00 r10 r20 r30 */ \ 00338 /* r01 r11 r21 r31 */ \ 00339 /* r02 r12 r22 r32 */ \ 00340 \ 00341 t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \ 00342 t1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \ 00343 t2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \ 00344 \ 00345 v0 = _mm_shuffle_ps(t2, t0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \ 00346 v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \ 00347 v2 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \ 00348 } 00349 00353 #define __MM_SELECT(v, fp) \ 00354 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp))) 00355 00357 #define __MM_ACCUM4_PS(a, b, c, d) \ 00358 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d)) 00359 00363 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \ 00364 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3)) 00365 00369 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \ 00370 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3) 00371 00373 #define __MM_ACCUM3_PS(a, b, c) \ 00374 _mm_add_ps(_mm_add_ps(a, b), c) 00375 00379 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \ 00380 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2)) 00381 00383 #define __MM_MADD_PS(a, b, c) \ 00384 _mm_add_ps(_mm_mul_ps(a, b), c) 00385 00387 #define __MM_LERP_PS(t, a, b) \ 00388 __MM_MADD_PS(_mm_sub_ps(b, a), t, a) 00389 00391 #define __MM_MADD_SS(a, b, c) \ 00392 _mm_add_ss(_mm_mul_ss(a, b), c) 00393 00395 #define __MM_LERP_SS(t, a, b) \ 00396 __MM_MADD_SS(_mm_sub_ss(b, a), t, a) 00397 00399 #define __MM_LOAD_PS(p) \ 00400 (*(__m128*)(p)) 00401 00403 #define __MM_STORE_PS(p, v) \ 00404 (*(__m128*)(p) = (v)) 00405 00406 00409 template <bool aligned = false> 00410 struct SSEMemoryAccessor 00411 { 00412 static FORCEINLINE __m128 load(const float *p) 00413 { 00414 return _mm_loadu_ps(p); 00415 } 00416 static FORCEINLINE void store(float *p, const __m128& v) 00417 { 00418 _mm_storeu_ps(p, v); 00419 } 00420 }; 00421 // Special aligned accessor 00422 template <> 00423 struct SSEMemoryAccessor<true> 00424 { 00425 static FORCEINLINE const __m128& load(const float *p) 00426 { 00427 return __MM_LOAD_PS(p); 00428 } 00429 static FORCEINLINE void store(float *p, const __m128& v) 00430 { 00431 __MM_STORE_PS(p, v); 00432 } 00433 }; 00434 00437 static FORCEINLINE bool _isAlignedForSSE(const void *p) 00438 { 00439 return (((size_t)p) & 15) == 0; 00440 } 00441 00445 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x) 00446 { 00447 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f }; 00448 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f }; 00449 __m128 t = _mm_rsqrt_ps(x); 00450 return _mm_mul_ps(_mm_mul_ps(v0pt5, t), 00451 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t))); 00452 } 00453 00454 // Macro to check the stack aligned for SSE 00455 #if OGRE_DEBUG_MODE 00456 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \ 00457 { \ 00458 __m128 test; \ 00459 assert(_isAlignedForSSE(&test)); \ 00460 } 00461 00462 #else // !OGRE_DEBUG_MODE 00463 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() 00464 00465 #endif // OGRE_DEBUG_MODE 00466 00467 00468 #endif // __OGRE_HAVE_SSE 00469 00470 } 00471 00472 #endif // __SIMDHelper_H__
Copyright © 2008 Torus Knot Software Ltd
This work is licensed under a Creative Commons Attribution-ShareAlike 2.5 License.
Last modified Sun Sep 27 22:02:26 2009