OgreSIMDHelper.h

Go to the documentation of this file.
00001 /*
00002 -----------------------------------------------------------------------------
00003 This source file is part of OGRE
00004     (Object-oriented Graphics Rendering Engine)
00005 For the latest info, see http://www.ogre3d.org/
00006 
00007 Copyright (c) 2000-2006 Torus Knot Software Ltd
00008 Also see acknowledgements in Readme.html
00009 
00010 This program is free software; you can redistribute it and/or modify it under
00011 the terms of the GNU Lesser General Public License as published by the Free Software
00012 Foundation; either version 2 of the License, or (at your option) any later
00013 version.
00014 
00015 This program is distributed in the hope that it will be useful, but WITHOUT
00016 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00017 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
00018 
00019 You should have received a copy of the GNU Lesser General Public License along with
00020 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
00021 Place - Suite 330, Boston, MA 02111-1307, USA, or go to
00022 http://www.gnu.org/copyleft/lesser.txt.
00023 
00024 You may alternatively use this source under the terms of a specific version of
00025 the OGRE Unrestricted License provided you have obtained such a license from
00026 Torus Knot Software Ltd.
00027 -----------------------------------------------------------------------------
00028 */
00029 #ifndef __SIMDHelper_H__
00030 #define __SIMDHelper_H__
00031 
00032 #include "OgrePrerequisites.h"
00033 #include "OgrePlatformInformation.h"
00034 
00035 // Stack-alignment hackery.
00036 //
00037 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
00038 // special code to ensure stack align to a 16-bytes boundary.
00039 //
00040 // Note:
00041 //   This macro can only guarantee callee stack pointer (esp) align
00042 // to a 16-bytes boundary, but not that for frame pointer (ebp).
00043 // Because most compiler might use frame pointer to access to stack
00044 // variables, so you need to wrap those alignment required functions
00045 // with extra function call.
00046 //
00047 #if defined(__INTEL_COMPILER)
00048 // For intel's compiler, simply calling alloca seems to do the right
00049 // thing. The size of the allocated block seems to be irrelevant.
00050 #define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
00051 
00052 #elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC
00053 //
00054 // Horrible hack to align the stack to a 16-bytes boundary for gcc.
00055 //
00056 // We assume a gcc version >= 2.95 so that
00057 // -mpreferred-stack-boundary works.  Otherwise, all bets are
00058 // off.  However, -mpreferred-stack-boundary does not create a
00059 // stack alignment, but it only preserves it.  Unfortunately,
00060 // since Ogre are designed as a flexibility library, user might
00061 // compile their application with wrong stack alignment, even
00062 // if user taken care with stack alignment, but many versions
00063 // of libc on linux call main() with the wrong initial stack
00064 // alignment the result that the code is now pessimally aligned
00065 // instead of having a 50% chance of being correct.
00066 //
00067 #if OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64
00068 
00069 #define __OGRE_SIMD_ALIGN_STACK()                                   \
00070     {                                                               \
00071         /* Use alloca to allocate some memory on the stack.  */     \
00072         /* This alerts gcc that something funny is going on, */     \
00073         /* so that it does not omit the frame pointer etc.   */     \
00074         (void)__builtin_alloca(16);                                 \
00075         /* Now align the stack pointer */                           \
00076         __asm__ __volatile__ ("andl $-16, %esp");                   \
00077     }
00078 
00079 #else // 64
00080 #define __OGRE_SIMD_ALIGN_STACK()                                   \
00081     {                                                               \
00082         /* Use alloca to allocate some memory on the stack.  */     \
00083         /* This alerts gcc that something funny is going on, */     \
00084         /* so that it does not omit the frame pointer etc.   */     \
00085         (void)__builtin_alloca(16);                                 \
00086         /* Now align the stack pointer */                           \
00087         __asm__ __volatile__ ("andq $-16, %rsp");                   \
00088     }
00089 #endif //64
00090 
00091 #elif defined(_MSC_VER)
00092 // Fortunately, MSVC will align the stack automatically
00093 
00094 #endif
00095 
00096 
00097 // Additional platform-dependent header files and declares.
00098 //
00099 // NOTE: Should be sync with __OGRE_HAVE_SSE macro.
00100 //
00101 
00102 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00103 
00104 #if OGRE_COMPILER == OGRE_COMPILER_MSVC || defined(__INTEL_COMPILER)
00105 #include <xmmintrin.h>
00106 
00107 #elif OGRE_COMPILER == OGRE_COMPILER_GNUC
00108 // Don't define ourself version SSE intrinsics if "xmmintrin.h" already included.
00109 //
00110 // Note: gcc in some platform already included "xmmintrin.h" for some reason.
00111 // I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h"
00112 // comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition
00113 // problem on gcc for x86.
00114 //
00115 #if !defined(_XMMINTRIN_H_INCLUDED)
00116 
00117 // Simulate VC/ICC intrinsics. Only used intrinsics are declared here.
00118 #   if OGRE_COMP_VER >= 350
00119 typedef float __m128 __attribute__ ((vector_size (16), aligned(16)));
00120 typedef int __m64 __attribute__ ((vector_size (8)));
00121 #   else
00122 typedef float __m128 __attribute__ ((mode(V4SF),aligned(16)));
00123 typedef int __m64 __attribute__ ((mode(V2SI)));
00124 #   endif
00125 
00126 // Macro for declare intrinsic routines always inline even if in debug build
00127 #define __ALWAYS_INLINE    FORCEINLINE __attribute__ ((__always_inline__))
00128 
00129 // Shuffle instruction must be declare as macro
00130 
00131 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
00132     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
00133 
00134 #define _mm_shuffle_ps(a, b, imm8) __extension__                                        \
00135     ({                                                                                  \
00136         __m128 result;                                                                  \
00137         __asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8));   \
00138         result;                                                                         \
00139     })
00140 
00141 
00142 // Load/store instructions
00143 
00144 #define __MM_DECL_LD(name, instruction, type)                               \
00145     static __ALWAYS_INLINE __m128 _mm_##name(const type *addr)              \
00146     {                                                                       \
00147         __m128 result;                                                      \
00148         __asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr));     \
00149         return result;                                                      \
00150     }
00151 
00152 #define __MM_DECL_LD2(name, instruction, type)                                      \
00153     static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr)          \
00154     {                                                                               \
00155         __m128 result;                                                              \
00156         __asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr));   \
00157         return result;                                                              \
00158     }
00159 
00160 #define __MM_DECL_ST(name, instruction, type)                               \
00161     static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val)          \
00162     {                                                                       \
00163         __asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val));        \
00164     }
00165 
00166 __MM_DECL_LD(loadu_ps, movups, float)
00167 __MM_DECL_ST(storeu_ps, movups, float)
00168 
00169 __MM_DECL_LD(load_ss, movss, float)
00170 __MM_DECL_ST(store_ss, movss, float)
00171 
00172 __MM_DECL_ST(storel_pi, movlps, __m64)
00173 __MM_DECL_ST(storeh_pi, movhps, __m64)
00174 __MM_DECL_LD2(loadl_pi, movlps, __m64)
00175 __MM_DECL_LD2(loadh_pi, movhps, __m64)
00176 
00177 #undef __MM_DECL_LD
00178 #undef __MM_DECL_LD2
00179 #undef __MM_DECL_ST
00180 
00181 // Two operand instructions
00182 
00183 #define __MM_DECL_OP2(name, instruction, constraint)                                    \
00184     static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b)                        \
00185     {                                                                                   \
00186         __m128 result;                                                                  \
00187         __asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b));    \
00188         return result;                                                                  \
00189     }
00190 
00191 __MM_DECL_OP2(add_ps, addps, xm)
00192 __MM_DECL_OP2(add_ss, addss, xm)
00193 __MM_DECL_OP2(sub_ps, subps, xm)
00194 __MM_DECL_OP2(sub_ss, subss, xm)
00195 __MM_DECL_OP2(mul_ps, mulps, xm)
00196 __MM_DECL_OP2(mul_ss, mulss, xm)
00197 
00198 __MM_DECL_OP2(xor_ps, xorps, xm)
00199 
00200 __MM_DECL_OP2(unpacklo_ps, unpcklps, xm)
00201 __MM_DECL_OP2(unpackhi_ps, unpckhps, xm)
00202 
00203 __MM_DECL_OP2(movehl_ps, movhlps, x)
00204 __MM_DECL_OP2(movelh_ps, movlhps, x)
00205 
00206 __MM_DECL_OP2(cmpnle_ps, cmpnleps, xm)
00207 
00208 #undef __MM_DECL_OP2
00209 
00210 // Other used instructions
00211 
00212     static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr)
00213     {
00214         __m128 tmp = _mm_load_ss(addr);
00215         return _mm_shuffle_ps(tmp, tmp, 0);
00216     }
00217 
00218     static __ALWAYS_INLINE __m128 _mm_setzero_ps(void)
00219     {
00220         __m128 result;
00221         __asm__("xorps %0, %0" : "=x" (result));
00222         return result;
00223     }
00224 
00225     static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val)
00226     {
00227         __m128 result;
00228         __asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val));
00229         //__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val));
00230         return result;
00231     }
00232 
00233     static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val)
00234     {
00235         int result;
00236         __asm__("movmskps %1, %0" : "=r" (result) : "x" (val));
00237         return result;
00238     }
00239 
00240 #endif // !defined(_XMMINTRIN_H_INCLUDED)
00241 
00242 #endif // OGRE_COMPILER == OGRE_COMPILER_GNUC
00243 
00244 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00245 
00246 
00247 
00248 //---------------------------------------------------------------------
00249 // SIMD macros and helpers
00250 //---------------------------------------------------------------------
00251 
00252 
00253 namespace Ogre {
00254 
00255 #if __OGRE_HAVE_SSE
00256 
00267 #if 1
00268 #define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)
00269 #else
00270 #define __MM_RSQRT_PS(x)    __mm_rsqrt_nr_ps(x) // Implemented below
00271 #endif
00272 
00281 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                        \
00282     {                                                                               \
00283         __m128 t3, t2, t1, t0;                                                      \
00284                                                                                     \
00285                                                             /* r00 r01 r02 r03 */   \
00286                                                             /* r10 r11 r12 r13 */   \
00287                                                             /* r20 r21 r22 r23 */   \
00288                                                             /* r30 r31 r32 r33 */   \
00289                                                                                     \
00290         t0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */   \
00291         t2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */   \
00292         t1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */   \
00293         t3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */   \
00294                                                                                     \
00295         r0 = _mm_movelh_ps(t0, t1);                         /* r00 r10 r20 r30 */   \
00296         r1 = _mm_movehl_ps(t1, t0);                         /* r01 r11 r21 r31 */   \
00297         r2 = _mm_movelh_ps(t2, t3);                         /* r02 r12 r22 r32 */   \
00298         r3 = _mm_movehl_ps(t3, t2);                         /* r03 r13 r23 r33 */   \
00299     }
00300 
00309 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                            \
00310     {                                                                               \
00311         __m128 t0, t1, t2;                                                          \
00312                                                                                     \
00313                                                             /* r00 r01 r02 r10 */   \
00314                                                             /* r11 r12 r20 r21 */   \
00315                                                             /* r22 r30 r31 r32 */   \
00316                                                                                     \
00317         t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */   \
00318         t1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */   \
00319         t2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */   \
00320                                                                                     \
00321         v0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
00322         v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
00323         v2 = _mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
00324     }
00325 
00333 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
00334     {                                                                               \
00335         __m128 t0, t1, t2;                                                          \
00336                                                                                     \
00337                                                             /* r00 r10 r20 r30 */   \
00338                                                             /* r01 r11 r21 r31 */   \
00339                                                             /* r02 r12 r22 r32 */   \
00340                                                                                     \
00341         t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
00342         t1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
00343         t2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
00344                                                                                     \
00345         v0 = _mm_shuffle_ps(t2, t0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
00346         v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
00347         v2 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
00348     }
00349 
00353 #define __MM_SELECT(v, fp)                                                          \
00354     _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
00355 
00357 #define __MM_ACCUM4_PS(a, b, c, d)                                                  \
00358     _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
00359 
00363 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
00364     __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
00365 
00369 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
00370     __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
00371 
00373 #define __MM_ACCUM3_PS(a, b, c)                                                     \
00374     _mm_add_ps(_mm_add_ps(a, b), c)
00375 
00379 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
00380     __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
00381 
00383 #define __MM_MADD_PS(a, b, c)                                                       \
00384     _mm_add_ps(_mm_mul_ps(a, b), c)
00385 
00387 #define __MM_LERP_PS(t, a, b)                                                       \
00388     __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
00389 
00391 #define __MM_MADD_SS(a, b, c)                                                       \
00392     _mm_add_ss(_mm_mul_ss(a, b), c)
00393 
00395 #define __MM_LERP_SS(t, a, b)                                                       \
00396     __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
00397 
00399 #define __MM_LOAD_PS(p)                                                             \
00400     (*(__m128*)(p))
00401 
00403 #define __MM_STORE_PS(p, v)                                                         \
00404     (*(__m128*)(p) = (v))
00405 
00406 
00409     template <bool aligned = false>
00410     struct SSEMemoryAccessor
00411     {
00412         static FORCEINLINE __m128 load(const float *p)
00413         {
00414             return _mm_loadu_ps(p);
00415         }
00416         static FORCEINLINE void store(float *p, const __m128& v)
00417         {
00418             _mm_storeu_ps(p, v);
00419         }
00420     };
00421     // Special aligned accessor
00422     template <>
00423     struct SSEMemoryAccessor<true>
00424     {
00425         static FORCEINLINE const __m128& load(const float *p)
00426         {
00427             return __MM_LOAD_PS(p);
00428         }
00429         static FORCEINLINE void store(float *p, const __m128& v)
00430         {
00431             __MM_STORE_PS(p, v);
00432         }
00433     };
00434 
00437     static FORCEINLINE bool _isAlignedForSSE(const void *p)
00438     {
00439         return (((size_t)p) & 15) == 0;
00440     }
00441 
00445     static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
00446     {
00447         static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
00448         static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
00449         __m128 t = _mm_rsqrt_ps(x);
00450         return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
00451             _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
00452     }
00453 
00454 // Macro to check the stack aligned for SSE
00455 #if OGRE_DEBUG_MODE
00456 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
00457     {                                               \
00458         __m128 test;                                \
00459         assert(_isAlignedForSSE(&test));            \
00460     }
00461 
00462 #else   // !OGRE_DEBUG_MODE
00463 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
00464 
00465 #endif  // OGRE_DEBUG_MODE
00466 
00467 
00468 #endif  // __OGRE_HAVE_SSE
00469 
00470 }
00471 
00472 #endif // __SIMDHelper_H__

Copyright © 2008 Torus Knot Software Ltd
Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 2.5 License.
Last modified Sun Sep 27 22:02:26 2009