Actual source code: iclsse.h

  2: #ifndef __ICL_SSE_H_
  4: #include <xmmintrin.h>


  8: /* SSE_FUNCTION_BEGIN must be after the LAST declaration in the outermost SSE scope */
  9: #define SSE_SCOPE_BEGIN { __m128 XMM0,XMM1,XMM2,XMM3,XMM4,XMM5,XMM6,XMM7; {
 10: #define SSE_SCOPE_END   }}

 12: /* For use with SSE Inlined Assembly  Blocks */
 13: /* Note: SSE_ macro invokations must NOT be followed by a ; */

 15: #define SSE_INLINE_BEGIN_1(arg1)           { float *_tmp_arg1; _tmp_arg1=arg1;
 16: #define SSE_INLINE_END_1                   }
 17: #define SSE_INLINE_BEGIN_2(arg1,arg2)      { float *_tmp_arg1, *_tmp_arg2; _tmp_arg1=arg1; _tmp_arg2=arg2;
 18: #define SSE_INLINE_END_2                   }
 19: #define SSE_INLINE_BEGIN_3(arg1,arg2,arg3) { float *_tmp_arg1, *_tmp_arg2, *_tmp_arg3; \
 20:                                              _tmp_arg1=arg1; _tmp_arg2=arg2; _tmp_arg3=arg3;
 21: #define SSE_INLINE_END_3                   }

 23: #define SSE_ARG_1 _tmp_arg1
 24: #define SSE_ARG_2 _tmp_arg2
 25: #define SSE_ARG_3 _tmp_arg3
 26: /* Note: If more args are to be used, be sure the debug version uses the most args allowed */

 28: /* Offset values for SSE_ load/store/arithmetic memory ops */
 29: #define FLOAT_0    0
 30: #define FLOAT_1    1
 31: #define FLOAT_2    2
 32: #define FLOAT_3    3
 33: #define FLOAT_4    4
 34: #define FLOAT_5    5
 35: #define FLOAT_6    6
 36: #define FLOAT_7    7
 37: #define FLOAT_8    8
 38: #define FLOAT_9    9
 39: #define FLOAT_10  10
 40: #define FLOAT_11  11
 41: #define FLOAT_12  12
 42: #define FLOAT_13  13
 43: #define FLOAT_14  14
 44: #define FLOAT_15  15

 46: #define FLOAT_16  16
 47: #define FLOAT_24  24
 48: #define FLOAT_32  32
 49: #define FLOAT_40  40
 50: #define FLOAT_48  48
 51: #define FLOAT_56  56
 52: #define FLOAT_64  64

 54: #define DOUBLE_0   0
 55: #define DOUBLE_1   1
 56: #define DOUBLE_2   2 
 57: #define DOUBLE_3   3
 58: #define DOUBLE_4   4
 59: #define DOUBLE_5   5
 60: #define DOUBLE_6   6
 61: #define DOUBLE_7   7

 63: #define DOUBLE_8   8
 64: #define DOUBLE_16 16
 65: #define DOUBLE_20 20
 66: #define DOUBLE_24 24
 67: #define DOUBLE_28 28
 68: #define DOUBLE_32 32

 70: /* xmmintrin.h provides for inline/debug versions automatically */
 71: /* Inline versions */

 73: /* Prefetch Macros */
 74: #define SSE_PREFETCH_NTA(arg,offset)      PREFETCH_NTA(&arg[offset]);
 75: #define SSE_PREFETCH_L1(arg,offset)       PREFETCH_L1(&arg[offset]);
 76: #define SSE_PREFETCH_L2(arg,offset)       PREFETCH_L2(&arg[offset]);
 77: #define SSE_PREFETCH_L3(arg,offset)       PREFETCH_L3(&arg[offset]);

 79: /* Store Macros */
 80: #define SSE_STORE_SS(arg,offset,srcreg)   STORE_SS(&arg[offset],srcreg);
 81: #define SSE_STOREL_PS(arg,offset,srcreg)  STOREL_PS(&arg[offset],srcreg);
 82: #define SSE_STOREH_PS(arg,offset,srcreg)  STOREH_PS(&arg[offset],srcreg);
 83: #define SSE_STORE_PS(arg,offset,srcreg)   STORE_PS(&arg[offset],srcreg);
 84: #define SSE_STOREU_PS(arg,offset,srcreg)  STOREU_PS(&arg[offset],srcreg);
 85: #define SSE_STREAM_PS(arg,offset,srcreg)  STREAM_PS(&arg[offset],srcreg);

 87: /* Register-Register Copy Macros */
 88: #define SSE_COPY_SS(dstreg,srcreg)        COPY_SS(dstreg,srcreg);
 89: #define SSE_COPY_PS(dstreg,srcreg)        COPY_PS(dstreg,srcreg);

 91: /* Load Macros */
 92: #define SSE_LOAD_SS(arg,offset,dstreg)    LOAD_SS(&arg[offset],dstreg);
 93: #define SSE_LOADL_PS(arg,offset,dstreg)   LOADL_PS(&arg[offset],dstreg);
 94: #define SSE_LOADH_PS(arg,offset,dstreg)   LOADH_PS(&arg[offset],dstreg);
 95: #define SSE_LOAD_PS(arg,offset,dstreg)    LOAD_PS(&arg[offset],dstreg);
 96: #define SSE_LOADU_PS(arg,offset,dstreg)   LOADU_PS(&arg[offset],dstreg);

 98: /* Shuffle */
 99: #define SSE_SHUFFLE(dstreg,srcreg,imm)    SHUFFLE(dstreg,srcreg,imm);

101: /* Multiply: A:=A*B */
102: #define SSE_MULT_SS(dstreg,srcreg)        MULT_SS(dstreg,srcreg);
103: #define SSE_MULT_PS(dstreg,srcreg)        MULT_PS(dstreg,srcreg);
104: #define SSE_MULT_SS_M(dstreg,arg,offset)  MULT_SS_M(dstreg,&arg[offset]);
105: #define SSE_MULT_PS_M(dstreg,arg,offset)  MULT_PS_M(dstreg,&arg[offset]);

107: /* Divide: A:=A/B */
108: #define SSE_DIV_SS(dstreg,srcreg)         DIV_SS(dstreg,srcreg);
109: #define SSE_DIV_PS(dstreg,srcreg)         DIV_PS(dstreg,srcreg);
110: #define SSE_DIV_SS_M(dstreg,arg,offset)   DIV_SS_M(dstreg,&arg[offset]);
111: #define SSE_DIV_PS_M(dstreg,arg,offset)   DIV_PS_M(dstreg,&arg[offset]);

113: /* Reciprocal: A:=1/B */
114: #define SSE_RECIP_SS(dstreg,srcreg)       RECIP_SS(dstreg,srcreg); 
115: #define SSE_RECIP_PS(dstreg,srcreg)       RECIP_PS(dstreg,srcreg);
116: #define SSE_RECIP_SS_M(dstreg,arg,offset) RECIP_SS_M(dstreg,&arg[offset]);
117: #define SSE_RECIP_PS_M(dstreg,arg,offset) RECIP_PS_M(dstreg,&arg[offset]);

119: /* Add: A:=A+B */
120: #define SSE_ADD_SS(dstreg,srcreg)         ADD_SS(dstreg,srcreg);
121: #define SSE_ADD_PS(dstreg,srcreg)         ADD_PS(dstreg,srcreg);
122: #define SSE_ADD_SS_M(dstreg,arg,offset)   ADD_SS_M(dstreg,&arg[offset]);
123: #define SSE_ADD_PS_M(dstreg,arg,offset)   ADD_PS_M(dstreg,&arg[offset]);

125: /* Subtract: A:=A-B */
126: #define SSE_SUB_SS(dstreg,srcreg)         SUB_SS(dstreg,srcreg);
127: #define SSE_SUB_PS(dstreg,srcreg)         SUB_PS(dstreg,srcreg);
128: #define SSE_SUB_SS_M(dstreg,arg,offset)   SUB_SS_M(dstreg,&arg[offset]);
129: #define SSE_SUB_PS_M(dstreg,arg,offset)   SUB_PS_M(dstreg,&arg[offset]);

131: /* Logical: A:=A<op>B */
132: #define SSE_AND_SS(dstreg,srcreg)         AND_SS(dstreg,srcreg);
133: #define SSE_ANDNOT_SS(dstreg,srcreg)      ANDNOT_SS(dstreg,srcreg);
134: #define SSE_OR_SS(dstreg,srcreg)          OR_SS(dstreg,srcreg);
135: #define SSE_XOR_SS(dstreg,srcreg)         XOR_SS(dstreg,srcreg);

137: #define SSE_AND_PS(dstreg,srcreg)         AND_PS(dstreg,srcreg);
138: #define SSE_ANDNOT_PS(dstreg,srcreg)      ANDNOT_PS(dstreg,srcreg);
139: #define SSE_OR_PS(dstreg,srcreg)          OR_PS(dstreg,srcreg);
140: #define SSE_XOR_PS(dstreg,srcreg)         XOR_PS(dstreg,srcreg);

142: /* Comparisons A:=A<compare>B */
143: #define SSE_CMPEQ_SS(dstreg,srcreg)       CMPEQ_SS(dstreg,srcreg);
144: #define SSE_CMPLT_SS(dstreg,srcreg)       CMPLT_SS(dstreg,srcreg);
145: #define SSE_CMPLE_SS(dstreg,srcreg)       CMPLE_SS(dstreg,srcreg);
146: #define SSE_CMPUNORD_SS(dstreg,srcreg)    CMPUNORD_SS(dstreg,srcreg);
147: #define SSE_CMPNEQ_SS(dstreg,srcreg)      CMPNEQ_SS(dstreg,srcreg);
148: #define SSE_CMPNLT_SS(dstreg,srcreg)      CMPNLT_SS(dstreg,srcreg);
149: #define SSE_CMPNLE_SS(dstreg,srcreg)      CMPNLE_SS(dstreg,srcreg);
150: #define SSE_CMPORD_SS(dstreg,srcreg)      CMPORD_SS(dstreg,srcreg);

152: #define SSE_CMPEQ_PS(dstreg,srcreg)       CMPEQ_PS(dstreg,srcreg);
153: #define SSE_CMPLT_PS(dstreg,srcreg)       CMPLT_PS(dstreg,srcreg);
154: #define SSE_CMPLE_PS(dstreg,srcreg)       CMPLE_PS(dstreg,srcreg);
155: #define SSE_CMPUNORD_PS(dstreg,srcreg)    CMPUNORD_PS(dstreg,srcreg);
156: #define SSE_CMPNEQ_PS(dstreg,srcreg)      CMPNEQ_PS(dstreg,srcreg);
157: #define SSE_CMPNLT_PS(dstreg,srcreg)      CMPNLT_PS(dstreg,srcreg);
158: #define SSE_CMPNLE_PS(dstreg,srcreg)      CMPNLE_PS(dstreg,srcreg);
159: #define SSE_CMPORD_PS(dstreg,srcreg)      CMPORD_PS(dstreg,srcreg);

161: /* ================================================================================================ */

163: /* Other useful macros whose destinations are not SSE registers */

165: /* Movemask (for use after comparisons) */
166: /* Reduces 128 bit mask to an integer based on most significant bits of 32 bit parts. */
167: #define MOVEMASK(integ,srcxmmreg)         integ = _mm_movemask_ps(srcxmmreg)

169: /* Double_4/Float_4 Conversions */
170: #define CONVERT_FLOAT4_DOUBLE4(dst,src)   { double *_tmp_double_ptr; float *_tmp_float_ptr; \
171:                                             _tmp_double_ptr = dst; _tmp_float_ptr = src; \
172:                                             _tmp_double_ptr[0]=(double)_tmp_float_ptr[0]; \
173:                                             _tmp_double_ptr[1]=(double)_tmp_float_ptr[1]; \
174:                                             _tmp_double_ptr[2]=(double)_tmp_float_ptr[2]; \
175:                                             _tmp_double_ptr[3]=(double)_tmp_float_ptr[3]; }

177: #define CONVERT_DOUBLE4_FLOAT4(dst,src)   { double *_tmp_double_ptr; float *_tmp_float_ptr; \
178:                                             _tmp_double_ptr = src; _tmp_float_ptr = dst; \
179:                                             _tmp_float_ptr[0]=(float)_tmp_double_ptr[0]; \
180:                                             _tmp_float_ptr[1]=(float)_tmp_double_ptr[1]; \
181:                                             _tmp_float_ptr[2]=(float)_tmp_double_ptr[2]; \
182:                                             _tmp_float_ptr[3]=(float)_tmp_double_ptr[3]; }

184: /* Aligned Malloc */
185: #define SSE_MALLOC(var,sze)              { void *_tmp_void_ptr = *var; size_t _tmp_size; _tmp_size = sze; \
186:                                             *var = _mm_malloc(sze,16); }
187: #define SSE_FREE(var)                     { void *_tmp_void_ptr = var; \
188:                                             _mm_free(var); }

190: /* CPUID Instruction Macros */

192: #define CPUID_VENDOR   0
193: #define CPUID_FEATURES 1
194: #define CPUID_CACHE    2

196: #define CPUID(imm,_eax,_ebx,_ecx,_edx) { int _tmp_imm; \
197:   unsigned long _tmp_eax, _tmp_ebx, _tmp_ecx, _tmp_edx; \
198:   _tmp_eax=*_eax; _tmp_ebx=*_ebx; _tmp_ecx=*_ecx; _tmp_edx=*_edx; \
199:   _tmp_imm=imm; \
200:   __asm { \
201:     __asm mov eax, imm \
202:     __asm cpuid \
203:     __asm mov _tmp_eax, eax \
204:     __asm mov _tmp_ebx, ebx \
205:     __asm mov _tmp_ecx, ecx \
206:     __asm mov _tmp_edx, edx \
207:   } \
208:   *_eax=_tmp_eax; *_ebx=_tmp_ebx; *_ecx=_tmp_ecx; *_edx=_tmp_edx; \
209: }

211: #define CPUID_GET_VENDOR(result) { char *_gv_vendor=result; int _gv_i; \
212:   unsigned long _gv_eax=0;unsigned long _gv_ebx=0;unsigned long _gv_ecx=0;unsigned long _gv_edx=0;\
213:   CPUID(CPUID_VENDOR,&_gv_eax,&_gv_ebx,&_gv_ecx,&_gv_edx); \
214:   for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+0]=*(((char *)(&_gv_ebx))+_gv_i); \
215:   for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+4]=*(((char *)(&_gv_edx))+_gv_i); \
216:   for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+8]=*(((char *)(&_gv_ecx))+_gv_i); \
217: }

219: /* ================================================================================================ */

221: /* The Stand Alone Versions of the SSE Macros */

223: /* Prefetch Macros */
224: #define PREFETCH_NTA(var)             _mm_prefetch((char *)(var),_MM_HINT_NTA)
225: #define PREFETCH_L1(var)              _mm_prefetch((char *)(var),_MM_HINT_T0)
226: #define PREFETCH_L2(var)              _mm_prefetch((char *)(var),_MM_HINT_T1)
227: #define PREFETCH_L3(var)              _mm_prefetch((char *)(var),_MM_HINT_T2)

229: /* Store Macros */
230: #define STORE_SS(var,srcreg)          _mm_store_ss(var,srcreg)
231: #define STOREL_PS(var,srcreg)         _mm_storel_pi((__m64 *)(var),srcreg)
232: #define STOREH_PS(var,srcreg)         _mm_storeh_pi((__m64 *)(var),srcreg)
233: #define STORE_PS(var,srcreg)          _mm_store_ps(var,srcreg)
234: #define STOREU_PS(var,srcreg)         _mm_storeu_ps(var,srcreg)
235: #define STREAM_PS(var,srcreg)         _mm_stream_ps(var,srcreg)

237: /* Register-Register Copy Macros */
238: #define COPY_SS(dstreg,srcreg)        dstreg = _mm_move_ss(dstreg,srcreg)
239: #define COPY_PS(dstreg,srcreg)        dstreg = srcreg

241: /* Load Macros */
242: #define LOAD_SS(var,dstreg)           dstreg = _mm_load_ss(var)
243: #define LOADL_PS(var,dstreg)          dstreg = _mm_loadl_pi(dstreg,(__m64 *)(var))
244: #define LOADH_PS(var,dstreg)          dstreg = _mm_loadh_pi(dstreg,(__m64 *)(var))
245: #define LOAD_PS(var,dstreg)           dstreg = _mm_load_ps(var)
246: #define LOADU_PS(var,dstreg)          dstreg = _mm_loadu_ps(var)

248: /* Shuffle */
249: #define SHUFFLE(dstreg,srcreg,i)      dstreg = _mm_shuffle_ps(dstreg,srcreg,i)

251: /* Multiply: A:=A*B */
252: #define MULT_SS(dstreg,srcreg)        dstreg = _mm_mul_ss(dstreg,srcreg)
253: #define MULT_PS(dstreg,srcreg)        dstreg = _mm_mul_ps(dstreg,srcreg)
254: #define MULT_SS_M(dstreg,var)         dstreg = _mm_mul_ss(dstreg,_mm_load_ss(var))
255: #define MULT_PS_M(dstreg,var)         dstreg = _mm_mul_ps(dstreg,_mm_load_ps(var))

257: /* Divide: A:=A/B */
258: #define DIV_SS(dstreg,srcreg)         dstreg = _mm_div_ss(dstreg,srcreg)
259: #define DIV_PS(dstreg,srcreg)         dstreg = _mm_div_ps(dstreg,srcreg)
260: #define DIV_SS_M(dstreg,var)          dstreg = _mm_div_ss(dstreg,_mm_load_ss(var))
261: #define DIV_PS_M(dstreg,var)          dstreg = _mm_div_ps(dstreg,_mm_load_ps(var))

263: /* Reciprocal: A:=1/B */
264: #define RECIP_SS(dstreg,srcreg)       dstreg = _mm_rcp_ss(srcreg)
265: #define RECIP_PS(dstreg,srcreg)       dstreg = _mm_rcp_ps(srcreg)
266: #define RECIP_SS_M(dstreg,var)        dstreg = _mm_rcp_ss(_mm_load_ss(var))
267: #define RECIP_PS_M(dstreg,var)        dstreg = _mm_rcp_ps(_mm_load_ps(var))

269: /* Add: A:=A+B */
270: #define ADD_SS(dstreg,srcreg)         dstreg = _mm_add_ss(dstreg,srcreg)
271: #define ADD_PS(dstreg,srcreg)         dstreg = _mm_add_ps(dstreg,srcreg)
272: #define ADD_SS_M(dstreg,var)          dstreg = _mm_add_ss(dstreg,_mm_load_ss(var))
273: #define ADD_PS_M(dstreg,var)          dstreg = _mm_add_ps(dstreg,_mm_load_ps(var))

275: /* Subtract: A:=A-B */
276: #define SUB_SS(dstreg,srcreg)         dstreg = _mm_sub_ss(dstreg,srcreg)
277: #define SUB_PS(dstreg,srcreg)         dstreg = _mm_sub_ps(dstreg,srcreg)
278: #define SUB_SS_M(dstreg,var)          dstreg = _mm_sub_ss(dstreg,_mm_load_ss(var))
279: #define SUB_PS_M(dstreg,var)          dstreg = _mm_sub_ps(dstreg,_mm_load_ps(var))

281: /* Logical: A:=A<op>B */
282: #define AND_SS(dstreg,srcreg)         dstreg = _mm_and_ss(dstreg,srcreg)
283: #define ANDNOT_SS(dstreg,srcreg)      dstreg = _mm_andnot_ss(dstreg,srcreg)
284: #define OR_SS(dstreg,srcreg)          dstreg = _mm_or_ss(dstreg,srcreg)
285: #define XOR_SS(dstreg,srcreg)         dstreg = _mm_xor_ss(dstreg,srcreg)

287: #define AND_PS(dstreg,srcreg)         dstreg = _mm_and_ps(dstreg,srcreg)
288: #define ANDNOT_PS(dstreg,srcreg)      dstreg = _mm_andnot_ps(dstreg,srcreg)
289: #define OR_PS(dstreg,srcreg)          dstreg = _mm_or_ps(dstreg,srcreg)
290: #define XOR_PS(dstreg,srcreg)         dstreg = _mm_xor_ps(dstreg,srcreg)

292: /* Implementing an if():
293:    First perform the comparison, then use Movemask to get an integer, say i, then
294:    if(i) ....
295: */

297: /* 
298:    Note: From the IA Software Developer's Manual:
299:    The greater-than relations not implemented in hardware require more than one instruction to
300:    emulate in software and therefore should not be implemented as pseudo-ops. (For these, the
301:    programmer should reverse the operands of the corresponding less than relations and use move
302:    instructions to ensure that the mask is moved to the correct destination register and that the
303:    source operand is left intact.)
304: */

306: /* Comparisons A:=A<compare>B */
307: #define CMPEQ_SS(dstreg,srcreg)       dstreg = _mm_cmpeq_ss(dstreg,srcreg)
308: #define CMPLT_SS(dstreg,srcreg)       dstreg = _mm_cmplt_ss(dstreg,srcreg)
309: #define CMPLE_SS(dstreg,srcreg)       dstreg = _mm_cmple_ss(dstreg,srcreg)
310: #define CMPUNORD_SS(dstreg,srcreg)    dstreg = _mm_cmpunord_ss(dstreg,srcreg)
311: #define CMPNEQ_SS(dstreg,srcreg)      dstreg = _mm_cmpneq_ss(dstreg,srcreg)
312: #define CMPNLT_SS(dstreg,srcreg)      dstreg = _mm_cmpnlt_ss(dstreg,srcreg)
313: #define CMPNLE_SS(dstreg,srcreg)      dstreg = _mm_cmpnle_ss(dstreg,srcreg)
314: #define CMPORD_SS(dstreg,srcreg)      dstreg = _mm_cmpord_ss(dstreg,srcreg)

316: #define CMPEQ_PS(dstreg,srcreg)       dstreg = _mm_cmpeq_ps(dstreg,srcreg)
317: #define CMPLT_PS(dstreg,srcreg)       dstreg = _mm_cmplt_ps(dstreg,srcreg)
318: #define CMPLE_PS(dstreg,srcreg)       dstreg = _mm_cmple_ps(dstreg,srcreg)
319: #define CMPUNORD_PS(dstreg,srcreg)    dstreg = _mm_cmpunord_ps(dstreg,srcreg)
320: #define CMPNEQ_PS(dstreg,srcreg)      dstreg = _mm_cmpneq_ps(dstreg,srcreg)
321: #define CMPNLT_PS(dstreg,srcreg)      dstreg = _mm_cmpnlt_ps(dstreg,srcreg)
322: #define CMPNLE_PS(dstreg,srcreg)      dstreg = _mm_cmpnle_ps(dstreg,srcreg)
323: #define CMPORD_PS(dstreg,srcreg)      dstreg = _mm_cmpord_ps(dstreg,srcreg)

325: /* ================================================================================================ */

328: #endif