1 : /* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
2 : Free Software Foundation, Inc.
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 3, or (at your option)
9 : any later version.
10 :
11 : GCC is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : Under Section 7 of GPL version 3, you are granted additional
17 : permissions described in the GCC Runtime Library Exception, version
18 : 3.1, as published by the Free Software Foundation.
19 :
20 : You should have received a copy of the GNU General Public License and
21 : a copy of the GCC Runtime Library Exception along with this program;
22 : see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 : <http://www.gnu.org/licenses/>. */
24 :
25 : /* Implemented from the specification included in the Intel C++ Compiler
26 : User Guide and Reference, version 9.0. */
27 :
28 : #ifndef _MMINTRIN_H_INCLUDED
29 : #define _MMINTRIN_H_INCLUDED
30 :
31 : #ifndef __MMX__
32 : # error "MMX instruction set not enabled"
33 : #else
34 : /* The Intel API is flexible enough that we must allow aliasing with other
35 : vector types, and their scalar components. */
36 : typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
37 :
38 : /* Internal data types for implementing the intrinsics. */
39 : typedef int __v2si __attribute__ ((__vector_size__ (8)));
40 : typedef short __v4hi __attribute__ ((__vector_size__ (8)));
41 : typedef char __v8qi __attribute__ ((__vector_size__ (8)));
42 : typedef long long __v1di __attribute__ ((__vector_size__ (8)));
43 : typedef float __v2sf __attribute__ ((__vector_size__ (8)));
44 :
45 : /* Empty the multimedia state. */
46 : extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
47 : _mm_empty (void)
48 : {
49 0 : __builtin_ia32_emms ();
50 : }
51 :
52 : extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
53 : _m_empty (void)
54 : {
55 : _mm_empty ();
56 : }
57 :
58 : /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
59 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60 : _mm_cvtsi32_si64 (int __i)
61 : {
62 0 : return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
63 : }
64 :
65 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66 : _m_from_int (int __i)
67 : {
68 : return _mm_cvtsi32_si64 (__i);
69 : }
70 :
71 : #ifdef __x86_64__
72 : /* Convert I to a __m64 object. */
73 :
74 : /* Intel intrinsic. */
75 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76 : _m_from_int64 (long long __i)
77 : {
78 : return (__m64) __i;
79 : }
80 :
81 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82 : _mm_cvtsi64_m64 (long long __i)
83 : {
84 : return (__m64) __i;
85 : }
86 :
87 : /* Microsoft intrinsic. */
88 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89 : _mm_cvtsi64x_si64 (long long __i)
90 : {
91 : return (__m64) __i;
92 : }
93 :
94 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 : _mm_set_pi64x (long long __i)
96 : {
97 : return (__m64) __i;
98 : }
99 : #endif
100 :
101 : /* Convert the lower 32 bits of the __m64 object into an integer. */
102 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 : _mm_cvtsi64_si32 (__m64 __i)
104 : {
105 0 : return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
106 : }
107 :
108 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109 : _m_to_int (__m64 __i)
110 : {
111 : return _mm_cvtsi64_si32 (__i);
112 : }
113 :
114 : #ifdef __x86_64__
115 : /* Convert the __m64 object to a 64bit integer. */
116 :
117 : /* Intel intrinsic. */
118 : extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 : _m_to_int64 (__m64 __i)
120 : {
121 : return (long long)__i;
122 : }
123 :
124 : extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 : _mm_cvtm64_si64 (__m64 __i)
126 : {
127 : return (long long)__i;
128 : }
129 :
130 : /* Microsoft intrinsic. */
131 : extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 : _mm_cvtsi64_si64x (__m64 __i)
133 : {
134 : return (long long)__i;
135 : }
136 : #endif
137 :
138 : /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
139 : the result, and the four 16-bit values from M2 into the upper four 8-bit
140 : values of the result, all with signed saturation. */
141 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 : _mm_packs_pi16 (__m64 __m1, __m64 __m2)
143 : {
144 : return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
145 : }
146 :
147 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 : _m_packsswb (__m64 __m1, __m64 __m2)
149 : {
150 : return _mm_packs_pi16 (__m1, __m2);
151 : }
152 :
153 : /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
154 : the result, and the two 32-bit values from M2 into the upper two 16-bit
155 : values of the result, all with signed saturation. */
156 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
157 : _mm_packs_pi32 (__m64 __m1, __m64 __m2)
158 : {
159 : return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
160 : }
161 :
162 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 : _m_packssdw (__m64 __m1, __m64 __m2)
164 : {
165 : return _mm_packs_pi32 (__m1, __m2);
166 : }
167 :
168 : /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
169 : the result, and the four 16-bit values from M2 into the upper four 8-bit
170 : values of the result, all with unsigned saturation. */
171 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172 : _mm_packs_pu16 (__m64 __m1, __m64 __m2)
173 : {
174 0 : return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
175 : }
176 :
177 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178 : _m_packuswb (__m64 __m1, __m64 __m2)
179 : {
180 : return _mm_packs_pu16 (__m1, __m2);
181 : }
182 :
183 : /* Interleave the four 8-bit values from the high half of M1 with the four
184 : 8-bit values from the high half of M2. */
185 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 : _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
187 : {
188 0 : return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
189 : }
190 :
191 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192 : _m_punpckhbw (__m64 __m1, __m64 __m2)
193 : {
194 : return _mm_unpackhi_pi8 (__m1, __m2);
195 : }
196 :
197 : /* Interleave the two 16-bit values from the high half of M1 with the two
198 : 16-bit values from the high half of M2. */
199 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200 : _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
201 : {
202 : return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
203 : }
204 :
205 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
206 : _m_punpckhwd (__m64 __m1, __m64 __m2)
207 : {
208 : return _mm_unpackhi_pi16 (__m1, __m2);
209 : }
210 :
211 : /* Interleave the 32-bit value from the high half of M1 with the 32-bit
212 : value from the high half of M2. */
213 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 : _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
215 : {
216 : return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
217 : }
218 :
219 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220 : _m_punpckhdq (__m64 __m1, __m64 __m2)
221 : {
222 : return _mm_unpackhi_pi32 (__m1, __m2);
223 : }
224 :
225 : /* Interleave the four 8-bit values from the low half of M1 with the four
226 : 8-bit values from the low half of M2. */
227 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 : _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
229 : {
230 0 : return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
231 : }
232 :
233 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234 : _m_punpcklbw (__m64 __m1, __m64 __m2)
235 : {
236 : return _mm_unpacklo_pi8 (__m1, __m2);
237 : }
238 :
239 : /* Interleave the two 16-bit values from the low half of M1 with the two
240 : 16-bit values from the low half of M2. */
241 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242 : _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
243 : {
244 : return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
245 : }
246 :
247 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 : _m_punpcklwd (__m64 __m1, __m64 __m2)
249 : {
250 : return _mm_unpacklo_pi16 (__m1, __m2);
251 : }
252 :
253 : /* Interleave the 32-bit value from the low half of M1 with the 32-bit
254 : value from the low half of M2. */
255 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256 : _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
257 : {
258 : return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
259 : }
260 :
261 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262 : _m_punpckldq (__m64 __m1, __m64 __m2)
263 : {
264 : return _mm_unpacklo_pi32 (__m1, __m2);
265 : }
266 :
267 : /* Add the 8-bit values in M1 to the 8-bit values in M2. */
268 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269 : _mm_add_pi8 (__m64 __m1, __m64 __m2)
270 : {
271 : return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
272 : }
273 :
274 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275 : _m_paddb (__m64 __m1, __m64 __m2)
276 : {
277 : return _mm_add_pi8 (__m1, __m2);
278 : }
279 :
280 : /* Add the 16-bit values in M1 to the 16-bit values in M2. */
281 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282 : _mm_add_pi16 (__m64 __m1, __m64 __m2)
283 : {
284 0 : return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
285 : }
286 :
287 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288 : _m_paddw (__m64 __m1, __m64 __m2)
289 : {
290 : return _mm_add_pi16 (__m1, __m2);
291 : }
292 :
293 : /* Add the 32-bit values in M1 to the 32-bit values in M2. */
294 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
295 : _mm_add_pi32 (__m64 __m1, __m64 __m2)
296 : {
297 : return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
298 : }
299 :
300 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301 : _m_paddd (__m64 __m1, __m64 __m2)
302 : {
303 : return _mm_add_pi32 (__m1, __m2);
304 : }
305 :
306 : /* Add the 64-bit values in M1 to the 64-bit values in M2. */
307 : #ifdef __SSE2__
308 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309 : _mm_add_si64 (__m64 __m1, __m64 __m2)
310 : {
311 : return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
312 : }
313 : #endif
314 :
315 : /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
316 : saturated arithmetic. */
317 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318 : _mm_adds_pi8 (__m64 __m1, __m64 __m2)
319 : {
320 : return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
321 : }
322 :
323 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324 : _m_paddsb (__m64 __m1, __m64 __m2)
325 : {
326 : return _mm_adds_pi8 (__m1, __m2);
327 : }
328 :
329 : /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
330 : saturated arithmetic. */
331 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332 : _mm_adds_pi16 (__m64 __m1, __m64 __m2)
333 : {
334 : return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
335 : }
336 :
337 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 : _m_paddsw (__m64 __m1, __m64 __m2)
339 : {
340 : return _mm_adds_pi16 (__m1, __m2);
341 : }
342 :
343 : /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
344 : saturated arithmetic. */
345 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
346 : _mm_adds_pu8 (__m64 __m1, __m64 __m2)
347 : {
348 0 : return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
349 : }
350 :
351 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352 : _m_paddusb (__m64 __m1, __m64 __m2)
353 : {
354 : return _mm_adds_pu8 (__m1, __m2);
355 : }
356 :
357 : /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
358 : saturated arithmetic. */
359 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 : _mm_adds_pu16 (__m64 __m1, __m64 __m2)
361 : {
362 0 : return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
363 : }
364 :
365 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 : _m_paddusw (__m64 __m1, __m64 __m2)
367 : {
368 : return _mm_adds_pu16 (__m1, __m2);
369 : }
370 :
371 : /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
372 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 : _mm_sub_pi8 (__m64 __m1, __m64 __m2)
374 : {
375 : return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
376 : }
377 :
378 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 : _m_psubb (__m64 __m1, __m64 __m2)
380 : {
381 : return _mm_sub_pi8 (__m1, __m2);
382 : }
383 :
384 : /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
385 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 : _mm_sub_pi16 (__m64 __m1, __m64 __m2)
387 : {
388 : return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
389 : }
390 :
391 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 : _m_psubw (__m64 __m1, __m64 __m2)
393 : {
394 : return _mm_sub_pi16 (__m1, __m2);
395 : }
396 :
397 : /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
398 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399 : _mm_sub_pi32 (__m64 __m1, __m64 __m2)
400 : {
401 : return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
402 : }
403 :
404 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 : _m_psubd (__m64 __m1, __m64 __m2)
406 : {
407 : return _mm_sub_pi32 (__m1, __m2);
408 : }
409 :
410 : /* Add the 64-bit values in M1 to the 64-bit values in M2. */
411 : #ifdef __SSE2__
412 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 : _mm_sub_si64 (__m64 __m1, __m64 __m2)
414 : {
415 : return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
416 : }
417 : #endif
418 :
419 : /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
420 : saturating arithmetic. */
421 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 : _mm_subs_pi8 (__m64 __m1, __m64 __m2)
423 : {
424 : return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
425 : }
426 :
427 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428 : _m_psubsb (__m64 __m1, __m64 __m2)
429 : {
430 : return _mm_subs_pi8 (__m1, __m2);
431 : }
432 :
433 : /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
434 : signed saturating arithmetic. */
435 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436 : _mm_subs_pi16 (__m64 __m1, __m64 __m2)
437 : {
438 : return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
439 : }
440 :
441 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442 : _m_psubsw (__m64 __m1, __m64 __m2)
443 : {
444 : return _mm_subs_pi16 (__m1, __m2);
445 : }
446 :
447 : /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
448 : unsigned saturating arithmetic. */
449 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450 : _mm_subs_pu8 (__m64 __m1, __m64 __m2)
451 : {
452 : return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
453 : }
454 :
455 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 : _m_psubusb (__m64 __m1, __m64 __m2)
457 : {
458 : return _mm_subs_pu8 (__m1, __m2);
459 : }
460 :
461 : /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
462 : unsigned saturating arithmetic. */
463 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
464 : _mm_subs_pu16 (__m64 __m1, __m64 __m2)
465 : {
466 : return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
467 : }
468 :
469 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470 : _m_psubusw (__m64 __m1, __m64 __m2)
471 : {
472 : return _mm_subs_pu16 (__m1, __m2);
473 : }
474 :
475 : /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
476 : four 32-bit intermediate results, which are then summed by pairs to
477 : produce two 32-bit results. */
478 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479 : _mm_madd_pi16 (__m64 __m1, __m64 __m2)
480 : {
481 : return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
482 : }
483 :
484 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485 : _m_pmaddwd (__m64 __m1, __m64 __m2)
486 : {
487 : return _mm_madd_pi16 (__m1, __m2);
488 : }
489 :
490 : /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
491 : M2 and produce the high 16 bits of the 32-bit results. */
492 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 : _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
494 : {
495 : return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
496 : }
497 :
498 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 : _m_pmulhw (__m64 __m1, __m64 __m2)
500 : {
501 : return _mm_mulhi_pi16 (__m1, __m2);
502 : }
503 :
504 : /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
505 : the low 16 bits of the results. */
506 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 : _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
508 : {
509 0 : return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
510 : }
511 :
512 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 : _m_pmullw (__m64 __m1, __m64 __m2)
514 : {
515 : return _mm_mullo_pi16 (__m1, __m2);
516 : }
517 :
518 : /* Shift four 16-bit values in M left by COUNT. */
519 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520 : _mm_sll_pi16 (__m64 __m, __m64 __count)
521 : {
522 : return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
523 : }
524 :
525 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 : _m_psllw (__m64 __m, __m64 __count)
527 : {
528 : return _mm_sll_pi16 (__m, __count);
529 : }
530 :
531 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 : _mm_slli_pi16 (__m64 __m, int __count)
533 : {
534 : return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
535 : }
536 :
537 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 : _m_psllwi (__m64 __m, int __count)
539 : {
540 : return _mm_slli_pi16 (__m, __count);
541 : }
542 :
543 : /* Shift two 32-bit values in M left by COUNT. */
544 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
545 : _mm_sll_pi32 (__m64 __m, __m64 __count)
546 : {
547 : return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
548 : }
549 :
550 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 : _m_pslld (__m64 __m, __m64 __count)
552 : {
553 : return _mm_sll_pi32 (__m, __count);
554 : }
555 :
556 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557 : _mm_slli_pi32 (__m64 __m, int __count)
558 : {
559 : return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
560 : }
561 :
562 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563 : _m_pslldi (__m64 __m, int __count)
564 : {
565 : return _mm_slli_pi32 (__m, __count);
566 : }
567 :
568 : /* Shift the 64-bit value in M left by COUNT. */
569 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570 : _mm_sll_si64 (__m64 __m, __m64 __count)
571 : {
572 : return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
573 : }
574 :
575 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576 : _m_psllq (__m64 __m, __m64 __count)
577 : {
578 : return _mm_sll_si64 (__m, __count);
579 : }
580 :
581 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 : _mm_slli_si64 (__m64 __m, int __count)
583 : {
584 0 : return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
585 : }
586 :
587 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588 : _m_psllqi (__m64 __m, int __count)
589 : {
590 : return _mm_slli_si64 (__m, __count);
591 : }
592 :
593 : /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
594 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595 : _mm_sra_pi16 (__m64 __m, __m64 __count)
596 : {
597 : return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
598 : }
599 :
600 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 : _m_psraw (__m64 __m, __m64 __count)
602 : {
603 : return _mm_sra_pi16 (__m, __count);
604 : }
605 :
606 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 : _mm_srai_pi16 (__m64 __m, int __count)
608 : {
609 : return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
610 : }
611 :
612 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 : _m_psrawi (__m64 __m, int __count)
614 : {
615 : return _mm_srai_pi16 (__m, __count);
616 : }
617 :
618 : /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
619 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
620 : _mm_sra_pi32 (__m64 __m, __m64 __count)
621 : {
622 : return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
623 : }
624 :
625 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 : _m_psrad (__m64 __m, __m64 __count)
627 : {
628 : return _mm_sra_pi32 (__m, __count);
629 : }
630 :
631 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632 : _mm_srai_pi32 (__m64 __m, int __count)
633 : {
634 : return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
635 : }
636 :
637 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
638 : _m_psradi (__m64 __m, int __count)
639 : {
640 : return _mm_srai_pi32 (__m, __count);
641 : }
642 :
643 : /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
644 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645 : _mm_srl_pi16 (__m64 __m, __m64 __count)
646 : {
647 : return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
648 : }
649 :
650 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 : _m_psrlw (__m64 __m, __m64 __count)
652 : {
653 : return _mm_srl_pi16 (__m, __count);
654 : }
655 :
656 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
657 : _mm_srli_pi16 (__m64 __m, int __count)
658 : {
659 0 : return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
660 : }
661 :
662 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663 : _m_psrlwi (__m64 __m, int __count)
664 : {
665 : return _mm_srli_pi16 (__m, __count);
666 : }
667 :
668 : /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
669 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670 : _mm_srl_pi32 (__m64 __m, __m64 __count)
671 : {
672 : return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
673 : }
674 :
675 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676 : _m_psrld (__m64 __m, __m64 __count)
677 : {
678 : return _mm_srl_pi32 (__m, __count);
679 : }
680 :
681 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 : _mm_srli_pi32 (__m64 __m, int __count)
683 : {
684 : return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
685 : }
686 :
687 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 : _m_psrldi (__m64 __m, int __count)
689 : {
690 : return _mm_srli_pi32 (__m, __count);
691 : }
692 :
693 : /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
694 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 : _mm_srl_si64 (__m64 __m, __m64 __count)
696 : {
697 : return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
698 : }
699 :
700 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 : _m_psrlq (__m64 __m, __m64 __count)
702 : {
703 : return _mm_srl_si64 (__m, __count);
704 : }
705 :
706 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707 : _mm_srli_si64 (__m64 __m, int __count)
708 : {
709 0 : return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
710 : }
711 :
712 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 : _m_psrlqi (__m64 __m, int __count)
714 : {
715 : return _mm_srli_si64 (__m, __count);
716 : }
717 :
718 : /* Bit-wise AND the 64-bit values in M1 and M2. */
719 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720 : _mm_and_si64 (__m64 __m1, __m64 __m2)
721 : {
722 0 : return __builtin_ia32_pand (__m1, __m2);
723 : }
724 :
725 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726 : _m_pand (__m64 __m1, __m64 __m2)
727 : {
728 : return _mm_and_si64 (__m1, __m2);
729 : }
730 :
731 : /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
732 : 64-bit value in M2. */
733 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 : _mm_andnot_si64 (__m64 __m1, __m64 __m2)
735 : {
736 : return __builtin_ia32_pandn (__m1, __m2);
737 : }
738 :
739 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 : _m_pandn (__m64 __m1, __m64 __m2)
741 : {
742 : return _mm_andnot_si64 (__m1, __m2);
743 : }
744 :
745 : /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
746 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747 : _mm_or_si64 (__m64 __m1, __m64 __m2)
748 : {
749 0 : return __builtin_ia32_por (__m1, __m2);
750 : }
751 :
752 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753 : _m_por (__m64 __m1, __m64 __m2)
754 : {
755 : return _mm_or_si64 (__m1, __m2);
756 : }
757 :
758 : /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
759 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760 : _mm_xor_si64 (__m64 __m1, __m64 __m2)
761 : {
762 0 : return __builtin_ia32_pxor (__m1, __m2);
763 : }
764 :
765 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766 : _m_pxor (__m64 __m1, __m64 __m2)
767 : {
768 : return _mm_xor_si64 (__m1, __m2);
769 : }
770 :
771 : /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
772 : test is true and zero if false. */
773 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
774 : _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
775 : {
776 : return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
777 : }
778 :
779 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780 : _m_pcmpeqb (__m64 __m1, __m64 __m2)
781 : {
782 : return _mm_cmpeq_pi8 (__m1, __m2);
783 : }
784 :
785 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786 : _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
787 : {
788 : return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
789 : }
790 :
791 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792 : _m_pcmpgtb (__m64 __m1, __m64 __m2)
793 : {
794 : return _mm_cmpgt_pi8 (__m1, __m2);
795 : }
796 :
797 : /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
798 : the test is true and zero if false. */
799 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 : _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
801 : {
802 : return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
803 : }
804 :
805 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 : _m_pcmpeqw (__m64 __m1, __m64 __m2)
807 : {
808 : return _mm_cmpeq_pi16 (__m1, __m2);
809 : }
810 :
811 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812 : _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
813 : {
814 : return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
815 : }
816 :
817 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 : _m_pcmpgtw (__m64 __m1, __m64 __m2)
819 : {
820 : return _mm_cmpgt_pi16 (__m1, __m2);
821 : }
822 :
823 : /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
824 : the test is true and zero if false. */
825 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
826 : _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
827 : {
828 : return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
829 : }
830 :
831 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832 : _m_pcmpeqd (__m64 __m1, __m64 __m2)
833 : {
834 : return _mm_cmpeq_pi32 (__m1, __m2);
835 : }
836 :
837 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 : _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
839 : {
840 : return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
841 : }
842 :
843 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844 : _m_pcmpgtd (__m64 __m1, __m64 __m2)
845 : {
846 : return _mm_cmpgt_pi32 (__m1, __m2);
847 : }
848 :
849 : /* Creates a 64-bit zero. */
850 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851 : _mm_setzero_si64 (void)
852 : {
853 0 : return (__m64)0LL;
854 : }
855 :
856 : /* Creates a vector of two 32-bit values; I0 is least significant. */
857 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858 : _mm_set_pi32 (int __i1, int __i0)
859 : {
860 : return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
861 : }
862 :
863 : /* Creates a vector of four 16-bit values; W0 is least significant. */
864 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 : _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
866 : {
867 0 : return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
868 : }
869 :
870 : /* Creates a vector of eight 8-bit values; B0 is least significant. */
871 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 : _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
873 : char __b3, char __b2, char __b1, char __b0)
874 : {
875 : return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
876 : __b4, __b5, __b6, __b7);
877 : }
878 :
879 : /* Similar, but with the arguments in reverse order. */
880 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881 : _mm_setr_pi32 (int __i0, int __i1)
882 : {
883 : return _mm_set_pi32 (__i1, __i0);
884 : }
885 :
886 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887 : _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
888 : {
889 : return _mm_set_pi16 (__w3, __w2, __w1, __w0);
890 : }
891 :
892 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893 : _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
894 : char __b4, char __b5, char __b6, char __b7)
895 : {
896 : return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
897 : }
898 :
899 : /* Creates a vector of two 32-bit values, both elements containing I. */
900 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901 : _mm_set1_pi32 (int __i)
902 : {
903 : return _mm_set_pi32 (__i, __i);
904 : }
905 :
906 : /* Creates a vector of four 16-bit values, all elements containing W. */
907 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
908 : _mm_set1_pi16 (short __w)
909 : {
910 0 : return _mm_set_pi16 (__w, __w, __w, __w);
911 : }
912 :
913 : /* Creates a vector of eight 8-bit values, all elements containing B. */
914 : extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 : _mm_set1_pi8 (char __b)
916 : {
917 : return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
918 : }
919 :
920 : #endif /* __MMX__ */
921 : #endif /* _MMINTRIN_H_INCLUDED */
|