1 : #include "nscore.h"
2 : #include "nsAlgorithm.h"
3 : #include <emmintrin.h>
4 : #include <nsUTF8Utils.h>
5 :
6 : void
7 26553 : LossyConvertEncoding16to8::write_sse2(const PRUnichar* aSource,
8 : PRUint32 aSourceLength)
9 : {
10 26553 : char* dest = mDestination;
11 :
12 : // Align source to a 16-byte boundary.
13 26553 : PRUint32 i = 0;
14 : PRUint32 alignLen =
15 26553 : NS_MIN<PRUint32>(aSourceLength, PRUint32(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(PRUnichar));
16 65761 : for (; i < alignLen; i++) {
17 39208 : dest[i] = static_cast<unsigned char>(aSource[i]);
18 : }
19 :
20 : // Walk 64 bytes (four XMM registers) at a time.
21 26553 : __m128i vectmask = _mm_set1_epi16(0x00ff);
22 35443 : for (; aSourceLength - i > 31; i += 32) {
23 17780 : __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
24 17780 : source1 = _mm_and_si128(source1, vectmask);
25 :
26 17780 : __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
27 17780 : source2 = _mm_and_si128(source2, vectmask);
28 :
29 17780 : __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
30 17780 : source3 = _mm_and_si128(source3, vectmask);
31 :
32 17780 : __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
33 17780 : source4 = _mm_and_si128(source4, vectmask);
34 :
35 :
36 : // Pack the source data. SSE2 views this as a saturating uint16 to
37 : // uint8 conversion, but since we masked off the high-order byte of every
38 : // uint16, we're really just grabbing the low-order bytes of source1 and
39 : // source2.
40 17780 : __m128i packed1 = _mm_packus_epi16(source1, source2);
41 17780 : __m128i packed2 = _mm_packus_epi16(source3, source4);
42 :
43 : // This store needs to be unaligned since there's no guarantee that the
44 : // alignment we did above for the source will align the destination.
45 8890 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1);
46 8890 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
47 : }
48 :
49 : // Finish up the rest.
50 250408 : for (; i < aSourceLength; i++) {
51 223855 : dest[i] = static_cast<unsigned char>(aSource[i]);
52 : }
53 :
54 26553 : mDestination += i;
55 26553 : }
56 :
57 : void
58 1352778 : LossyConvertEncoding8to16::write_sse2(const char* aSource,
59 : PRUint32 aSourceLength)
60 : {
61 1352778 : PRUnichar *dest = mDestination;
62 :
63 : // Align source to a 16-byte boundary. We choose to align source rather than
64 : // dest because we'd rather have our loads than our stores be fast. You have
65 : // to wait for a load to complete, but you can keep on moving after issuing a
66 : // store.
67 1352778 : PRUint32 i = 0;
68 1352778 : PRUint32 alignLen = NS_MIN(aSourceLength, PRUint32(-NS_PTR_TO_INT32(aSource) & 0xf));
69 6733728 : for (; i < alignLen; i++) {
70 5380950 : dest[i] = static_cast<unsigned char>(aSource[i]);
71 : }
72 :
73 : // Walk 32 bytes (two XMM registers) at a time.
74 1762235 : for (; aSourceLength - i > 31; i += 32) {
75 818914 : __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
76 818914 : __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
77 :
78 : // Interleave 0s in with the bytes of source to create lo and hi.
79 818914 : __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
80 818914 : __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
81 818914 : __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
82 818914 : __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
83 :
84 : // store lo and hi into dest.
85 409457 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1);
86 409457 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1);
87 409457 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
88 409457 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
89 : }
90 :
91 : // Finish up whatever's left.
92 6996539 : for (; i < aSourceLength; i++) {
93 5643761 : dest[i] = static_cast<unsigned char>(aSource[i]);
94 : }
95 :
96 1352778 : mDestination += i;
97 1352778 : }
|