LCOV - code coverage report
Current view: directory - xpcom/string/src - nsUTF8UtilsSSE2.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 45 45 100.0 %
Date: 2012-06-02 Functions: 2 2 100.0 %

       1                 : #include "nscore.h"
       2                 : #include "nsAlgorithm.h"
       3                 : #include <emmintrin.h>
       4                 : #include <nsUTF8Utils.h>
       5                 : 
       6                 : void
       7           26553 : LossyConvertEncoding16to8::write_sse2(const PRUnichar* aSource,
       8                 :                                       PRUint32 aSourceLength)
       9                 : {
      10           26553 :   char* dest = mDestination;
      11                 : 
      12                 :   // Align source to a 16-byte boundary.
      13           26553 :   PRUint32 i = 0;
      14                 :   PRUint32 alignLen =
      15           26553 :     NS_MIN<PRUint32>(aSourceLength, PRUint32(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(PRUnichar));
      16           65761 :   for (; i < alignLen; i++) {
      17           39208 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      18                 :   }
      19                 : 
      20                 :   // Walk 64 bytes (four XMM registers) at a time.
      21           26553 :   __m128i vectmask = _mm_set1_epi16(0x00ff);
      22           35443 :   for (; aSourceLength - i > 31; i += 32) {
      23           17780 :     __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
      24           17780 :     source1 = _mm_and_si128(source1, vectmask);
      25                 : 
      26           17780 :     __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
      27           17780 :     source2 = _mm_and_si128(source2, vectmask);
      28                 : 
      29           17780 :     __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
      30           17780 :     source3 = _mm_and_si128(source3, vectmask);
      31                 : 
      32           17780 :     __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
      33           17780 :     source4 = _mm_and_si128(source4, vectmask);
      34                 : 
      35                 : 
      36                 :     // Pack the source data.  SSE2 views this as a saturating uint16 to
      37                 :     // uint8 conversion, but since we masked off the high-order byte of every
      38                 :     // uint16, we're really just grabbing the low-order bytes of source1 and
      39                 :     // source2.
      40           17780 :     __m128i packed1 = _mm_packus_epi16(source1, source2);
      41           17780 :     __m128i packed2 = _mm_packus_epi16(source3, source4);
      42                 : 
      43                 :     // This store needs to be unaligned since there's no guarantee that the
      44                 :     // alignment we did above for the source will align the destination.
      45            8890 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      packed1);
      46            8890 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
      47                 :   }
      48                 : 
      49                 :   // Finish up the rest.
      50          250408 :   for (; i < aSourceLength; i++) {
      51          223855 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      52                 :   }
      53                 : 
      54           26553 :   mDestination += i;
      55           26553 : }
      56                 : 
      57                 : void
      58         1352778 : LossyConvertEncoding8to16::write_sse2(const char* aSource,
      59                 :                                       PRUint32 aSourceLength)
      60                 : {
      61         1352778 :   PRUnichar *dest = mDestination;
      62                 : 
      63                 :   // Align source to a 16-byte boundary.  We choose to align source rather than
      64                 :   // dest because we'd rather have our loads than our stores be fast. You have
      65                 :   // to wait for a load to complete, but you can keep on moving after issuing a
      66                 :   // store.
      67         1352778 :   PRUint32 i = 0;
      68         1352778 :   PRUint32 alignLen = NS_MIN(aSourceLength, PRUint32(-NS_PTR_TO_INT32(aSource) & 0xf));
      69         6733728 :   for (; i < alignLen; i++) {
      70         5380950 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      71                 :   }
      72                 : 
      73                 :   // Walk 32 bytes (two XMM registers) at a time.
      74         1762235 :   for (; aSourceLength - i > 31; i += 32) {
      75          818914 :     __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
      76          818914 :     __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
      77                 : 
      78                 :     // Interleave 0s in with the bytes of source to create lo and hi.
      79          818914 :     __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
      80          818914 :     __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
      81          818914 :     __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
      82          818914 :     __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
      83                 : 
      84                 :     // store lo and hi into dest.
      85          409457 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      lo1);
      86          409457 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8),  hi1);
      87          409457 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
      88          409457 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
      89                 :   }
      90                 : 
      91                 :   // Finish up whatever's left.
      92         6996539 :   for (; i < aSourceLength; i++) {
      93         5643761 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      94                 :   }
      95                 : 
      96         1352778 :   mDestination += i;
      97         1352778 : }

Generated by: LCOV version 1.7