1 : /* GRAPHITE2 LICENSING
2 :
3 : Copyright 2010, SIL International
4 : All rights reserved.
5 :
6 : This library is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU Lesser General Public License as published
8 : by the Free Software Foundation; either version 2.1 of License, or
9 : (at your option) any later version.
10 :
11 : This program is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 : Lesser General Public License for more details.
15 :
16 : You should also have received a copy of the GNU Lesser General Public
17 : License along with this library in the file named "LICENSE".
18 : If not, write to the Free Software Foundation, 51 Franklin Street,
19 : Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
20 : internet at http://www.fsf.org/licenses/lgpl.html.
21 :
22 : Alternatively, the contents of this file may be used under the terms of the
23 : Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
24 : License, as published by the Free Software Foundation, either version 2
25 : of the License or (at your option) any later version.
26 : */
27 : #pragma once
28 :
29 : #include <cstdlib>
30 : #include "inc/Main.h"
31 :
32 : namespace graphite2 {
33 :
34 : typedef uint32 uchar_t;
35 :
36 : template <int N>
37 : struct _utf_codec
38 : {
39 : typedef uchar_t codeunit_t;
40 :
41 : static void put(codeunit_t * cp, const uchar_t , int8 & len) throw();
42 : static uchar_t get(const codeunit_t * cp, int8 & len) throw();
43 : };
44 :
45 :
46 : template <>
47 : struct _utf_codec<32>
48 : {
49 : private:
50 : static const uchar_t limit = 0x110000;
51 : public:
52 : typedef uint32 codeunit_t;
53 :
54 : inline
55 0 : static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
56 : {
57 0 : *cp = usv; l = 1;
58 0 : }
59 :
60 : inline
61 0 : static uchar_t get(const codeunit_t * cp, int8 & l) throw()
62 : {
63 0 : if (cp[0] < limit) { l = 1; return cp[0]; }
64 0 : else { l = -1; return 0xFFFD; }
65 : }
66 : };
67 :
68 :
69 : template <>
70 : struct _utf_codec<16>
71 : {
72 : private:
73 : static const int32 lead_offset = 0xD800 - (0x10000 >> 10);
74 : static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
75 : public:
76 : typedef uint16 codeunit_t;
77 :
78 : inline
79 : static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
80 : {
81 : if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); }
82 : else
83 : {
84 : cp[0] = codeunit_t(lead_offset + (usv >> 10));
85 : cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
86 : l = 2;
87 : }
88 : }
89 :
90 : inline
91 0 : static uchar_t get(const codeunit_t * cp, int8 & l) throw()
92 : {
93 0 : const uint32 uh = cp[0];
94 0 : l = 1;
95 :
96 0 : if (0xD800 > uh || uh > 0xDFFF) { return uh; }
97 0 : const uint32 ul = cp[1];
98 0 : if (uh > 0xDBFF || 0xDC00 > ul || ul > 0xDFFF) { l = -1; return 0xFFFD; }
99 0 : ++l;
100 0 : return (uh<<10) + ul + surrogate_offset;
101 : }
102 : };
103 :
104 :
105 : template <>
106 : struct _utf_codec<8>
107 : {
108 : private:
109 : static const int8 sz_lut[16];
110 : static const byte mask_lut[5];
111 :
112 :
113 : public:
114 : typedef uint8 codeunit_t;
115 :
116 : inline
117 0 : static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
118 : {
119 0 : if (usv < 0x80) {l = 1; cp[0] = usv; return; }
120 0 : if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; }
121 0 : if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; }
122 0 : else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
123 : }
124 :
125 : inline
126 0 : static uchar_t get(const codeunit_t * cp, int8 & l) throw()
127 : {
128 0 : const int8 seq_sz = sz_lut[*cp >> 4];
129 0 : uchar_t u = *cp & mask_lut[seq_sz];
130 0 : l = 1;
131 0 : bool toolong = false;
132 :
133 0 : switch(seq_sz) {
134 0 : case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10);
135 0 : case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20);
136 0 : case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80);
137 0 : case 1: break;
138 0 : case 0: l = -1; return 0xFFFD;
139 : }
140 :
141 0 : if (l != seq_sz || toolong)
142 : {
143 0 : l = -l;
144 0 : return 0xFFFD;
145 : }
146 0 : return u;
147 : }
148 : };
149 :
150 :
151 : template <typename C>
152 : class _utf_iterator
153 : {
154 : typedef _utf_codec<sizeof(C)*8> codec;
155 :
156 : C * cp;
157 : mutable int8 sl;
158 :
159 : public:
160 : typedef C codeunit_type;
161 : typedef uchar_t value_type;
162 : typedef uchar_t * pointer;
163 :
164 : class reference
165 : {
166 : const _utf_iterator & _i;
167 :
168 0 : reference(const _utf_iterator & i): _i(i) {}
169 : public:
170 0 : operator value_type () const throw () { return codec::get(_i.cp, _i.sl); }
171 0 : reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; }
172 :
173 : friend class _utf_iterator;
174 : };
175 :
176 :
177 0 : _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
178 :
179 0 : _utf_iterator & operator ++ () { cp += abs(sl); return *this; }
180 : _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; }
181 :
182 0 : bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
183 0 : bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
184 :
185 0 : reference operator * () const throw() { return *this; }
186 : pointer operator ->() const throw() { return &operator *(); }
187 :
188 0 : operator codeunit_type * () const throw() { return cp; }
189 :
190 0 : bool error() const throw() { return sl < 1; }
191 : };
192 :
193 : template <typename C>
194 : struct utf
195 : {
196 : typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
197 :
198 : typedef _utf_iterator<C> iterator;
199 : typedef _utf_iterator<const C> const_iterator;
200 : };
201 :
202 :
203 : typedef utf<uint32> utf32;
204 : typedef utf<uint16> utf16;
205 : typedef utf<uint8> utf8;
206 :
207 : } // namespace graphite2
|