1:
2:
3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27:
28:
29: #include <sys/cdefs.h>
30: #include <sys/errno.h>
31: #include <sys/param.h>
32: #include <sys/types.h>
33: #include <sys/limits.h>
34:
35: #include <errno.h>
36: #include <stdio.h>
37: #include <stdlib.h>
38: #include <stddef.h>
39: #include <string.h>
40: #include <wchar.h>
41:
42: #include "citrus_ctype.h"
43: #include "citrus_utf8.h"
44:
45: _CITRUS_CTYPE_DEF_OPS(utf8);
46:
47: struct _utf8_state {
48: wchar_t ch;
49: int want;
50: wchar_t lbound;
51: };
52:
53: size_t
54:
55: _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
56: const char * __restrict s, size_t n,
57: void * __restrict pspriv)
58: {
59: struct _utf8_state *us;
60: int ch, i, mask, want;
61: wchar_t lbound, wch;
62:
63: us = (struct _utf8_state *)pspriv;
64:
65: if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
66: ;
67: return ((size_t)-1);
68: }
69:
70: if (s == NULL) {
71: s = "";
72: n = 1;
73: pwc = NULL;
74: }
75:
76: if (n == 0) {
77:
78: return ((size_t)-2);
79: }
80:
81: if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
82:
83: if (pwc != NULL)
84: *pwc = ch;
85: return (ch != '\0' ? 1 : 0);
86: }
87:
88: if (us->want == 0) {
89: 90: 91: 92: 93: 94: 95: 96: 97: 98: 99: 100:
101: ch = (unsigned char)*s;
102: if ((ch & 0x80) == 0) {
103: mask = 0x7f;
104: want = 1;
105: lbound = 0;
106: } else if ((ch & 0xe0) == 0xc0) {
107: mask = 0x1f;
108: want = 2;
109: lbound = 0x80;
110: } else if ((ch & 0xf0) == 0xe0) {
111: mask = 0x0f;
112: want = 3;
113: lbound = 0x800;
114: } else if ((ch & 0xf8) == 0xf0) {
115: mask = 0x07;
116: want = 4;
117: lbound = 0x10000;
118: } else {
119: 120: 121: 122:
123: ;
124: return ((size_t)-1);
125: }
126: } else {
127: want = us->want;
128: lbound = us->lbound;
129: }
130:
131: 132: 133: 134:
135: if (us->want == 0)
136: wch = (unsigned char)*s++ & mask;
137: else
138: wch = us->ch;
139: for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
140: if ((*s & 0xc0) != 0x80) {
141: 142: 143: 144:
145: ;
146: return ((size_t)-1);
147: }
148: wch <<= 6;
149: wch |= *s++ & 0x3f;
150: }
151: if (i < want) {
152:
153: us->want = want - i;
154: us->lbound = lbound;
155: us->ch = wch;
156: return ((size_t)-2);
157: }
158: if (wch < lbound) {
159: 160: 161:
162: ;
163: return ((size_t)-1);
164: }
165: if ((wch >= 0xd800 && wch <= 0xdfff) ||
166: wch == 0xfffe || wch == 0xffff) {
167: 168: 169:
170: ;
171: return ((size_t)-1);
172: }
173: if (pwc != NULL)
174: *pwc = wch;
175: us->want = 0;
176: return (wch == L'\0' ? 0 : want);
177: }
178:
179: int
180:
181: _citrus_utf8_ctype_mbsinit(const void * __restrict pspriv)
182: {
183: return (pspriv == NULL ||
184: ((const struct _utf8_state *)pspriv)->want == 0);
185: }
186:
187: size_t
188:
189: _citrus_utf8_ctype_mbsrtowcs(wchar_t * __restrict pwcs,
190: const char ** __restrict s, size_t n,
191: void * __restrict pspriv)
192: {
193: struct _utf8_state *us;
194: const char *src;
195: size_t nchr;
196: wchar_t wc;
197: size_t nb;
198:
199: us = (struct _utf8_state *)pspriv;
200: src = *s;
201: nchr = 0;
202:
203: if (pwcs == NULL) {
204: 205: 206: 207: 208:
209: if (us->want > 0 && (signed char)*src > 0) {
210: ;
211: return ((size_t)-1);
212: }
213: for (;;) {
214: if ((signed char)*src > 0) {
215: 216: 217: 218:
219: nb = 1;
220: } else {
221: nb = _citrus_utf8_ctype_mbrtowc(&wc, src,
222: _CITRUS_UTF8_MB_CUR_MAX, us);
223: if (nb == (size_t)-1) {
224:
225: return (nb);
226: }
227: if (nb == 0 || nb == (size_t)-2) {
228: return (nchr);
229: }
230: }
231:
232: src += nb;
233: nchr++;
234: }
235:
236: }
237:
238: 239: 240: 241: 242:
243: if (n > 0 && us->want > 0 && (signed char)*src > 0) {
244: ;
245: return ((size_t)-1);
246: }
247: while (n-- > 0) {
248: if ((signed char)*src > 0) {
249: 250: 251: 252:
253: *pwcs = (wchar_t)*src;
254: nb = 1;
255: } else {
256: nb = _citrus_utf8_ctype_mbrtowc(pwcs, src,
257: _CITRUS_UTF8_MB_CUR_MAX, us);
258: if (nb == (size_t)-1) {
259: *s = src;
260: return (nb);
261: }
262: if (nb == (size_t)-2) {
263: *s = src;
264: return (nchr);
265: }
266: if (nb == 0) {
267: *s = NULL;
268: return (nchr);
269: }
270: }
271: src += nb;
272: nchr++;
273: pwcs++;
274: }
275: *s = src;
276: return (nchr);
277: }
278:
279: size_t
280:
281: _citrus_utf8_ctype_wcrtomb(char * __restrict s,
282: wchar_t wc, void * __restrict pspriv)
283: {
284: struct _utf8_state *us;
285: unsigned char lead;
286: int i, len;
287:
288: us = (struct _utf8_state *)pspriv;
289:
290: if (us->want != 0) {
291: ;
292: return ((size_t)-1);
293: }
294:
295: if (s == NULL) {
296:
297: return (1);
298: }
299:
300: if ((wc & ~0x7f) == 0) {
301:
302: *s = (char)wc;
303: return (1);
304: }
305:
306: 307: 308: 309: 310: 311:
312: if ((wc & ~0x7f) == 0) {
313: lead = 0;
314: len = 1;
315: } else if ((wc & ~0x7ff) == 0) {
316: lead = 0xc0;
317: len = 2;
318: } else if ((wc & ~0xffff) == 0) {
319: lead = 0xe0;
320: len = 3;
321: } else if ((wc & ~0x1fffff) == 0) {
322: lead = 0xf0;
323: len = 4;
324: } else {
325: ;
326: return ((size_t)-1);
327: }
328:
329: 330: 331: 332: 333: 334:
335: for (i = len - 1; i > 0; i--) {
336: s[i] = (wc & 0x3f) | 0x80;
337: wc >>= 6;
338: }
339: *s = (wc & 0xff) | lead;
340:
341: return (len);
342: }
343:
344: size_t
345:
346: _citrus_utf8_ctype_wcsrtombs(char * __restrict s,
347: const wchar_t ** __restrict pwcs, size_t n,
348: void * __restrict pspriv)
349: {
350: struct _utf8_state *us;
351: char buf[_CITRUS_UTF8_MB_CUR_MAX];
352: const wchar_t *src;
353: size_t nbytes;
354: size_t nb;
355:
356: us = (struct _utf8_state *)pspriv;
357:
358: if (us->want != 0) {
359: ;
360: return ((size_t)-1);
361: }
362:
363: src = *pwcs;
364: nbytes = 0;
365:
366: if (s == NULL) {
367: for (;;) {
368: if (0 <= *src && *src < 0x80)
369:
370: nb = 1;
371: else {
372: nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us);
373: if (nb == (size_t)-1) {
374:
375: return (nb);
376: }
377: }
378: if (*src == L'\0') {
379: return (nbytes + nb - 1);
380: }
381: src++;
382: nbytes += nb;
383: }
384:
385: }
386:
387: while (n > 0) {
388: if (0 <= *src && *src < 0x80) {
389:
390: nb = 1;
391: *s = *src;
392: } else if (n > (size_t)_CITRUS_UTF8_MB_CUR_MAX) {
393:
394: nb = _citrus_utf8_ctype_wcrtomb(s, *src, us);
395: if (nb == (size_t)-1) {
396: *pwcs = src;
397: return (nb);
398: }
399: } else {
400: 401: 402:
403: nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us);
404: if (nb == (size_t)-1) {
405: *pwcs = src;
406: return (nb);
407: }
408: if (nb > n)
409:
410: break;
411: memcpy(s, buf, nb);
412: }
413: if (*src == L'\0') {
414: *pwcs = NULL;
415: return (nbytes + nb - 1);
416: }
417: src++;
418: s += nb;
419: n -= nb;
420: nbytes += nb;
421: }
422: *pwcs = src;
423: return (nbytes);
424: }