gonzui


Format: Advanced Search

t2ex/bsd_source/lib/libc/src_bsd/stdio/citrus_utf8.cbare sourcepermlink (0.05 seconds)

Search this content:

    1: /*      $OpenBSD: citrus_utf8.c,v 1.4 2011/04/21 00:16:06 yasuoka Exp $ */
    2: 
    3: /*-
    4:  * Copyright (c) 2002-2004 Tim J. Robbins
    5:  * All rights reserved.
    6:  *
    7:  * Redistribution and use in source and binary forms, with or without
    8:  * modification, are permitted provided that the following conditions
    9:  * are met:
   10:  * 1. Redistributions of source code must retain the above copyright
   11:  *    notice, this list of conditions and the following disclaimer.
   12:  * 2. Redistributions in binary form must reproduce the above copyright
   13:  *    notice, this list of conditions and the following disclaimer in the
   14:  *    documentation and/or other materials provided with the distribution.
   15:  *
   16:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26:  * SUCH DAMAGE.
   27:  */
   28: 
   29: #include <sys/cdefs.h>
   30: #include <sys/errno.h>
   31: #include <sys/param.h>
   32: #include <sys/types.h>
   33: #include <sys/limits.h>
   34: 
   35: #include <errno.h>
   36: #include <stdio.h>
   37: #include <stdlib.h>
   38: #include <stddef.h>
   39: #include <string.h>
   40: #include <wchar.h>
   41: 
   42: #include "citrus_ctype.h"
   43: #include "citrus_utf8.h"
   44: 
   45: _CITRUS_CTYPE_DEF_OPS(utf8);
   46: 
   47: struct _utf8_state {
   48:         wchar_t        ch;
   49:         int    want;
   50:         wchar_t        lbound;
   51: };
   52: 
   53: size_t
   54: /*ARGSUSED*/
   55: _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
   56:                            const char * __restrict s, size_t n,
   57:                            void * __restrict pspriv)
   58: {
   59:         struct _utf8_state *us;
   60:         int ch, i, mask, want;
   61:         wchar_t lbound, wch;
   62: 
   63:         us = (struct _utf8_state *)pspriv;
   64: 
   65:         if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
   66:                 /*errno = EINVAL*/;
   67:                 return ((size_t)-1);
   68:         }
   69: 
   70:         if (s == NULL) {
   71:                 s = "";
   72:                 n = 1;
   73:                 pwc = NULL;
   74:         }
   75: 
   76:         if (n == 0) {
   77:                 /* Incomplete multibyte sequence */
   78:                 return ((size_t)-2);
   79:         }
   80: 
   81:         if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
   82:                 /* Fast path for plain ASCII characters. */
   83:                 if (pwc != NULL)
   84:                         *pwc = ch;
   85:                 return (ch != '\0' ? 1 : 0);
   86:         }
   87: 
   88:         if (us->want == 0) {
   89:                 /*
   90:                  * Determine the number of octets that make up this character
   91:                  * from the first octet, and a mask that extracts the
   92:                  * interesting bits of the first octet. We already know
   93:                  * the character is at least two bytes long.
   94:                  *
   95:                  * We also specify a lower bound for the character code to
   96:                  * detect redundant, non-"shortest form" encodings. For
   97:                  * example, the sequence C0 80 is _not_ a legal representation
   98:                  * of the null character. This enforces a 1-to-1 mapping
   99:                  * between character codes and their multibyte representations.
  100:                  */
  101:                 ch = (unsigned char)*s;
  102:                 if ((ch & 0x80) == 0) {
  103:                         mask = 0x7f;
  104:                         want = 1;
  105:                         lbound = 0;
  106:                 } else if ((ch & 0xe0) == 0xc0) {
  107:                         mask = 0x1f;
  108:                         want = 2;
  109:                         lbound = 0x80;
  110:                 } else if ((ch & 0xf0) == 0xe0) {
  111:                         mask = 0x0f;
  112:                         want = 3;
  113:                         lbound = 0x800;
  114:                 } else if ((ch & 0xf8) == 0xf0) {
  115:                         mask = 0x07;
  116:                         want = 4;
  117:                         lbound = 0x10000;
  118:                 } else {
  119:                         /*
  120:                          * Malformed input; input is not UTF-8.
  121:                          * See RFC 3629.
  122:                          */
  123:                         /*errno = EILSEQ*/;
  124:                         return ((size_t)-1);
  125:                 }
  126:         } else {
  127:                 want = us->want;
  128:                 lbound = us->lbound;
  129:         }
  130: 
  131:         /*
  132:          * Decode the octet sequence representing the character in chunks
  133:          * of 6 bits, most significant first.
  134:          */
  135:         if (us->want == 0)
  136:                 wch = (unsigned char)*s++ & mask;
  137:         else
  138:                 wch = us->ch;
  139:         for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
  140:                 if ((*s & 0xc0) != 0x80) {
  141:                         /*
  142:                          * Malformed input; bad characters in the middle
  143:                          * of a character.
  144:                          */
  145:                         /*errno = EILSEQ*/;
  146:                         return ((size_t)-1);
  147:                 }
  148:                 wch <<= 6;
  149:                 wch |= *s++ & 0x3f;
  150:         }
  151:         if (i < want) {
  152:                 /* Incomplete multibyte sequence. */
  153:                 us->want = want - i;
  154:                 us->lbound = lbound;
  155:                 us->ch = wch;
  156:                 return ((size_t)-2);
  157:         }
  158:         if (wch < lbound) {
  159:                 /*
  160:                  * Malformed input; redundant encoding.
  161:                  */
  162:                 /*errno = EILSEQ*/;
  163:                 return ((size_t)-1);
  164:         }
  165:         if ((wch >= 0xd800 && wch <= 0xdfff) ||
  166:             wch == 0xfffe || wch == 0xffff) {
  167:                 /*
  168:                  * Malformed input; invalid code points.
  169:                  */
  170:                 /*errno = EILSEQ*/;
  171:                 return ((size_t)-1);
  172:         }
  173:         if (pwc != NULL)
  174:                 *pwc = wch;
  175:         us->want = 0;
  176:         return (wch == L'\0' ? 0 : want);
  177: }
  178: 
  179: int
  180: /*ARGSUSED*/
  181: _citrus_utf8_ctype_mbsinit(const void * __restrict pspriv)
  182: {
  183:         return (pspriv == NULL ||
  184:             ((const struct _utf8_state *)pspriv)->want == 0);
  185: }
  186: 
  187: size_t
  188: /*ARGSUSED*/
  189: _citrus_utf8_ctype_mbsrtowcs(wchar_t * __restrict pwcs,
  190:                              const char ** __restrict s, size_t n,
  191:                              void * __restrict pspriv)
  192: {
  193:         struct _utf8_state *us;
  194:         const char *src;
  195:         size_t nchr;
  196:         wchar_t wc;
  197:         size_t nb;
  198: 
  199:         us = (struct _utf8_state *)pspriv;
  200:         src = *s;
  201:         nchr = 0;
  202: 
  203:         if (pwcs == NULL) {
  204:                 /*
  205:                  * The fast path in the loop below is not safe if an ASCII
  206:                  * character appears as anything but the first byte of a
  207:                  * multibyte sequence. Check now to avoid doing it in the loop.
  208:                  */
  209:                 if (us->want > 0 && (signed char)*src > 0) {
  210:                         /*errno = EILSEQ*/;
  211:                         return ((size_t)-1);
  212:                 }
  213:                 for (;;) {
  214:                         if ((signed char)*src > 0) {
  215:                                 /*
  216:                                  * Fast path for plain ASCII characters
  217:                                  * excluding NUL.
  218:                                  */
  219:                                 nb = 1;
  220:                         } else {
  221:                                 nb = _citrus_utf8_ctype_mbrtowc(&wc, src,
  222:                                     _CITRUS_UTF8_MB_CUR_MAX, us);
  223:                                 if (nb == (size_t)-1) {
  224:                                         /* Invalid sequence. */
  225:                                         return (nb);
  226:                                 }
  227:                                 if (nb == 0 || nb == (size_t)-2) {
  228:                                         return (nchr);
  229:                                 }
  230:                         }
  231: 
  232:                         src += nb;
  233:                         nchr++;
  234:                 }
  235:                 /*NOTREACHED*/
  236:         }
  237: 
  238:         /*
  239:          * The fast path in the loop below is not safe if an ASCII
  240:          * character appears as anything but the first byte of a
  241:          * multibyte sequence. Check now to avoid doing it in the loop.
  242:          */
  243:         if (n > 0 && us->want > 0 && (signed char)*src > 0) {
  244:                 /*errno = EILSEQ*/;
  245:                 return ((size_t)-1);
  246:         }
  247:         while (n-- > 0) {
  248:                 if ((signed char)*src > 0) {
  249:                         /*
  250:                          * Fast path for plain ASCII characters
  251:                          * excluding NUL.
  252:                          */
  253:                         *pwcs = (wchar_t)*src;
  254:                         nb = 1;
  255:                 } else {
  256:                         nb = _citrus_utf8_ctype_mbrtowc(pwcs, src,
  257:                             _CITRUS_UTF8_MB_CUR_MAX, us);
  258:                         if (nb == (size_t)-1) {
  259:                                 *s = src;
  260:                                 return (nb);
  261:                         }
  262:                         if (nb == (size_t)-2) {
  263:                                 *s = src;
  264:                                 return (nchr);
  265:                         }
  266:                         if (nb == 0) {
  267:                                 *s = NULL;
  268:                                 return (nchr);
  269:                         }
  270:                 }
  271:                 src += nb;
  272:                 nchr++;
  273:                 pwcs++;
  274:         }
  275:         *s = src;
  276:         return (nchr);
  277: }
  278: 
  279: size_t
  280: /*ARGSUSED*/
  281: _citrus_utf8_ctype_wcrtomb(char * __restrict s,
  282:                            wchar_t wc, void * __restrict pspriv)
  283: {
  284:         struct _utf8_state *us;
  285:         unsigned char lead;
  286:         int i, len;
  287: 
  288:         us = (struct _utf8_state *)pspriv;
  289: 
  290:         if (us->want != 0) {
  291:                 /*errno = EINVAL*/;
  292:                 return ((size_t)-1);
  293:         }
  294: 
  295:         if (s == NULL) {
  296:                 /* Reset to initial shift state (no-op) */
  297:                 return (1);
  298:         }
  299: 
  300:         if ((wc & ~0x7f) == 0) {
  301:                 /* Fast path for plain ASCII characters. */
  302:                 *s = (char)wc;
  303:                 return (1);
  304:         }
  305: 
  306:         /*
  307:          * Determine the number of octets needed to represent this character.
  308:          * We always output the shortest sequence possible. Also specify the
  309:          * first few bits of the first octet, which contains the information
  310:          * about the sequence length.
  311:          */
  312:         if ((wc & ~0x7f) == 0) {
  313:                 lead = 0;
  314:                 len = 1;
  315:         } else if ((wc & ~0x7ff) == 0) {
  316:                 lead = 0xc0;
  317:                 len = 2;
  318:         } else if ((wc & ~0xffff) == 0) {
  319:                 lead = 0xe0;
  320:                 len = 3;
  321:         } else if ((wc & ~0x1fffff) == 0) {
  322:                 lead = 0xf0;
  323:                 len = 4;
  324:         } else {
  325:                 /*errno = EILSEQ*/;
  326:                 return ((size_t)-1);
  327:         }
  328: 
  329:         /*
  330:          * Output the octets representing the character in chunks
  331:          * of 6 bits, least significant last. The first octet is
  332:          * a special case because it contains the sequence length
  333:          * information.
  334:          */
  335:         for (i = len - 1; i > 0; i--) {
  336:                 s[i] = (wc & 0x3f) | 0x80;
  337:                 wc >>= 6;
  338:         }
  339:         *s = (wc & 0xff) | lead;
  340: 
  341:         return (len);
  342: }
  343: 
  344: size_t
  345: /*ARGSUSED*/
  346: _citrus_utf8_ctype_wcsrtombs(char * __restrict s,
  347:                              const wchar_t ** __restrict pwcs, size_t n,
  348:                              void * __restrict pspriv)
  349: {
  350:         struct _utf8_state *us;
  351:         char buf[_CITRUS_UTF8_MB_CUR_MAX];
  352:         const wchar_t *src;
  353:         size_t nbytes;
  354:         size_t nb;
  355: 
  356:         us = (struct _utf8_state *)pspriv;
  357: 
  358:         if (us->want != 0) {
  359:                 /*errno = EINVAL*/;
  360:                 return ((size_t)-1);
  361:         }
  362: 
  363:         src = *pwcs;
  364:         nbytes = 0;
  365: 
  366:         if (s == NULL) {
  367:                 for (;;) {
  368:                         if (0 <= *src && *src < 0x80)
  369:                                 /* Fast path for plain ASCII characters. */
  370:                                 nb = 1;
  371:                         else {
  372:                                 nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us);
  373:                                 if (nb == (size_t)-1) {
  374:                                         /* Invalid character */
  375:                                         return (nb);
  376:                                 }
  377:                         }
  378:                         if (*src == L'\0') {
  379:                                 return (nbytes + nb - 1);
  380:                         }
  381:                         src++;
  382:                         nbytes += nb;
  383:                 }
  384:                 /*NOTREACHED*/
  385:         }
  386: 
  387:         while (n > 0) {
  388:                 if (0 <= *src && *src < 0x80) {
  389:                         /* Fast path for plain ASCII characters. */
  390:                         nb = 1;
  391:                         *s = *src;
  392:                 } else if (n > (size_t)_CITRUS_UTF8_MB_CUR_MAX) {
  393:                         /* Enough space to translate in-place. */
  394:                         nb = _citrus_utf8_ctype_wcrtomb(s, *src, us);
  395:                         if (nb == (size_t)-1) {
  396:                                 *pwcs = src;
  397:                                 return (nb);
  398:                         }
  399:                 } else {
  400:                         /*
  401:                          * May not be enough space; use temp. buffer.
  402:                          */
  403:                         nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us);
  404:                         if (nb == (size_t)-1) {
  405:                                 *pwcs = src;
  406:                                 return (nb);
  407:                         }
  408:                         if (nb > n)
  409:                                 /* MB sequence for character won't fit. */
  410:                                 break;
  411:                         memcpy(s, buf, nb);
  412:                 }
  413:                 if (*src == L'\0') {
  414:                         *pwcs = NULL;
  415:                         return (nbytes + nb - 1);
  416:                 }
  417:                 src++;
  418:                 s += nb;
  419:                 n -= nb;
  420:                 nbytes += nb;
  421:         }
  422:         *pwcs = src;
  423:         return (nbytes);
  424: }