LCOV - code coverage report
Current view: top level - third_party/heimdal/lib/wind - utf8.c (source / functions) Hit Total Coverage
Test: coverage report for v4-17-test 1498b464 Lines: 86 202 42.6 %
Date: 2024-06-13 04:01:37 Functions: 7 11 63.6 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
       3             :  * (Royal Institute of Technology, Stockholm, Sweden).
       4             :  * All rights reserved.
       5             :  *
       6             :  * Redistribution and use in source and binary forms, with or without
       7             :  * modification, are permitted provided that the following conditions
       8             :  * are met:
       9             :  *
      10             :  * 1. Redistributions of source code must retain the above copyright
      11             :  *    notice, this list of conditions and the following disclaimer.
      12             :  *
      13             :  * 2. Redistributions in binary form must reproduce the above copyright
      14             :  *    notice, this list of conditions and the following disclaimer in the
      15             :  *    documentation and/or other materials provided with the distribution.
      16             :  *
      17             :  * 3. Neither the name of the Institute nor the names of its contributors
      18             :  *    may be used to endorse or promote products derived from this software
      19             :  *    without specific prior written permission.
      20             :  *
      21             :  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
      22             :  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      23             :  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      24             :  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
      25             :  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
      26             :  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
      27             :  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
      28             :  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
      29             :  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
      30             :  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
      31             :  * SUCH DAMAGE.
      32             :  */
      33             : 
      34             : #include <config.h>
      35             : #include "windlocl.h"
      36             : 
      37             : static int
      38     1376358 : utf8toutf32(const unsigned char **pp, uint32_t *out)
      39             : {
      40     1376358 :     const unsigned char *p = *pp;
      41     1376358 :     unsigned c = *p;
      42             : 
      43     1376358 :     if (c & 0x80) {
      44        1920 :         if ((c & 0xE0) == 0xC0) {
      45          68 :             const unsigned c2 = *++p;
      46          68 :             if ((c2 & 0xC0) == 0x80) {
      47         136 :                 *out =  ((c  & 0x1F) << 6)
      48          68 :                     | (c2 & 0x3F);
      49             :             } else {
      50           0 :                 return WIND_ERR_INVALID_UTF8;
      51             :             }
      52        1852 :         } else if ((c & 0xF0) == 0xE0) {
      53        1852 :             const unsigned c2 = *++p;
      54        1852 :             if ((c2 & 0xC0) == 0x80) {
      55        1852 :                 const unsigned c3 = *++p;
      56        1852 :                 if ((c3 & 0xC0) == 0x80) {
      57        3704 :                     *out =   ((c  & 0x0F) << 12)
      58        1852 :                         | ((c2 & 0x3F) << 6)
      59        1852 :                         |  (c3 & 0x3F);
      60             :                 } else {
      61           0 :                     return WIND_ERR_INVALID_UTF8;
      62             :                 }
      63             :             } else {
      64           0 :                 return WIND_ERR_INVALID_UTF8;
      65             :             }
      66           0 :         } else if ((c & 0xF8) == 0xF0) {
      67           0 :             const unsigned c2 = *++p;
      68           0 :             if ((c2 & 0xC0) == 0x80) {
      69           0 :                 const unsigned c3 = *++p;
      70           0 :                 if ((c3 & 0xC0) == 0x80) {
      71           0 :                     const unsigned c4 = *++p;
      72           0 :                     if ((c4 & 0xC0) == 0x80) {
      73           0 :                         *out =   ((c  & 0x07) << 18)
      74           0 :                             | ((c2 & 0x3F) << 12)
      75           0 :                             | ((c3 & 0x3F) <<  6)
      76           0 :                             |  (c4 & 0x3F);
      77             :                     } else {
      78           0 :                         return WIND_ERR_INVALID_UTF8;
      79             :                     }
      80             :                 } else {
      81           0 :                     return WIND_ERR_INVALID_UTF8;
      82             :                 }
      83             :             } else {
      84           0 :                 return WIND_ERR_INVALID_UTF8;
      85             :             }
      86             :         } else {
      87           0 :             return WIND_ERR_INVALID_UTF8;
      88             :         }
      89             :     } else {
      90     1374438 :         *out = c;
      91             :     }
      92             : 
      93     1376358 :     *pp = p;
      94             : 
      95     1376358 :     return 0;
      96             : }
      97             : 
      98             : /**
      99             :  * Convert an UTF-8 string to an UCS4 string.
     100             :  *
     101             :  * @param in an UTF-8 string to convert.
     102             :  * @param out the resulting UCS4 strint, must be at least
     103             :  * wind_utf8ucs4_length() long.  If out is NULL, the function will
     104             :  * calculate the needed space for the out variable (just like
     105             :  * wind_utf8ucs4_length()).
     106             :  * @param out_len before processing out_len should be the length of
     107             :  * the out variable, after processing it will be the length of the out
     108             :  * string.
     109             :  *
     110             :  * @return returns 0 on success, an wind error code otherwise
     111             :  * @ingroup wind
     112             :  */
     113             : 
     114             : int
     115           0 : wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
     116             : {
     117             :     const unsigned char *p;
     118           0 :     size_t o = 0;
     119             :     int ret;
     120             : 
     121           0 :     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
     122             :         uint32_t u;
     123             : 
     124           0 :         ret = utf8toutf32(&p, &u);
     125           0 :         if (ret)
     126           0 :             return ret;
     127             : 
     128           0 :         if (out) {
     129           0 :             if (o >= *out_len)
     130           0 :                 return WIND_ERR_OVERRUN;
     131           0 :             out[o] = u;
     132             :         }
     133           0 :         o++;
     134             :     }
     135           0 :     *out_len = o;
     136           0 :     return 0;
     137             : }
     138             : 
     139             : /**
     140             :  * Calculate the length of from converting a UTF-8 string to a UCS4
     141             :  * string.
     142             :  *
     143             :  * @param in an UTF-8 string to convert.
     144             :  * @param out_len the length of the resulting UCS4 string.
     145             :  *
     146             :  * @return returns 0 on success, an wind error code otherwise
     147             :  * @ingroup wind
     148             :  */
     149             : 
     150             : int
     151           0 : wind_utf8ucs4_length(const char *in, size_t *out_len)
     152             : {
     153           0 :     return wind_utf8ucs4(in, NULL, out_len);
     154             : }
     155             : 
     156             : static const char first_char[4] =
     157             :     { 0x00, 0xC0, 0xE0, 0xF0 };
     158             : 
     159             : /**
     160             :  * Convert an UCS4 string to a UTF-8 string.
     161             :  *
     162             :  * @param in an UCS4 string to convert.
     163             :  * @param in_len the length input array.
     164             : 
     165             :  * @param out the resulting UTF-8 strint, must be at least
     166             :  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
     167             :  * out is NULL, the function will calculate the needed space for the
     168             :  * out variable (just like wind_ucs4utf8_length()).
     169             : 
     170             :  * @param out_len before processing out_len should be the length of
     171             :  * the out variable, after processing it will be the length of the out
     172             :  * string.
     173             :  *
     174             :  * @return returns 0 on success, an wind error code otherwise
     175             :  * @ingroup wind
     176             :  */
     177             : 
     178             : int
     179           0 : wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
     180             : {
     181             :     uint32_t ch;
     182             :     size_t i, len, o;
     183             : 
     184           0 :     for (o = 0, i = 0; i < in_len; i++) {
     185           0 :         ch = in[i];
     186             : 
     187           0 :         if (ch < 0x80) {
     188           0 :             len = 1;
     189           0 :         } else if (ch < 0x800) {
     190           0 :             len = 2;
     191           0 :         } else if (ch < 0x10000) {
     192           0 :             len = 3;
     193           0 :         } else if (ch <= 0x10FFFF) {
     194           0 :             len = 4;
     195             :         } else
     196           0 :             return WIND_ERR_INVALID_UTF32;
     197             : 
     198           0 :         o += len;
     199             : 
     200           0 :         if (out) {
     201           0 :             if (o >= *out_len)
     202           0 :                 return WIND_ERR_OVERRUN;
     203             : 
     204           0 :             switch(len) {
     205           0 :             case 4:
     206           0 :                 out[3] = (ch | 0x80) & 0xbf;
     207           0 :                 ch = ch >> 6;
     208             :                 fallthrough;
     209           0 :             case 3:
     210           0 :                 out[2] = (ch | 0x80) & 0xbf;
     211           0 :                 ch = ch >> 6;
     212             :                 fallthrough;
     213           0 :             case 2:
     214           0 :                 out[1] = (ch | 0x80) & 0xbf;
     215           0 :                 ch = ch >> 6;
     216             :                 fallthrough;
     217           0 :             case 1:
     218           0 :                 out[0] = ch | first_char[len - 1];
     219             :                 fallthrough;
     220           0 :             default:
     221           0 :                 break;
     222             :             }
     223           0 :         }
     224           0 :         out += len;
     225             :     }
     226           0 :     if (out) {
     227           0 :         if (o + 1 >= *out_len)
     228           0 :             return WIND_ERR_OVERRUN;
     229           0 :         *out = '\0';
     230             :     }
     231           0 :     *out_len = o;
     232           0 :     return 0;
     233             : }
     234             : 
     235             : /**
     236             :  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
     237             :  *
     238             :  * @param in an UCS4 string to convert.
     239             :  * @param in_len the length of UCS4 string to convert.
     240             :  * @param out_len the length of the resulting UTF-8 string.
     241             :  *
     242             :  * @return returns 0 on success, an wind error code otherwise
     243             :  * @ingroup wind
     244             :  */
     245             : 
     246             : int
     247           0 : wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
     248             : {
     249           0 :     return wind_ucs4utf8(in, in_len, NULL, out_len);
     250             : }
     251             : 
     252             : /**
     253             :  * Read in an UCS2 from a buffer.
     254             :  *
     255             :  * @param ptr The input buffer to read from.
     256             :  * @param len the length of the input buffer.
     257             :  * @param flags Flags to control the behavior of the function.
     258             :  * @param out the output UCS2, the array must be at least out/2 long.
     259             :  * @param out_len the output length
     260             :  *
     261             :  * @return returns 0 on success, an wind error code otherwise.
     262             :  * @ingroup wind
     263             :  */
     264             : 
     265             : int
     266      326630 : wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
     267             :               uint16_t *out, size_t *out_len)
     268             : {
     269      326630 :     const unsigned char *p = ptr;
     270      326630 :     int little = ((*flags) & WIND_RW_LE);
     271      326630 :     size_t olen = *out_len;
     272             : 
     273             :     /** if len is zero, flags are unchanged */
     274      326630 :     if (len == 0) {
     275           0 :         *out_len = 0;
     276           0 :         return 0;
     277             :     }
     278             : 
     279             :     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
     280      326630 :     if (len & 1)
     281           0 :         return WIND_ERR_LENGTH_NOT_MOD2;
     282             : 
     283             :     /**
     284             :      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
     285             :      * found, check is LE/BE flag is already and use that otherwise
     286             :      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
     287             :      * the LE/BE flag and set the resulting LE/BE flag.
     288             :      */
     289      326630 :     if ((*flags) & WIND_RW_BOM) {
     290           0 :         uint16_t bom = (p[0] << 8) + p[1];
     291           0 :         if (bom == 0xfffe || bom == 0xfeff) {
     292           0 :             little = (bom == 0xfffe);
     293           0 :             p += 2;
     294           0 :             len -= 2;
     295           0 :         } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
     296             :             /* little already set */
     297             :         } else
     298           0 :             return WIND_ERR_NO_BOM;
     299           0 :         *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
     300           0 :         *flags |= little ? WIND_RW_LE : WIND_RW_BE;
     301             :     }
     302             : 
     303     7384176 :     while (len) {
     304     6730916 :         if (olen < 1)
     305           0 :             return WIND_ERR_OVERRUN;
     306     6730916 :         if (little)
     307     6730916 :             *out = (p[1] << 8) + p[0];
     308             :         else
     309           0 :             *out = (p[0] << 8) + p[1];
     310     6730916 :         out++; p += 2; len -= 2; olen--;
     311             :     }
     312      326630 :     *out_len -= olen;
     313      326630 :     return 0;
     314             : }
     315             : 
     316             : /**
     317             :  * Write an UCS2 string to a buffer.
     318             :  *
     319             :  * @param in The input UCS2 string.
     320             :  * @param in_len the length of the input buffer.
     321             :  * @param flags Flags to control the behavior of the function.
     322             :  * @param ptr The input buffer to write to, the array must be at least
     323             :  * (in + 1) * 2 bytes long.
     324             :  * @param out_len the output length
     325             :  *
     326             :  * @return returns 0 on success, an wind error code otherwise.
     327             :  * @ingroup wind
     328             :  */
     329             : 
     330             : int
     331       54055 : wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
     332             :                void *ptr, size_t *out_len)
     333             : {
     334       54055 :     unsigned char *p = ptr;
     335       54055 :     size_t len = *out_len;
     336             : 
     337             :     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
     338       54055 :     if (len & 1)
     339           0 :         return WIND_ERR_LENGTH_NOT_MOD2;
     340             : 
     341             :     /** On zero input length, flags are preserved */
     342       54055 :     if (in_len == 0) {
     343           0 :         *out_len = 0;
     344           0 :         return 0;
     345             :     }
     346             :     /** If flags have WIND_RW_BOM set, the byte order mark is written
     347             :      * first to the output data */
     348       54055 :     if ((*flags) & WIND_RW_BOM) {
     349           0 :         uint16_t bom = 0xfffe;
     350             : 
     351           0 :         if (len < 2)
     352           0 :             return WIND_ERR_OVERRUN;
     353             : 
     354           0 :         if ((*flags) & WIND_RW_LE) {
     355           0 :             p[0] = (bom     ) & 0xff;
     356           0 :             p[1] = (bom >> 8) & 0xff;
     357             :         } else {
     358           0 :             p[1] = (bom     ) & 0xff;
     359           0 :             p[0] = (bom >> 8) & 0xff;
     360             :         }
     361           0 :         len -= 2;
     362             :     }
     363             : 
     364      769399 :     while (in_len) {
     365             :         /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
     366      661289 :         if (len < 2)
     367           0 :             return WIND_ERR_OVERRUN;
     368      661289 :         if ((*flags) & WIND_RW_LE) {
     369      661289 :             p[0] = (in[0]     ) & 0xff;
     370      661289 :             p[1] = (in[0] >> 8) & 0xff;
     371             :         } else {
     372           0 :             p[1] = (in[0]     ) & 0xff;
     373           0 :             p[0] = (in[0] >> 8) & 0xff;
     374             :         }
     375      661289 :         len -= 2;
     376      661289 :         in_len--;
     377      661289 :         p += 2;
     378      661289 :         in++;
     379             :     }
     380       54055 :     *out_len -= len;
     381       54055 :     return 0;
     382             : }
     383             : 
     384             : 
     385             : /**
     386             :  * Convert an UTF-8 string to an UCS2 string.
     387             :  *
     388             :  * @param in an UTF-8 string to convert.
     389             :  * @param out the resulting UCS2 strint, must be at least
     390             :  * wind_utf8ucs2_length() long.  If out is NULL, the function will
     391             :  * calculate the needed space for the out variable (just like
     392             :  * wind_utf8ucs2_length()).
     393             :  * @param out_len before processing out_len should be the length of
     394             :  * the out variable, after processing it will be the length of the out
     395             :  * string.
     396             :  *
     397             :  * @return returns 0 on success, an wind error code otherwise
     398             :  * @ingroup wind
     399             :  */
     400             : 
     401             : int
     402      110472 : wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
     403             : {
     404             :     const unsigned char *p;
     405      110472 :     size_t o = 0;
     406             :     int ret;
     407             : 
     408     2973660 :     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
     409             :         uint32_t u;
     410             : 
     411     1376358 :         ret = utf8toutf32(&p, &u);
     412     1376358 :         if (ret)
     413           0 :             return ret;
     414             : 
     415     1376358 :         if (u & 0xffff0000)
     416           0 :             return WIND_ERR_NOT_UTF16;
     417             : 
     418     1376358 :         if (out) {
     419      688179 :             if (o >= *out_len)
     420           0 :                 return WIND_ERR_OVERRUN;
     421      688179 :             out[o] = u;
     422             :         }
     423     1376358 :         o++;
     424             :     }
     425      110472 :     *out_len = o;
     426      110472 :     return 0;
     427             : }
     428             : 
     429             : /**
     430             :  * Calculate the length of from converting a UTF-8 string to a UCS2
     431             :  * string.
     432             :  *
     433             :  * @param in an UTF-8 string to convert.
     434             :  * @param out_len the length of the resulting UCS4 string.
     435             :  *
     436             :  * @return returns 0 on success, an wind error code otherwise
     437             :  * @ingroup wind
     438             :  */
     439             : 
     440             : int
     441       55236 : wind_utf8ucs2_length(const char *in, size_t *out_len)
     442             : {
     443       55236 :     return wind_utf8ucs2(in, NULL, out_len);
     444             : }
     445             : 
     446             : /**
     447             :  * Convert an UCS2 string to a UTF-8 string.
     448             :  *
     449             :  * @param in an UCS2 string to convert.
     450             :  * @param in_len the length of the in UCS2 string.
     451             :  * @param out the resulting UTF-8 strint, must be at least
     452             :  * wind_ucs2utf8_length() long.  If out is NULL, the function will
     453             :  * calculate the needed space for the out variable (just like
     454             :  * wind_ucs2utf8_length()).
     455             :  * @param out_len before processing out_len should be the length of
     456             :  * the out variable, after processing it will be the length of the out
     457             :  * string.
     458             :  *
     459             :  * @return returns 0 on success, an wind error code otherwise
     460             :  * @ingroup wind
     461             :  */
     462             : 
     463             : int
     464      653260 : wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
     465             : {
     466             :     uint16_t ch;
     467             :     size_t i, len, o;
     468             : 
     469    14115092 :     for (o = 0, i = 0; i < in_len; i++) {
     470    13461832 :         ch = in[i];
     471             : 
     472    13461832 :         if (ch < 0x80) {
     473    13461832 :             len = 1;
     474           0 :         } else if (ch < 0x800) {
     475           0 :             len = 2;
     476             :         } else
     477           0 :             len = 3;
     478             : 
     479    13461832 :         o += len;
     480             : 
     481    13461832 :         if (out) {
     482     6730916 :             if (o >= *out_len)
     483           0 :                 return WIND_ERR_OVERRUN;
     484             : 
     485     6730916 :             switch(len) {
     486           0 :             case 3:
     487           0 :                 out[2] = (ch | 0x80) & 0xbf;
     488           0 :                 ch = ch >> 6;
     489             :                 fallthrough;
     490           0 :             case 2:
     491           0 :                 out[1] = (ch | 0x80) & 0xbf;
     492           0 :                 ch = ch >> 6;
     493             :                 fallthrough;
     494     6730916 :             case 1:
     495     6730916 :                 out[0] = ch | first_char[len - 1];
     496             :                 fallthrough;
     497     6730916 :             default:
     498     6730916 :                 break;
     499             :             }
     500     6730916 :             out += len;
     501             :         }
     502             :     }
     503      653260 :     if (out) {
     504      326630 :         if (o >= *out_len)
     505           0 :             return WIND_ERR_OVERRUN;
     506      326630 :         *out = '\0';
     507             :     }
     508      653260 :     *out_len = o;
     509      653260 :     return 0;
     510             : }
     511             : 
     512             : /**
     513             :  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
     514             :  *
     515             :  * @param in an UCS2 string to convert.
     516             :  * @param in_len an UCS2 string length to convert.
     517             :  * @param out_len the length of the resulting UTF-8 string.
     518             :  *
     519             :  * @return returns 0 on success, an wind error code otherwise
     520             :  * @ingroup wind
     521             :  */
     522             : 
     523             : int
     524      326630 : wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
     525             : {
     526      326630 :     return wind_ucs2utf8(in, in_len, NULL, out_len);
     527             : }

Generated by: LCOV version 1.13