LCOV - code coverage report
Current view: top level - lib/util/charset - util_str.c (source / functions) Hit Total Coverage
Test: coverage report for v4-17-test 1498b464 Lines: 151 231 65.4 %
Date: 2024-06-13 04:01:37 Functions: 16 20 80.0 %

          Line data    Source code
       1             : /*
       2             :    Unix SMB/CIFS implementation.
       3             :    Samba utility functions
       4             :    Copyright (C) Andrew Tridgell 1992-2001
       5             :    Copyright (C) Simo Sorce 2001
       6             :    Copyright (C) Andrew Bartlett 2011
       7             :    Copyright (C) Jeremy Allison  1992-2007
       8             :    Copyright (C) Martin Pool     2003
       9             :    Copyright (C) James Peach     2006
      10             : 
      11             :    This program is free software; you can redistribute it and/or modify
      12             :    it under the terms of the GNU General Public License as published by
      13             :    the Free Software Foundation; either version 3 of the License, or
      14             :    (at your option) any later version.
      15             : 
      16             :    This program is distributed in the hope that it will be useful,
      17             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             :    GNU General Public License for more details.
      20             : 
      21             :    You should have received a copy of the GNU General Public License
      22             :    along with this program.  If not, see <http://www.gnu.org/licenses/>.
      23             : */
      24             : 
      25             : #include "replace.h"
      26             : #include "system/locale.h"
      27             : #include "charset.h"
      28             : #include "lib/util/fault.h"
      29             : 
      30             : #ifdef strcasecmp
      31             : #undef strcasecmp
      32             : #endif
      33             : #ifdef strncasecmp
      34             : #undef strncasecmp
      35             : #endif
      36             : 
      37             : 
      38             : /**
      39             :  Case insensitive string compararison, handle specified for testing
      40             : **/
      41    68520579 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
      42             :                                  const char *s1, const char *s2)
      43             : {
      44    68520579 :         codepoint_t c1=0, c2=0;
      45    68520579 :         codepoint_t u1=0, u2=0;
      46    68520579 :         codepoint_t l1=0, l2=0;
      47             :         size_t size1, size2;
      48             : 
      49             :         /* handle null ptr comparisons to simplify the use in qsort */
      50    68520579 :         if (s1 == s2) return 0;
      51    68520123 :         if (s1 == NULL) return -1;
      52    68520123 :         if (s2 == NULL) return 1;
      53             : 
      54   220193404 :         while (*s1 && *s2) {
      55   158637234 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
      56   158637234 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
      57             : 
      58   158637234 :                 if (c1 == INVALID_CODEPOINT ||
      59             :                     c2 == INVALID_CODEPOINT) {
      60           8 :                         return strcasecmp(s1, s2);
      61             :                 }
      62             : 
      63   158637226 :                 s1 += size1;
      64   158637226 :                 s2 += size2;
      65             : 
      66   158637226 :                 if (c1 == c2) {
      67    91325288 :                         continue;
      68             :                 }
      69             : 
      70    67311938 :                 u1 = toupper_m(c1);
      71    67311938 :                 u2 = toupper_m(c2);
      72    67311938 :                 if (u1 == u2) {
      73      373986 :                         continue;
      74             :                 }
      75             : 
      76    66937952 :                 l1 = tolower_m(c1);
      77    66937952 :                 l2 = tolower_m(c2);
      78    66937952 :                 if (l1 == l2) {
      79           0 :                         continue;
      80             :                 }
      81             : 
      82    66937952 :                 return l1 - l2;
      83             :         }
      84             : 
      85     1582163 :         return *s1 - *s2;
      86             : }
      87             : 
      88             : /**
      89             :  Case insensitive string compararison
      90             : **/
      91    68520579 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
      92             : {
      93    68520579 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
      94    68520579 :         return strcasecmp_m_handle(iconv_handle, s1, s2);
      95             : }
      96             : 
      97             : /**
      98             :  Case insensitive string compararison, length limited, handle specified for testing
      99             : **/
     100      356497 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
     101             :                                   const char *s1, const char *s2, size_t n)
     102             : {
     103      356497 :         codepoint_t c1=0, c2=0;
     104      356497 :         codepoint_t u1=0, u2=0;
     105      356497 :         codepoint_t l1=0, l2=0;
     106             :         size_t size1, size2;
     107             : 
     108             :         /* handle null ptr comparisons to simplify the use in qsort */
     109      356497 :         if (s1 == s2) return 0;
     110      356309 :         if (s1 == NULL) return -1;
     111      356309 :         if (s2 == NULL) return 1;
     112             : 
     113     1338644 :         while (*s1 && *s2 && n) {
     114     1080653 :                 n--;
     115             : 
     116     1080653 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
     117     1080653 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
     118             : 
     119     1080653 :                 if (c1 == INVALID_CODEPOINT ||
     120             :                     c2 == INVALID_CODEPOINT) {
     121             :                         /*
     122             :                          * n was specified in characters,
     123             :                          * now we must convert it to bytes.
     124             :                          * As bytes are the smallest
     125             :                          * character unit, the following
     126             :                          * increment and strncasecmp is always
     127             :                          * safe.
     128             :                          *
     129             :                          * The source string was already known
     130             :                          * to be n characters long, so we are
     131             :                          * guaranteed to be able to look at the
     132             :                          * (n remaining + size1) bytes from the
     133             :                          * s1 position).
     134             :                          */
     135           0 :                         n += size1;
     136           0 :                         return strncasecmp(s1, s2, n);
     137             :                 }
     138             : 
     139     1080653 :                 s1 += size1;
     140     1080653 :                 s2 += size2;
     141             : 
     142     1080653 :                 if (c1 == c2) {
     143      770937 :                         continue;
     144             :                 }
     145             : 
     146      309716 :                 u1 = toupper_m(c1);
     147      309716 :                 u2 = toupper_m(c2);
     148      309716 :                 if (u1 == u2) {
     149         383 :                         continue;
     150             :                 }
     151             : 
     152      309333 :                 l1 = tolower_m(c1);
     153      309333 :                 l2 = tolower_m(c2);
     154      309333 :                 if (l1 == l2) {
     155           0 :                         continue;
     156             :                 }
     157             : 
     158      309333 :                 return l1 - l2;
     159             :         }
     160             : 
     161       46976 :         if (n == 0) {
     162       45752 :                 return 0;
     163             :         }
     164             : 
     165        1224 :         return *s1 - *s2;
     166             : }
     167             : 
     168             : /**
     169             :  Case insensitive string compararison, length limited
     170             : **/
     171      356497 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
     172             : {
     173      356497 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
     174      356497 :         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
     175             : }
     176             : 
     177             : /**
     178             :  * Compare 2 strings.
     179             :  *
     180             :  * @note The comparison is case-insensitive.
     181             :  **/
     182        5687 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
     183             : {
     184        5687 :         return strcasecmp_m(s1,s2) == 0;
     185             : }
     186             : 
     187             : /**
     188             :  Compare 2 strings (case sensitive).
     189             : **/
     190      176870 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
     191             : {
     192      176870 :         if (s1 == s2)
     193           0 :                 return true;
     194      176870 :         if (!s1 || !s2)
     195           0 :                 return false;
     196             : 
     197      176870 :         return strcmp(s1,s2) == 0;
     198             : }
     199             : 
     200             : /**
     201             :  * Calculate the number of units (8 or 16-bit, depending on the
     202             :  * destination charset), that would be needed to convert the input
     203             :  * string which is expected to be in in src_charset encoding to the
     204             :  * destination charset (which should be a unicode charset).
     205             :  */
     206    14049892 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
     207             :                                     const char *s, charset_t src_charset, charset_t dst_charset)
     208             : {
     209    14049892 :         size_t count = 0;
     210             : 
     211             : #ifdef DEVELOPER
     212    14049892 :         switch (dst_charset) {
     213           0 :         case CH_DOS:
     214             :         case CH_UNIX:
     215           0 :                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
     216    14049892 :         default:
     217    14049892 :                 break;
     218             :         }
     219             : 
     220    14049892 :         switch (src_charset) {
     221           0 :         case CH_UTF16LE:
     222             :         case CH_UTF16BE:
     223           0 :                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
     224    14049892 :         default:
     225    14049892 :                 break;
     226             :         }
     227             : #endif
     228    14049892 :         if (!s) {
     229       48968 :                 return 0;
     230             :         }
     231             : 
     232   699797894 :         while (*s && !(((uint8_t)*s) & 0x80)) {
     233   672884785 :                 s++;
     234   672884785 :                 count++;
     235             :         }
     236             : 
     237    14000924 :         if (!*s) {
     238    14000529 :                 return count;
     239             :         }
     240             : 
     241       39586 :         while (*s) {
     242             :                 size_t c_size;
     243       38796 :                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
     244             :                                                           src_charset, &c_size);
     245       38796 :                 s += c_size;
     246             : 
     247       38796 :                 switch (dst_charset) {
     248       38796 :                 case CH_UTF16LE:
     249             :                 case CH_UTF16BE:
     250             :                 case CH_UTF16MUNGED:
     251       38796 :                         if (c < 0x10000) {
     252             :                                 /* Unicode char fits into 16 bits. */
     253       38796 :                                 count += 1;
     254             :                         } else {
     255             :                                 /* Double-width unicode char - 32 bits. */
     256           0 :                                 count += 2;
     257             :                         }
     258       38796 :                         break;
     259           0 :                 case CH_UTF8:
     260             :                         /*
     261             :                          * this only checks ranges, and does not
     262             :                          * check for invalid codepoints
     263             :                          */
     264           0 :                         if (c < 0x80) {
     265           0 :                                 count += 1;
     266           0 :                         } else if (c < 0x800) {
     267           0 :                                 count += 2;
     268           0 :                         } else if (c < 0x10000) {
     269           0 :                                 count += 3;
     270             :                         } else {
     271           0 :                                 count += 4;
     272             :                         }
     273           0 :                         break;
     274           0 :                 default:
     275             :                         /*
     276             :                          * non-unicode encoding:
     277             :                          * assume that each codepoint fits into
     278             :                          * one unit in the destination encoding.
     279             :                          */
     280           0 :                         count += 1;
     281             :                 }
     282             :         }
     283             : 
     284         395 :         return count;
     285             : }
     286             : 
     287             : /**
     288             :  * Calculate the number of units (8 or 16-bit, depending on the
     289             :  * destination charset), that would be needed to convert the input
     290             :  * string which is expected to be in in src_charset encoding to the
     291             :  * destination charset (which should be a unicode charset).
     292             :  */
     293    14049892 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
     294             : {
     295    14049892 :         struct smb_iconv_handle *ic = get_iconv_handle();
     296    14049892 :         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
     297             : }
     298             : 
     299     3227254 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
     300             :                                   const charset_t dst_charset)
     301             : {
     302     3227254 :         if (!s) {
     303       12034 :                 return 0;
     304             :         }
     305     3215220 :         return strlen_m_ext(s, src_charset, dst_charset) + 1;
     306             : }
     307             : 
     308       48960 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
     309             :                                        const charset_t src_charset,
     310             :                                        const charset_t dst_charset)
     311             : {
     312             :         size_t len;
     313       48960 :         if (!s) {
     314         528 :                 return 0;
     315             :         }
     316       48432 :         len = strlen_m_ext(s, src_charset, dst_charset);
     317       48432 :         if (len == 0) {
     318       14281 :                 return 0;
     319             :         }
     320             : 
     321       34151 :         return len+1;
     322             : }
     323             : 
     324             : /**
     325             :  * Calculate the number of 16-bit units that would be needed to convert
     326             :  * the input string which is expected to be in CH_UNIX encoding to UTF16.
     327             :  *
     328             :  * This will be the same as the number of bytes in a string for single
     329             :  * byte strings, but will be different for multibyte.
     330             :  */
     331    10786240 : _PUBLIC_ size_t strlen_m(const char *s)
     332             : {
     333    10786240 :         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
     334             : }
     335             : 
     336             : /**
     337             :    Work out the number of multibyte chars in a string, including the NULL
     338             :    terminator.
     339             : **/
     340      361120 : _PUBLIC_ size_t strlen_m_term(const char *s)
     341             : {
     342      361120 :         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
     343             : }
     344             : 
     345             : /*
     346             :  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
     347             :  * if a string is there, include the terminator.
     348             :  */
     349             : 
     350       48960 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
     351             : {
     352       48960 :         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
     353             : }
     354             : 
     355             : /**
     356             :  Strchr and strrchr_m are a bit complex on general multi-byte strings.
     357             : **/
     358    57021937 : _PUBLIC_ char *strchr_m(const char *src, char c)
     359             : {
     360             :         const char *s;
     361    57021937 :         struct smb_iconv_handle *ic = get_iconv_handle();
     362    57021937 :         if (src == NULL) {
     363           0 :                 return NULL;
     364             :         }
     365             :         /* characters below 0x3F are guaranteed to not appear in
     366             :            non-initial position in multi-byte charsets */
     367    57021937 :         if ((c & 0xC0) == 0) {
     368    18002515 :                 return strchr(src, c);
     369             :         }
     370             : 
     371             :         /* this is quite a common operation, so we want it to be
     372             :            fast. We optimise for the ascii case, knowing that all our
     373             :            supported multi-byte character sets are ascii-compatible
     374             :            (ie. they match for the first 128 chars) */
     375             : 
     376   271155218 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     377   232152914 :                 if (*s == c)
     378       17118 :                         return discard_const_p(char, s);
     379             :         }
     380             : 
     381    39002304 :         if (!*s)
     382    39002304 :                 return NULL;
     383             : 
     384             : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
     385             :         /* With compose characters we must restart from the beginning. JRA. */
     386             :         s = src;
     387             : #endif
     388             : 
     389           0 :         while (*s) {
     390             :                 size_t size;
     391           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     392           0 :                 if (c2 == c) {
     393           0 :                         return discard_const_p(char, s);
     394             :                 }
     395           0 :                 s += size;
     396             :         }
     397             : 
     398           0 :         return NULL;
     399             : }
     400             : 
     401             : /**
     402             :  * Multibyte-character version of strrchr
     403             :  */
     404      662680 : _PUBLIC_ char *strrchr_m(const char *s, char c)
     405             : {
     406             :         struct smb_iconv_handle *ic;
     407      662680 :         char *ret = NULL;
     408             : 
     409      662680 :         if (s == NULL) {
     410           0 :                 return NULL;
     411             :         }
     412             : 
     413             :         /* characters below 0x3F are guaranteed to not appear in
     414             :            non-initial position in multi-byte charsets */
     415      662680 :         if ((c & 0xC0) == 0) {
     416      649311 :                 return strrchr(s, c);
     417             :         }
     418             : 
     419             :         /* this is quite a common operation, so we want it to be
     420             :            fast. We optimise for the ascii case, knowing that all our
     421             :            supported multi-byte character sets are ascii-compatible
     422             :            (ie. they match for the first 128 chars). Also, in Samba
     423             :            we only search for ascii characters in 'c' and that
     424             :            in all mb character sets with a compound character
     425             :            containing c, if 'c' is not a match at position
     426             :            p, then p[-1] > 0x7f. JRA. */
     427             : 
     428             :         {
     429       13369 :                 size_t len = strlen(s);
     430       13369 :                 const char *cp = s;
     431       13369 :                 bool got_mb = false;
     432             : 
     433       13369 :                 if (len == 0)
     434          20 :                         return NULL;
     435       13349 :                 cp += (len - 1);
     436             :                 do {
     437       74489 :                         if (c == *cp) {
     438             :                                 /* Could be a match. Part of a multibyte ? */
     439       22471 :                                 if ((cp > s) &&
     440       11252 :                                         (((unsigned char)cp[-1]) & 0x80)) {
     441             :                                         /* Yep - go slow :-( */
     442           0 :                                         got_mb = true;
     443           0 :                                         break;
     444             :                                 }
     445             :                                 /* No - we have a match ! */
     446       11806 :                                 return discard_const_p(char , cp);
     447             :                         }
     448       62683 :                 } while (cp-- != s);
     449        1543 :                 if (!got_mb)
     450        1543 :                         return NULL;
     451             :         }
     452             : 
     453           0 :         ic = get_iconv_handle();
     454             : 
     455           0 :         while (*s) {
     456             :                 size_t size;
     457           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     458           0 :                 if (c2 == c) {
     459           0 :                         ret = discard_const_p(char, s);
     460             :                 }
     461           0 :                 s += size;
     462             :         }
     463             : 
     464           0 :         return ret;
     465             : }
     466             : 
     467             : /**
     468             :   return True if any (multi-byte) character is lower case
     469             : */
     470           0 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
     471             :                                  const char *string)
     472             : {
     473           0 :         while (*string) {
     474             :                 size_t c_size;
     475             :                 codepoint_t s;
     476             :                 codepoint_t t;
     477             : 
     478           0 :                 s = next_codepoint_handle(ic, string, &c_size);
     479           0 :                 string += c_size;
     480             : 
     481           0 :                 t = toupper_m(s);
     482             : 
     483           0 :                 if (s != t) {
     484           0 :                         return true; /* that means it has lower case chars */
     485             :                 }
     486             :         }
     487             : 
     488           0 :         return false;
     489             : }
     490             : 
     491           0 : _PUBLIC_ bool strhaslower(const char *string)
     492             : {
     493           0 :         struct smb_iconv_handle *ic = get_iconv_handle();
     494           0 :         return strhaslower_handle(ic, string);
     495             : }
     496             : 
     497             : /**
     498             :   return True if any (multi-byte) character is upper case
     499             : */
     500           0 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
     501             :                                  const char *string)
     502             : {
     503           0 :         while (*string) {
     504             :                 size_t c_size;
     505             :                 codepoint_t s;
     506             :                 codepoint_t t;
     507             : 
     508           0 :                 s = next_codepoint_handle(ic, string, &c_size);
     509           0 :                 string += c_size;
     510             : 
     511           0 :                 t = tolower_m(s);
     512             : 
     513           0 :                 if (s != t) {
     514           0 :                         return true; /* that means it has upper case chars */
     515             :                 }
     516             :         }
     517             : 
     518           0 :         return false;
     519             : }
     520             : 
     521           0 : _PUBLIC_ bool strhasupper(const char *string)
     522             : {
     523           0 :         struct smb_iconv_handle *ic = get_iconv_handle();
     524           0 :         return strhasupper_handle(ic, string);
     525             : }
     526             : 
     527             : /***********************************************************************
     528             :  strstr_m - We convert via ucs2 for now.
     529             : ***********************************************************************/
     530             : 
     531      172494 : char *strstr_m(const char *src, const char *findstr)
     532             : {
     533      172494 :         TALLOC_CTX *mem_ctx = NULL;
     534             :         smb_ucs2_t *p;
     535             :         smb_ucs2_t *src_w, *find_w;
     536             :         const char *s;
     537             :         char *s2;
     538      172494 :         char *retp = NULL;
     539      172494 :         size_t converted_size, findstr_len = 0;
     540             : 
     541             :         /* for correctness */
     542      172494 :         if (!findstr[0]) {
     543           0 :                 return discard_const_p(char, src);
     544             :         }
     545             : 
     546             :         /* Samba does single character findstr calls a *lot*. */
     547      172494 :         if (findstr[1] == '\0')
     548       15834 :                 return strchr_m(src, *findstr);
     549             : 
     550             :         /* We optimise for the ascii case, knowing that all our
     551             :            supported multi-byte character sets are ascii-compatible
     552             :            (ie. they match for the first 128 chars) */
     553             : 
     554     3433838 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     555     3337061 :                 if (*s == *findstr) {
     556      101243 :                         if (!findstr_len)
     557       85957 :                                 findstr_len = strlen(findstr);
     558             : 
     559      101243 :                         if (strncmp(s, findstr, findstr_len) == 0) {
     560       59883 :                                 return discard_const_p(char, s);
     561             :                         }
     562             :                 }
     563             :         }
     564             : 
     565       96777 :         if (!*s)
     566       96777 :                 return NULL;
     567             : 
     568             : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
     569             :         /* 'make check' fails unless we do this */
     570             : 
     571             :         /* With compose characters we must restart from the beginning. JRA. */
     572           0 :         s = src;
     573             : #endif
     574             : 
     575             :         /*
     576             :          * Use get_iconv_handle() just as a non-NULL talloc ctx. In
     577             :          * case we leak memory, this should then be more obvious in
     578             :          * the talloc report.
     579             :          */
     580           0 :         mem_ctx = talloc_new(get_iconv_handle());
     581           0 :         if (mem_ctx == NULL) {
     582           0 :                 return NULL;
     583             :         }
     584             : 
     585           0 :         if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
     586           0 :                 goto done;
     587             :         }
     588             : 
     589           0 :         if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
     590           0 :                 goto done;
     591             :         }
     592             : 
     593           0 :         p = strstr_w(src_w, find_w);
     594             : 
     595           0 :         if (!p) {
     596           0 :                 goto done;
     597             :         }
     598             : 
     599           0 :         *p = 0;
     600           0 :         if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
     601           0 :                 goto done;
     602             :         }
     603           0 :         retp = discard_const_p(char, (s+strlen(s2)));
     604           0 : done:
     605           0 :         TALLOC_FREE(mem_ctx);
     606           0 :         return retp;
     607             : }

Generated by: LCOV version 1.13