LCOV - code coverage report
Current view: top level - lib/util/charset - util_str.c (source / functions) Hit Total Coverage
Test: coverage report for recycleplus df22b230 Lines: 151 231 65.4 %
Date: 2024-02-14 10:14:15 Functions: 16 20 80.0 %

          Line data    Source code
       1             : /*
       2             :    Unix SMB/CIFS implementation.
       3             :    Samba utility functions
       4             :    Copyright (C) Andrew Tridgell 1992-2001
       5             :    Copyright (C) Simo Sorce 2001
       6             :    Copyright (C) Andrew Bartlett 2011
       7             :    Copyright (C) Jeremy Allison  1992-2007
       8             :    Copyright (C) Martin Pool     2003
       9             :    Copyright (C) James Peach     2006
      10             : 
      11             :    This program is free software; you can redistribute it and/or modify
      12             :    it under the terms of the GNU General Public License as published by
      13             :    the Free Software Foundation; either version 3 of the License, or
      14             :    (at your option) any later version.
      15             : 
      16             :    This program is distributed in the hope that it will be useful,
      17             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             :    GNU General Public License for more details.
      20             : 
      21             :    You should have received a copy of the GNU General Public License
      22             :    along with this program.  If not, see <http://www.gnu.org/licenses/>.
      23             : */
      24             : 
      25             : #include "replace.h"
      26             : #include "system/locale.h"
      27             : #include "charset.h"
      28             : #include "lib/util/fault.h"
      29             : 
      30             : #ifdef strcasecmp
      31             : #undef strcasecmp
      32             : #endif
      33             : #ifdef strncasecmp
      34             : #undef strncasecmp
      35             : #endif
      36             : 
      37             : 
      38             : /**
      39             :  Case insensitive string comparison, handle specified for testing
      40             : **/
      41    68851429 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
      42             :                                  const char *s1, const char *s2)
      43             : {
      44    68851429 :         codepoint_t c1=0, c2=0;
      45    68851429 :         codepoint_t u1=0, u2=0;
      46    68851429 :         codepoint_t l1=0, l2=0;
      47             :         size_t size1, size2;
      48             : 
      49             :         /* handle null ptr comparisons to simplify the use in qsort */
      50    68851429 :         if (s1 == s2) return 0;
      51    68850973 :         if (s1 == NULL) return -1;
      52    68850973 :         if (s2 == NULL) return 1;
      53             : 
      54   160104608 :         while (*s1 && *s2) {
      55   158518937 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
      56   158518937 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
      57             : 
      58   158518937 :                 if (c1 == INVALID_CODEPOINT ||
      59             :                     c2 == INVALID_CODEPOINT) {
      60           8 :                         return strcasecmp(s1, s2);
      61             :                 }
      62             : 
      63   158518929 :                 s1 += size1;
      64   158518929 :                 s2 += size2;
      65             : 
      66   158518929 :                 if (c1 == c2) {
      67    91056275 :                         continue;
      68             :                 }
      69             : 
      70    67462654 :                 u1 = toupper_m(c1);
      71    67462654 :                 u2 = toupper_m(c2);
      72    67462654 :                 if (u1 == u2) {
      73      197360 :                         continue;
      74             :                 }
      75             : 
      76    67265294 :                 l1 = tolower_m(c1);
      77    67265294 :                 l2 = tolower_m(c2);
      78    67265294 :                 if (l1 == l2) {
      79           0 :                         continue;
      80             :                 }
      81             : 
      82    67265294 :                 return l1 - l2;
      83             :         }
      84             : 
      85     1585671 :         return *s1 - *s2;
      86             : }
      87             : 
      88             : /**
      89             :  Case insensitive string comparison
      90             : **/
      91    68851429 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
      92             : {
      93    68851429 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
      94    68851429 :         return strcasecmp_m_handle(iconv_handle, s1, s2);
      95             : }
      96             : 
      97             : /**
      98             :  Case insensitive string comparison, length limited, handle specified for
      99             :  testing
     100             : **/
     101      376985 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
     102             :                                   const char *s1, const char *s2, size_t n)
     103             : {
     104      376985 :         codepoint_t c1=0, c2=0;
     105      376985 :         codepoint_t u1=0, u2=0;
     106      376985 :         codepoint_t l1=0, l2=0;
     107             :         size_t size1, size2;
     108             : 
     109             :         /* handle null ptr comparisons to simplify the use in qsort */
     110      376985 :         if (s1 == s2) return 0;
     111      376812 :         if (s1 == NULL) return -1;
     112      376812 :         if (s2 == NULL) return 1;
     113             : 
     114     1220203 :         while (*s1 && *s2 && n) {
     115     1168452 :                 n--;
     116             : 
     117     1168452 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
     118     1168452 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
     119             : 
     120     1168452 :                 if (c1 == INVALID_CODEPOINT ||
     121             :                     c2 == INVALID_CODEPOINT) {
     122             :                         /*
     123             :                          * n was specified in characters,
     124             :                          * now we must convert it to bytes.
     125             :                          * As bytes are the smallest
     126             :                          * character unit, the following
     127             :                          * increment and strncasecmp is always
     128             :                          * safe.
     129             :                          *
     130             :                          * The source string was already known
     131             :                          * to be n characters long, so we are
     132             :                          * guaranteed to be able to look at the
     133             :                          * (n remaining + size1) bytes from the
     134             :                          * s1 position).
     135             :                          */
     136           0 :                         n += size1;
     137           0 :                         return strncasecmp(s1, s2, n);
     138             :                 }
     139             : 
     140     1168452 :                 s1 += size1;
     141     1168452 :                 s2 += size2;
     142             : 
     143     1168452 :                 if (c1 == c2) {
     144      842841 :                         continue;
     145             :                 }
     146             : 
     147      325611 :                 u1 = toupper_m(c1);
     148      325611 :                 u2 = toupper_m(c2);
     149      325611 :                 if (u1 == u2) {
     150         550 :                         continue;
     151             :                 }
     152             : 
     153      325061 :                 l1 = tolower_m(c1);
     154      325061 :                 l2 = tolower_m(c2);
     155      325061 :                 if (l1 == l2) {
     156           0 :                         continue;
     157             :                 }
     158             : 
     159      325061 :                 return l1 - l2;
     160             :         }
     161             : 
     162       51751 :         if (n == 0) {
     163       50456 :                 return 0;
     164             :         }
     165             : 
     166        1295 :         return *s1 - *s2;
     167             : }
     168             : 
     169             : /**
     170             :  Case insensitive string comparison, length limited
     171             : **/
     172      376985 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
     173             : {
     174      376985 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
     175      376985 :         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
     176             : }
     177             : 
     178             : /**
     179             :  * Compare 2 strings.
     180             :  *
     181             :  * @note The comparison is case-insensitive.
     182             :  **/
     183        5663 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
     184             : {
     185        5663 :         return strcasecmp_m(s1,s2) == 0;
     186             : }
     187             : 
     188             : /**
     189             :  Compare 2 strings (case sensitive).
     190             : **/
     191      176138 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
     192             : {
     193      176138 :         if (s1 == s2)
     194           0 :                 return true;
     195      176138 :         if (!s1 || !s2)
     196           0 :                 return false;
     197             : 
     198      176138 :         return strcmp(s1,s2) == 0;
     199             : }
     200             : 
     201             : /**
     202             :  * Calculate the number of units (8 or 16-bit, depending on the
     203             :  * destination charset), that would be needed to convert the input
     204             :  * string which is expected to be in in src_charset encoding to the
     205             :  * destination charset (which should be a unicode charset).
     206             :  */
     207    14250389 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
     208             :                                     const char *s, charset_t src_charset, charset_t dst_charset)
     209             : {
     210    14250389 :         size_t count = 0;
     211             : 
     212             : #ifdef DEVELOPER
     213    14250389 :         switch (dst_charset) {
     214           0 :         case CH_DOS:
     215             :         case CH_UNIX:
     216           0 :                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
     217    14250389 :         default:
     218    14250389 :                 break;
     219             :         }
     220             : 
     221    14250389 :         switch (src_charset) {
     222           0 :         case CH_UTF16LE:
     223             :         case CH_UTF16BE:
     224           0 :                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
     225    14250389 :         default:
     226    14250389 :                 break;
     227             :         }
     228             : #endif
     229    14250389 :         if (!s) {
     230       48888 :                 return 0;
     231             :         }
     232             : 
     233   703688771 :         while (*s && !(((uint8_t)*s) & 0x80)) {
     234   689487270 :                 s++;
     235   689487270 :                 count++;
     236             :         }
     237             : 
     238    14201501 :         if (!*s) {
     239    14201106 :                 return count;
     240             :         }
     241             : 
     242       39191 :         while (*s) {
     243             :                 size_t c_size;
     244       38796 :                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
     245             :                                                           src_charset, &c_size);
     246       38796 :                 s += c_size;
     247             : 
     248       38796 :                 switch (dst_charset) {
     249       38796 :                 case CH_UTF16LE:
     250             :                 case CH_UTF16BE:
     251             :                 case CH_UTF16MUNGED:
     252       38796 :                         if (c < 0x10000) {
     253             :                                 /* Unicode char fits into 16 bits. */
     254       38796 :                                 count += 1;
     255             :                         } else {
     256             :                                 /* Double-width unicode char - 32 bits. */
     257           0 :                                 count += 2;
     258             :                         }
     259       38796 :                         break;
     260           0 :                 case CH_UTF8:
     261             :                         /*
     262             :                          * this only checks ranges, and does not
     263             :                          * check for invalid codepoints
     264             :                          */
     265           0 :                         if (c < 0x80) {
     266           0 :                                 count += 1;
     267           0 :                         } else if (c < 0x800) {
     268           0 :                                 count += 2;
     269           0 :                         } else if (c < 0x10000) {
     270           0 :                                 count += 3;
     271             :                         } else {
     272           0 :                                 count += 4;
     273             :                         }
     274           0 :                         break;
     275           0 :                 default:
     276             :                         /*
     277             :                          * non-unicode encoding:
     278             :                          * assume that each codepoint fits into
     279             :                          * one unit in the destination encoding.
     280             :                          */
     281           0 :                         count += 1;
     282             :                 }
     283             :         }
     284             : 
     285         395 :         return count;
     286             : }
     287             : 
     288             : /**
     289             :  * Calculate the number of units (8 or 16-bit, depending on the
     290             :  * destination charset), that would be needed to convert the input
     291             :  * string which is expected to be in in src_charset encoding to the
     292             :  * destination charset (which should be a unicode charset).
     293             :  */
     294    14250389 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
     295             : {
     296    14250389 :         struct smb_iconv_handle *ic = get_iconv_handle();
     297    14250389 :         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
     298             : }
     299             : 
     300     3170645 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
     301             :                                   const charset_t dst_charset)
     302             : {
     303     3170645 :         if (!s) {
     304       12038 :                 return 0;
     305             :         }
     306     3158607 :         return strlen_m_ext(s, src_charset, dst_charset) + 1;
     307             : }
     308             : 
     309       49140 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
     310             :                                        const charset_t src_charset,
     311             :                                        const charset_t dst_charset)
     312             : {
     313             :         size_t len;
     314       49140 :         if (!s) {
     315         528 :                 return 0;
     316             :         }
     317       48612 :         len = strlen_m_ext(s, src_charset, dst_charset);
     318       48612 :         if (len == 0) {
     319       14338 :                 return 0;
     320             :         }
     321             : 
     322       34274 :         return len+1;
     323             : }
     324             : 
     325             : /**
     326             :  * Calculate the number of 16-bit units that would be needed to convert
     327             :  * the input string which is expected to be in CH_UNIX encoding to UTF16.
     328             :  *
     329             :  * This will be the same as the number of bytes in a string for single
     330             :  * byte strings, but will be different for multibyte.
     331             :  */
     332    11043170 : _PUBLIC_ size_t strlen_m(const char *s)
     333             : {
     334    11043170 :         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
     335             : }
     336             : 
     337             : /**
     338             :    Work out the number of multibyte chars in a string, including the NULL
     339             :    terminator.
     340             : **/
     341      360032 : _PUBLIC_ size_t strlen_m_term(const char *s)
     342             : {
     343      360032 :         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
     344             : }
     345             : 
     346             : /*
     347             :  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
     348             :  * if a string is there, include the terminator.
     349             :  */
     350             : 
     351       49140 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
     352             : {
     353       49140 :         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
     354             : }
     355             : 
     356             : /**
     357             :  Strchr and strrchr_m are a bit complex on general multi-byte strings.
     358             : **/
     359    58460570 : _PUBLIC_ char *strchr_m(const char *src, char c)
     360             : {
     361             :         const char *s;
     362    58460570 :         struct smb_iconv_handle *ic = get_iconv_handle();
     363    58460570 :         if (src == NULL) {
     364           0 :                 return NULL;
     365             :         }
     366             :         /* characters below 0x3F are guaranteed to not appear in
     367             :            non-initial position in multi-byte charsets */
     368    58460570 :         if ((c & 0xC0) == 0) {
     369    18520514 :                 return strchr(src, c);
     370             :         }
     371             : 
     372             :         /* this is quite a common operation, so we want it to be
     373             :            fast. We optimise for the ascii case, knowing that all our
     374             :            supported multi-byte character sets are ascii-compatible
     375             :            (ie. they match for the first 128 chars) */
     376             : 
     377   277637434 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     378   237714451 :                 if (*s == c)
     379       17073 :                         return discard_const_p(char, s);
     380             :         }
     381             : 
     382    39922983 :         if (!*s)
     383    39922983 :                 return NULL;
     384             : 
     385             : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
     386             :         /* With compose characters we must restart from the beginning. JRA. */
     387             :         s = src;
     388             : #endif
     389             : 
     390           0 :         while (*s) {
     391             :                 size_t size;
     392           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     393           0 :                 if (c2 == c) {
     394           0 :                         return discard_const_p(char, s);
     395             :                 }
     396           0 :                 s += size;
     397             :         }
     398             : 
     399           0 :         return NULL;
     400             : }
     401             : 
     402             : /**
     403             :  * Multibyte-character version of strrchr
     404             :  */
     405      698588 : _PUBLIC_ char *strrchr_m(const char *s, char c)
     406             : {
     407             :         struct smb_iconv_handle *ic;
     408      698588 :         char *ret = NULL;
     409             : 
     410      698588 :         if (s == NULL) {
     411           0 :                 return NULL;
     412             :         }
     413             : 
     414             :         /* characters below 0x3F are guaranteed to not appear in
     415             :            non-initial position in multi-byte charsets */
     416      698588 :         if ((c & 0xC0) == 0) {
     417      689316 :                 return strrchr(s, c);
     418             :         }
     419             : 
     420             :         /* this is quite a common operation, so we want it to be
     421             :            fast. We optimise for the ascii case, knowing that all our
     422             :            supported multi-byte character sets are ascii-compatible
     423             :            (ie. they match for the first 128 chars). Also, in Samba
     424             :            we only search for ascii characters in 'c' and that
     425             :            in all mb character sets with a compound character
     426             :            containing c, if 'c' is not a match at position
     427             :            p, then p[-1] > 0x7f. JRA. */
     428             : 
     429             :         {
     430        9272 :                 size_t len = strlen(s);
     431        9272 :                 const char *cp = s;
     432        9272 :                 bool got_mb = false;
     433             : 
     434        9272 :                 if (len == 0)
     435          20 :                         return NULL;
     436        9252 :                 cp += (len - 1);
     437             :                 do {
     438       42827 :                         if (c == *cp) {
     439             :                                 /* Could be a match. Part of a multibyte ? */
     440        7188 :                                 if ((cp > s) &&
     441        6634 :                                         (((unsigned char)cp[-1]) & 0x80)) {
     442             :                                         /* Yep - go slow :-( */
     443           0 :                                         got_mb = true;
     444           0 :                                         break;
     445             :                                 }
     446             :                                 /* No - we have a match ! */
     447        7188 :                                 return discard_const_p(char , cp);
     448             :                         }
     449       35639 :                 } while (cp-- != s);
     450        2064 :                 if (!got_mb)
     451        2064 :                         return NULL;
     452             :         }
     453             : 
     454           0 :         ic = get_iconv_handle();
     455             : 
     456           0 :         while (*s) {
     457             :                 size_t size;
     458           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     459           0 :                 if (c2 == c) {
     460           0 :                         ret = discard_const_p(char, s);
     461             :                 }
     462           0 :                 s += size;
     463             :         }
     464             : 
     465           0 :         return ret;
     466             : }
     467             : 
     468             : /**
     469             :   return True if any (multi-byte) character is lower case
     470             : */
     471           0 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
     472             :                                  const char *string)
     473             : {
     474           0 :         while (*string) {
     475             :                 size_t c_size;
     476             :                 codepoint_t s;
     477             :                 codepoint_t t;
     478             : 
     479           0 :                 s = next_codepoint_handle(ic, string, &c_size);
     480           0 :                 string += c_size;
     481             : 
     482           0 :                 t = toupper_m(s);
     483             : 
     484           0 :                 if (s != t) {
     485           0 :                         return true; /* that means it has lower case chars */
     486             :                 }
     487             :         }
     488             : 
     489           0 :         return false;
     490             : }
     491             : 
     492           0 : _PUBLIC_ bool strhaslower(const char *string)
     493             : {
     494           0 :         struct smb_iconv_handle *ic = get_iconv_handle();
     495           0 :         return strhaslower_handle(ic, string);
     496             : }
     497             : 
     498             : /**
     499             :   return True if any (multi-byte) character is upper case
     500             : */
     501           0 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
     502             :                                  const char *string)
     503             : {
     504           0 :         while (*string) {
     505             :                 size_t c_size;
     506             :                 codepoint_t s;
     507             :                 codepoint_t t;
     508             : 
     509           0 :                 s = next_codepoint_handle(ic, string, &c_size);
     510           0 :                 string += c_size;
     511             : 
     512           0 :                 t = tolower_m(s);
     513             : 
     514           0 :                 if (s != t) {
     515           0 :                         return true; /* that means it has upper case chars */
     516             :                 }
     517             :         }
     518             : 
     519           0 :         return false;
     520             : }
     521             : 
     522           0 : _PUBLIC_ bool strhasupper(const char *string)
     523             : {
     524           0 :         struct smb_iconv_handle *ic = get_iconv_handle();
     525           0 :         return strhasupper_handle(ic, string);
     526             : }
     527             : 
     528             : /***********************************************************************
     529             :  strstr_m - We convert via ucs2 for now.
     530             : ***********************************************************************/
     531             : 
     532      151574 : char *strstr_m(const char *src, const char *findstr)
     533             : {
     534      151574 :         TALLOC_CTX *mem_ctx = NULL;
     535             :         smb_ucs2_t *p;
     536             :         smb_ucs2_t *src_w, *find_w;
     537             :         const char *s;
     538             :         char *s2;
     539      151574 :         char *retp = NULL;
     540      151574 :         size_t converted_size, findstr_len = 0;
     541             : 
     542             :         /* for correctness */
     543      151574 :         if (!findstr[0]) {
     544           0 :                 return discard_const_p(char, src);
     545             :         }
     546             : 
     547             :         /* Samba does single character findstr calls a *lot*. */
     548      151574 :         if (findstr[1] == '\0')
     549       15821 :                 return strchr_m(src, *findstr);
     550             : 
     551             :         /* We optimise for the ascii case, knowing that all our
     552             :            supported multi-byte character sets are ascii-compatible
     553             :            (ie. they match for the first 128 chars) */
     554             : 
     555     3203786 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     556     3122290 :                 if (*s == *findstr) {
     557       83431 :                         if (!findstr_len)
     558       73031 :                                 findstr_len = strlen(findstr);
     559             : 
     560       83431 :                         if (strncmp(s, findstr, findstr_len) == 0) {
     561       54257 :                                 return discard_const_p(char, s);
     562             :                         }
     563             :                 }
     564             :         }
     565             : 
     566       81496 :         if (!*s)
     567       81496 :                 return NULL;
     568             : 
     569             : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
     570             :         /* 'make check' fails unless we do this */
     571             : 
     572             :         /* With compose characters we must restart from the beginning. JRA. */
     573           0 :         s = src;
     574             : #endif
     575             : 
     576             :         /*
     577             :          * Use get_iconv_handle() just as a non-NULL talloc ctx. In
     578             :          * case we leak memory, this should then be more obvious in
     579             :          * the talloc report.
     580             :          */
     581           0 :         mem_ctx = talloc_new(get_iconv_handle());
     582           0 :         if (mem_ctx == NULL) {
     583           0 :                 return NULL;
     584             :         }
     585             : 
     586           0 :         if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
     587           0 :                 goto done;
     588             :         }
     589             : 
     590           0 :         if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
     591           0 :                 goto done;
     592             :         }
     593             : 
     594           0 :         p = strstr_w(src_w, find_w);
     595             : 
     596           0 :         if (!p) {
     597           0 :                 goto done;
     598             :         }
     599             : 
     600           0 :         *p = 0;
     601           0 :         if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
     602           0 :                 goto done;
     603             :         }
     604           0 :         retp = discard_const_p(char, (s+strlen(s2)));
     605           0 : done:
     606           0 :         TALLOC_FREE(mem_ctx);
     607           0 :         return retp;
     608             : }

Generated by: LCOV version 1.14