• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

string.c

Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: yugui $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018 
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021 
00022 #include <math.h>
00023 #include <ctype.h>
00024 
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028 
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030 
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047 
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050 
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00055 #define STR_ASSOC   FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063 
00064 
00065 #define STR_SET_NOEMBED(str) do {\
00066     FL_SET(str, STR_NOEMBED);\
00067     STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072     long tmp_n = (n);\
00073     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076 
00077 #define STR_SET_LEN(str, n) do { \
00078     if (STR_EMBED_P(str)) {\
00079         STR_SET_EMBED_LEN(str, n);\
00080     }\
00081     else {\
00082         RSTRING(str)->as.heap.len = (n);\
00083     }\
00084 } while (0)
00085 
00086 #define STR_DEC_LEN(str) do {\
00087     if (STR_EMBED_P(str)) {\
00088         long n = RSTRING_LEN(str);\
00089         n--;\
00090         STR_SET_EMBED_LEN(str, n);\
00091     }\
00092     else {\
00093         RSTRING(str)->as.heap.len--;\
00094     }\
00095 } while (0)
00096 
00097 #define RESIZE_CAPA(str,capacity) do {\
00098     if (STR_EMBED_P(str)) {\
00099         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100             char *tmp = ALLOC_N(char, capacity+1);\
00101             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102             RSTRING(str)->as.heap.ptr = tmp;\
00103             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104             STR_SET_NOEMBED(str);\
00105             RSTRING(str)->as.heap.aux.capa = (capacity);\
00106         }\
00107     }\
00108     else {\
00109         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110         if (!STR_NOCAPA_P(str))\
00111             RSTRING(str)->as.heap.aux.capa = (capacity);\
00112     }\
00113 } while (0)
00114 
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117 
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119 
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123     rb_encoding *enc;
00124 
00125     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00126     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127         return 1;
00128 
00129     enc = STR_ENC_GET(str);
00130     if (rb_enc_mbmaxlen(enc) == 1)
00131         return 1;
00132 
00133     /* Conservative.  Possibly single byte.
00134      * "\xa1" in Shift_JIS for example. */
00135     return 0;
00136 }
00137 
00138 VALUE rb_fs;
00139 
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149     if ((int)sizeof(VALUE) * 2 < e - p) {
00150         const VALUE *s, *t;
00151         const VALUE lowbits = sizeof(VALUE) - 1;
00152         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153         while (p < (const char *)s) {
00154             if (!ISASCII(*p))
00155                 return p;
00156             p++;
00157         }
00158         t = (const VALUE*)(~lowbits & (VALUE)e);
00159         while (s < t) {
00160             if (*s & NONASCII_MASK) {
00161                 t = s;
00162                 break;
00163             }
00164             s++;
00165         }
00166         p = (const char *)t;
00167     }
00168 #endif
00169     while (p < e) {
00170         if (!ISASCII(*p))
00171             return p;
00172         p++;
00173     }
00174     return NULL;
00175 }
00176 
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180     const char *e = p + len;
00181 
00182     if (rb_enc_to_index(enc) == 0) {
00183         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00184         p = search_nonascii(p, e);
00185         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186     }
00187 
00188     if (rb_enc_asciicompat(enc)) {
00189         p = search_nonascii(p, e);
00190         if (!p) {
00191             return ENC_CODERANGE_7BIT;
00192         }
00193         while (p < e) {
00194             int ret = rb_enc_precise_mbclen(p, e, enc);
00195             if (!MBCLEN_CHARFOUND_P(ret)) {
00196                 return ENC_CODERANGE_BROKEN;
00197             }
00198             p += MBCLEN_CHARFOUND_LEN(ret);
00199             if (p < e) {
00200                 p = search_nonascii(p, e);
00201                 if (!p) {
00202                     return ENC_CODERANGE_VALID;
00203                 }
00204             }
00205         }
00206         if (e < p) {
00207             return ENC_CODERANGE_BROKEN;
00208         }
00209         return ENC_CODERANGE_VALID;
00210     }
00211 
00212     while (p < e) {
00213         int ret = rb_enc_precise_mbclen(p, e, enc);
00214 
00215         if (!MBCLEN_CHARFOUND_P(ret)) {
00216             return ENC_CODERANGE_BROKEN;
00217         }
00218         p += MBCLEN_CHARFOUND_LEN(ret);
00219     }
00220     if (e < p) {
00221         return ENC_CODERANGE_BROKEN;
00222     }
00223     return ENC_CODERANGE_VALID;
00224 }
00225 
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229     const char *p = s;
00230 
00231     if (*cr == ENC_CODERANGE_BROKEN)
00232         return e - s;
00233 
00234     if (rb_enc_to_index(enc) == 0) {
00235         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00236         p = search_nonascii(p, e);
00237         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238         return e - s;
00239     }
00240     else if (rb_enc_asciicompat(enc)) {
00241         p = search_nonascii(p, e);
00242         if (!p) {
00243             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244             return e - s;
00245         }
00246         while (p < e) {
00247             int ret = rb_enc_precise_mbclen(p, e, enc);
00248             if (!MBCLEN_CHARFOUND_P(ret)) {
00249                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250                 return p - s;
00251             }
00252             p += MBCLEN_CHARFOUND_LEN(ret);
00253             if (p < e) {
00254                 p = search_nonascii(p, e);
00255                 if (!p) {
00256                     *cr = ENC_CODERANGE_VALID;
00257                     return e - s;
00258                 }
00259             }
00260         }
00261         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262         return p - s;
00263     }
00264     else {
00265         while (p < e) {
00266             int ret = rb_enc_precise_mbclen(p, e, enc);
00267             if (!MBCLEN_CHARFOUND_P(ret)) {
00268                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269                 return p - s;
00270             }
00271             p += MBCLEN_CHARFOUND_LEN(ret);
00272         }
00273         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274         return p - s;
00275     }
00276 }
00277 
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281     rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283 
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287     /* this function is designed for copying encoding and coderange
00288      * from src to new string "dest" which is made from the part of src.
00289      */
00290     str_enc_copy(dest, src);
00291     switch (ENC_CODERANGE(src)) {
00292       case ENC_CODERANGE_7BIT:
00293         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294         break;
00295       case ENC_CODERANGE_VALID:
00296         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299         else
00300             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301         break;
00302       default:
00303         if (RSTRING_LEN(dest) == 0) {
00304             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306             else
00307                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308         }
00309         break;
00310     }
00311 }
00312 
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316     str_enc_copy(dest, src);
00317     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319 
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323     int cr = ENC_CODERANGE(str);
00324 
00325     if (cr == ENC_CODERANGE_UNKNOWN) {
00326         rb_encoding *enc = STR_ENC_GET(str);
00327         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328         ENC_CODERANGE_SET(str, cr);
00329     }
00330     return cr;
00331 }
00332 
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336     rb_encoding *enc = STR_ENC_GET(str);
00337 
00338     if (!rb_enc_asciicompat(enc))
00339         return FALSE;
00340     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341         return TRUE;
00342     return FALSE;
00343 }
00344 
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349         rb_raise(rb_eRuntimeError, "string modified");
00350     }
00351 }
00352 
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356     if (OBJ_FROZEN(s)) {
00357         rb_raise(rb_eRuntimeError, "string frozen");
00358     }
00359 }
00360 
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364     if (STR_EMBED_P(str)) {
00365         return RSTRING_EMBED_LEN_MAX;
00366     }
00367     else if (STR_NOCAPA_P(str)) {
00368         return RSTRING(str)->as.heap.len;
00369     }
00370     else {
00371         return RSTRING(str)->as.heap.aux.capa;
00372     }
00373 }
00374 
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378     NEWOBJ(str, struct RString);
00379     OBJSETUP(str, klass, T_STRING);
00380 
00381     str->as.heap.ptr = 0;
00382     str->as.heap.len = 0;
00383     str->as.heap.aux.capa = 0;
00384 
00385     return (VALUE)str;
00386 }
00387 
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391     VALUE str;
00392 
00393     if (len < 0) {
00394         rb_raise(rb_eArgError, "negative string size (or size too big)");
00395     }
00396 
00397     str = str_alloc(klass);
00398     if (len > RSTRING_EMBED_LEN_MAX) {
00399         RSTRING(str)->as.heap.aux.capa = len;
00400         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401         STR_SET_NOEMBED(str);
00402     }
00403     else if (len == 0) {
00404         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405     }
00406     if (ptr) {
00407         memcpy(RSTRING_PTR(str), ptr, len);
00408     }
00409     STR_SET_LEN(str, len);
00410     RSTRING_PTR(str)[len] = '\0';
00411     return str;
00412 }
00413 
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417     return str_new(rb_cString, ptr, len);
00418 }
00419 
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423     VALUE str = rb_str_new(ptr, len);
00424     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425     return str;
00426 }
00427 
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431     VALUE str = rb_str_new(ptr, len);
00432     rb_enc_associate(str, enc);
00433     return str;
00434 }
00435 
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439     if (!ptr) {
00440         rb_raise(rb_eArgError, "NULL pointer given");
00441     }
00442     return rb_str_new(ptr, strlen(ptr));
00443 }
00444 
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447 
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451     VALUE str = rb_str_new2(ptr);
00452     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453     return str;
00454 }
00455 
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458 
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462     VALUE str = rb_str_new(ptr, len);
00463 
00464     OBJ_TAINT(str);
00465     return str;
00466 }
00467 
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471     VALUE str = rb_str_new2(ptr);
00472 
00473     OBJ_TAINT(str);
00474     return str;
00475 }
00476 
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479 
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483     rb_econv_t *ec;
00484     rb_econv_result_t ret;
00485     long len;
00486     VALUE newstr;
00487     const unsigned char *sp;
00488     unsigned char *dp;
00489 
00490     if (!to) return str;
00491     if (from == to) return str;
00492     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493         to == rb_ascii8bit_encoding()) {
00494         if (STR_ENC_GET(str) != to) {
00495             str = rb_str_dup(str);
00496             rb_enc_associate(str, to);
00497         }
00498         return str;
00499     }
00500 
00501     len = RSTRING_LEN(str);
00502     newstr = rb_str_new(0, len);
00503 
00504   retry:
00505     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506     if (!ec) return str;
00507 
00508     sp = (unsigned char*)RSTRING_PTR(str);
00509     dp = (unsigned char*)RSTRING_PTR(newstr);
00510     ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511                            &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512     rb_econv_close(ec);
00513     switch (ret) {
00514       case econv_destination_buffer_full:
00515         /* destination buffer short */
00516         len = len < 2 ? 2 : len * 2;
00517         rb_str_resize(newstr, len);
00518         goto retry;
00519 
00520       case econv_finished:
00521         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522         rb_str_set_len(newstr, len);
00523         rb_enc_associate(newstr, to);
00524         return newstr;
00525 
00526       default:
00527         /* some error, return original */
00528         return str;
00529     }
00530 }
00531 
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537 
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541     VALUE str;
00542 
00543     str = rb_tainted_str_new(ptr, len);
00544     if (eenc == rb_usascii_encoding() &&
00545         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546         rb_enc_associate(str, rb_ascii8bit_encoding());
00547         return str;
00548     }
00549     rb_enc_associate(str, eenc);
00550     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552 
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558 
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564 
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570 
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576 
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582 
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588 
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594 
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600 
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606 
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611         STR_SET_EMBED(str2);
00612         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614     }
00615     else {
00616         str = rb_str_new_frozen(str);
00617         FL_SET(str2, STR_NOEMBED);
00618         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620         RSTRING(str2)->as.heap.aux.shared = str;
00621         FL_SET(str2, ELTS_SHARED);
00622     }
00623     rb_enc_cr_str_exact_copy(str2, str);
00624 
00625     return str2;
00626 }
00627 
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631     return str_replace_shared(str_alloc(klass), str);
00632 }
00633 
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637     return str_new_shared(klass, str);
00638 }
00639 
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643     VALUE str2 = str_new3(rb_obj_class(str), str);
00644 
00645     OBJ_INFECT(str2, str);
00646     return str2;
00647 }
00648 
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651 
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655     VALUE str2;
00656 
00657     str2 = str_alloc(klass);
00658     STR_SET_NOEMBED(str2);
00659     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661     if (STR_SHARED_P(str)) {
00662         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663         assert(OBJ_FROZEN(shared));
00664         FL_SET(str2, ELTS_SHARED);
00665         RSTRING(str2)->as.heap.aux.shared = shared;
00666     }
00667     else {
00668         FL_SET(str, ELTS_SHARED);
00669         RSTRING(str)->as.heap.aux.shared = str2;
00670     }
00671     rb_enc_cr_str_exact_copy(str2, str);
00672     OBJ_INFECT(str2, str);
00673     return str2;
00674 }
00675 
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679     VALUE klass, str;
00680 
00681     if (OBJ_FROZEN(orig)) return orig;
00682     klass = rb_obj_class(orig);
00683     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684         long ofs;
00685         assert(OBJ_FROZEN(str));
00686         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689             ENCODING_GET(str) != ENCODING_GET(orig)) {
00690             str = str_new3(klass, str);
00691             RSTRING(str)->as.heap.ptr += ofs;
00692             RSTRING(str)->as.heap.len -= ofs;
00693             rb_enc_cr_str_exact_copy(str, orig);
00694             OBJ_INFECT(str, orig);
00695         }
00696     }
00697     else if (STR_EMBED_P(orig)) {
00698         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699         rb_enc_cr_str_exact_copy(str, orig);
00700         OBJ_INFECT(str, orig);
00701     }
00702     else if (STR_ASSOC_P(orig)) {
00703         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704         FL_UNSET(orig, STR_ASSOC);
00705         str = str_new4(klass, orig);
00706         FL_SET(str, STR_ASSOC);
00707         RSTRING(str)->as.heap.aux.shared = assoc;
00708     }
00709     else {
00710         str = str_new4(klass, orig);
00711     }
00712     OBJ_FREEZE(str);
00713     return str;
00714 }
00715 
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718 
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722     return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724 
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726            rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728 
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732     VALUE v = rb_str_new5(str, 0, 0);
00733     OBJ_INFECT(v, str);
00734     return v;
00735 }
00736 
00737 #define STR_BUF_MIN_SIZE 128
00738 
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742     VALUE str = str_alloc(rb_cString);
00743 
00744     if (capa < STR_BUF_MIN_SIZE) {
00745         capa = STR_BUF_MIN_SIZE;
00746     }
00747     FL_SET(str, STR_NOEMBED);
00748     RSTRING(str)->as.heap.aux.capa = capa;
00749     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750     RSTRING(str)->as.heap.ptr[0] = '\0';
00751 
00752     return str;
00753 }
00754 
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758     VALUE str;
00759     long len = strlen(ptr);
00760 
00761     str = rb_str_buf_new(len);
00762     rb_str_buf_cat(str, ptr, len);
00763 
00764     return str;
00765 }
00766 
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769 
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773     return str_new(0, 0, len);
00774 }
00775 
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780         xfree(RSTRING(str)->as.heap.ptr);
00781     }
00782 }
00783 
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788         return RSTRING(str)->as.heap.aux.capa;
00789     }
00790     else {
00791         return 0;
00792     }
00793 }
00794 
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798     return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800 
00801 static inline void str_discard(VALUE str);
00802 
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806     rb_encoding *enc;
00807     int cr;
00808     if (str == str2) return;
00809     enc = STR_ENC_GET(str2);
00810     cr = ENC_CODERANGE(str2);
00811     str_discard(str);
00812     OBJ_INFECT(str, str2);
00813     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814         STR_SET_EMBED(str);
00815         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817         rb_enc_associate(str, enc);
00818         ENC_CODERANGE_SET(str, cr);
00819         return;
00820     }
00821     STR_SET_NOEMBED(str);
00822     STR_UNSET_NOCAPA(str);
00823     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825     if (STR_NOCAPA_P(str2)) {
00826         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828     }
00829     else {
00830         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831     }
00832     STR_SET_EMBED(str2);        /* abandon str2 */
00833     RSTRING_PTR(str2)[0] = 0;
00834     STR_SET_EMBED_LEN(str2, 0);
00835     rb_enc_associate(str, enc);
00836     ENC_CODERANGE_SET(str, cr);
00837 }
00838 
00839 static ID id_to_s;
00840 
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844     VALUE str;
00845 
00846     if (TYPE(obj) == T_STRING) {
00847         return obj;
00848     }
00849     str = rb_funcall(obj, id_to_s, 0);
00850     if (TYPE(str) != T_STRING)
00851         return rb_any_to_s(obj);
00852     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853     return str;
00854 }
00855 
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859     long len;
00860 
00861     len = RSTRING_LEN(str2);
00862     if (STR_ASSOC_P(str2)) {
00863         str2 = rb_str_new4(str2);
00864     }
00865     if (STR_SHARED_P(str2)) {
00866         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867         assert(OBJ_FROZEN(shared));
00868         STR_SET_NOEMBED(str);
00869         RSTRING(str)->as.heap.len = len;
00870         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871         FL_SET(str, ELTS_SHARED);
00872         FL_UNSET(str, STR_ASSOC);
00873         RSTRING(str)->as.heap.aux.shared = shared;
00874     }
00875     else {
00876         str_replace_shared(str, str2);
00877     }
00878 
00879     OBJ_INFECT(str, str2);
00880     rb_enc_cr_str_exact_copy(str, str2);
00881     return str;
00882 }
00883 
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887     VALUE dup = str_alloc(klass);
00888     str_replace(dup, str);
00889     return dup;
00890 }
00891 
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895     return str_duplicate(rb_obj_class(str), str);
00896 }
00897 
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901     return str_replace(str_alloc(rb_cString), str);
00902 }
00903 
00904 /*
00905  *  call-seq:
00906  *     String.new(str="")   -> new_str
00907  *
00908  *  Returns a new string object containing a copy of <i>str</i>.
00909  */
00910 
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914     VALUE orig;
00915 
00916     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917         rb_str_replace(str, orig);
00918     return str;
00919 }
00920 
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924     long c;
00925     const char *q;
00926 
00927     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929     }
00930     else if (rb_enc_asciicompat(enc)) {
00931         c = 0;
00932         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933             while (p < e) {
00934                 if (ISASCII(*p)) {
00935                     q = search_nonascii(p, e);
00936                     if (!q)
00937                         return c + (e - p);
00938                     c += q - p;
00939                     p = q;
00940                 }
00941                 p += rb_enc_fast_mbclen(p, e, enc);
00942                 c++;
00943             }
00944         }
00945         else {
00946             while (p < e) {
00947                 if (ISASCII(*p)) {
00948                     q = search_nonascii(p, e);
00949                     if (!q)
00950                         return c + (e - p);
00951                     c += q - p;
00952                     p = q;
00953                 }
00954                 p += rb_enc_mbclen(p, e, enc);
00955                 c++;
00956             }
00957         }
00958         return c;
00959     }
00960 
00961     for (c=0; p<e; c++) {
00962         p += rb_enc_mbclen(p, e, enc);
00963     }
00964     return c;
00965 }
00966 
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972 
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976     long c;
00977     const char *q;
00978     int ret;
00979 
00980     *cr = 0;
00981     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983     }
00984     else if (rb_enc_asciicompat(enc)) {
00985         c = 0;
00986         while (p < e) {
00987             if (ISASCII(*p)) {
00988                 q = search_nonascii(p, e);
00989                 if (!q) {
00990                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991                     return c + (e - p);
00992                 }
00993                 c += q - p;
00994                 p = q;
00995             }
00996             ret = rb_enc_precise_mbclen(p, e, enc);
00997             if (MBCLEN_CHARFOUND_P(ret)) {
00998                 *cr |= ENC_CODERANGE_VALID;
00999                 p += MBCLEN_CHARFOUND_LEN(ret);
01000             }
01001             else {
01002                 *cr = ENC_CODERANGE_BROKEN;
01003                 p++;
01004             }
01005             c++;
01006         }
01007         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008         return c;
01009     }
01010 
01011     for (c=0; p<e; c++) {
01012         ret = rb_enc_precise_mbclen(p, e, enc);
01013         if (MBCLEN_CHARFOUND_P(ret)) {
01014             *cr |= ENC_CODERANGE_VALID;
01015             p += MBCLEN_CHARFOUND_LEN(ret);
01016         }
01017         else {
01018             *cr = ENC_CODERANGE_BROKEN;
01019             if (p + rb_enc_mbminlen(enc) <= e)
01020                 p += rb_enc_mbminlen(enc);
01021             else
01022                 p = e;
01023         }
01024     }
01025     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026     return c;
01027 }
01028 
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034     VALUE d = *s;
01035     d |= ~(d>>1);
01036     d >>= 6;
01037     d &= NONASCII_MASK >> 7;
01038     d += (d>>8);
01039     d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041     d += (d>>32);
01042 #endif
01043     return (d&0xF);
01044 }
01045 #endif
01046 
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050     const char *p, *e;
01051     long n;
01052     int cr;
01053 
01054     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055     if (!enc) enc = STR_ENC_GET(str);
01056     p = RSTRING_PTR(str);
01057     e = RSTRING_END(str);
01058     cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061         enc == rb_utf8_encoding()) {
01062 
01063         VALUE len = 0;
01064         if ((int)sizeof(VALUE) * 2 < e - p) {
01065             const VALUE *s, *t;
01066             const VALUE lowbits = sizeof(VALUE) - 1;
01067             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068             t = (const VALUE*)(~lowbits & (VALUE)e);
01069             while (p < (const char *)s) {
01070                 if (is_utf8_lead_byte(*p)) len++;
01071                 p++;
01072             }
01073             while (s < t) {
01074                 len += count_utf8_lead_bytes_with_word(s);
01075                 s++;
01076             }
01077             p = (const char *)s;
01078         }
01079         while (p < e) {
01080             if (is_utf8_lead_byte(*p)) len++;
01081             p++;
01082         }
01083         return (long)len;
01084     }
01085 #endif
01086     n = rb_enc_strlen_cr(p, e, enc, &cr);
01087     if (cr) {
01088         ENC_CODERANGE_SET(str, cr);
01089     }
01090     return n;
01091 }
01092 
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096     return str_strlen(str, STR_ENC_GET(str));
01097 }
01098 
01099 /*
01100  *  call-seq:
01101  *     str.length   -> integer
01102  *     str.size     -> integer
01103  *
01104  *  Returns the character length of <i>str</i>.
01105  */
01106 
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110     long len;
01111 
01112     len = str_strlen(str, STR_ENC_GET(str));
01113     return LONG2NUM(len);
01114 }
01115 
01116 /*
01117  *  call-seq:
01118  *     str.bytesize  -> integer
01119  *
01120  *  Returns the length of <i>str</i> in bytes.
01121  */
01122 
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126     return INT2NUM(RSTRING_LEN(str));
01127 }
01128 
01129 /*
01130  *  call-seq:
01131  *     str.empty?   -> true or false
01132  *
01133  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01134  *
01135  *     "hello".empty?   #=> false
01136  *     "".empty?        #=> true
01137  */
01138 
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142     if (RSTRING_LEN(str) == 0)
01143         return Qtrue;
01144     return Qfalse;
01145 }
01146 
01147 /*
01148  *  call-seq:
01149  *     str + other_str   -> new_str
01150  *
01151  *  Concatenation---Returns a new <code>String</code> containing
01152  *  <i>other_str</i> concatenated to <i>str</i>.
01153  *
01154  *     "Hello from " + self.to_s   #=> "Hello from main"
01155  */
01156 
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160     VALUE str3;
01161     rb_encoding *enc;
01162 
01163     StringValue(str2);
01164     enc = rb_enc_check(str1, str2);
01165     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168            RSTRING_PTR(str2), RSTRING_LEN(str2));
01169     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170 
01171     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172         OBJ_TAINT(str3);
01173     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175     return str3;
01176 }
01177 
01178 /*
01179  *  call-seq:
01180  *     str * integer   -> new_str
01181  *
01182  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
01183  *  the receiver.
01184  *
01185  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01186  */
01187 
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191     VALUE str2;
01192     long n, len;
01193     char *ptr2;
01194 
01195     len = NUM2LONG(times);
01196     if (len < 0) {
01197         rb_raise(rb_eArgError, "negative argument");
01198     }
01199     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01200         rb_raise(rb_eArgError, "argument too big");
01201     }
01202 
01203     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204     ptr2 = RSTRING_PTR(str2);
01205     if (len) {
01206         n = RSTRING_LEN(str);
01207         memcpy(ptr2, RSTRING_PTR(str), n);
01208         while (n <= len/2) {
01209             memcpy(ptr2 + n, ptr2, n);
01210             n *= 2;
01211         }
01212         memcpy(ptr2 + n, ptr2, len-n);
01213     }
01214     ptr2[RSTRING_LEN(str2)] = '\0';
01215     OBJ_INFECT(str2, str);
01216     rb_enc_cr_str_copy_for_substr(str2, str);
01217 
01218     return str2;
01219 }
01220 
01221 /*
01222  *  call-seq:
01223  *     str % arg   -> new_str
01224  *
01225  *  Format---Uses <i>str</i> as a format specification, and returns the result
01226  *  of applying it to <i>arg</i>. If the format specification contains more than
01227  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
01228  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
01229  *  of the format string.
01230  *
01231  *     "%05d" % 123                              #=> "00123"
01232  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01233  */
01234 
01235 static VALUE
01236 rb_str_format_m(VALUE str, VALUE arg)
01237 {
01238     volatile VALUE tmp = rb_check_array_type(arg);
01239 
01240     if (!NIL_P(tmp)) {
01241         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01242     }
01243     return rb_str_format(1, &arg, str);
01244 }
01245 
01246 static inline void
01247 str_modifiable(VALUE str)
01248 {
01249     if (FL_TEST(str, STR_TMPLOCK)) {
01250         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01251     }
01252     if (OBJ_FROZEN(str)) rb_error_frozen("string");
01253     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01254         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01255 }
01256 
01257 static inline int
01258 str_independent(VALUE str)
01259 {
01260     str_modifiable(str);
01261     if (!STR_SHARED_P(str)) return 1;
01262     if (STR_EMBED_P(str)) return 1;
01263     return 0;
01264 }
01265 
01266 static void
01267 str_make_independent(VALUE str)
01268 {
01269     char *ptr;
01270     long len = RSTRING_LEN(str);
01271 
01272     ptr = ALLOC_N(char, len+1);
01273     if (RSTRING_PTR(str)) {
01274         memcpy(ptr, RSTRING_PTR(str), len);
01275     }
01276     STR_SET_NOEMBED(str);
01277     ptr[len] = 0;
01278     RSTRING(str)->as.heap.ptr = ptr;
01279     RSTRING(str)->as.heap.len = len;
01280     RSTRING(str)->as.heap.aux.capa = len;
01281     STR_UNSET_NOCAPA(str);
01282 }
01283 
01284 void
01285 rb_str_modify(VALUE str)
01286 {
01287     if (!str_independent(str))
01288         str_make_independent(str);
01289     ENC_CODERANGE_CLEAR(str);
01290 }
01291 
01292 /* As rb_str_modify(), but don't clear coderange */
01293 static void
01294 str_modify_keep_cr(VALUE str)
01295 {
01296     if (!str_independent(str))
01297         str_make_independent(str);
01298     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01299         /* Force re-scan later */
01300         ENC_CODERANGE_CLEAR(str);
01301 }
01302 
01303 static inline void
01304 str_discard(VALUE str)
01305 {
01306     str_modifiable(str);
01307     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01308         xfree(RSTRING_PTR(str));
01309         RSTRING(str)->as.heap.ptr = 0;
01310         RSTRING(str)->as.heap.len = 0;
01311     }
01312 }
01313 
01314 void
01315 rb_str_associate(VALUE str, VALUE add)
01316 {
01317     /* sanity check */
01318     if (OBJ_FROZEN(str)) rb_error_frozen("string");
01319     if (STR_ASSOC_P(str)) {
01320         /* already associated */
01321         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01322     }
01323     else {
01324         if (STR_SHARED_P(str)) {
01325             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01326             str_make_independent(str);
01327             if (STR_ASSOC_P(assoc)) {
01328                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01329                 rb_ary_concat(assoc, add);
01330                 add = assoc;
01331             }
01332         }
01333         else if (STR_EMBED_P(str)) {
01334             str_make_independent(str);
01335         }
01336         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01337             RESIZE_CAPA(str, RSTRING_LEN(str));
01338         }
01339         FL_SET(str, STR_ASSOC);
01340         RBASIC(add)->klass = 0;
01341         RSTRING(str)->as.heap.aux.shared = add;
01342     }
01343 }
01344 
01345 VALUE
01346 rb_str_associated(VALUE str)
01347 {
01348     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01349     if (STR_ASSOC_P(str)) {
01350         return RSTRING(str)->as.heap.aux.shared;
01351     }
01352     return Qfalse;
01353 }
01354 
01355 VALUE
01356 rb_string_value(volatile VALUE *ptr)
01357 {
01358     VALUE s = *ptr;
01359     if (TYPE(s) != T_STRING) {
01360         s = rb_str_to_str(s);
01361         *ptr = s;
01362     }
01363     return s;
01364 }
01365 
01366 char *
01367 rb_string_value_ptr(volatile VALUE *ptr)
01368 {
01369     VALUE str = rb_string_value(ptr);
01370     return RSTRING_PTR(str);
01371 }
01372 
01373 char *
01374 rb_string_value_cstr(volatile VALUE *ptr)
01375 {
01376     VALUE str = rb_string_value(ptr);
01377     char *s = RSTRING_PTR(str);
01378     long len = RSTRING_LEN(str);
01379 
01380     if (!s || memchr(s, 0, len)) {
01381         rb_raise(rb_eArgError, "string contains null byte");
01382     }
01383     if (s[len]) rb_str_modify(str);
01384     return s;
01385 }
01386 
01387 VALUE
01388 rb_check_string_type(VALUE str)
01389 {
01390     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01391     return str;
01392 }
01393 
01394 /*
01395  *  call-seq:
01396  *     String.try_convert(obj) -> string or nil
01397  *
01398  *  Try to convert <i>obj</i> into a String, using to_str method.
01399  *  Returns converted string or nil if <i>obj</i> cannot be converted
01400  *  for any reason.
01401  *
01402  *     String.try_convert("str")     #=> "str"
01403  *     String.try_convert(/re/)      #=> nil
01404  */
01405 static VALUE
01406 rb_str_s_try_convert(VALUE dummy, VALUE str)
01407 {
01408     return rb_check_string_type(str);
01409 }
01410 
01411 char*
01412 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01413 {
01414     if (rb_enc_mbmaxlen(enc) == 1) {
01415         p += nth;
01416     }
01417     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01418         p += nth * rb_enc_mbmaxlen(enc);
01419     }
01420     else if (rb_enc_asciicompat(enc)) {
01421         const char *p2, *e2;
01422         int n;
01423 
01424         while (p < e && 0 < nth) {
01425             e2 = p + nth;
01426             if (e < e2)
01427                 return (char *)e;
01428             if (ISASCII(*p)) {
01429                 p2 = search_nonascii(p, e2);
01430                 if (!p2)
01431                     return (char *)e2;
01432                 nth -= p2 - p;
01433                 p = p2;
01434             }
01435             n = rb_enc_mbclen(p, e, enc);
01436             p += n;
01437             nth--;
01438         }
01439         if (nth != 0)
01440             return (char *)e;
01441         return (char *)p;
01442     }
01443     else {
01444         while (p<e && nth--) {
01445             p += rb_enc_mbclen(p, e, enc);
01446         }
01447     }
01448     if (p > e) p = e;
01449     return (char*)p;
01450 }
01451 
01452 static char*
01453 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01454 {
01455     if (singlebyte)
01456         p += nth;
01457     else {
01458         p = rb_enc_nth(p, e, nth, enc);
01459     }
01460     if (!p) return 0;
01461     if (p > e) p = e;
01462     return (char *)p;
01463 }
01464 
01465 /* char offset to byte offset */
01466 static long
01467 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01468 {
01469     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01470     if (!pp) return e - p;
01471     return pp - p;
01472 }
01473 
01474 long
01475 rb_str_offset(VALUE str, long pos)
01476 {
01477     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01478                       STR_ENC_GET(str), single_byte_optimizable(str));
01479 }
01480 
01481 #ifdef NONASCII_MASK
01482 static char *
01483 str_utf8_nth(const char *p, const char *e, long nth)
01484 {
01485     if ((int)SIZEOF_VALUE < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01486         const VALUE *s, *t;
01487         const VALUE lowbits = sizeof(VALUE) - 1;
01488         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01489         t = (const VALUE*)(~lowbits & (VALUE)e);
01490         while (p < (const char *)s) {
01491             if (is_utf8_lead_byte(*p)) nth--;
01492             p++;
01493         }
01494         do {
01495             nth -= count_utf8_lead_bytes_with_word(s);
01496             s++;
01497         } while (s < t && (int)sizeof(VALUE) <= nth);
01498         p = (char *)s;
01499     }
01500     while (p < e) {
01501         if (is_utf8_lead_byte(*p)) {
01502             if (nth == 0) break;
01503             nth--;
01504         }
01505         p++;
01506     }
01507     return (char *)p;
01508 }
01509 
01510 static long
01511 str_utf8_offset(const char *p, const char *e, long nth)
01512 {
01513     const char *pp = str_utf8_nth(p, e, nth);
01514     return pp - p;
01515 }
01516 #endif
01517 
01518 /* byte offset to char offset */
01519 long
01520 rb_str_sublen(VALUE str, long pos)
01521 {
01522     if (single_byte_optimizable(str) || pos < 0)
01523         return pos;
01524     else {
01525         char *p = RSTRING_PTR(str);
01526         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01527     }
01528 }
01529 
01530 VALUE
01531 rb_str_subseq(VALUE str, long beg, long len)
01532 {
01533     VALUE str2;
01534 
01535     if (RSTRING_LEN(str) == beg + len &&
01536         RSTRING_EMBED_LEN_MAX < len) {
01537         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01538         rb_str_drop_bytes(str2, beg);
01539     }
01540     else {
01541         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01542     }
01543 
01544     rb_enc_cr_str_copy_for_substr(str2, str);
01545     OBJ_INFECT(str2, str);
01546 
01547     return str2;
01548 }
01549 
01550 VALUE
01551 rb_str_substr(VALUE str, long beg, long len)
01552 {
01553     rb_encoding *enc = STR_ENC_GET(str);
01554     VALUE str2;
01555     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01556 
01557     if (len < 0) return Qnil;
01558     if (!RSTRING_LEN(str)) {
01559         len = 0;
01560     }
01561     if (single_byte_optimizable(str)) {
01562         if (beg > RSTRING_LEN(str)) return Qnil;
01563         if (beg < 0) {
01564             beg += RSTRING_LEN(str);
01565             if (beg < 0) return Qnil;
01566         }
01567         if (beg + len > RSTRING_LEN(str))
01568             len = RSTRING_LEN(str) - beg;
01569         if (len <= 0) {
01570             len = 0;
01571             p = 0;
01572         }
01573         else
01574             p = s + beg;
01575         goto sub;
01576     }
01577     if (beg < 0) {
01578         if (len > -beg) len = -beg;
01579         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01580             beg = -beg;
01581             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01582             p = e;
01583             if (!p) return Qnil;
01584             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01585             if (!p) return Qnil;
01586             len = e - p;
01587             goto sub;
01588         }
01589         else {
01590             beg += str_strlen(str, enc);
01591             if (beg < 0) return Qnil;
01592         }
01593     }
01594     else if (beg > 0 && beg > str_strlen(str, enc)) {
01595         return Qnil;
01596     }
01597     if (len == 0) {
01598         p = 0;
01599     }
01600 #ifdef NONASCII_MASK
01601     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01602         enc == rb_utf8_encoding()) {
01603         p = str_utf8_nth(s, e, beg);
01604         len = str_utf8_offset(p, e, len);
01605     }
01606 #endif
01607     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01608         int char_sz = rb_enc_mbmaxlen(enc);
01609 
01610         p = s + beg * char_sz;
01611         if (p > e) {
01612             p = e;
01613             len = 0;
01614         }
01615         else if (len * char_sz > e - p)
01616             len = e - p;
01617         else
01618             len *= char_sz;
01619     }
01620     else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01621         len = 0;
01622     }
01623     else {
01624         len = str_offset(p, e, len, enc, 0);
01625     }
01626   sub:
01627     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01628         str2 = rb_str_new4(str);
01629         str2 = str_new3(rb_obj_class(str2), str2);
01630         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01631         RSTRING(str2)->as.heap.len = len;
01632     }
01633     else {
01634         str2 = rb_str_new5(str, p, len);
01635         rb_enc_cr_str_copy_for_substr(str2, str);
01636         OBJ_INFECT(str2, str);
01637     }
01638 
01639     return str2;
01640 }
01641 
01642 VALUE
01643 rb_str_freeze(VALUE str)
01644 {
01645     if (STR_ASSOC_P(str)) {
01646         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01647         OBJ_FREEZE(ary);
01648     }
01649     return rb_obj_freeze(str);
01650 }
01651 
01652 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01653 #define rb_str_dup_frozen rb_str_new_frozen
01654 
01655 VALUE
01656 rb_str_locktmp(VALUE str)
01657 {
01658     if (FL_TEST(str, STR_TMPLOCK)) {
01659         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01660     }
01661     FL_SET(str, STR_TMPLOCK);
01662     return str;
01663 }
01664 
01665 VALUE
01666 rb_str_unlocktmp(VALUE str)
01667 {
01668     if (!FL_TEST(str, STR_TMPLOCK)) {
01669         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01670     }
01671     FL_UNSET(str, STR_TMPLOCK);
01672     return str;
01673 }
01674 
01675 void
01676 rb_str_set_len(VALUE str, long len)
01677 {
01678     rb_str_modify(str);
01679     STR_SET_LEN(str, len);
01680     RSTRING_PTR(str)[len] = '\0';
01681 }
01682 
01683 VALUE
01684 rb_str_resize(VALUE str, long len)
01685 {
01686     long slen;
01687 
01688     if (len < 0) {
01689         rb_raise(rb_eArgError, "negative string size (or size too big)");
01690     }
01691 
01692     rb_str_modify(str);
01693     slen = RSTRING_LEN(str);
01694     if (len != slen) {
01695         if (STR_EMBED_P(str)) {
01696             char *ptr;
01697             if (len <= RSTRING_EMBED_LEN_MAX) {
01698                 STR_SET_EMBED_LEN(str, len);
01699                 RSTRING(str)->as.ary[len] = '\0';
01700                 return str;
01701             }
01702             ptr = ALLOC_N(char,len+1);
01703             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01704             RSTRING(str)->as.heap.ptr = ptr;
01705             STR_SET_NOEMBED(str);
01706         }
01707         else if (len <= RSTRING_EMBED_LEN_MAX) {
01708             char *ptr = RSTRING(str)->as.heap.ptr;
01709             STR_SET_EMBED(str);
01710             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
01711             RSTRING(str)->as.ary[len] = '\0';
01712             STR_SET_EMBED_LEN(str, len);
01713             xfree(ptr);
01714             return str;
01715         }
01716         else if (slen < len || slen - len > 1024) {
01717             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01718         }
01719         if (!STR_NOCAPA_P(str)) {
01720             RSTRING(str)->as.heap.aux.capa = len;
01721         }
01722         RSTRING(str)->as.heap.len = len;
01723         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01724     }
01725     return str;
01726 }
01727 
01728 static VALUE
01729 str_buf_cat(VALUE str, const char *ptr, long len)
01730 {
01731     long capa, total, off = -1;
01732 
01733     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01734         off = ptr - RSTRING_PTR(str);
01735     }
01736     rb_str_modify(str);
01737     if (len == 0) return 0;
01738     if (STR_ASSOC_P(str)) {
01739         FL_UNSET(str, STR_ASSOC);
01740         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01741     }
01742     else if (STR_EMBED_P(str)) {
01743         capa = RSTRING_EMBED_LEN_MAX;
01744     }
01745     else {
01746         capa = RSTRING(str)->as.heap.aux.capa;
01747     }
01748     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01749         rb_raise(rb_eArgError, "string sizes too big");
01750     }
01751     total = RSTRING_LEN(str)+len;
01752     if (capa <= total) {
01753         while (total > capa) {
01754             if (capa + 1 >= LONG_MAX / 2) {
01755                 capa = (total + 4095) / 4096;
01756                 break;
01757             }
01758             capa = (capa + 1) * 2;
01759         }
01760         RESIZE_CAPA(str, capa);
01761     }
01762     if (off != -1) {
01763         ptr = RSTRING_PTR(str) + off;
01764     }
01765     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01766     STR_SET_LEN(str, total);
01767     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01768 
01769     return str;
01770 }
01771 
01772 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01773 
01774 VALUE
01775 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01776 {
01777     if (len == 0) return str;
01778     if (len < 0) {
01779         rb_raise(rb_eArgError, "negative string size (or size too big)");
01780     }
01781     return str_buf_cat(str, ptr, len);
01782 }
01783 
01784 VALUE
01785 rb_str_buf_cat2(VALUE str, const char *ptr)
01786 {
01787     return rb_str_buf_cat(str, ptr, strlen(ptr));
01788 }
01789 
01790 VALUE
01791 rb_str_cat(VALUE str, const char *ptr, long len)
01792 {
01793     if (len < 0) {
01794         rb_raise(rb_eArgError, "negative string size (or size too big)");
01795     }
01796     if (STR_ASSOC_P(str)) {
01797         rb_str_modify(str);
01798         if (STR_EMBED_P(str)) str_make_independent(str);
01799         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01800         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01801         RSTRING(str)->as.heap.len += len;
01802         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
01803         return str;
01804     }
01805 
01806     return rb_str_buf_cat(str, ptr, len);
01807 }
01808 
01809 VALUE
01810 rb_str_cat2(VALUE str, const char *ptr)
01811 {
01812     return rb_str_cat(str, ptr, strlen(ptr));
01813 }
01814 
01815 static VALUE
01816 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01817     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01818 {
01819     int str_encindex = ENCODING_GET(str);
01820     int res_encindex;
01821     int str_cr, res_cr;
01822     int str_a8 = ENCODING_IS_ASCII8BIT(str);
01823     int ptr_a8 = ptr_encindex == 0;
01824 
01825     str_cr = ENC_CODERANGE(str);
01826 
01827     if (str_encindex == ptr_encindex) {
01828         if (str_cr == ENC_CODERANGE_UNKNOWN ||
01829             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01830             ptr_cr = ENC_CODERANGE_UNKNOWN;
01831         }
01832         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01833             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01834         }
01835     }
01836     else {
01837         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01838         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01839         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01840             if (len == 0)
01841                 return str;
01842             if (RSTRING_LEN(str) == 0) {
01843                 rb_str_buf_cat(str, ptr, len);
01844                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01845                 return str;
01846             }
01847             goto incompatible;
01848         }
01849         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01850             ptr_cr = coderange_scan(ptr, len, ptr_enc);
01851         }
01852         if (str_cr == ENC_CODERANGE_UNKNOWN) {
01853             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01854                 str_cr = rb_enc_str_coderange(str);
01855             }
01856         }
01857     }
01858     if (ptr_cr_ret)
01859         *ptr_cr_ret = ptr_cr;
01860 
01861     if (str_encindex != ptr_encindex &&
01862         str_cr != ENC_CODERANGE_7BIT &&
01863         ptr_cr != ENC_CODERANGE_7BIT) {
01864       incompatible:
01865         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01866             rb_enc_name(rb_enc_from_index(str_encindex)),
01867             rb_enc_name(rb_enc_from_index(ptr_encindex)));
01868     }
01869 
01870     if (str_cr == ENC_CODERANGE_UNKNOWN) {
01871         res_encindex = str_encindex;
01872         res_cr = ENC_CODERANGE_UNKNOWN;
01873     }
01874     else if (str_cr == ENC_CODERANGE_7BIT) {
01875         if (ptr_cr == ENC_CODERANGE_7BIT) {
01876             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01877             res_cr = ENC_CODERANGE_7BIT;
01878         }
01879         else {
01880             res_encindex = ptr_encindex;
01881             res_cr = ptr_cr;
01882         }
01883     }
01884     else if (str_cr == ENC_CODERANGE_VALID) {
01885         res_encindex = str_encindex;
01886         res_cr = str_cr;
01887     }
01888     else { /* str_cr == ENC_CODERANGE_BROKEN */
01889         res_encindex = str_encindex;
01890         res_cr = str_cr;
01891         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01892     }
01893 
01894     if (len < 0) {
01895         rb_raise(rb_eArgError, "negative string size (or size too big)");
01896     }
01897     str_buf_cat(str, ptr, len);
01898     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01899     return str;
01900 }
01901 
01902 VALUE
01903 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01904 {
01905     return rb_enc_cr_str_buf_cat(str, ptr, len,
01906         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01907 }
01908 
01909 VALUE
01910 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01911 {
01912     /* ptr must reference NUL terminated ASCII string. */
01913     int encindex = ENCODING_GET(str);
01914     rb_encoding *enc = rb_enc_from_index(encindex);
01915     if (rb_enc_asciicompat(enc)) {
01916         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01917             encindex, ENC_CODERANGE_7BIT, 0);
01918     }
01919     else {
01920         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01921         while (*ptr) {
01922             unsigned int c = (unsigned char)*ptr;
01923             int len = rb_enc_codelen(c, enc);
01924             rb_enc_mbcput(c, buf, enc);
01925             rb_enc_cr_str_buf_cat(str, buf, len,
01926                 encindex, ENC_CODERANGE_VALID, 0);
01927             ptr++;
01928         }
01929         return str;
01930     }
01931 }
01932 
01933 VALUE
01934 rb_str_buf_append(VALUE str, VALUE str2)
01935 {
01936     int str2_cr;
01937 
01938     str2_cr = ENC_CODERANGE(str2);
01939 
01940     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01941         ENCODING_GET(str2), str2_cr, &str2_cr);
01942 
01943     OBJ_INFECT(str, str2);
01944     ENC_CODERANGE_SET(str2, str2_cr);
01945 
01946     return str;
01947 }
01948 
01949 VALUE
01950 rb_str_append(VALUE str, VALUE str2)
01951 {
01952     rb_encoding *enc;
01953     int cr, cr2;
01954 
01955     StringValue(str2);
01956     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01957         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01958         enc = rb_enc_check(str, str2);
01959         cr = ENC_CODERANGE(str);
01960         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01961         rb_str_modify(str);
01962         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01963         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01964                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01965         RSTRING(str)->as.heap.len = len;
01966         rb_enc_associate(str, enc);
01967         ENC_CODERANGE_SET(str, cr);
01968         OBJ_INFECT(str, str2);
01969         return str;
01970     }
01971     return rb_str_buf_append(str, str2);
01972 }
01973 
01974 
01975 /*
01976  *  call-seq:
01977  *     str << integer       -> str
01978  *     str.concat(integer)  -> str
01979  *     str << obj           -> str
01980  *     str.concat(obj)      -> str
01981  *
01982  *  Append---Concatenates the given object to <i>str</i>. If the object is a
01983  *  <code>Integer</code>, it is considered as a codepoint, and is converted
01984  *  to a character before concatenation.
01985  *
01986  *     a = "hello "
01987  *     a << "world"   #=> "hello world"
01988  *     a.concat(33)   #=> "hello world!"
01989  */
01990 
01991 VALUE
01992 rb_str_concat(VALUE str1, VALUE str2)
01993 {
01994     SIGNED_VALUE lc;
01995 
01996     if (FIXNUM_P(str2)) {
01997         lc = FIX2LONG(str2);
01998         if (lc < 0)
01999             rb_raise(rb_eRangeError, "negative argument");
02000     }
02001     else if (TYPE(str2) == T_BIGNUM) {
02002         if (!RBIGNUM_SIGN(str2))
02003             rb_raise(rb_eRangeError, "negative argument");
02004         lc = rb_big2ulong(str2);
02005     }
02006     else {
02007         return rb_str_append(str1, str2);
02008     }
02009 #if SIZEOF_INT < SIZEOF_VALUE
02010     if ((VALUE)lc > UINT_MAX) {
02011         rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02012     }
02013 #endif
02014     {
02015         rb_encoding *enc = STR_ENC_GET(str1);
02016         long pos = RSTRING_LEN(str1);
02017         int cr = ENC_CODERANGE(str1);
02018         int c, len;
02019 
02020         if ((len = rb_enc_codelen(c = (int)lc, enc)) <= 0) {
02021             rb_raise(rb_eRangeError, "%u invalid char", c);
02022         }
02023         rb_str_resize(str1, pos+len);
02024         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
02025         ENC_CODERANGE_SET(str1, cr);
02026         return str1;
02027     }
02028 }
02029 
02030 st_index_t
02031 rb_memhash(const void *ptr, long len)
02032 {
02033     return st_hash(ptr, len, rb_hash_start(0));
02034 }
02035 
02036 st_index_t
02037 rb_str_hash(VALUE str)
02038 {
02039     int e = ENCODING_GET(str);
02040     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02041         e = 0;
02042     }
02043     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02044 }
02045 
02046 int
02047 rb_str_hash_cmp(VALUE str1, VALUE str2)
02048 {
02049     long len;
02050 
02051     if (!rb_str_comparable(str1, str2)) return 1;
02052     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02053         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02054         return 0;
02055     }
02056     return 1;
02057 }
02058 
02059 /*
02060  * call-seq:
02061  *    str.hash   -> fixnum
02062  *
02063  * Return a hash based on the string's length and content.
02064  */
02065 
02066 static VALUE
02067 rb_str_hash_m(VALUE str)
02068 {
02069     st_index_t hval = rb_str_hash(str);
02070     return INT2FIX(hval);
02071 }
02072 
02073 #define lesser(a,b) (((a)>(b))?(b):(a))
02074 
02075 int
02076 rb_str_comparable(VALUE str1, VALUE str2)
02077 {
02078     int idx1, idx2;
02079     int rc1, rc2;
02080 
02081     if (RSTRING_LEN(str1) == 0) return TRUE;
02082     if (RSTRING_LEN(str2) == 0) return TRUE;
02083     idx1 = ENCODING_GET(str1);
02084     idx2 = ENCODING_GET(str2);
02085     if (idx1 == idx2) return TRUE;
02086     rc1 = rb_enc_str_coderange(str1);
02087     rc2 = rb_enc_str_coderange(str2);
02088     if (rc1 == ENC_CODERANGE_7BIT) {
02089         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02090         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02091             return TRUE;
02092     }
02093     if (rc2 == ENC_CODERANGE_7BIT) {
02094         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02095             return TRUE;
02096     }
02097     return FALSE;
02098 }
02099 
02100 int
02101 rb_str_cmp(VALUE str1, VALUE str2)
02102 {
02103     long len;
02104     int retval;
02105 
02106     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02107     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02108     if (retval == 0) {
02109         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02110             if (!rb_str_comparable(str1, str2)) {
02111                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02112                     return 1;
02113                 return -1;
02114             }
02115             return 0;
02116         }
02117         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02118         return -1;
02119     }
02120     if (retval > 0) return 1;
02121     return -1;
02122 }
02123 
02124 /* expect tail call optimization */
02125 static VALUE
02126 str_eql(const VALUE str1, const VALUE str2)
02127 {
02128     const long len = RSTRING_LEN(str1);
02129 
02130     if (len != RSTRING_LEN(str2)) return Qfalse;
02131     if (!rb_str_comparable(str1, str2)) return Qfalse;
02132     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02133         return Qtrue;
02134     return Qfalse;
02135 }
02136 /*
02137  *  call-seq:
02138  *     str == obj   -> true or false
02139  *
02140  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02141  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02142  *  <code><=></code> <i>obj</i> returns zero.
02143  */
02144 
02145 VALUE
02146 rb_str_equal(VALUE str1, VALUE str2)
02147 {
02148     if (str1 == str2) return Qtrue;
02149     if (TYPE(str2) != T_STRING) {
02150         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02151             return Qfalse;
02152         }
02153         return rb_equal(str2, str1);
02154     }
02155     return str_eql(str1, str2);
02156 }
02157 
02158 /*
02159  * call-seq:
02160  *   str.eql?(other)   -> true or false
02161  *
02162  * Two strings are equal if they have the same length and content.
02163  */
02164 
02165 static VALUE
02166 rb_str_eql(VALUE str1, VALUE str2)
02167 {
02168     if (TYPE(str2) != T_STRING) return Qfalse;
02169     return str_eql(str1, str2);
02170 }
02171 
02172 /*
02173  *  call-seq:
02174  *     str <=> other_str   -> -1, 0, +1 or nil
02175  *
02176  *  Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
02177  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
02178  *  <i>str</i>. If the strings are of different lengths, and the strings are
02179  *  equal when compared up to the shortest length, then the longer string is
02180  *  considered greater than the shorter one. In older versions of Ruby, setting
02181  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
02182  *  in favor of using <code>String#casecmp</code>.
02183  *
02184  *  <code><=></code> is the basis for the methods <code><</code>,
02185  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
02186  *  included from module <code>Comparable</code>.  The method
02187  *  <code>String#==</code> does not use <code>Comparable#==</code>.
02188  *
02189  *     "abcdef" <=> "abcde"     #=> 1
02190  *     "abcdef" <=> "abcdef"    #=> 0
02191  *     "abcdef" <=> "abcdefg"   #=> -1
02192  *     "abcdef" <=> "ABCDEF"    #=> 1
02193  */
02194 
02195 static VALUE
02196 rb_str_cmp_m(VALUE str1, VALUE str2)
02197 {
02198     long result;
02199 
02200     if (TYPE(str2) != T_STRING) {
02201         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02202             return Qnil;
02203         }
02204         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02205             return Qnil;
02206         }
02207         else {
02208             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02209 
02210             if (NIL_P(tmp)) return Qnil;
02211             if (!FIXNUM_P(tmp)) {
02212                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02213             }
02214             result = -FIX2LONG(tmp);
02215         }
02216     }
02217     else {
02218         result = rb_str_cmp(str1, str2);
02219     }
02220     return LONG2NUM(result);
02221 }
02222 
02223 /*
02224  *  call-seq:
02225  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02226  *
02227  *  Case-insensitive version of <code>String#<=></code>.
02228  *
02229  *     "abcdef".casecmp("abcde")     #=> 1
02230  *     "aBcDeF".casecmp("abcdef")    #=> 0
02231  *     "abcdef".casecmp("abcdefg")   #=> -1
02232  *     "abcdef".casecmp("ABCDEF")    #=> 0
02233  */
02234 
02235 static VALUE
02236 rb_str_casecmp(VALUE str1, VALUE str2)
02237 {
02238     long len;
02239     rb_encoding *enc;
02240     char *p1, *p1end, *p2, *p2end;
02241 
02242     StringValue(str2);
02243     enc = rb_enc_compatible(str1, str2);
02244     if (!enc) {
02245         return Qnil;
02246     }
02247 
02248     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02249     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02250     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02251         while (p1 < p1end && p2 < p2end) {
02252             if (*p1 != *p2) {
02253                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02254                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02255                 if (c1 != c2)
02256                     return INT2FIX(c1 < c2 ? -1 : 1);
02257             }
02258             p1++;
02259             p2++;
02260         }
02261     }
02262     else {
02263         while (p1 < p1end && p2 < p2end) {
02264             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02265             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02266 
02267             if (0 <= c1 && 0 <= c2) {
02268                 c1 = TOUPPER(c1);
02269                 c2 = TOUPPER(c2);
02270                 if (c1 != c2)
02271                     return INT2FIX(c1 < c2 ? -1 : 1);
02272             }
02273             else {
02274                 int r;
02275                 l1 = rb_enc_mbclen(p1, p1end, enc);
02276                 l2 = rb_enc_mbclen(p2, p2end, enc);
02277                 len = l1 < l2 ? l1 : l2;
02278                 r = memcmp(p1, p2, len);
02279                 if (r != 0)
02280                     return INT2FIX(r < 0 ? -1 : 1);
02281                 if (l1 != l2)
02282                     return INT2FIX(l1 < l2 ? -1 : 1);
02283             }
02284             p1 += l1;
02285             p2 += l2;
02286         }
02287     }
02288     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02289     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02290     return INT2FIX(-1);
02291 }
02292 
02293 static long
02294 rb_str_index(VALUE str, VALUE sub, long offset)
02295 {
02296     long pos;
02297     char *s, *sptr, *e;
02298     long len, slen;
02299     rb_encoding *enc;
02300 
02301     enc = rb_enc_check(str, sub);
02302     if (is_broken_string(sub)) {
02303         return -1;
02304     }
02305     len = str_strlen(str, enc);
02306     slen = str_strlen(sub, enc);
02307     if (offset < 0) {
02308         offset += len;
02309         if (offset < 0) return -1;
02310     }
02311     if (len - offset < slen) return -1;
02312     s = RSTRING_PTR(str);
02313     e = s + RSTRING_LEN(str);
02314     if (offset) {
02315         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02316         s += offset;
02317     }
02318     if (slen == 0) return offset;
02319     /* need proceed one character at a time */
02320     sptr = RSTRING_PTR(sub);
02321     slen = RSTRING_LEN(sub);
02322     len = RSTRING_LEN(str) - offset;
02323     for (;;) {
02324         char *t;
02325         pos = rb_memsearch(sptr, slen, s, len, enc);
02326         if (pos < 0) return pos;
02327         t = rb_enc_right_char_head(s, s+pos, e, enc);
02328         if (t == s + pos) break;
02329         if ((len -= t - s) <= 0) return -1;
02330         offset += t - s;
02331         s = t;
02332     }
02333     return pos + offset;
02334 }
02335 
02336 
02337 /*
02338  *  call-seq:
02339  *     str.index(substring [, offset])   -> fixnum or nil
02340  *     str.index(regexp [, offset])      -> fixnum or nil
02341  *
02342  *  Returns the index of the first occurrence of the given <i>substring</i> or
02343  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02344  *  found. If the second parameter is present, it specifies the position in the
02345  *  string to begin the search.
02346  *
02347  *     "hello".index('e')             #=> 1
02348  *     "hello".index('lo')            #=> 3
02349  *     "hello".index('a')             #=> nil
02350  *     "hello".index(?e)              #=> 1
02351  *     "hello".index(/[aeiou]/, -3)   #=> 4
02352  */
02353 
02354 static VALUE
02355 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02356 {
02357     VALUE sub;
02358     VALUE initpos;
02359     long pos;
02360 
02361     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02362         pos = NUM2LONG(initpos);
02363     }
02364     else {
02365         pos = 0;
02366     }
02367     if (pos < 0) {
02368         pos += str_strlen(str, STR_ENC_GET(str));
02369         if (pos < 0) {
02370             if (TYPE(sub) == T_REGEXP) {
02371                 rb_backref_set(Qnil);
02372             }
02373             return Qnil;
02374         }
02375     }
02376 
02377     switch (TYPE(sub)) {
02378       case T_REGEXP:
02379         if (pos > str_strlen(str, STR_ENC_GET(str)))
02380             return Qnil;
02381         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02382                          rb_enc_check(str, sub), single_byte_optimizable(str));
02383 
02384         pos = rb_reg_search(sub, str, pos, 0);
02385         pos = rb_str_sublen(str, pos);
02386         break;
02387 
02388       default: {
02389         VALUE tmp;
02390 
02391         tmp = rb_check_string_type(sub);
02392         if (NIL_P(tmp)) {
02393             rb_raise(rb_eTypeError, "type mismatch: %s given",
02394                      rb_obj_classname(sub));
02395         }
02396         sub = tmp;
02397       }
02398         /* fall through */
02399       case T_STRING:
02400         pos = rb_str_index(str, sub, pos);
02401         pos = rb_str_sublen(str, pos);
02402         break;
02403     }
02404 
02405     if (pos == -1) return Qnil;
02406     return LONG2NUM(pos);
02407 }
02408 
02409 static long
02410 rb_str_rindex(VALUE str, VALUE sub, long pos)
02411 {
02412     long len, slen;
02413     char *s, *sbeg, *e, *t;
02414     rb_encoding *enc;
02415     int singlebyte = single_byte_optimizable(str);
02416 
02417     enc = rb_enc_check(str, sub);
02418     if (is_broken_string(sub)) {
02419         return -1;
02420     }
02421     len = str_strlen(str, enc);
02422     slen = str_strlen(sub, enc);
02423     /* substring longer than string */
02424     if (len < slen) return -1;
02425     if (len - pos < slen) {
02426         pos = len - slen;
02427     }
02428     if (len == 0) {
02429         return pos;
02430     }
02431     sbeg = RSTRING_PTR(str);
02432     e = RSTRING_END(str);
02433     t = RSTRING_PTR(sub);
02434     slen = RSTRING_LEN(sub);
02435     for (;;) {
02436         s = str_nth(sbeg, e, pos, enc, singlebyte);
02437         if (!s) return -1;
02438         if (memcmp(s, t, slen) == 0) {
02439             return pos;
02440         }
02441         if (pos == 0) break;
02442         pos--;
02443     }
02444     return -1;
02445 }
02446 
02447 
02448 /*
02449  *  call-seq:
02450  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02451  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02452  *
02453  *  Returns the index of the last occurrence of the given <i>substring</i> or
02454  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02455  *  found. If the second parameter is present, it specifies the position in the
02456  *  string to end the search---characters beyond this point will not be
02457  *  considered.
02458  *
02459  *     "hello".rindex('e')             #=> 1
02460  *     "hello".rindex('l')             #=> 3
02461  *     "hello".rindex('a')             #=> nil
02462  *     "hello".rindex(?e)              #=> 1
02463  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02464  */
02465 
02466 static VALUE
02467 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02468 {
02469     VALUE sub;
02470     VALUE vpos;
02471     rb_encoding *enc = STR_ENC_GET(str);
02472     long pos, len = str_strlen(str, enc);
02473 
02474     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02475         pos = NUM2LONG(vpos);
02476         if (pos < 0) {
02477             pos += len;
02478             if (pos < 0) {
02479                 if (TYPE(sub) == T_REGEXP) {
02480                     rb_backref_set(Qnil);
02481                 }
02482                 return Qnil;
02483             }
02484         }
02485         if (pos > len) pos = len;
02486     }
02487     else {
02488         pos = len;
02489     }
02490 
02491     switch (TYPE(sub)) {
02492       case T_REGEXP:
02493         /* enc = rb_get_check(str, sub); */
02494         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02495                          STR_ENC_GET(str), single_byte_optimizable(str));
02496 
02497         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02498             pos = rb_reg_search(sub, str, pos, 1);
02499             pos = rb_str_sublen(str, pos);
02500         }
02501         if (pos >= 0) return LONG2NUM(pos);
02502         break;
02503 
02504       default: {
02505         VALUE tmp;
02506 
02507         tmp = rb_check_string_type(sub);
02508         if (NIL_P(tmp)) {
02509             rb_raise(rb_eTypeError, "type mismatch: %s given",
02510                      rb_obj_classname(sub));
02511         }
02512         sub = tmp;
02513       }
02514         /* fall through */
02515       case T_STRING:
02516         pos = rb_str_rindex(str, sub, pos);
02517         if (pos >= 0) return LONG2NUM(pos);
02518         break;
02519     }
02520     return Qnil;
02521 }
02522 
02523 /*
02524  *  call-seq:
02525  *     str =~ obj   -> fixnum or nil
02526  *
02527  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02528  *  against <i>str</i>,and returns the position the match starts, or
02529  *  <code>nil</code> if there is no match. Otherwise, invokes
02530  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02531  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
02532  *
02533  *     "cat o' 9 tails" =~ /\d/   #=> 7
02534  *     "cat o' 9 tails" =~ 9      #=> nil
02535  */
02536 
02537 static VALUE
02538 rb_str_match(VALUE x, VALUE y)
02539 {
02540     switch (TYPE(y)) {
02541       case T_STRING:
02542         rb_raise(rb_eTypeError, "type mismatch: String given");
02543 
02544       case T_REGEXP:
02545         return rb_reg_match(y, x);
02546 
02547       default:
02548         return rb_funcall(y, rb_intern("=~"), 1, x);
02549     }
02550 }
02551 
02552 
02553 static VALUE get_pat(VALUE, int);
02554 
02555 
02556 /*
02557  *  call-seq:
02558  *     str.match(pattern)        -> matchdata or nil
02559  *     str.match(pattern, pos)   -> matchdata or nil
02560  *
02561  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02562  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02563  *  parameter is present, it specifies the position in the string to begin the
02564  *  search.
02565  *  If the second parameter is present, it specifies the position in the string
02566  *  to begin the search.
02567  *
02568  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02569  *     'hello'.match('(.)\1')[0]   #=> "ll"
02570  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02571  *     'hello'.match('xx')         #=> nil
02572  *
02573  *  If a block is given, invoke the block with MatchData if match succeed, so
02574  *  that you can write
02575  *
02576  *     str.match(pat) {|m| ...}
02577  *
02578  *  instead of
02579  *
02580  *     if m = str.match(pat)
02581  *       ...
02582  *     end
02583  *
02584  *  The return value is a value from block execution in this case.
02585  */
02586 
02587 static VALUE
02588 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02589 {
02590     VALUE re, result;
02591     if (argc < 1)
02592        rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02593     re = argv[0];
02594     argv[0] = str;
02595     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02596     if (!NIL_P(result) && rb_block_given_p()) {
02597         return rb_yield(result);
02598     }
02599     return result;
02600 }
02601 
02602 enum neighbor_char {
02603     NEIGHBOR_NOT_CHAR,
02604     NEIGHBOR_FOUND,
02605     NEIGHBOR_WRAPPED
02606 };
02607 
02608 static enum neighbor_char
02609 enc_succ_char(char *p, long len, rb_encoding *enc)
02610 {
02611     long i;
02612     int l;
02613     while (1) {
02614         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02615             p[i] = '\0';
02616         if (i < 0)
02617             return NEIGHBOR_WRAPPED;
02618         ++((unsigned char*)p)[i];
02619         l = rb_enc_precise_mbclen(p, p+len, enc);
02620         if (MBCLEN_CHARFOUND_P(l)) {
02621             l = MBCLEN_CHARFOUND_LEN(l);
02622             if (l == len) {
02623                 return NEIGHBOR_FOUND;
02624             }
02625             else {
02626                 memset(p+l, 0xff, len-l);
02627             }
02628         }
02629         if (MBCLEN_INVALID_P(l) && i < len-1) {
02630             long len2;
02631             int l2;
02632             for (len2 = len-1; 0 < len2; len2--) {
02633                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02634                 if (!MBCLEN_INVALID_P(l2))
02635                     break;
02636             }
02637             memset(p+len2+1, 0xff, len-(len2+1));
02638         }
02639     }
02640 }
02641 
02642 static enum neighbor_char
02643 enc_pred_char(char *p, long len, rb_encoding *enc)
02644 {
02645     long i;
02646     int l;
02647     while (1) {
02648         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02649             p[i] = '\xff';
02650         if (i < 0)
02651             return NEIGHBOR_WRAPPED;
02652         --((unsigned char*)p)[i];
02653         l = rb_enc_precise_mbclen(p, p+len, enc);
02654         if (MBCLEN_CHARFOUND_P(l)) {
02655             l = MBCLEN_CHARFOUND_LEN(l);
02656             if (l == len) {
02657                 return NEIGHBOR_FOUND;
02658             }
02659             else {
02660                 memset(p+l, 0, len-l);
02661             }
02662         }
02663         if (MBCLEN_INVALID_P(l) && i < len-1) {
02664             long len2;
02665             int l2;
02666             for (len2 = len-1; 0 < len2; len2--) {
02667                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02668                 if (!MBCLEN_INVALID_P(l2))
02669                     break;
02670             }
02671             memset(p+len2+1, 0, len-(len2+1));
02672         }
02673     }
02674 }
02675 
02676 /*
02677   overwrite +p+ by succeeding letter in +enc+ and returns
02678   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02679   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02680   assuming each ranges are successive, and mbclen
02681   never change in each ranges.
02682   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02683   character.
02684  */
02685 static enum neighbor_char
02686 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02687 {
02688     enum neighbor_char ret;
02689     unsigned int c;
02690     int ctype;
02691     int range;
02692     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02693 
02694     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02695     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02696         ctype = ONIGENC_CTYPE_DIGIT;
02697     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02698         ctype = ONIGENC_CTYPE_ALPHA;
02699     else
02700         return NEIGHBOR_NOT_CHAR;
02701 
02702     MEMCPY(save, p, char, len);
02703     ret = enc_succ_char(p, len, enc);
02704     if (ret == NEIGHBOR_FOUND) {
02705         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02706         if (rb_enc_isctype(c, ctype, enc))
02707             return NEIGHBOR_FOUND;
02708     }
02709     MEMCPY(p, save, char, len);
02710     range = 1;
02711     while (1) {
02712         MEMCPY(save, p, char, len);
02713         ret = enc_pred_char(p, len, enc);
02714         if (ret == NEIGHBOR_FOUND) {
02715             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02716             if (!rb_enc_isctype(c, ctype, enc)) {
02717                 MEMCPY(p, save, char, len);
02718                 break;
02719             }
02720         }
02721         else {
02722             MEMCPY(p, save, char, len);
02723             break;
02724         }
02725         range++;
02726     }
02727     if (range == 1) {
02728         return NEIGHBOR_NOT_CHAR;
02729     }
02730 
02731     if (ctype != ONIGENC_CTYPE_DIGIT) {
02732         MEMCPY(carry, p, char, len);
02733         return NEIGHBOR_WRAPPED;
02734     }
02735 
02736     MEMCPY(carry, p, char, len);
02737     enc_succ_char(carry, len, enc);
02738     return NEIGHBOR_WRAPPED;
02739 }
02740 
02741 
02742 /*
02743  *  call-seq:
02744  *     str.succ   -> new_str
02745  *     str.next   -> new_str
02746  *
02747  *  Returns the successor to <i>str</i>. The successor is calculated by
02748  *  incrementing characters starting from the rightmost alphanumeric (or
02749  *  the rightmost character if there are no alphanumerics) in the
02750  *  string. Incrementing a digit always results in another digit, and
02751  *  incrementing a letter results in another letter of the same case.
02752  *  Incrementing nonalphanumerics uses the underlying character set's
02753  *  collating sequence.
02754  *
02755  *  If the increment generates a ``carry,'' the character to the left of
02756  *  it is incremented. This process repeats until there is no carry,
02757  *  adding an additional character if necessary.
02758  *
02759  *     "abcd".succ        #=> "abce"
02760  *     "THX1138".succ     #=> "THX1139"
02761  *     "<<koala>>".succ   #=> "<<koalb>>"
02762  *     "1999zzz".succ     #=> "2000aaa"
02763  *     "ZZZ9999".succ     #=> "AAAA0000"
02764  *     "***".succ         #=> "**+"
02765  */
02766 
02767 VALUE
02768 rb_str_succ(VALUE orig)
02769 {
02770     rb_encoding *enc;
02771     VALUE str;
02772     char *sbeg, *s, *e, *last_alnum = 0;
02773     int c = -1;
02774     long l;
02775     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02776     long carry_pos = 0, carry_len = 1;
02777     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02778 
02779     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02780     rb_enc_cr_str_copy_for_substr(str, orig);
02781     OBJ_INFECT(str, orig);
02782     if (RSTRING_LEN(str) == 0) return str;
02783 
02784     enc = STR_ENC_GET(orig);
02785     sbeg = RSTRING_PTR(str);
02786     s = e = sbeg + RSTRING_LEN(str);
02787 
02788     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02789         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02790             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02791                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02792                 s = last_alnum;
02793                 break;
02794             }
02795         }
02796         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02797         neighbor = enc_succ_alnum_char(s, l, enc, carry);
02798         switch (neighbor) {
02799           case NEIGHBOR_NOT_CHAR:
02800             continue;
02801           case NEIGHBOR_FOUND:
02802             return str;
02803           case NEIGHBOR_WRAPPED:
02804             last_alnum = s;
02805             break;
02806         }
02807         c = 1;
02808         carry_pos = s - sbeg;
02809         carry_len = l;
02810     }
02811     if (c == -1) {              /* str contains no alnum */
02812         s = e;
02813         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02814             enum neighbor_char neighbor;
02815             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02816             neighbor = enc_succ_char(s, l, enc);
02817             if (neighbor == NEIGHBOR_FOUND)
02818                 return str;
02819             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02820                 /* wrapped to \0...\0.  search next valid char. */
02821                 enc_succ_char(s, l, enc);
02822             }
02823             if (!rb_enc_asciicompat(enc)) {
02824                 MEMCPY(carry, s, char, l);
02825                 carry_len = l;
02826             }
02827             carry_pos = s - sbeg;
02828         }
02829     }
02830     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02831     s = RSTRING_PTR(str) + carry_pos;
02832     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02833     memmove(s, carry, carry_len);
02834     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02835     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02836     rb_enc_str_coderange(str);
02837     return str;
02838 }
02839 
02840 
02841 /*
02842  *  call-seq:
02843  *     str.succ!   -> str
02844  *     str.next!   -> str
02845  *
02846  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
02847  *  place.
02848  */
02849 
02850 static VALUE
02851 rb_str_succ_bang(VALUE str)
02852 {
02853     rb_str_shared_replace(str, rb_str_succ(str));
02854 
02855     return str;
02856 }
02857 
02858 
02859 /*
02860  *  call-seq:
02861  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
02862  *     str.upto(other_str, exclusive=false)                -> an_enumerator
02863  *
02864  *  Iterates through successive values, starting at <i>str</i> and
02865  *  ending at <i>other_str</i> inclusive, passing each value in turn to
02866  *  the block. The <code>String#succ</code> method is used to generate
02867  *  each value.  If optional second argument exclusive is omitted or is false,
02868  *  the last value will be included; otherwise it will be excluded.
02869  *
02870  *  If no block is given, an enumerator is returned instead.
02871  *
02872  *     "a8".upto("b6") {|s| print s, ' ' }
02873  *     for s in "a8".."b6"
02874  *       print s, ' '
02875  *     end
02876  *
02877  *  <em>produces:</em>
02878  *
02879  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
02880  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
02881  *
02882  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
02883  *  both are recognized as decimal numbers. In addition, the width of
02884  *  string (e.g. leading zeros) is handled appropriately.
02885  *
02886  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
02887  *     "25".upto("5").to_a   #=> []
02888  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
02889  */
02890 
02891 static VALUE
02892 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02893 {
02894     VALUE end, exclusive;
02895     VALUE current, after_end;
02896     ID succ;
02897     int n, excl, ascii;
02898     rb_encoding *enc;
02899 
02900     rb_scan_args(argc, argv, "11", &end, &exclusive);
02901     RETURN_ENUMERATOR(beg, argc, argv);
02902     excl = RTEST(exclusive);
02903     CONST_ID(succ, "succ");
02904     StringValue(end);
02905     enc = rb_enc_check(beg, end);
02906     ascii = (is_ascii_string(beg) && is_ascii_string(end));
02907     /* single character */
02908     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02909         char c = RSTRING_PTR(beg)[0];
02910         char e = RSTRING_PTR(end)[0];
02911 
02912         if (c > e || (excl && c == e)) return beg;
02913         for (;;) {
02914             rb_yield(rb_enc_str_new(&c, 1, enc));
02915             if (!excl && c == e) break;
02916             c++;
02917             if (excl && c == e) break;
02918         }
02919         return beg;
02920     }
02921     /* both edges are all digits */
02922     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02923         char *s, *send;
02924         VALUE b, e;
02925         int width;
02926 
02927         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02928         width = rb_long2int(send - s);
02929         while (s < send) {
02930             if (!ISDIGIT(*s)) goto no_digits;
02931             s++;
02932         }
02933         s = RSTRING_PTR(end); send = RSTRING_END(end);
02934         while (s < send) {
02935             if (!ISDIGIT(*s)) goto no_digits;
02936             s++;
02937         }
02938         b = rb_str_to_inum(beg, 10, FALSE);
02939         e = rb_str_to_inum(end, 10, FALSE);
02940         if (FIXNUM_P(b) && FIXNUM_P(e)) {
02941             long bi = FIX2LONG(b);
02942             long ei = FIX2LONG(e);
02943             rb_encoding *usascii = rb_usascii_encoding();
02944 
02945             while (bi <= ei) {
02946                 if (excl && bi == ei) break;
02947                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02948                 bi++;
02949             }
02950         }
02951         else {
02952             ID op = excl ? '<' : rb_intern("<=");
02953             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02954 
02955             args[0] = INT2FIX(width);
02956             while (rb_funcall(b, op, 1, e)) {
02957                 args[1] = b;
02958                 rb_yield(rb_str_format(numberof(args), args, fmt));
02959                 b = rb_funcall(b, succ, 0, 0);
02960             }
02961         }
02962         return beg;
02963     }
02964     /* normal case */
02965   no_digits:
02966     n = rb_str_cmp(beg, end);
02967     if (n > 0 || (excl && n == 0)) return beg;
02968 
02969     after_end = rb_funcall(end, succ, 0, 0);
02970     current = rb_str_dup(beg);
02971     while (!rb_str_equal(current, after_end)) {
02972         VALUE next = Qnil;
02973         if (excl || !rb_str_equal(current, end))
02974             next = rb_funcall(current, succ, 0, 0);
02975         rb_yield(current);
02976         if (NIL_P(next)) break;
02977         current = next;
02978         StringValue(current);
02979         if (excl && rb_str_equal(current, end)) break;
02980         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02981             break;
02982     }
02983 
02984     return beg;
02985 }
02986 
02987 static VALUE
02988 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
02989 {
02990     if (rb_reg_search(re, str, 0, 0) >= 0) {
02991         VALUE match = rb_backref_get();
02992         int nth = rb_reg_backref_number(match, backref);
02993         return rb_reg_nth_match(nth, match);
02994     }
02995     return Qnil;
02996 }
02997 
02998 static VALUE
02999 rb_str_aref(VALUE str, VALUE indx)
03000 {
03001     long idx;
03002 
03003     switch (TYPE(indx)) {
03004       case T_FIXNUM:
03005         idx = FIX2LONG(indx);
03006 
03007       num_index:
03008         str = rb_str_substr(str, idx, 1);
03009         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03010         return str;
03011 
03012       case T_REGEXP:
03013         return rb_str_subpat(str, indx, INT2FIX(0));
03014 
03015       case T_STRING:
03016         if (rb_str_index(str, indx, 0) != -1)
03017             return rb_str_dup(indx);
03018         return Qnil;
03019 
03020       default:
03021         /* check if indx is Range */
03022         {
03023             long beg, len;
03024             VALUE tmp;
03025 
03026             len = str_strlen(str, STR_ENC_GET(str));
03027             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03028               case Qfalse:
03029                 break;
03030               case Qnil:
03031                 return Qnil;
03032               default:
03033                 tmp = rb_str_substr(str, beg, len);
03034                 return tmp;
03035             }
03036         }
03037         idx = NUM2LONG(indx);
03038         goto num_index;
03039     }
03040     return Qnil;                /* not reached */
03041 }
03042 
03043 
03044 /*
03045  *  call-seq:
03046  *     str[fixnum]                 -> new_str or nil
03047  *     str[fixnum, fixnum]         -> new_str or nil
03048  *     str[range]                  -> new_str or nil
03049  *     str[regexp]                 -> new_str or nil
03050  *     str[regexp, fixnum]         -> new_str or nil
03051  *     str[other_str]              -> new_str or nil
03052  *     str.slice(fixnum)           -> new_str or nil
03053  *     str.slice(fixnum, fixnum)   -> new_str or nil
03054  *     str.slice(range)            -> new_str or nil
03055  *     str.slice(regexp)           -> new_str or nil
03056  *     str.slice(regexp, fixnum)   -> new_str or nil
03057  *     str.slice(regexp, capname)  -> new_str or nil
03058  *     str.slice(other_str)        -> new_str or nil
03059  *
03060  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
03061  *  substring of one character at that position. If passed two <code>Fixnum</code>
03062  *  objects, returns a substring starting at the offset given by the first, and
03063  *  a length given by the second. If given a range, a substring containing
03064  *  characters at offsets given by the range is returned. In all three cases, if
03065  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
03066  *  <code>nil</code> if the initial offset falls outside the string, the length
03067  *  is negative, or the beginning of the range is greater than the end.
03068  *
03069  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
03070  *  returned. If a numeric or name parameter follows the regular expression, that
03071  *  component of the <code>MatchData</code> is returned instead. If a
03072  *  <code>String</code> is given, that string is returned if it occurs in
03073  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
03074  *  match.
03075  *
03076  *     a = "hello there"
03077  *     a[1]                   #=> "e"
03078  *     a[1,3]                 #=> "ell"
03079  *     a[1..3]                #=> "ell"
03080  *     a[-3,2]                #=> "er"
03081  *     a[-4..-2]              #=> "her"
03082  *     a[12..-1]              #=> nil
03083  *     a[-2..-4]              #=> ""
03084  *     a[/[aeiou](.)\1/]      #=> "ell"
03085  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03086  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03087  *     a[/[aeiou](.)\1/, 2]   #=> nil
03088  *     a["lo"]                #=> "lo"
03089  *     a["bye"]               #=> nil
03090  */
03091 
03092 static VALUE
03093 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03094 {
03095     if (argc == 2) {
03096         if (TYPE(argv[0]) == T_REGEXP) {
03097             return rb_str_subpat(str, argv[0], argv[1]);
03098         }
03099         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03100     }
03101     if (argc != 1) {
03102         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03103     }
03104     return rb_str_aref(str, argv[0]);
03105 }
03106 
03107 VALUE
03108 rb_str_drop_bytes(VALUE str, long len)
03109 {
03110     char *ptr = RSTRING_PTR(str);
03111     long olen = RSTRING_LEN(str), nlen;
03112 
03113     str_modifiable(str);
03114     if (len > olen) len = olen;
03115     nlen = olen - len;
03116     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03117         char *oldptr = ptr;
03118         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03119         STR_SET_EMBED(str);
03120         STR_SET_EMBED_LEN(str, nlen);
03121         ptr = RSTRING(str)->as.ary;
03122         memmove(ptr, oldptr + len, nlen);
03123         if (fl == STR_NOEMBED) xfree(oldptr);
03124     }
03125     else {
03126         if (!STR_SHARED_P(str)) rb_str_new4(str);
03127         ptr = RSTRING(str)->as.heap.ptr += len;
03128         RSTRING(str)->as.heap.len = nlen;
03129     }
03130     ptr[nlen] = 0;
03131     ENC_CODERANGE_CLEAR(str);
03132     return str;
03133 }
03134 
03135 static void
03136 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03137 {
03138     if (beg == 0 && RSTRING_LEN(val) == 0) {
03139         rb_str_drop_bytes(str, len);
03140         OBJ_INFECT(str, val);
03141         return;
03142     }
03143 
03144     rb_str_modify(str);
03145     if (len < RSTRING_LEN(val)) {
03146         /* expand string */
03147         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03148     }
03149 
03150     if (RSTRING_LEN(val) != len) {
03151         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03152                 RSTRING_PTR(str) + beg + len,
03153                 RSTRING_LEN(str) - (beg + len));
03154     }
03155     if (RSTRING_LEN(val) < beg && len < 0) {
03156         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03157     }
03158     if (RSTRING_LEN(val) > 0) {
03159         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03160     }
03161     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03162     if (RSTRING_PTR(str)) {
03163         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03164     }
03165     OBJ_INFECT(str, val);
03166 }
03167 
03168 static void
03169 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03170 {
03171     long slen;
03172     char *p, *e;
03173     rb_encoding *enc;
03174     int singlebyte = single_byte_optimizable(str);
03175     int cr;
03176 
03177     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03178 
03179     StringValue(val);
03180     enc = rb_enc_check(str, val);
03181     slen = str_strlen(str, enc);
03182 
03183     if (slen < beg) {
03184       out_of_range:
03185         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03186     }
03187     if (beg < 0) {
03188         if (-beg > slen) {
03189             goto out_of_range;
03190         }
03191         beg += slen;
03192     }
03193     if (slen < len || slen < beg + len) {
03194         len = slen - beg;
03195     }
03196     str_modify_keep_cr(str);
03197     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03198     if (!p) p = RSTRING_END(str);
03199     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03200     if (!e) e = RSTRING_END(str);
03201     /* error check */
03202     beg = p - RSTRING_PTR(str); /* physical position */
03203     len = e - p;                /* physical length */
03204     rb_str_splice_0(str, beg, len, val);
03205     rb_enc_associate(str, enc);
03206     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03207     if (cr != ENC_CODERANGE_BROKEN)
03208         ENC_CODERANGE_SET(str, cr);
03209 }
03210 
03211 void
03212 rb_str_update(VALUE str, long beg, long len, VALUE val)
03213 {
03214     rb_str_splice(str, beg, len, val);
03215 }
03216 
03217 static void
03218 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03219 {
03220     int nth;
03221     VALUE match;
03222     long start, end, len;
03223     rb_encoding *enc;
03224     struct re_registers *regs;
03225 
03226     if (rb_reg_search(re, str, 0, 0) < 0) {
03227         rb_raise(rb_eIndexError, "regexp not matched");
03228     }
03229     match = rb_backref_get();
03230     nth = rb_reg_backref_number(match, backref);
03231     regs = RMATCH_REGS(match);
03232     if (nth >= regs->num_regs) {
03233       out_of_range:
03234         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03235     }
03236     if (nth < 0) {
03237         if (-nth >= regs->num_regs) {
03238             goto out_of_range;
03239         }
03240         nth += regs->num_regs;
03241     }
03242 
03243     start = BEG(nth);
03244     if (start == -1) {
03245         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03246     }
03247     end = END(nth);
03248     len = end - start;
03249     StringValue(val);
03250     enc = rb_enc_check(str, val);
03251     rb_str_splice_0(str, start, len, val);
03252     rb_enc_associate(str, enc);
03253 }
03254 
03255 static VALUE
03256 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03257 {
03258     long idx, beg;
03259 
03260     switch (TYPE(indx)) {
03261       case T_FIXNUM:
03262         idx = FIX2LONG(indx);
03263       num_index:
03264         rb_str_splice(str, idx, 1, val);
03265         return val;
03266 
03267       case T_REGEXP:
03268         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03269         return val;
03270 
03271       case T_STRING:
03272         beg = rb_str_index(str, indx, 0);
03273         if (beg < 0) {
03274             rb_raise(rb_eIndexError, "string not matched");
03275         }
03276         beg = rb_str_sublen(str, beg);
03277         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03278         return val;
03279 
03280       default:
03281         /* check if indx is Range */
03282         {
03283             long beg, len;
03284             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03285                 rb_str_splice(str, beg, len, val);
03286                 return val;
03287             }
03288         }
03289         idx = NUM2LONG(indx);
03290         goto num_index;
03291     }
03292 }
03293 
03294 /*
03295  *  call-seq:
03296  *     str[fixnum] = new_str
03297  *     str[fixnum, fixnum] = new_str
03298  *     str[range] = aString
03299  *     str[regexp] = new_str
03300  *     str[regexp, fixnum] = new_str
03301  *     str[regexp, name] = new_str
03302  *     str[other_str] = new_str
03303  *
03304  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03305  *  portion of the string affected is determined using the same criteria as
03306  *  <code>String#[]</code>. If the replacement string is not the same length as
03307  *  the text it is replacing, the string will be adjusted accordingly. If the
03308  *  regular expression or string is used as the index doesn't match a position
03309  *  in the string, <code>IndexError</code> is raised. If the regular expression
03310  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03311  *  which portion of the match to replace (effectively using the
03312  *  <code>MatchData</code> indexing rules. The forms that take a
03313  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03314  *  out of range; the <code>Range</code> form will raise a
03315  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03316  *  forms will silently ignore the assignment.
03317  */
03318 
03319 static VALUE
03320 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03321 {
03322     if (argc == 3) {
03323         if (TYPE(argv[0]) == T_REGEXP) {
03324             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03325         }
03326         else {
03327             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03328         }
03329         return argv[2];
03330     }
03331     if (argc != 2) {
03332         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03333     }
03334     return rb_str_aset(str, argv[0], argv[1]);
03335 }
03336 
03337 /*
03338  *  call-seq:
03339  *     str.insert(index, other_str)   -> str
03340  *
03341  *  Inserts <i>other_str</i> before the character at the given
03342  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03343  *  end of the string, and insert <em>after</em> the given character.
03344  *  The intent is insert <i>aString</i> so that it starts at the given
03345  *  <i>index</i>.
03346  *
03347  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03348  *     "abcd".insert(3, 'X')    #=> "abcXd"
03349  *     "abcd".insert(4, 'X')    #=> "abcdX"
03350  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03351  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03352  */
03353 
03354 static VALUE
03355 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03356 {
03357     long pos = NUM2LONG(idx);
03358 
03359     if (pos == -1) {
03360         return rb_str_append(str, str2);
03361     }
03362     else if (pos < 0) {
03363         pos++;
03364     }
03365     rb_str_splice(str, pos, 0, str2);
03366     return str;
03367 }
03368 
03369 
03370 /*
03371  *  call-seq:
03372  *     str.slice!(fixnum)           -> fixnum or nil
03373  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03374  *     str.slice!(range)            -> new_str or nil
03375  *     str.slice!(regexp)           -> new_str or nil
03376  *     str.slice!(other_str)        -> new_str or nil
03377  *
03378  *  Deletes the specified portion from <i>str</i>, and returns the portion
03379  *  deleted.
03380  *
03381  *     string = "this is a string"
03382  *     string.slice!(2)        #=> "i"
03383  *     string.slice!(3..6)     #=> " is "
03384  *     string.slice!(/s.*t/)   #=> "sa st"
03385  *     string.slice!("r")      #=> "r"
03386  *     string                  #=> "thing"
03387  */
03388 
03389 static VALUE
03390 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03391 {
03392     VALUE result;
03393     VALUE buf[3];
03394     int i;
03395 
03396     if (argc < 1 || 2 < argc) {
03397         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03398     }
03399     for (i=0; i<argc; i++) {
03400         buf[i] = argv[i];
03401     }
03402     str_modify_keep_cr(str);
03403     buf[i] = rb_str_new(0,0);
03404     result = rb_str_aref_m(argc, buf, str);
03405     if (!NIL_P(result)) {
03406         rb_str_aset_m(argc+1, buf, str);
03407     }
03408     return result;
03409 }
03410 
03411 static VALUE
03412 get_pat(VALUE pat, int quote)
03413 {
03414     VALUE val;
03415 
03416     switch (TYPE(pat)) {
03417       case T_REGEXP:
03418         return pat;
03419 
03420       case T_STRING:
03421         break;
03422 
03423       default:
03424         val = rb_check_string_type(pat);
03425         if (NIL_P(val)) {
03426             Check_Type(pat, T_REGEXP);
03427         }
03428         pat = val;
03429     }
03430 
03431     if (quote) {
03432         pat = rb_reg_quote(pat);
03433     }
03434 
03435     return rb_reg_regcomp(pat);
03436 }
03437 
03438 
03439 /*
03440  *  call-seq:
03441  *     str.sub!(pattern, replacement)          -> str or nil
03442  *     str.sub!(pattern) {|match| block }      -> str or nil
03443  *
03444  *  Performs the substitutions of <code>String#sub</code> in place,
03445  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
03446  *  performed.
03447  */
03448 
03449 static VALUE
03450 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03451 {
03452     VALUE pat, repl, hash = Qnil;
03453     int iter = 0;
03454     int tainted = 0;
03455     int untrusted = 0;
03456     long plen;
03457 
03458     if (argc == 1 && rb_block_given_p()) {
03459         iter = 1;
03460     }
03461     else if (argc == 2) {
03462         repl = argv[1];
03463         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03464         if (NIL_P(hash)) {
03465             StringValue(repl);
03466         }
03467         if (OBJ_TAINTED(repl)) tainted = 1;
03468         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03469     }
03470     else {
03471         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03472     }
03473 
03474     pat = get_pat(argv[0], 1);
03475     str_modifiable(str);
03476     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03477         rb_encoding *enc;
03478         int cr = ENC_CODERANGE(str);
03479         VALUE match = rb_backref_get();
03480         struct re_registers *regs = RMATCH_REGS(match);
03481         long beg0 = BEG(0);
03482         long end0 = END(0);
03483         char *p, *rp;
03484         long len, rlen;
03485 
03486         if (iter || !NIL_P(hash)) {
03487             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03488 
03489             if (iter) {
03490                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03491             }
03492             else {
03493                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03494                 repl = rb_obj_as_string(repl);
03495             }
03496             str_mod_check(str, p, len);
03497             str_frozen_check(str);
03498         }
03499         else {
03500             repl = rb_reg_regsub(repl, str, regs, pat);
03501         }
03502         enc = rb_enc_compatible(str, repl);
03503         if (!enc) {
03504             rb_encoding *str_enc = STR_ENC_GET(str);
03505             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03506             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03507                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03508                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03509                          rb_enc_name(str_enc),
03510                          rb_enc_name(STR_ENC_GET(repl)));
03511             }
03512             enc = STR_ENC_GET(repl);
03513         }
03514         rb_str_modify(str);
03515         rb_enc_associate(str, enc);
03516         if (OBJ_TAINTED(repl)) tainted = 1;
03517         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03518         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03519             int cr2 = ENC_CODERANGE(repl);
03520             if (cr2 == ENC_CODERANGE_BROKEN ||
03521                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03522                 cr = ENC_CODERANGE_UNKNOWN;
03523             else
03524                 cr = cr2;
03525         }
03526         plen = end0 - beg0;
03527         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03528         len = RSTRING_LEN(str);
03529         if (rlen > plen) {
03530             RESIZE_CAPA(str, len + rlen - plen);
03531         }
03532         p = RSTRING_PTR(str);
03533         if (rlen != plen) {
03534             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03535         }
03536         memcpy(p + beg0, rp, rlen);
03537         len += rlen - plen;
03538         STR_SET_LEN(str, len);
03539         RSTRING_PTR(str)[len] = '\0';
03540         ENC_CODERANGE_SET(str, cr);
03541         if (tainted) OBJ_TAINT(str);
03542         if (untrusted) OBJ_UNTRUST(str);
03543 
03544         return str;
03545     }
03546     return Qnil;
03547 }
03548 
03549 
03550 /*
03551  *  call-seq:
03552  *     str.sub(pattern, replacement)         -> new_str
03553  *     str.sub(pattern, hash)                -> new_str
03554  *     str.sub(pattern) {|match| block }     -> new_str
03555  *
03556  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
03557  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03558  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03559  *  regular expression metacharacters it contains will be interpreted
03560  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03561  *  instead of a digit.
03562  *
03563  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03564  *  the matched text. It may contain back-references to the pattern's capture
03565  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03566  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03567  *  double-quoted string, both back-references must be preceded by an
03568  *  additional backslash. However, within <i>replacement</i> the special match
03569  *  variables, such as <code>&$</code>, will not refer to the current match.
03570  *
03571  *  If the second argument is a <code>Hash</code>, and the matched text is one
03572  *  of its keys, the corresponding value is the replacement string.
03573  *
03574  *  In the block form, the current match string is passed in as a parameter,
03575  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03576  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03577  *  returned by the block will be substituted for the match on each call.
03578  *
03579  *  The result inherits any tainting in the original string or any supplied
03580  *  replacement string.
03581  *
03582  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03583  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03584  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03585  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03586  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03587  *      #=> "Is /bin/bash your preferred shell?"
03588  */
03589 
03590 static VALUE
03591 rb_str_sub(int argc, VALUE *argv, VALUE str)
03592 {
03593     str = rb_str_dup(str);
03594     rb_str_sub_bang(argc, argv, str);
03595     return str;
03596 }
03597 
03598 static VALUE
03599 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03600 {
03601     VALUE pat, val, repl, match, dest, hash = Qnil;
03602     struct re_registers *regs;
03603     long beg, n;
03604     long beg0, end0;
03605     long offset, blen, slen, len, last;
03606     int iter = 0;
03607     char *sp, *cp;
03608     int tainted = 0;
03609     rb_encoding *str_enc;
03610 
03611     switch (argc) {
03612       case 1:
03613         RETURN_ENUMERATOR(str, argc, argv);
03614         iter = 1;
03615         break;
03616       case 2:
03617         repl = argv[1];
03618         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03619         if (NIL_P(hash)) {
03620             StringValue(repl);
03621         }
03622         if (OBJ_TAINTED(repl)) tainted = 1;
03623         break;
03624       default:
03625         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03626     }
03627 
03628     pat = get_pat(argv[0], 1);
03629     beg = rb_reg_search(pat, str, 0, 0);
03630     if (beg < 0) {
03631         if (bang) return Qnil;  /* no match, no substitution */
03632         return rb_str_dup(str);
03633     }
03634 
03635     offset = 0;
03636     n = 0;
03637     blen = RSTRING_LEN(str) + 30; /* len + margin */
03638     dest = rb_str_buf_new(blen);
03639     sp = RSTRING_PTR(str);
03640     slen = RSTRING_LEN(str);
03641     cp = sp;
03642     str_enc = STR_ENC_GET(str);
03643 
03644     do {
03645         n++;
03646         match = rb_backref_get();
03647         regs = RMATCH_REGS(match);
03648         beg0 = BEG(0);
03649         end0 = END(0);
03650         if (iter || !NIL_P(hash)) {
03651             if (iter) {
03652                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03653             }
03654             else {
03655                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03656                 val = rb_obj_as_string(val);
03657             }
03658             str_mod_check(str, sp, slen);
03659             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03660                 rb_raise(rb_eRuntimeError, "block should not cheat");
03661             }
03662         }
03663         else {
03664             val = rb_reg_regsub(repl, str, regs, pat);
03665         }
03666 
03667         if (OBJ_TAINTED(val)) tainted = 1;
03668 
03669         len = beg - offset;     /* copy pre-match substr */
03670         if (len) {
03671             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03672         }
03673 
03674         rb_str_buf_append(dest, val);
03675 
03676         last = offset;
03677         offset = end0;
03678         if (beg0 == end0) {
03679             /*
03680              * Always consume at least one character of the input string
03681              * in order to prevent infinite loops.
03682              */
03683             if (RSTRING_LEN(str) <= end0) break;
03684             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03685             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03686             offset = end0 + len;
03687         }
03688         cp = RSTRING_PTR(str) + offset;
03689         if (offset > RSTRING_LEN(str)) break;
03690         beg = rb_reg_search(pat, str, offset, 0);
03691     } while (beg >= 0);
03692     if (RSTRING_LEN(str) > offset) {
03693         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03694     }
03695     rb_reg_search(pat, str, last, 0);
03696     if (bang) {
03697         rb_str_shared_replace(str, dest);
03698     }
03699     else {
03700         RBASIC(dest)->klass = rb_obj_class(str);
03701         OBJ_INFECT(dest, str);
03702         str = dest;
03703     }
03704 
03705     if (tainted) OBJ_TAINT(str);
03706     return str;
03707 }
03708 
03709 
03710 /*
03711  *  call-seq:
03712  *     str.gsub!(pattern, replacement)        -> str or nil
03713  *     str.gsub!(pattern) {|match| block }    -> str or nil
03714  *     str.gsub!(pattern)                     -> an_enumerator
03715  *
03716  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03717  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03718  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03719  */
03720 
03721 static VALUE
03722 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03723 {
03724     str_modify_keep_cr(str);
03725     return str_gsub(argc, argv, str, 1);
03726 }
03727 
03728 
03729 /*
03730  *  call-seq:
03731  *     str.gsub(pattern, replacement)       -> new_str
03732  *     str.gsub(pattern, hash)              -> new_str
03733  *     str.gsub(pattern) {|match| block }   -> new_str
03734  *     str.gsub(pattern)                    -> enumerator
03735  *
03736  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03737  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03738  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03739  *  regular expression metacharacters it contains will be interpreted
03740  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03741  *  instead of a digit.
03742  *
03743  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03744  *  the matched text. It may contain back-references to the pattern's capture
03745  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03746  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03747  *  double-quoted string, both back-references must be preceded by an
03748  *  additional backslash. However, within <i>replacement</i> the special match
03749  *  variables, such as <code>&$</code>, will not refer to the current match.
03750  *
03751  *  If the second argument is a <code>Hash</code>, and the matched text is one
03752  *  of its keys, the corresponding value is the replacement string.
03753  *
03754  *  In the block form, the current match string is passed in as a parameter,
03755  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03756  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03757  *  returned by the block will be substituted for the match on each call.
03758  *
03759  *  The result inherits any tainting in the original string or any supplied
03760  *  replacement string.
03761  *
03762  *  When neither a block nor a second argument is supplied, an
03763  *  <code>Enumerator</code> is returned.
03764  *
03765  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03766  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03767  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03768  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
03769  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
03770  */
03771 
03772 static VALUE
03773 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03774 {
03775     return str_gsub(argc, argv, str, 0);
03776 }
03777 
03778 
03779 /*
03780  *  call-seq:
03781  *     str.replace(other_str)   -> str
03782  *
03783  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
03784  *  values in <i>other_str</i>.
03785  *
03786  *     s = "hello"         #=> "hello"
03787  *     s.replace "world"   #=> "world"
03788  */
03789 
03790 VALUE
03791 rb_str_replace(VALUE str, VALUE str2)
03792 {
03793     str_modifiable(str);
03794     if (str == str2) return str;
03795 
03796     StringValue(str2);
03797     str_discard(str);
03798     return str_replace(str, str2);
03799 }
03800 
03801 /*
03802  *  call-seq:
03803  *     string.clear    ->  string
03804  *
03805  *  Makes string empty.
03806  *
03807  *     a = "abcde"
03808  *     a.clear    #=> ""
03809  */
03810 
03811 static VALUE
03812 rb_str_clear(VALUE str)
03813 {
03814     str_discard(str);
03815     STR_SET_EMBED(str);
03816     STR_SET_EMBED_LEN(str, 0);
03817     RSTRING_PTR(str)[0] = 0;
03818     if (rb_enc_asciicompat(STR_ENC_GET(str)))
03819         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03820     else
03821         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03822     return str;
03823 }
03824 
03825 /*
03826  *  call-seq:
03827  *     string.chr    ->  string
03828  *
03829  *  Returns a one-character string at the beginning of the string.
03830  *
03831  *     a = "abcde"
03832  *     a.chr    #=> "a"
03833  */
03834 
03835 static VALUE
03836 rb_str_chr(VALUE str)
03837 {
03838     return rb_str_substr(str, 0, 1);
03839 }
03840 
03841 /*
03842  *  call-seq:
03843  *     str.getbyte(index)          -> 0 .. 255
03844  *
03845  *  returns the <i>index</i>th byte as an integer.
03846  */
03847 static VALUE
03848 rb_str_getbyte(VALUE str, VALUE index)
03849 {
03850     long pos = NUM2LONG(index);
03851 
03852     if (pos < 0)
03853         pos += RSTRING_LEN(str);
03854     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
03855         return Qnil;
03856 
03857     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03858 }
03859 
03860 /*
03861  *  call-seq:
03862  *     str.setbyte(index, int) -> int
03863  *
03864  *  modifies the <i>index</i>th byte as <i>int</i>.
03865  */
03866 static VALUE
03867 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03868 {
03869     long pos = NUM2LONG(index);
03870     int byte = NUM2INT(value);
03871 
03872     rb_str_modify(str);
03873 
03874     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03875         rb_raise(rb_eIndexError, "index %ld out of string", pos);
03876     if (pos < 0)
03877         pos += RSTRING_LEN(str);
03878 
03879     RSTRING_PTR(str)[pos] = byte;
03880 
03881     return value;
03882 }
03883 
03884 /*
03885  *  call-seq:
03886  *     str.reverse   -> new_str
03887  *
03888  *  Returns a new string with the characters from <i>str</i> in reverse order.
03889  *
03890  *     "stressed".reverse   #=> "desserts"
03891  */
03892 
03893 static VALUE
03894 rb_str_reverse(VALUE str)
03895 {
03896     rb_encoding *enc;
03897     VALUE rev;
03898     char *s, *e, *p;
03899     int single = 1;
03900 
03901     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03902     enc = STR_ENC_GET(str);
03903     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03904     s = RSTRING_PTR(str); e = RSTRING_END(str);
03905     p = RSTRING_END(rev);
03906 
03907     if (RSTRING_LEN(str) > 1) {
03908         if (single_byte_optimizable(str)) {
03909             while (s < e) {
03910                 *--p = *s++;
03911             }
03912         }
03913         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03914             while (s < e) {
03915                 int clen = rb_enc_fast_mbclen(s, e, enc);
03916 
03917                 if (clen > 1 || (*s & 0x80)) single = 0;
03918                 p -= clen;
03919                 memcpy(p, s, clen);
03920                 s += clen;
03921             }
03922         }
03923         else {
03924             while (s < e) {
03925                 int clen = rb_enc_mbclen(s, e, enc);
03926 
03927                 if (clen > 1 || (*s & 0x80)) single = 0;
03928                 p -= clen;
03929                 memcpy(p, s, clen);
03930                 s += clen;
03931             }
03932         }
03933     }
03934     STR_SET_LEN(rev, RSTRING_LEN(str));
03935     OBJ_INFECT(rev, str);
03936     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03937         if (single) {
03938             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03939         }
03940         else {
03941             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03942         }
03943     }
03944     rb_enc_cr_str_copy_for_substr(rev, str);
03945 
03946     return rev;
03947 }
03948 
03949 
03950 /*
03951  *  call-seq:
03952  *     str.reverse!   -> str
03953  *
03954  *  Reverses <i>str</i> in place.
03955  */
03956 
03957 static VALUE
03958 rb_str_reverse_bang(VALUE str)
03959 {
03960     if (RSTRING_LEN(str) > 1) {
03961         if (single_byte_optimizable(str)) {
03962             char *s, *e, c;
03963 
03964             str_modify_keep_cr(str);
03965             s = RSTRING_PTR(str);
03966             e = RSTRING_END(str) - 1;
03967             while (s < e) {
03968                 c = *s;
03969                 *s++ = *e;
03970                 *e-- = c;
03971             }
03972         }
03973         else {
03974             rb_str_shared_replace(str, rb_str_reverse(str));
03975         }
03976     }
03977     else {
03978         str_modify_keep_cr(str);
03979     }
03980     return str;
03981 }
03982 
03983 
03984 /*
03985  *  call-seq:
03986  *     str.include? other_str   -> true or false
03987  *
03988  *  Returns <code>true</code> if <i>str</i> contains the given string or
03989  *  character.
03990  *
03991  *     "hello".include? "lo"   #=> true
03992  *     "hello".include? "ol"   #=> false
03993  *     "hello".include? ?h     #=> true
03994  */
03995 
03996 static VALUE
03997 rb_str_include(VALUE str, VALUE arg)
03998 {
03999     long i;
04000 
04001     StringValue(arg);
04002     i = rb_str_index(str, arg, 0);
04003 
04004     if (i == -1) return Qfalse;
04005     return Qtrue;
04006 }
04007 
04008 
04009 /*
04010  *  call-seq:
04011  *     str.to_i(base=10)   -> integer
04012  *
04013  *  Returns the result of interpreting leading characters in <i>str</i> as an
04014  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04015  *  end of a valid number are ignored. If there is not a valid number at the
04016  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04017  *  exception.
04018  *
04019  *     "12345".to_i             #=> 12345
04020  *     "99 red balloons".to_i   #=> 99
04021  *     "0a".to_i                #=> 0
04022  *     "0a".to_i(16)            #=> 10
04023  *     "hello".to_i             #=> 0
04024  *     "1100101".to_i(2)        #=> 101
04025  *     "1100101".to_i(8)        #=> 294977
04026  *     "1100101".to_i(10)       #=> 1100101
04027  *     "1100101".to_i(16)       #=> 17826049
04028  */
04029 
04030 static VALUE
04031 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04032 {
04033     int base;
04034 
04035     if (argc == 0) base = 10;
04036     else {
04037         VALUE b;
04038 
04039         rb_scan_args(argc, argv, "01", &b);
04040         base = NUM2INT(b);
04041     }
04042     if (base < 0) {
04043         rb_raise(rb_eArgError, "invalid radix %d", base);
04044     }
04045     return rb_str_to_inum(str, base, FALSE);
04046 }
04047 
04048 
04049 /*
04050  *  call-seq:
04051  *     str.to_f   -> float
04052  *
04053  *  Returns the result of interpreting leading characters in <i>str</i> as a
04054  *  floating point number. Extraneous characters past the end of a valid number
04055  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04056  *  <code>0.0</code> is returned. This method never raises an exception.
04057  *
04058  *     "123.45e1".to_f        #=> 1234.5
04059  *     "45.67 degrees".to_f   #=> 45.67
04060  *     "thx1138".to_f         #=> 0.0
04061  */
04062 
04063 static VALUE
04064 rb_str_to_f(VALUE str)
04065 {
04066     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04067 }
04068 
04069 
04070 /*
04071  *  call-seq:
04072  *     str.to_s     -> str
04073  *     str.to_str   -> str
04074  *
04075  *  Returns the receiver.
04076  */
04077 
04078 static VALUE
04079 rb_str_to_s(VALUE str)
04080 {
04081     if (rb_obj_class(str) != rb_cString) {
04082         return str_duplicate(rb_cString, str);
04083     }
04084     return str;
04085 }
04086 
04087 #if 0
04088 static void
04089 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04090 {
04091     char s[RUBY_MAX_CHAR_LEN];
04092     int n = rb_enc_codelen(c, enc);
04093 
04094     rb_enc_mbcput(c, s, enc);
04095     rb_enc_str_buf_cat(str, s, n, enc);
04096 }
04097 #endif
04098 
04099 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04100 
04101 int
04102 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04103 {
04104     char buf[CHAR_ESC_LEN + 1];
04105     int l;
04106 
04107 #if SIZEOF_INT > 4
04108     c &= 0xffffffff;
04109 #endif
04110     if (unicode_p) {
04111         if (c < 0x7F && ISPRINT(c)) {
04112             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04113         }
04114         else if (c < 0x10000) {
04115             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04116         }
04117         else {
04118             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04119         }
04120     }
04121     else {
04122         if (c < 0x100) {
04123             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04124         }
04125         else {
04126             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04127         }
04128     }
04129     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04130     rb_str_buf_cat(result, buf, l);
04131     return l;
04132 }
04133 
04134 /*
04135  * call-seq:
04136  *   str.inspect   -> string
04137  *
04138  * Returns a printable version of _str_, surrounded by quote marks,
04139  * with special characters escaped.
04140  *
04141  *    str = "hello"
04142  *    str[3] = "\b"
04143  *    str.inspect       #=> "\"hel\\bo\""
04144  */
04145 
04146 VALUE
04147 rb_str_inspect(VALUE str)
04148 {
04149     rb_encoding *enc = STR_ENC_GET(str);
04150     const char *p, *pend, *prev;
04151     char buf[CHAR_ESC_LEN + 1];
04152     VALUE result = rb_str_buf_new(0);
04153     rb_encoding *resenc = rb_default_internal_encoding();
04154     int unicode_p = rb_enc_unicode_p(enc);
04155     int asciicompat = rb_enc_asciicompat(enc);
04156 
04157     if (resenc == NULL) resenc = rb_default_external_encoding();
04158     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04159     rb_enc_associate(result, resenc);
04160     str_buf_cat2(result, "\"");
04161 
04162     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04163     prev = p;
04164     while (p < pend) {
04165         unsigned int c, cc;
04166         int n;
04167 
04168         n = rb_enc_precise_mbclen(p, pend, enc);
04169         if (!MBCLEN_CHARFOUND_P(n)) {
04170             if (p > prev) str_buf_cat(result, prev, p - prev);
04171             n = rb_enc_mbminlen(enc);
04172             if (pend < p + n)
04173                 n = (int)(pend - p);
04174             while (n--) {
04175                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04176                 str_buf_cat(result, buf, strlen(buf));
04177                 prev = ++p;
04178             }
04179             continue;
04180         }
04181         n = MBCLEN_CHARFOUND_LEN(n);
04182         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04183         p += n;
04184         if (c == '"'|| c == '\\' ||
04185             (c == '#' &&
04186              p < pend &&
04187              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04188              (cc = rb_enc_codepoint(p,pend,enc),
04189               (cc == '$' || cc == '@' || cc == '{')))) {
04190             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04191             str_buf_cat2(result, "\\");
04192             prev = p - n;
04193             continue;
04194         }
04195         switch (c) {
04196           case '\n': cc = 'n'; break;
04197           case '\r': cc = 'r'; break;
04198           case '\t': cc = 't'; break;
04199           case '\f': cc = 'f'; break;
04200           case '\013': cc = 'v'; break;
04201           case '\010': cc = 'b'; break;
04202           case '\007': cc = 'a'; break;
04203           case 033: cc = 'e'; break;
04204           default: cc = 0; break;
04205         }
04206         if (cc) {
04207             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04208             buf[0] = '\\';
04209             buf[1] = (char)cc;
04210             str_buf_cat(result, buf, 2);
04211             prev = p;
04212             continue;
04213         }
04214         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04215             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04216             continue;
04217         }
04218         else {
04219             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04220             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04221             prev = p;
04222             continue;
04223         }
04224     }
04225     if (p > prev) str_buf_cat(result, prev, p - prev);
04226     str_buf_cat2(result, "\"");
04227 
04228     OBJ_INFECT(result, str);
04229     return result;
04230 }
04231 
04232 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04233 
04234 /*
04235  *  call-seq:
04236  *     str.dump   -> new_str
04237  *
04238  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
04239  *  <code>\nnn</code> notation and all special characters escaped.
04240  */
04241 
04242 VALUE
04243 rb_str_dump(VALUE str)
04244 {
04245     rb_encoding *enc = rb_enc_get(str);
04246     long len;
04247     const char *p, *pend;
04248     char *q, *qend;
04249     VALUE result;
04250     int u8 = (enc == rb_utf8_encoding());
04251 
04252     len = 2;                    /* "" */
04253     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04254     while (p < pend) {
04255         unsigned char c = *p++;
04256         switch (c) {
04257           case '"':  case '\\':
04258           case '\n': case '\r':
04259           case '\t': case '\f':
04260           case '\013': case '\010': case '\007': case '\033':
04261             len += 2;
04262             break;
04263 
04264           case '#':
04265             len += IS_EVSTR(p, pend) ? 2 : 1;
04266             break;
04267 
04268           default:
04269             if (ISPRINT(c)) {
04270                 len++;
04271             }
04272             else {
04273                 if (u8) {       /* \u{NN} */
04274                     char buf[32];
04275                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04276                     if (MBCLEN_CHARFOUND_P(n)) {
04277                         int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04278                         sprintf(buf, "%x", cc);
04279                         len += strlen(buf)+4;
04280                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04281                         break;
04282                     }
04283                 }
04284                 len += 4;       /* \xNN */
04285             }
04286             break;
04287         }
04288     }
04289     if (!rb_enc_asciicompat(enc)) {
04290         len += 19;              /* ".force_encoding('')" */
04291         len += strlen(enc->name);
04292     }
04293 
04294     result = rb_str_new5(str, 0, len);
04295     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04296     q = RSTRING_PTR(result); qend = q + len + 1;
04297 
04298     *q++ = '"';
04299     while (p < pend) {
04300         unsigned char c = *p++;
04301 
04302         if (c == '"' || c == '\\') {
04303             *q++ = '\\';
04304             *q++ = c;
04305         }
04306         else if (c == '#') {
04307             if (IS_EVSTR(p, pend)) *q++ = '\\';
04308             *q++ = '#';
04309         }
04310         else if (c == '\n') {
04311             *q++ = '\\';
04312             *q++ = 'n';
04313         }
04314         else if (c == '\r') {
04315             *q++ = '\\';
04316             *q++ = 'r';
04317         }
04318         else if (c == '\t') {
04319             *q++ = '\\';
04320             *q++ = 't';
04321         }
04322         else if (c == '\f') {
04323             *q++ = '\\';
04324             *q++ = 'f';
04325         }
04326         else if (c == '\013') {
04327             *q++ = '\\';
04328             *q++ = 'v';
04329         }
04330         else if (c == '\010') {
04331             *q++ = '\\';
04332             *q++ = 'b';
04333         }
04334         else if (c == '\007') {
04335             *q++ = '\\';
04336             *q++ = 'a';
04337         }
04338         else if (c == '\033') {
04339             *q++ = '\\';
04340             *q++ = 'e';
04341         }
04342         else if (ISPRINT(c)) {
04343             *q++ = c;
04344         }
04345         else {
04346             *q++ = '\\';
04347             if (u8) {
04348                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04349                 if (MBCLEN_CHARFOUND_P(n)) {
04350                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04351                     p += n;
04352                     snprintf(q, qend-q, "u{%x}", cc);
04353                     q += strlen(q);
04354                     continue;
04355                 }
04356             }
04357             snprintf(q, qend-q, "x%02X", c);
04358             q += 3;
04359         }
04360     }
04361     *q++ = '"';
04362     *q = '\0';
04363     if (!rb_enc_asciicompat(enc)) {
04364         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04365         enc = rb_ascii8bit_encoding();
04366     }
04367     OBJ_INFECT(result, str);
04368     /* result from dump is ASCII */
04369     rb_enc_associate(result, enc);
04370     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04371     return result;
04372 }
04373 
04374 
04375 static void
04376 rb_str_check_dummy_enc(rb_encoding *enc)
04377 {
04378     if (rb_enc_dummy_p(enc)) {
04379         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04380                  rb_enc_name(enc));
04381     }
04382 }
04383 
04384 /*
04385  *  call-seq:
04386  *     str.upcase!   -> str or nil
04387  *
04388  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04389  *  were made.
04390  *  Note: case replacement is effective only in ASCII region.
04391  */
04392 
04393 static VALUE
04394 rb_str_upcase_bang(VALUE str)
04395 {
04396     rb_encoding *enc;
04397     char *s, *send;
04398     int modify = 0;
04399     int n;
04400 
04401     str_modify_keep_cr(str);
04402     enc = STR_ENC_GET(str);
04403     rb_str_check_dummy_enc(enc);
04404     s = RSTRING_PTR(str); send = RSTRING_END(str);
04405     if (single_byte_optimizable(str)) {
04406         while (s < send) {
04407             unsigned int c = *(unsigned char*)s;
04408 
04409             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04410                 *s = 'A' + (c - 'a');
04411                 modify = 1;
04412             }
04413             s++;
04414         }
04415     }
04416     else {
04417         int ascompat = rb_enc_asciicompat(enc);
04418 
04419         while (s < send) {
04420             unsigned int c;
04421 
04422             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04423                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04424                     *s = 'A' + (c - 'a');
04425                     modify = 1;
04426                 }
04427                 s++;
04428             }
04429             else {
04430                 c = rb_enc_codepoint_len(s, send, &n, enc);
04431                 if (rb_enc_islower(c, enc)) {
04432                     /* assuming toupper returns codepoint with same size */
04433                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04434                     modify = 1;
04435                 }
04436                 s += n;
04437             }
04438         }
04439     }
04440 
04441     if (modify) return str;
04442     return Qnil;
04443 }
04444 
04445 
04446 /*
04447  *  call-seq:
04448  *     str.upcase   -> new_str
04449  *
04450  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04451  *  uppercase counterparts. The operation is locale insensitive---only
04452  *  characters ``a'' to ``z'' are affected.
04453  *  Note: case replacement is effective only in ASCII region.
04454  *
04455  *     "hEllO".upcase   #=> "HELLO"
04456  */
04457 
04458 static VALUE
04459 rb_str_upcase(VALUE str)
04460 {
04461     str = rb_str_dup(str);
04462     rb_str_upcase_bang(str);
04463     return str;
04464 }
04465 
04466 
04467 /*
04468  *  call-seq:
04469  *     str.downcase!   -> str or nil
04470  *
04471  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04472  *  changes were made.
04473  *  Note: case replacement is effective only in ASCII region.
04474  */
04475 
04476 static VALUE
04477 rb_str_downcase_bang(VALUE str)
04478 {
04479     rb_encoding *enc;
04480     char *s, *send;
04481     int modify = 0;
04482 
04483     str_modify_keep_cr(str);
04484     enc = STR_ENC_GET(str);
04485     rb_str_check_dummy_enc(enc);
04486     s = RSTRING_PTR(str); send = RSTRING_END(str);
04487     if (single_byte_optimizable(str)) {
04488         while (s < send) {
04489             unsigned int c = *(unsigned char*)s;
04490 
04491             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04492                 *s = 'a' + (c - 'A');
04493                 modify = 1;
04494             }
04495             s++;
04496         }
04497     }
04498     else {
04499         int ascompat = rb_enc_asciicompat(enc);
04500 
04501         while (s < send) {
04502             unsigned int c;
04503             int n;
04504 
04505             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04506                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04507                     *s = 'a' + (c - 'A');
04508                     modify = 1;
04509                 }
04510                 s++;
04511             }
04512             else {
04513                 c = rb_enc_codepoint_len(s, send, &n, enc);
04514                 if (rb_enc_isupper(c, enc)) {
04515                     /* assuming toupper returns codepoint with same size */
04516                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04517                     modify = 1;
04518                 }
04519                 s += n;
04520             }
04521         }
04522     }
04523 
04524     if (modify) return str;
04525     return Qnil;
04526 }
04527 
04528 
04529 /*
04530  *  call-seq:
04531  *     str.downcase   -> new_str
04532  *
04533  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04534  *  lowercase counterparts. The operation is locale insensitive---only
04535  *  characters ``A'' to ``Z'' are affected.
04536  *  Note: case replacement is effective only in ASCII region.
04537  *
04538  *     "hEllO".downcase   #=> "hello"
04539  */
04540 
04541 static VALUE
04542 rb_str_downcase(VALUE str)
04543 {
04544     str = rb_str_dup(str);
04545     rb_str_downcase_bang(str);
04546     return str;
04547 }
04548 
04549 
04550 /*
04551  *  call-seq:
04552  *     str.capitalize!   -> str or nil
04553  *
04554  *  Modifies <i>str</i> by converting the first character to uppercase and the
04555  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04556  *  Note: case conversion is effective only in ASCII region.
04557  *
04558  *     a = "hello"
04559  *     a.capitalize!   #=> "Hello"
04560  *     a               #=> "Hello"
04561  *     a.capitalize!   #=> nil
04562  */
04563 
04564 static VALUE
04565 rb_str_capitalize_bang(VALUE str)
04566 {
04567     rb_encoding *enc;
04568     char *s, *send;
04569     int modify = 0;
04570     unsigned int c;
04571     int n;
04572 
04573     str_modify_keep_cr(str);
04574     enc = STR_ENC_GET(str);
04575     rb_str_check_dummy_enc(enc);
04576     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04577     s = RSTRING_PTR(str); send = RSTRING_END(str);
04578 
04579     c = rb_enc_codepoint_len(s, send, &n, enc);
04580     if (rb_enc_islower(c, enc)) {
04581         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04582         modify = 1;
04583     }
04584     s += n;
04585     while (s < send) {
04586         c = rb_enc_codepoint_len(s, send, &n, enc);
04587         if (rb_enc_isupper(c, enc)) {
04588             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04589             modify = 1;
04590         }
04591         s += n;
04592     }
04593 
04594     if (modify) return str;
04595     return Qnil;
04596 }
04597 
04598 
04599 /*
04600  *  call-seq:
04601  *     str.capitalize   -> new_str
04602  *
04603  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04604  *  and the remainder to lowercase.
04605  *  Note: case conversion is effective only in ASCII region.
04606  *
04607  *     "hello".capitalize    #=> "Hello"
04608  *     "HELLO".capitalize    #=> "Hello"
04609  *     "123ABC".capitalize   #=> "123abc"
04610  */
04611 
04612 static VALUE
04613 rb_str_capitalize(VALUE str)
04614 {
04615     str = rb_str_dup(str);
04616     rb_str_capitalize_bang(str);
04617     return str;
04618 }
04619 
04620 
04621 /*
04622  *  call-seq:
04623 *     str.swapcase!   -> str or nil
04624  *
04625  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
04626  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
04627  *  Note: case conversion is effective only in ASCII region.
04628  */
04629 
04630 static VALUE
04631 rb_str_swapcase_bang(VALUE str)
04632 {
04633     rb_encoding *enc;
04634     char *s, *send;
04635     int modify = 0;
04636     int n;
04637 
04638     str_modify_keep_cr(str);
04639     enc = STR_ENC_GET(str);
04640     rb_str_check_dummy_enc(enc);
04641     s = RSTRING_PTR(str); send = RSTRING_END(str);
04642     while (s < send) {
04643         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04644 
04645         if (rb_enc_isupper(c, enc)) {
04646             /* assuming toupper returns codepoint with same size */
04647             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04648             modify = 1;
04649         }
04650         else if (rb_enc_islower(c, enc)) {
04651             /* assuming tolower returns codepoint with same size */
04652             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04653             modify = 1;
04654         }
04655         s += n;
04656     }
04657 
04658     if (modify) return str;
04659     return Qnil;
04660 }
04661 
04662 
04663 /*
04664  *  call-seq:
04665  *     str.swapcase   -> new_str
04666  *
04667  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
04668  *  to lowercase and lowercase characters converted to uppercase.
04669  *  Note: case conversion is effective only in ASCII region.
04670  *
04671  *     "Hello".swapcase          #=> "hELLO"
04672  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
04673  */
04674 
04675 static VALUE
04676 rb_str_swapcase(VALUE str)
04677 {
04678     str = rb_str_dup(str);
04679     rb_str_swapcase_bang(str);
04680     return str;
04681 }
04682 
04683 typedef unsigned char *USTR;
04684 
04685 struct tr {
04686     int gen;
04687     unsigned int now, max;
04688     char *p, *pend;
04689 };
04690 
04691 static unsigned int
04692 trnext(struct tr *t, rb_encoding *enc)
04693 {
04694     int n;
04695 
04696     for (;;) {
04697         if (!t->gen) {
04698             if (t->p == t->pend) return -1;
04699             if (t->p < t->pend - 1 && *t->p == '\\') {
04700                 t->p++;
04701             }
04702             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04703             t->p += n;
04704             if (t->p < t->pend - 1 && *t->p == '-') {
04705                 t->p++;
04706                 if (t->p < t->pend) {
04707                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04708                     t->p += n;
04709                     if (t->now > c) {
04710                         if (t->now < 0x80 && c < 0x80) {
04711                             rb_raise(rb_eArgError,
04712                                      "invalid range \"%c-%c\" in string transliteration",
04713                                      t->now, c);
04714                         }
04715                         else {
04716                             rb_raise(rb_eArgError, "invalid range in string transliteration");
04717                         }
04718                         continue; /* not reached */
04719                     }
04720                     t->gen = 1;
04721                     t->max = c;
04722                 }
04723             }
04724             return t->now;
04725         }
04726         else if (++t->now < t->max) {
04727             return t->now;
04728         }
04729         else {
04730             t->gen = 0;
04731             return t->max;
04732         }
04733     }
04734 }
04735 
04736 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04737 
04738 static VALUE
04739 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04740 {
04741     const unsigned int errc = -1;
04742     unsigned int trans[256];
04743     rb_encoding *enc, *e1, *e2;
04744     struct tr trsrc, trrepl;
04745     int cflag = 0;
04746     unsigned int c, c0;
04747     int last = 0, modify = 0, i, l;
04748     char *s, *send;
04749     VALUE hash = 0;
04750     int singlebyte = single_byte_optimizable(str);
04751     int cr;
04752 
04753 #define CHECK_IF_ASCII(c) \
04754     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04755            (cr = ENC_CODERANGE_VALID) : 0)
04756 
04757     StringValue(src);
04758     StringValue(repl);
04759     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04760     if (RSTRING_LEN(repl) == 0) {
04761         return rb_str_delete_bang(1, &src, str);
04762     }
04763 
04764     cr = ENC_CODERANGE(str);
04765     e1 = rb_enc_check(str, src);
04766     e2 = rb_enc_check(str, repl);
04767     if (e1 == e2) {
04768         enc = e1;
04769     }
04770     else {
04771         enc = rb_enc_check(src, repl);
04772     }
04773     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04774     if (RSTRING_LEN(src) > 1 &&
04775         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04776         trsrc.p + l < trsrc.pend) {
04777         cflag = 1;
04778         trsrc.p += l;
04779     }
04780     trrepl.p = RSTRING_PTR(repl);
04781     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04782     trsrc.gen = trrepl.gen = 0;
04783     trsrc.now = trrepl.now = 0;
04784     trsrc.max = trrepl.max = 0;
04785 
04786     if (cflag) {
04787         for (i=0; i<256; i++) {
04788             trans[i] = 1;
04789         }
04790         while ((c = trnext(&trsrc, enc)) != errc) {
04791             if (c < 256) {
04792                 trans[c] = errc;
04793             }
04794             else {
04795                 if (!hash) hash = rb_hash_new();
04796                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04797             }
04798         }
04799         while ((c = trnext(&trrepl, enc)) != errc)
04800             /* retrieve last replacer */;
04801         last = trrepl.now;
04802         for (i=0; i<256; i++) {
04803             if (trans[i] != errc) {
04804                 trans[i] = last;
04805             }
04806         }
04807     }
04808     else {
04809         unsigned int r;
04810 
04811         for (i=0; i<256; i++) {
04812             trans[i] = errc;
04813         }
04814         while ((c = trnext(&trsrc, enc)) != errc) {
04815             r = trnext(&trrepl, enc);
04816             if (r == errc) r = trrepl.now;
04817             if (c < 256) {
04818                 trans[c] = r;
04819                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04820             }
04821             else {
04822                 if (!hash) hash = rb_hash_new();
04823                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04824             }
04825         }
04826     }
04827 
04828     if (cr == ENC_CODERANGE_VALID)
04829         cr = ENC_CODERANGE_7BIT;
04830     str_modify_keep_cr(str);
04831     s = RSTRING_PTR(str); send = RSTRING_END(str);
04832     if (sflag) {
04833         int clen, tlen;
04834         long offset, max = RSTRING_LEN(str);
04835         unsigned int save = -1;
04836         char *buf = ALLOC_N(char, max), *t = buf;
04837 
04838         while (s < send) {
04839             int may_modify = 0;
04840 
04841             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04842             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04843 
04844             s += clen;
04845             if (c < 256) {
04846                 c = trans[c];
04847             }
04848             else if (hash) {
04849                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04850                 if (NIL_P(tmp)) {
04851                     if (cflag) c = last;
04852                     else c = errc;
04853                 }
04854                 else if (cflag) c = errc;
04855                 else c = NUM2INT(tmp);
04856             }
04857             else {
04858                 c = errc;
04859             }
04860             if (c != (unsigned int)-1) {
04861                 if (save == c) {
04862                     CHECK_IF_ASCII(c);
04863                     continue;
04864                 }
04865                 save = c;
04866                 tlen = rb_enc_codelen(c, enc);
04867                 modify = 1;
04868             }
04869             else {
04870                 save = -1;
04871                 c = c0;
04872                 if (enc != e1) may_modify = 1;
04873             }
04874             while (t - buf + tlen >= max) {
04875                 offset = t - buf;
04876                 max *= 2;
04877                 REALLOC_N(buf, char, max);
04878                 t = buf + offset;
04879             }
04880             rb_enc_mbcput(c, t, enc);
04881             if (may_modify && memcmp(s, t, tlen) != 0) {
04882                 modify = 1;
04883             }
04884             CHECK_IF_ASCII(c);
04885             t += tlen;
04886         }
04887         *t = '\0';
04888         RSTRING(str)->as.heap.ptr = buf;
04889         RSTRING(str)->as.heap.len = t - buf;
04890         STR_SET_NOEMBED(str);
04891         RSTRING(str)->as.heap.aux.capa = max;
04892     }
04893     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04894         while (s < send) {
04895             c = (unsigned char)*s;
04896             if (trans[c] != errc) {
04897                 if (!cflag) {
04898                     c = trans[c];
04899                     *s = c;
04900                     modify = 1;
04901                 }
04902                 else {
04903                     *s = last;
04904                     modify = 1;
04905                 }
04906             }
04907             CHECK_IF_ASCII(c);
04908             s++;
04909         }
04910     }
04911     else {
04912         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04913         long offset;
04914         char *buf = ALLOC_N(char, max), *t = buf;
04915 
04916         while (s < send) {
04917             int may_modify = 0;
04918             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04919             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04920 
04921             if (c < 256) {
04922                 c = trans[c];
04923             }
04924             else if (hash) {
04925                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04926                 if (NIL_P(tmp)) {
04927                     if (cflag) c = last;
04928                     else c = errc;
04929                 }
04930                 else if (cflag) c = errc;
04931                 else c = NUM2INT(tmp);
04932             }
04933             else {
04934                 c = errc;
04935             }
04936             if (c != errc) {
04937                 tlen = rb_enc_codelen(c, enc);
04938                 modify = 1;
04939             }
04940             else {
04941                 c = c0;
04942                 if (enc != e1) may_modify = 1;
04943             }
04944             while (t - buf + tlen >= max) {
04945                 offset = t - buf;
04946                 max *= 2;
04947                 REALLOC_N(buf, char, max);
04948                 t = buf + offset;
04949             }
04950             if (s != t) {
04951                 rb_enc_mbcput(c, t, enc);
04952                 if (may_modify && memcmp(s, t, tlen) != 0) {
04953                     modify = 1;
04954                 }
04955             }
04956             CHECK_IF_ASCII(c);
04957             s += clen;
04958             t += tlen;
04959         }
04960         if (!STR_EMBED_P(str)) {
04961             xfree(RSTRING(str)->as.heap.ptr);
04962         }
04963         *t = '\0';
04964         RSTRING(str)->as.heap.ptr = buf;
04965         RSTRING(str)->as.heap.len = t - buf;
04966         STR_SET_NOEMBED(str);
04967         RSTRING(str)->as.heap.aux.capa = max;
04968     }
04969 
04970     if (modify) {
04971         if (cr != ENC_CODERANGE_BROKEN)
04972             ENC_CODERANGE_SET(str, cr);
04973         rb_enc_associate(str, enc);
04974         return str;
04975     }
04976     return Qnil;
04977 }
04978 
04979 
04980 /*
04981  *  call-seq:
04982  *     str.tr!(from_str, to_str)   -> str or nil
04983  *
04984  *  Translates <i>str</i> in place, using the same rules as
04985  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
04986  *  changes were made.
04987  */
04988 
04989 static VALUE
04990 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
04991 {
04992     return tr_trans(str, src, repl, 0);
04993 }
04994 
04995 
04996 /*
04997  *  call-seq:
04998  *     str.tr(from_str, to_str)   -> new_str
04999  *
05000  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
05001  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
05002  *  shorter than <i>from_str</i>, it is padded with its last character. Both
05003  *  strings may use the c1--c2 notation to denote ranges of characters, and
05004  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
05005  *  characters except those listed.
05006  *
05007  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05008  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05009  *     "hello".tr('el', 'ip')      #=> "hippo"
05010  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05011  */
05012 
05013 static VALUE
05014 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05015 {
05016     str = rb_str_dup(str);
05017     tr_trans(str, src, repl, 0);
05018     return str;
05019 }
05020 
05021 static void
05022 tr_setup_table(VALUE str, char stable[256], int first,
05023                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05024 {
05025     const unsigned int errc = -1;
05026     char buf[256];
05027     struct tr tr;
05028     unsigned int c;
05029     VALUE table = 0, ptable = 0;
05030     int i, l, cflag = 0;
05031 
05032     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05033     tr.gen = tr.now = tr.max = 0;
05034 
05035     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05036         cflag = 1;
05037         tr.p += l;
05038     }
05039     if (first) {
05040         for (i=0; i<256; i++) {
05041             stable[i] = 1;
05042         }
05043     }
05044     for (i=0; i<256; i++) {
05045         buf[i] = cflag;
05046     }
05047 
05048     while ((c = trnext(&tr, enc)) != errc) {
05049         if (c < 256) {
05050             buf[c & 0xff] = !cflag;
05051         }
05052         else {
05053             VALUE key = UINT2NUM(c);
05054 
05055             if (!table) {
05056                 table = rb_hash_new();
05057                 if (cflag) {
05058                     ptable = *ctablep;
05059                     *ctablep = table;
05060                 }
05061                 else {
05062                     ptable = *tablep;
05063                     *tablep = table;
05064                 }
05065             }
05066             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05067                 rb_hash_aset(table, key, Qtrue);
05068             }
05069         }
05070     }
05071     for (i=0; i<256; i++) {
05072         stable[i] = stable[i] && buf[i];
05073     }
05074 }
05075 
05076 
05077 static int
05078 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05079 {
05080     if (c < 256) {
05081         return table[c] != 0;
05082     }
05083     else {
05084         VALUE v = UINT2NUM(c);
05085 
05086         if (del && !NIL_P(rb_hash_lookup(del, v))) {
05087             if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
05088                 return TRUE;
05089             }
05090         }
05091         return FALSE;
05092     }
05093 }
05094 
05095 /*
05096  *  call-seq:
05097  *     str.delete!([other_str]+)   -> str or nil
05098  *
05099  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05100  *  <code>nil</code> if <i>str</i> was not modified.
05101  */
05102 
05103 static VALUE
05104 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05105 {
05106     char squeez[256];
05107     rb_encoding *enc = 0;
05108     char *s, *send, *t;
05109     VALUE del = 0, nodel = 0;
05110     int modify = 0;
05111     int i, ascompat, cr;
05112 
05113     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05114     if (argc < 1) {
05115         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05116     }
05117     for (i=0; i<argc; i++) {
05118         VALUE s = argv[i];
05119 
05120         StringValue(s);
05121         enc = rb_enc_check(str, s);
05122         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05123     }
05124 
05125     str_modify_keep_cr(str);
05126     ascompat = rb_enc_asciicompat(enc);
05127     s = t = RSTRING_PTR(str);
05128     send = RSTRING_END(str);
05129     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05130     while (s < send) {
05131         unsigned int c;
05132         int clen;
05133 
05134         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05135             if (squeez[c]) {
05136                 modify = 1;
05137             }
05138             else {
05139                 if (t != s) *t = c;
05140                 t++;
05141             }
05142             s++;
05143         }
05144         else {
05145             c = rb_enc_codepoint_len(s, send, &clen, enc);
05146 
05147             if (tr_find(c, squeez, del, nodel)) {
05148                 modify = 1;
05149             }
05150             else {
05151                 if (t != s) rb_enc_mbcput(c, t, enc);
05152                 t += clen;
05153                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05154             }
05155             s += clen;
05156         }
05157     }
05158     *t = '\0';
05159     STR_SET_LEN(str, t - RSTRING_PTR(str));
05160     ENC_CODERANGE_SET(str, cr);
05161 
05162     if (modify) return str;
05163     return Qnil;
05164 }
05165 
05166 
05167 /*
05168  *  call-seq:
05169  *     str.delete([other_str]+)   -> new_str
05170  *
05171  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05172  *  arguments deleted. Uses the same rules for building the set of characters as
05173  *  <code>String#count</code>.
05174  *
05175  *     "hello".delete "l","lo"        #=> "heo"
05176  *     "hello".delete "lo"            #=> "he"
05177  *     "hello".delete "aeiou", "^e"   #=> "hell"
05178  *     "hello".delete "ej-m"          #=> "ho"
05179  */
05180 
05181 static VALUE
05182 rb_str_delete(int argc, VALUE *argv, VALUE str)
05183 {
05184     str = rb_str_dup(str);
05185     rb_str_delete_bang(argc, argv, str);
05186     return str;
05187 }
05188 
05189 
05190 /*
05191  *  call-seq:
05192  *     str.squeeze!([other_str]*)   -> str or nil
05193  *
05194  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05195  *  <code>nil</code> if no changes were made.
05196  */
05197 
05198 static VALUE
05199 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05200 {
05201     char squeez[256];
05202     rb_encoding *enc = 0;
05203     VALUE del = 0, nodel = 0;
05204     char *s, *send, *t;
05205     int i, modify = 0;
05206     int ascompat, singlebyte = single_byte_optimizable(str);
05207     unsigned int save;
05208 
05209     if (argc == 0) {
05210         enc = STR_ENC_GET(str);
05211     }
05212     else {
05213         for (i=0; i<argc; i++) {
05214             VALUE s = argv[i];
05215 
05216             StringValue(s);
05217             enc = rb_enc_check(str, s);
05218             if (singlebyte && !single_byte_optimizable(s))
05219                 singlebyte = 0;
05220             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05221         }
05222     }
05223 
05224     str_modify_keep_cr(str);
05225     s = t = RSTRING_PTR(str);
05226     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05227     send = RSTRING_END(str);
05228     save = -1;
05229     ascompat = rb_enc_asciicompat(enc);
05230 
05231     if (singlebyte) {
05232         while (s < send) {
05233             unsigned int c = *(unsigned char*)s++;
05234             if (c != save || (argc > 0 && !squeez[c])) {
05235                 *t++ = save = c;
05236             }
05237         }
05238     } else {
05239         while (s < send) {
05240             unsigned int c;
05241             int clen;
05242 
05243             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05244                 if (c != save || (argc > 0 && !squeez[c])) {
05245                     *t++ = save = c;
05246                 }
05247                 s++;
05248             }
05249             else {
05250                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05251 
05252                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05253                     if (t != s) rb_enc_mbcput(c, t, enc);
05254                     save = c;
05255                     t += clen;
05256                 }
05257                 s += clen;
05258             }
05259         }
05260     }
05261 
05262     *t = '\0';
05263     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05264         STR_SET_LEN(str, t - RSTRING_PTR(str));
05265         modify = 1;
05266     }
05267 
05268     if (modify) return str;
05269     return Qnil;
05270 }
05271 
05272 
05273 /*
05274  *  call-seq:
05275  *     str.squeeze([other_str]*)    -> new_str
05276  *
05277  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05278  *  procedure described for <code>String#count</code>. Returns a new string
05279  *  where runs of the same character that occur in this set are replaced by a
05280  *  single character. If no arguments are given, all runs of identical
05281  *  characters are replaced by a single character.
05282  *
05283  *     "yellow moon".squeeze                  #=> "yelow mon"
05284  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05285  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05286  */
05287 
05288 static VALUE
05289 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05290 {
05291     str = rb_str_dup(str);
05292     rb_str_squeeze_bang(argc, argv, str);
05293     return str;
05294 }
05295 
05296 
05297 /*
05298  *  call-seq:
05299  *     str.tr_s!(from_str, to_str)   -> str or nil
05300  *
05301  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05302  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05303  */
05304 
05305 static VALUE
05306 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05307 {
05308     return tr_trans(str, src, repl, 1);
05309 }
05310 
05311 
05312 /*
05313  *  call-seq:
05314  *     str.tr_s(from_str, to_str)   -> new_str
05315  *
05316  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05317  *  then removes duplicate characters in regions that were affected by the
05318  *  translation.
05319  *
05320  *     "hello".tr_s('l', 'r')     #=> "hero"
05321  *     "hello".tr_s('el', '*')    #=> "h*o"
05322  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05323  */
05324 
05325 static VALUE
05326 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05327 {
05328     str = rb_str_dup(str);
05329     tr_trans(str, src, repl, 1);
05330     return str;
05331 }
05332 
05333 
05334 /*
05335  *  call-seq:
05336  *     str.count([other_str]+)   -> fixnum
05337  *
05338  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
05339  *  intersection of these sets defines the characters to count in
05340  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
05341  *  negated. The sequence c1--c2 means all characters between c1 and c2.
05342  *
05343  *     a = "hello world"
05344  *     a.count "lo"            #=> 5
05345  *     a.count "lo", "o"       #=> 2
05346  *     a.count "hello", "^l"   #=> 4
05347  *     a.count "ej-m"          #=> 4
05348  */
05349 
05350 static VALUE
05351 rb_str_count(int argc, VALUE *argv, VALUE str)
05352 {
05353     char table[256];
05354     rb_encoding *enc = 0;
05355     VALUE del = 0, nodel = 0;
05356     char *s, *send;
05357     int i;
05358     int ascompat;
05359 
05360     if (argc < 1) {
05361         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05362     }
05363     for (i=0; i<argc; i++) {
05364         VALUE tstr = argv[i];
05365         unsigned char c;
05366 
05367         StringValue(tstr);
05368         enc = rb_enc_check(str, tstr);
05369         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05370             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05371             int n = 0;
05372 
05373             s = RSTRING_PTR(str);
05374             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05375             send = RSTRING_END(str);
05376             while (s < send) {
05377                 if (*(unsigned char*)s++ == c) n++;
05378             }
05379             return INT2NUM(n);
05380         }
05381         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05382     }
05383 
05384     s = RSTRING_PTR(str);
05385     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05386     send = RSTRING_END(str);
05387     ascompat = rb_enc_asciicompat(enc);
05388     i = 0;
05389     while (s < send) {
05390         unsigned int c;
05391         int clen;
05392 
05393         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05394             clen = 1;
05395             if (table[c]) {
05396                 i++;
05397             }
05398             s++;
05399         }
05400         else {
05401             c = rb_enc_codepoint_len(s, send, &clen, enc);
05402             if (tr_find(c, table, del, nodel)) {
05403                 i++;
05404             }
05405             s += clen;
05406         }
05407     }
05408 
05409     return INT2NUM(i);
05410 }
05411 
05412 static const char isspacetable[256] = {
05413     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05414     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05415     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05416     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05417     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05418     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05419     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05420     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05421     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05422     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05423     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05424     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05425     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05426     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05427     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05428     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05429 };
05430 
05431 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05432 
05433 /*
05434  *  call-seq:
05435  *     str.split(pattern=$;, [limit])   -> anArray
05436  *
05437  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05438  *  of these substrings.
05439  *
05440  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05441  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05442  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05443  *  of contiguous whitespace characters ignored.
05444  *
05445  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05446  *  pattern matches. Whenever the pattern matches a zero-length string,
05447  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05448  *  groups, the respective matches will be returned in the array as well.
05449  *
05450  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05451  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05452  *  split on whitespace as if ` ' were specified.
05453  *
05454  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05455  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05456  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05457  *  string is returned as the only entry in an array). If negative, there is no
05458  *  limit to the number of fields returned, and trailing null fields are not
05459  *  suppressed.
05460  *
05461  *     " now's  the time".split        #=> ["now's", "the", "time"]
05462  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05463  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05464  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05465  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05466  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05467  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05468  *
05469  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05470  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05471  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05472  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05473  */
05474 
05475 static VALUE
05476 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05477 {
05478     rb_encoding *enc;
05479     VALUE spat;
05480     VALUE limit;
05481     enum {awk, string, regexp} split_type;
05482     long beg, end, i = 0;
05483     int lim = 0;
05484     VALUE result, tmp;
05485 
05486     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05487         lim = NUM2INT(limit);
05488         if (lim <= 0) limit = Qnil;
05489         else if (lim == 1) {
05490             if (RSTRING_LEN(str) == 0)
05491                 return rb_ary_new2(0);
05492             return rb_ary_new3(1, str);
05493         }
05494         i = 1;
05495     }
05496 
05497     enc = STR_ENC_GET(str);
05498     if (NIL_P(spat)) {
05499         if (!NIL_P(rb_fs)) {
05500             spat = rb_fs;
05501             goto fs_set;
05502         }
05503         split_type = awk;
05504     }
05505     else {
05506       fs_set:
05507         if (TYPE(spat) == T_STRING) {
05508             rb_encoding *enc2 = STR_ENC_GET(spat);
05509 
05510             split_type = string;
05511             if (RSTRING_LEN(spat) == 0) {
05512                 /* Special case - split into chars */
05513                 spat = rb_reg_regcomp(spat);
05514                 split_type = regexp;
05515             }
05516             else if (rb_enc_asciicompat(enc2) == 1) {
05517                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05518                     split_type = awk;
05519                 }
05520             }
05521             else {
05522                 int l;
05523                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05524                     RSTRING_LEN(spat) == l) {
05525                     split_type = awk;
05526                 }
05527             }
05528         }
05529         else {
05530             spat = get_pat(spat, 1);
05531             split_type = regexp;
05532         }
05533     }
05534 
05535     result = rb_ary_new();
05536     beg = 0;
05537     if (split_type == awk) {
05538         char *ptr = RSTRING_PTR(str);
05539         char *eptr = RSTRING_END(str);
05540         char *bptr = ptr;
05541         int skip = 1;
05542         unsigned int c;
05543 
05544         end = beg;
05545         if (is_ascii_string(str)) {
05546             while (ptr < eptr) {
05547                 c = (unsigned char)*ptr++;
05548                 if (skip) {
05549                     if (ascii_isspace(c)) {
05550                         beg = ptr - bptr;
05551                     }
05552                     else {
05553                         end = ptr - bptr;
05554                         skip = 0;
05555                         if (!NIL_P(limit) && lim <= i) break;
05556                     }
05557                 }
05558                 else if (ascii_isspace(c)) {
05559                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05560                     skip = 1;
05561                     beg = ptr - bptr;
05562                     if (!NIL_P(limit)) ++i;
05563                 }
05564                 else {
05565                     end = ptr - bptr;
05566                 }
05567             }
05568         }
05569         else {
05570             while (ptr < eptr) {
05571                 int n;
05572 
05573                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05574                 ptr += n;
05575                 if (skip) {
05576                     if (rb_isspace(c)) {
05577                         beg = ptr - bptr;
05578                     }
05579                     else {
05580                         end = ptr - bptr;
05581                         skip = 0;
05582                         if (!NIL_P(limit) && lim <= i) break;
05583                     }
05584                 }
05585                 else if (rb_isspace(c)) {
05586                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05587                     skip = 1;
05588                     beg = ptr - bptr;
05589                     if (!NIL_P(limit)) ++i;
05590                 }
05591                 else {
05592                     end = ptr - bptr;
05593                 }
05594             }
05595         }
05596     }
05597     else if (split_type == string) {
05598         char *ptr = RSTRING_PTR(str);
05599         char *temp = ptr;
05600         char *eptr = RSTRING_END(str);
05601         char *sptr = RSTRING_PTR(spat);
05602         long slen = RSTRING_LEN(spat);
05603 
05604         if (is_broken_string(str)) {
05605             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05606         }
05607         if (is_broken_string(spat)) {
05608             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05609         }
05610         enc = rb_enc_check(str, spat);
05611         while (ptr < eptr &&
05612                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05613             /* Check we are at the start of a char */
05614             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05615             if (t != ptr + end) {
05616                 ptr = t;
05617                 continue;
05618             }
05619             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05620             ptr += end + slen;
05621             if (!NIL_P(limit) && lim <= ++i) break;
05622         }
05623         beg = ptr - temp;
05624     }
05625     else {
05626         char *ptr = RSTRING_PTR(str);
05627         long len = RSTRING_LEN(str);
05628         long start = beg;
05629         long idx;
05630         int last_null = 0;
05631         struct re_registers *regs;
05632 
05633         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05634             regs = RMATCH_REGS(rb_backref_get());
05635             if (start == end && BEG(0) == END(0)) {
05636                 if (!ptr) {
05637                     rb_ary_push(result, str_new_empty(str));
05638                     break;
05639                 }
05640                 else if (last_null == 1) {
05641                     rb_ary_push(result, rb_str_subseq(str, beg,
05642                                                       rb_enc_fast_mbclen(ptr+beg,
05643                                                                          ptr+len,
05644                                                                          enc)));
05645                     beg = start;
05646                 }
05647                 else {
05648                     if (ptr+start == ptr+len)
05649                         start++;
05650                     else
05651                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05652                     last_null = 1;
05653                     continue;
05654                 }
05655             }
05656             else {
05657                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05658                 beg = start = END(0);
05659             }
05660             last_null = 0;
05661 
05662             for (idx=1; idx < regs->num_regs; idx++) {
05663                 if (BEG(idx) == -1) continue;
05664                 if (BEG(idx) == END(idx))
05665                     tmp = str_new_empty(str);
05666                 else
05667                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05668                 rb_ary_push(result, tmp);
05669             }
05670             if (!NIL_P(limit) && lim <= ++i) break;
05671         }
05672     }
05673     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05674         if (RSTRING_LEN(str) == beg)
05675             tmp = str_new_empty(str);
05676         else
05677             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05678         rb_ary_push(result, tmp);
05679     }
05680     if (NIL_P(limit) && lim == 0) {
05681         long len;
05682         while ((len = RARRAY_LEN(result)) > 0 &&
05683                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05684             rb_ary_pop(result);
05685     }
05686 
05687     return result;
05688 }
05689 
05690 VALUE
05691 rb_str_split(VALUE str, const char *sep0)
05692 {
05693     VALUE sep;
05694 
05695     StringValue(str);
05696     sep = rb_str_new2(sep0);
05697     return rb_str_split_m(1, &sep, str);
05698 }
05699 
05700 
05701 /*
05702  *  call-seq:
05703  *     str.each_line(separator=$/) {|substr| block }   -> str
05704  *     str.each_line(separator=$/)                     -> an_enumerator
05705  *
05706  *     str.lines(separator=$/) {|substr| block }       -> str
05707  *     str.lines(separator=$/)                         -> an_enumerator
05708  *
05709  *  Splits <i>str</i> using the supplied parameter as the record separator
05710  *  (<code>$/</code> by default), passing each substring in turn to the supplied
05711  *  block. If a zero-length record separator is supplied, the string is split
05712  *  into paragraphs delimited by multiple successive newlines.
05713  *
05714  *  If no block is given, an enumerator is returned instead.
05715  *
05716  *     print "Example one\n"
05717  *     "hello\nworld".each_line {|s| p s}
05718  *     print "Example two\n"
05719  *     "hello\nworld".each_line('l') {|s| p s}
05720  *     print "Example three\n"
05721  *     "hello\n\n\nworld".each_line('') {|s| p s}
05722  *
05723  *  <em>produces:</em>
05724  *
05725  *     Example one
05726  *     "hello\n"
05727  *     "world"
05728  *     Example two
05729  *     "hel"
05730  *     "l"
05731  *     "o\nworl"
05732  *     "d"
05733  *     Example three
05734  *     "hello\n\n\n"
05735  *     "world"
05736  */
05737 
05738 static VALUE
05739 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05740 {
05741     rb_encoding *enc;
05742     VALUE rs;
05743     unsigned int newline;
05744     const char *p, *pend, *s, *ptr;
05745     long len, rslen;
05746     VALUE line;
05747     int n;
05748     VALUE orig = str;
05749 
05750     if (argc == 0) {
05751         rs = rb_rs;
05752     }
05753     else {
05754         rb_scan_args(argc, argv, "01", &rs);
05755     }
05756     RETURN_ENUMERATOR(str, argc, argv);
05757     if (NIL_P(rs)) {
05758         rb_yield(str);
05759         return orig;
05760     }
05761     str = rb_str_new4(str);
05762     ptr = p = s = RSTRING_PTR(str);
05763     pend = p + RSTRING_LEN(str);
05764     len = RSTRING_LEN(str);
05765     StringValue(rs);
05766     if (rs == rb_default_rs) {
05767         enc = rb_enc_get(str);
05768         while (p < pend) {
05769             char *p0;
05770 
05771             p = memchr(p, '\n', pend - p);
05772             if (!p) break;
05773             p0 = rb_enc_left_char_head(s, p, pend, enc);
05774             if (!rb_enc_is_newline(p0, pend, enc)) {
05775                 p++;
05776                 continue;
05777             }
05778             p = p0 + rb_enc_mbclen(p0, pend, enc);
05779             line = rb_str_new5(str, s, p - s);
05780             OBJ_INFECT(line, str);
05781             rb_enc_cr_str_copy_for_substr(line, str);
05782             rb_yield(line);
05783             str_mod_check(str, ptr, len);
05784             s = p;
05785         }
05786         goto finish;
05787     }
05788 
05789     enc = rb_enc_check(str, rs);
05790     rslen = RSTRING_LEN(rs);
05791     if (rslen == 0) {
05792         newline = '\n';
05793     }
05794     else {
05795         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05796     }
05797 
05798     while (p < pend) {
05799         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05800 
05801       again:
05802         if (rslen == 0 && c == newline) {
05803             p += n;
05804             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05805                 goto again;
05806             }
05807             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05808                 p += n;
05809             }
05810             p -= n;
05811         }
05812         if (c == newline &&
05813             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
05814             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05815             OBJ_INFECT(line, str);
05816             rb_enc_cr_str_copy_for_substr(line, str);
05817             rb_yield(line);
05818             str_mod_check(str, ptr, len);
05819             s = p + (rslen ? rslen : n);
05820         }
05821         p += n;
05822     }
05823 
05824   finish:
05825     if (s != pend) {
05826         line = rb_str_new5(str, s, pend - s);
05827         OBJ_INFECT(line, str);
05828         rb_enc_cr_str_copy_for_substr(line, str);
05829         rb_yield(line);
05830     }
05831 
05832     return orig;
05833 }
05834 
05835 
05836 /*
05837  *  call-seq:
05838  *     str.bytes {|fixnum| block }        -> str
05839  *     str.bytes                          -> an_enumerator
05840  *
05841  *     str.each_byte {|fixnum| block }    -> str
05842  *     str.each_byte                      -> an_enumerator
05843  *
05844  *  Passes each byte in <i>str</i> to the given block, or returns
05845  *  an enumerator if no block is given.
05846  *
05847  *     "hello".each_byte {|c| print c, ' ' }
05848  *
05849  *  <em>produces:</em>
05850  *
05851  *     104 101 108 108 111
05852  */
05853 
05854 static VALUE
05855 rb_str_each_byte(VALUE str)
05856 {
05857     long i;
05858 
05859     RETURN_ENUMERATOR(str, 0, 0);
05860     for (i=0; i<RSTRING_LEN(str); i++) {
05861         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05862     }
05863     return str;
05864 }
05865 
05866 
05867 /*
05868  *  call-seq:
05869  *     str.chars {|cstr| block }        -> str
05870  *     str.chars                        -> an_enumerator
05871  *
05872  *     str.each_char {|cstr| block }    -> str
05873  *     str.each_char                    -> an_enumerator
05874  *
05875  *  Passes each character in <i>str</i> to the given block, or returns
05876  *  an enumerator if no block is given.
05877  *
05878  *     "hello".each_char {|c| print c, ' ' }
05879  *
05880  *  <em>produces:</em>
05881  *
05882  *     h e l l o
05883  */
05884 
05885 static VALUE
05886 rb_str_each_char(VALUE str)
05887 {
05888     VALUE orig = str;
05889     long i, len, n;
05890     const char *ptr;
05891     rb_encoding *enc;
05892 
05893     RETURN_ENUMERATOR(str, 0, 0);
05894     str = rb_str_new4(str);
05895     ptr = RSTRING_PTR(str);
05896     len = RSTRING_LEN(str);
05897     enc = rb_enc_get(str);
05898     switch (ENC_CODERANGE(str)) {
05899       case ENC_CODERANGE_VALID:
05900       case ENC_CODERANGE_7BIT:
05901         for (i = 0; i < len; i += n) {
05902             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05903             rb_yield(rb_str_subseq(str, i, n));
05904         }
05905         break;
05906       default:
05907         for (i = 0; i < len; i += n) {
05908             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05909             rb_yield(rb_str_subseq(str, i, n));
05910         }
05911     }
05912     return orig;
05913 }
05914 
05915 /*
05916  *  call-seq:
05917  *     str.codepoints {|integer| block }        -> str
05918  *     str.codepoints                           -> an_enumerator
05919  *
05920  *     str.each_codepoint {|integer| block }    -> str
05921  *     str.each_codepoint                       -> an_enumerator
05922  *
05923  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
05924  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
05925  *  given block.
05926  *
05927  *  If no block is given, an enumerator is returned instead.
05928  *
05929  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
05930  *
05931  *  <em>produces:</em>
05932  *
05933  *     104 101 108 108 111 1593
05934  */
05935 
05936 static VALUE
05937 rb_str_each_codepoint(VALUE str)
05938 {
05939     VALUE orig = str;
05940     long len;
05941     int n;
05942     unsigned int c;
05943     const char *ptr, *end;
05944     rb_encoding *enc;
05945 
05946     if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05947     RETURN_ENUMERATOR(str, 0, 0);
05948     str = rb_str_new4(str);
05949     ptr = RSTRING_PTR(str);
05950     len = RSTRING_LEN(str);
05951     end = RSTRING_END(str);
05952     enc = STR_ENC_GET(str);
05953     while (ptr < end) {
05954         c = rb_enc_codepoint_len(ptr, end, &n, enc);
05955         rb_yield(UINT2NUM(c));
05956         ptr += n;
05957     }
05958     return orig;
05959 }
05960 
05961 static long
05962 chopped_length(VALUE str)
05963 {
05964     rb_encoding *enc = STR_ENC_GET(str);
05965     const char *p, *p2, *beg, *end;
05966 
05967     beg = RSTRING_PTR(str);
05968     end = beg + RSTRING_LEN(str);
05969     if (beg > end) return 0;
05970     p = rb_enc_prev_char(beg, end, end, enc);
05971     if (!p) return 0;
05972     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05973         p2 = rb_enc_prev_char(beg, p, end, enc);
05974         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05975     }
05976     return p - beg;
05977 }
05978 
05979 /*
05980  *  call-seq:
05981  *     str.chop!   -> str or nil
05982  *
05983  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
05984  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
05985  *  <code>String#chomp!</code>.
05986  */
05987 
05988 static VALUE
05989 rb_str_chop_bang(VALUE str)
05990 {
05991     str_modify_keep_cr(str);
05992     if (RSTRING_LEN(str) > 0) {
05993         long len;
05994         len = chopped_length(str);
05995         STR_SET_LEN(str, len);
05996         RSTRING_PTR(str)[len] = '\0';
05997         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
05998             ENC_CODERANGE_CLEAR(str);
05999         }
06000         return str;
06001     }
06002     return Qnil;
06003 }
06004 
06005 
06006 /*
06007  *  call-seq:
06008  *     str.chop   -> new_str
06009  *
06010  *  Returns a new <code>String</code> with the last character removed.  If the
06011  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06012  *  <code>chop</code> to an empty string returns an empty
06013  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06014  *  the string unchanged if it doesn't end in a record separator.
06015  *
06016  *     "string\r\n".chop   #=> "string"
06017  *     "string\n\r".chop   #=> "string\n"
06018  *     "string\n".chop     #=> "string"
06019  *     "string".chop       #=> "strin"
06020  *     "x".chop.chop       #=> ""
06021  */
06022 
06023 static VALUE
06024 rb_str_chop(VALUE str)
06025 {
06026     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06027     rb_enc_cr_str_copy_for_substr(str2, str);
06028     OBJ_INFECT(str2, str);
06029     return str2;
06030 }
06031 
06032 
06033 /*
06034  *  call-seq:
06035  *     str.chomp!(separator=$/)   -> str or nil
06036  *
06037  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06038  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06039  */
06040 
06041 static VALUE
06042 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06043 {
06044     rb_encoding *enc;
06045     VALUE rs;
06046     int newline;
06047     char *p, *pp, *e;
06048     long len, rslen;
06049 
06050     str_modify_keep_cr(str);
06051     len = RSTRING_LEN(str);
06052     if (len == 0) return Qnil;
06053     p = RSTRING_PTR(str);
06054     e = p + len;
06055     if (argc == 0) {
06056         rs = rb_rs;
06057         if (rs == rb_default_rs) {
06058           smart_chomp:
06059             enc = rb_enc_get(str);
06060             if (rb_enc_mbminlen(enc) > 1) {
06061                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06062                 if (rb_enc_is_newline(pp, e, enc)) {
06063                     e = pp;
06064                 }
06065                 pp = e - rb_enc_mbminlen(enc);
06066                 if (pp >= p) {
06067                     pp = rb_enc_left_char_head(p, pp, e, enc);
06068                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06069                         e = pp;
06070                     }
06071                 }
06072                 if (e == RSTRING_END(str)) {
06073                     return Qnil;
06074                 }
06075                 len = e - RSTRING_PTR(str);
06076                 STR_SET_LEN(str, len);
06077             }
06078             else {
06079                 if (RSTRING_PTR(str)[len-1] == '\n') {
06080                     STR_DEC_LEN(str);
06081                     if (RSTRING_LEN(str) > 0 &&
06082                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06083                         STR_DEC_LEN(str);
06084                     }
06085                 }
06086                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06087                     STR_DEC_LEN(str);
06088                 }
06089                 else {
06090                     return Qnil;
06091                 }
06092             }
06093             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06094             return str;
06095         }
06096     }
06097     else {
06098         rb_scan_args(argc, argv, "01", &rs);
06099     }
06100     if (NIL_P(rs)) return Qnil;
06101     StringValue(rs);
06102     rslen = RSTRING_LEN(rs);
06103     if (rslen == 0) {
06104         while (len>0 && p[len-1] == '\n') {
06105             len--;
06106             if (len>0 && p[len-1] == '\r')
06107                 len--;
06108         }
06109         if (len < RSTRING_LEN(str)) {
06110             STR_SET_LEN(str, len);
06111             RSTRING_PTR(str)[len] = '\0';
06112             return str;
06113         }
06114         return Qnil;
06115     }
06116     if (rslen > len) return Qnil;
06117     newline = RSTRING_PTR(rs)[rslen-1];
06118     if (rslen == 1 && newline == '\n')
06119         goto smart_chomp;
06120 
06121     enc = rb_enc_check(str, rs);
06122     if (is_broken_string(rs)) {
06123         return Qnil;
06124     }
06125     pp = e - rslen;
06126     if (p[len-1] == newline &&
06127         (rslen <= 1 ||
06128          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06129         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06130             return Qnil;
06131         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06132             ENC_CODERANGE_CLEAR(str);
06133         }
06134         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06135         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06136         return str;
06137     }
06138     return Qnil;
06139 }
06140 
06141 
06142 /*
06143  *  call-seq:
06144  *     str.chomp(separator=$/)   -> new_str
06145  *
06146  *  Returns a new <code>String</code> with the given record separator removed
06147  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06148  *  changed from the default Ruby record separator, then <code>chomp</code> also
06149  *  removes carriage return characters (that is it will remove <code>\n</code>,
06150  *  <code>\r</code>, and <code>\r\n</code>).
06151  *
06152  *     "hello".chomp            #=> "hello"
06153  *     "hello\n".chomp          #=> "hello"
06154  *     "hello\r\n".chomp        #=> "hello"
06155  *     "hello\n\r".chomp        #=> "hello\n"
06156  *     "hello\r".chomp          #=> "hello"
06157  *     "hello \n there".chomp   #=> "hello \n there"
06158  *     "hello".chomp("llo")     #=> "he"
06159  */
06160 
06161 static VALUE
06162 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06163 {
06164     str = rb_str_dup(str);
06165     rb_str_chomp_bang(argc, argv, str);
06166     return str;
06167 }
06168 
06169 /*
06170  *  call-seq:
06171  *     str.lstrip!   -> self or nil
06172  *
06173  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06174  *  change was made. See also <code>String#rstrip!</code> and
06175  *  <code>String#strip!</code>.
06176  *
06177  *     "  hello  ".lstrip   #=> "hello  "
06178  *     "hello".lstrip!      #=> nil
06179  */
06180 
06181 static VALUE
06182 rb_str_lstrip_bang(VALUE str)
06183 {
06184     rb_encoding *enc;
06185     char *s, *t, *e;
06186 
06187     str_modify_keep_cr(str);
06188     enc = STR_ENC_GET(str);
06189     s = RSTRING_PTR(str);
06190     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06191     e = t = RSTRING_END(str);
06192     /* remove spaces at head */
06193     while (s < e) {
06194         int n;
06195         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06196 
06197         if (!rb_isspace(cc)) break;
06198         s += n;
06199     }
06200 
06201     if (s > RSTRING_PTR(str)) {
06202         STR_SET_LEN(str, t-s);
06203         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06204         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06205         return str;
06206     }
06207     return Qnil;
06208 }
06209 
06210 
06211 /*
06212  *  call-seq:
06213  *     str.lstrip   -> new_str
06214  *
06215  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06216  *  <code>String#rstrip</code> and <code>String#strip</code>.
06217  *
06218  *     "  hello  ".lstrip   #=> "hello  "
06219  *     "hello".lstrip       #=> "hello"
06220  */
06221 
06222 static VALUE
06223 rb_str_lstrip(VALUE str)
06224 {
06225     str = rb_str_dup(str);
06226     rb_str_lstrip_bang(str);
06227     return str;
06228 }
06229 
06230 
06231 /*
06232  *  call-seq:
06233  *     str.rstrip!   -> self or nil
06234  *
06235  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06236  *  no change was made. See also <code>String#lstrip!</code> and
06237  *  <code>String#strip!</code>.
06238  *
06239  *     "  hello  ".rstrip   #=> "  hello"
06240  *     "hello".rstrip!      #=> nil
06241  */
06242 
06243 static VALUE
06244 rb_str_rstrip_bang(VALUE str)
06245 {
06246     rb_encoding *enc;
06247     char *s, *t, *e;
06248 
06249     str_modify_keep_cr(str);
06250     enc = STR_ENC_GET(str);
06251     rb_str_check_dummy_enc(enc);
06252     s = RSTRING_PTR(str);
06253     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06254     t = e = RSTRING_END(str);
06255 
06256     /* remove trailing spaces or '\0's */
06257     if (single_byte_optimizable(str)) {
06258         unsigned char c;
06259         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06260     }
06261     else {
06262         char *tp;
06263 
06264         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06265             unsigned int c = rb_enc_codepoint(tp, e, enc);
06266             if (c && !rb_isspace(c)) break;
06267             t = tp;
06268         }
06269     }
06270     if (t < e) {
06271         long len = t-RSTRING_PTR(str);
06272 
06273         STR_SET_LEN(str, len);
06274         RSTRING_PTR(str)[len] = '\0';
06275         return str;
06276     }
06277     return Qnil;
06278 }
06279 
06280 
06281 /*
06282  *  call-seq:
06283  *     str.rstrip   -> new_str
06284  *
06285  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06286  *  <code>String#lstrip</code> and <code>String#strip</code>.
06287  *
06288  *     "  hello  ".rstrip   #=> "  hello"
06289  *     "hello".rstrip       #=> "hello"
06290  */
06291 
06292 static VALUE
06293 rb_str_rstrip(VALUE str)
06294 {
06295     str = rb_str_dup(str);
06296     rb_str_rstrip_bang(str);
06297     return str;
06298 }
06299 
06300 
06301 /*
06302  *  call-seq:
06303  *     str.strip!   -> str or nil
06304  *
06305  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06306  *  <code>nil</code> if <i>str</i> was not altered.
06307  */
06308 
06309 static VALUE
06310 rb_str_strip_bang(VALUE str)
06311 {
06312     VALUE l = rb_str_lstrip_bang(str);
06313     VALUE r = rb_str_rstrip_bang(str);
06314 
06315     if (NIL_P(l) && NIL_P(r)) return Qnil;
06316     return str;
06317 }
06318 
06319 
06320 /*
06321  *  call-seq:
06322  *     str.strip   -> new_str
06323  *
06324  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06325  *
06326  *     "    hello    ".strip   #=> "hello"
06327  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06328  */
06329 
06330 static VALUE
06331 rb_str_strip(VALUE str)
06332 {
06333     str = rb_str_dup(str);
06334     rb_str_strip_bang(str);
06335     return str;
06336 }
06337 
06338 static VALUE
06339 scan_once(VALUE str, VALUE pat, long *start)
06340 {
06341     VALUE result, match;
06342     struct re_registers *regs;
06343     int i;
06344 
06345     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06346         match = rb_backref_get();
06347         regs = RMATCH_REGS(match);
06348         if (BEG(0) == END(0)) {
06349             rb_encoding *enc = STR_ENC_GET(str);
06350             /*
06351              * Always consume at least one character of the input string
06352              */
06353             if (RSTRING_LEN(str) > END(0))
06354                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06355                                                    RSTRING_END(str), enc);
06356             else
06357                 *start = END(0)+1;
06358         }
06359         else {
06360             *start = END(0);
06361         }
06362         if (regs->num_regs == 1) {
06363             return rb_reg_nth_match(0, match);
06364         }
06365         result = rb_ary_new2(regs->num_regs);
06366         for (i=1; i < regs->num_regs; i++) {
06367             rb_ary_push(result, rb_reg_nth_match(i, match));
06368         }
06369 
06370         return result;
06371     }
06372     return Qnil;
06373 }
06374 
06375 
06376 /*
06377  *  call-seq:
06378  *     str.scan(pattern)                         -> array
06379  *     str.scan(pattern) {|match, ...| block }   -> str
06380  *
06381  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
06382  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
06383  *  generated and either added to the result array or passed to the block. If
06384  *  the pattern contains no groups, each individual result consists of the
06385  *  matched string, <code>$&</code>.  If the pattern contains groups, each
06386  *  individual result is itself an array containing one entry per group.
06387  *
06388  *     a = "cruel world"
06389  *     a.scan(/\w+/)        #=> ["cruel", "world"]
06390  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
06391  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
06392  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
06393  *
06394  *  And the block form:
06395  *
06396  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
06397  *     print "\n"
06398  *     a.scan(/(.)(.)/) {|x,y| print y, x }
06399  *     print "\n"
06400  *
06401  *  <em>produces:</em>
06402  *
06403  *     <<cruel>> <<world>>
06404  *     rceu lowlr
06405  */
06406 
06407 static VALUE
06408 rb_str_scan(VALUE str, VALUE pat)
06409 {
06410     VALUE result;
06411     long start = 0;
06412     long last = -1, prev = 0;
06413     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06414 
06415     pat = get_pat(pat, 1);
06416     if (!rb_block_given_p()) {
06417         VALUE ary = rb_ary_new();
06418 
06419         while (!NIL_P(result = scan_once(str, pat, &start))) {
06420             last = prev;
06421             prev = start;
06422             rb_ary_push(ary, result);
06423         }
06424         if (last >= 0) rb_reg_search(pat, str, last, 0);
06425         return ary;
06426     }
06427 
06428     while (!NIL_P(result = scan_once(str, pat, &start))) {
06429         last = prev;
06430         prev = start;
06431         rb_yield(result);
06432         str_mod_check(str, p, len);
06433     }
06434     if (last >= 0) rb_reg_search(pat, str, last, 0);
06435     return str;
06436 }
06437 
06438 
06439 /*
06440  *  call-seq:
06441  *     str.hex   -> integer
06442  *
06443  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
06444  *  (with an optional sign and an optional <code>0x</code>) and returns the
06445  *  corresponding number. Zero is returned on error.
06446  *
06447  *     "0x0a".hex     #=> 10
06448  *     "-1234".hex    #=> -4660
06449  *     "0".hex        #=> 0
06450  *     "wombat".hex   #=> 0
06451  */
06452 
06453 static VALUE
06454 rb_str_hex(VALUE str)
06455 {
06456     rb_encoding *enc = rb_enc_get(str);
06457 
06458     if (!rb_enc_asciicompat(enc)) {
06459         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06460     }
06461     return rb_str_to_inum(str, 16, FALSE);
06462 }
06463 
06464 
06465 /*
06466  *  call-seq:
06467  *     str.oct   -> integer
06468  *
06469  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
06470  *  optional sign) and returns the corresponding number.  Returns 0 if the
06471  *  conversion fails.
06472  *
06473  *     "123".oct       #=> 83
06474  *     "-377".oct      #=> -255
06475  *     "bad".oct       #=> 0
06476  *     "0377bad".oct   #=> 255
06477  */
06478 
06479 static VALUE
06480 rb_str_oct(VALUE str)
06481 {
06482     rb_encoding *enc = rb_enc_get(str);
06483 
06484     if (!rb_enc_asciicompat(enc)) {
06485         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06486     }
06487     return rb_str_to_inum(str, -8, FALSE);
06488 }
06489 
06490 
06491 /*
06492  *  call-seq:
06493  *     str.crypt(other_str)   -> new_str
06494  *
06495  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
06496  *  library function <code>crypt</code>. The argument is the salt string, which
06497  *  should be two characters long, each character drawn from
06498  *  <code>[a-zA-Z0-9./]</code>.
06499  */
06500 
06501 static VALUE
06502 rb_str_crypt(VALUE str, VALUE salt)
06503 {
06504     extern char *crypt(const char *, const char *);
06505     VALUE result;
06506     const char *s, *saltp;
06507 #ifdef BROKEN_CRYPT
06508     char salt_8bit_clean[3];
06509 #endif
06510 
06511     StringValue(salt);
06512     if (RSTRING_LEN(salt) < 2)
06513         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06514 
06515     s = RSTRING_PTR(str);
06516     if (!s) s = "";
06517     saltp = RSTRING_PTR(salt);
06518 #ifdef BROKEN_CRYPT
06519     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06520         salt_8bit_clean[0] = saltp[0] & 0x7f;
06521         salt_8bit_clean[1] = saltp[1] & 0x7f;
06522         salt_8bit_clean[2] = '\0';
06523         saltp = salt_8bit_clean;
06524     }
06525 #endif
06526     result = rb_str_new2(crypt(s, saltp));
06527     OBJ_INFECT(result, str);
06528     OBJ_INFECT(result, salt);
06529     return result;
06530 }
06531 
06532 
06533 /*
06534  *  call-seq:
06535  *     str.intern   -> symbol
06536  *     str.to_sym   -> symbol
06537  *
06538  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
06539  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
06540  *
06541  *     "Koala".intern         #=> :Koala
06542  *     s = 'cat'.to_sym       #=> :cat
06543  *     s == :cat              #=> true
06544  *     s = '@cat'.to_sym      #=> :@cat
06545  *     s == :@cat             #=> true
06546  *
06547  *  This can also be used to create symbols that cannot be represented using the
06548  *  <code>:xxx</code> notation.
06549  *
06550  *     'cat and dog'.to_sym   #=> :"cat and dog"
06551  */
06552 
06553 VALUE
06554 rb_str_intern(VALUE s)
06555 {
06556     VALUE str = RB_GC_GUARD(s);
06557     ID id;
06558 
06559     id = rb_intern_str(str);
06560     return ID2SYM(id);
06561 }
06562 
06563 
06564 /*
06565  *  call-seq:
06566  *     str.ord   -> integer
06567  *
06568  *  Return the <code>Integer</code> ordinal of a one-character string.
06569  *
06570  *     "a".ord         #=> 97
06571  */
06572 
06573 VALUE
06574 rb_str_ord(VALUE s)
06575 {
06576     unsigned int c;
06577 
06578     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06579     return UINT2NUM(c);
06580 }
06581 /*
06582  *  call-seq:
06583  *     str.sum(n=16)   -> integer
06584  *
06585  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
06586  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
06587  *  to 16. The result is simply the sum of the binary value of each character in
06588  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
06589  *  checksum.
06590  */
06591 
06592 static VALUE
06593 rb_str_sum(int argc, VALUE *argv, VALUE str)
06594 {
06595     VALUE vbits;
06596     int bits;
06597     char *ptr, *p, *pend;
06598     long len;
06599     VALUE sum = INT2FIX(0);
06600     unsigned long sum0 = 0;
06601 
06602     if (argc == 0) {
06603         bits = 16;
06604     }
06605     else {
06606         rb_scan_args(argc, argv, "01", &vbits);
06607         bits = NUM2INT(vbits);
06608     }
06609     ptr = p = RSTRING_PTR(str);
06610     len = RSTRING_LEN(str);
06611     pend = p + len;
06612 
06613     while (p < pend) {
06614         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06615             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06616             str_mod_check(str, ptr, len);
06617             sum0 = 0;
06618         }
06619         sum0 += (unsigned char)*p;
06620         p++;
06621     }
06622 
06623     if (bits == 0) {
06624         if (sum0) {
06625             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06626         }
06627     }
06628     else {
06629         if (sum == INT2FIX(0)) {
06630             if (bits < (int)sizeof(long)*CHAR_BIT) {
06631                 sum0 &= (((unsigned long)1)<<bits)-1;
06632             }
06633             sum = LONG2FIX(sum0);
06634         }
06635         else {
06636             VALUE mod;
06637 
06638             if (sum0) {
06639                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06640             }
06641 
06642             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06643             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06644             sum = rb_funcall(sum, '&', 1, mod);
06645         }
06646     }
06647     return sum;
06648 }
06649 
06650 static VALUE
06651 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06652 {
06653     rb_encoding *enc;
06654     VALUE w;
06655     long width, len, flen = 1, fclen = 1;
06656     VALUE res;
06657     char *p;
06658     const char *f = " ";
06659     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06660     volatile VALUE pad;
06661     int singlebyte = 1, cr;
06662 
06663     rb_scan_args(argc, argv, "11", &w, &pad);
06664     enc = STR_ENC_GET(str);
06665     width = NUM2LONG(w);
06666     if (argc == 2) {
06667         StringValue(pad);
06668         enc = rb_enc_check(str, pad);
06669         f = RSTRING_PTR(pad);
06670         flen = RSTRING_LEN(pad);
06671         fclen = str_strlen(pad, enc);
06672         singlebyte = single_byte_optimizable(pad);
06673         if (flen == 0 || fclen == 0) {
06674             rb_raise(rb_eArgError, "zero width padding");
06675         }
06676     }
06677     len = str_strlen(str, enc);
06678     if (width < 0 || len >= width) return rb_str_dup(str);
06679     n = width - len;
06680     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06681     rlen = n - llen;
06682     cr = ENC_CODERANGE(str);
06683     if (flen > 1) {
06684        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06685        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06686     }
06687     size = RSTRING_LEN(str);
06688     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06689        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06690        (len += llen2 + rlen2) >= LONG_MAX - size) {
06691        rb_raise(rb_eArgError, "argument too big");
06692     }
06693     len += size;
06694     res = rb_str_new5(str, 0, len);
06695     p = RSTRING_PTR(res);
06696     if (flen <= 1) {
06697        memset(p, *f, llen);
06698        p += llen;
06699     }
06700     else {
06701        while (llen >= fclen) {
06702             memcpy(p,f,flen);
06703             p += flen;
06704             llen -= fclen;
06705         }
06706        if (llen > 0) {
06707            memcpy(p, f, llen2);
06708            p += llen2;
06709         }
06710     }
06711     memcpy(p, RSTRING_PTR(str), size);
06712     p += size;
06713     if (flen <= 1) {
06714        memset(p, *f, rlen);
06715        p += rlen;
06716     }
06717     else {
06718        while (rlen >= fclen) {
06719             memcpy(p,f,flen);
06720             p += flen;
06721             rlen -= fclen;
06722         }
06723        if (rlen > 0) {
06724            memcpy(p, f, rlen2);
06725            p += rlen2;
06726         }
06727     }
06728     *p = '\0';
06729     STR_SET_LEN(res, p-RSTRING_PTR(res));
06730     OBJ_INFECT(res, str);
06731     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06732     rb_enc_associate(res, enc);
06733     if (argc == 2)
06734         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06735     if (cr != ENC_CODERANGE_BROKEN)
06736         ENC_CODERANGE_SET(res, cr);
06737     return res;
06738 }
06739 
06740 
06741 /*
06742  *  call-seq:
06743  *     str.ljust(integer, padstr=' ')   -> new_str
06744  *
06745  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06746  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
06747  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06748  *
06749  *     "hello".ljust(4)            #=> "hello"
06750  *     "hello".ljust(20)           #=> "hello               "
06751  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
06752  */
06753 
06754 static VALUE
06755 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06756 {
06757     return rb_str_justify(argc, argv, str, 'l');
06758 }
06759 
06760 
06761 /*
06762  *  call-seq:
06763  *     str.rjust(integer, padstr=' ')   -> new_str
06764  *
06765  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06766  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
06767  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06768  *
06769  *     "hello".rjust(4)            #=> "hello"
06770  *     "hello".rjust(20)           #=> "               hello"
06771  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
06772  */
06773 
06774 static VALUE
06775 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06776 {
06777     return rb_str_justify(argc, argv, str, 'r');
06778 }
06779 
06780 
06781 /*
06782  *  call-seq:
06783  *     str.center(integer, padstr)   -> new_str
06784  *
06785  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06786  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
06787  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06788  *
06789  *     "hello".center(4)         #=> "hello"
06790  *     "hello".center(20)        #=> "       hello        "
06791  *     "hello".center(20, '123') #=> "1231231hello12312312"
06792  */
06793 
06794 static VALUE
06795 rb_str_center(int argc, VALUE *argv, VALUE str)
06796 {
06797     return rb_str_justify(argc, argv, str, 'c');
06798 }
06799 
06800 /*
06801  *  call-seq:
06802  *     str.partition(sep)              -> [head, sep, tail]
06803  *     str.partition(regexp)           -> [head, match, tail]
06804  *
06805  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
06806  *  and returns the part before it, the match, and the part
06807  *  after it.
06808  *  If it is not found, returns two empty strings and <i>str</i>.
06809  *
06810  *     "hello".partition("l")         #=> ["he", "l", "lo"]
06811  *     "hello".partition("x")         #=> ["hello", "", ""]
06812  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
06813  */
06814 
06815 static VALUE
06816 rb_str_partition(VALUE str, VALUE sep)
06817 {
06818     long pos;
06819     int regex = FALSE;
06820 
06821     if (TYPE(sep) == T_REGEXP) {
06822         pos = rb_reg_search(sep, str, 0, 0);
06823         regex = TRUE;
06824     }
06825     else {
06826         VALUE tmp;
06827 
06828         tmp = rb_check_string_type(sep);
06829         if (NIL_P(tmp)) {
06830             rb_raise(rb_eTypeError, "type mismatch: %s given",
06831                      rb_obj_classname(sep));
06832         }
06833         sep = tmp;
06834         pos = rb_str_index(str, sep, 0);
06835     }
06836     if (pos < 0) {
06837       failed:
06838         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06839     }
06840     if (regex) {
06841         sep = rb_str_subpat(str, sep, INT2FIX(0));
06842         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06843     }
06844     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06845                           sep,
06846                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
06847                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06848 }
06849 
06850 /*
06851  *  call-seq:
06852  *     str.rpartition(sep)             -> [head, sep, tail]
06853  *     str.rpartition(regexp)          -> [head, match, tail]
06854  *
06855  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
06856  *  of the string, and returns the part before it, the match, and the part
06857  *  after it.
06858  *  If it is not found, returns two empty strings and <i>str</i>.
06859  *
06860  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
06861  *     "hello".rpartition("x")         #=> ["", "", "hello"]
06862  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
06863  */
06864 
06865 static VALUE
06866 rb_str_rpartition(VALUE str, VALUE sep)
06867 {
06868     long pos = RSTRING_LEN(str);
06869     int regex = FALSE;
06870 
06871     if (TYPE(sep) == T_REGEXP) {
06872         pos = rb_reg_search(sep, str, pos, 1);
06873         regex = TRUE;
06874     }
06875     else {
06876         VALUE tmp;
06877 
06878         tmp = rb_check_string_type(sep);
06879         if (NIL_P(tmp)) {
06880             rb_raise(rb_eTypeError, "type mismatch: %s given",
06881                      rb_obj_classname(sep));
06882         }
06883         sep = tmp;
06884         pos = rb_str_sublen(str, pos);
06885         pos = rb_str_rindex(str, sep, pos);
06886     }
06887     if (pos < 0) {
06888         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06889     }
06890     if (regex) {
06891         sep = rb_reg_nth_match(0, rb_backref_get());
06892     }
06893     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06894                           sep,
06895                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06896 }
06897 
06898 /*
06899  *  call-seq:
06900  *     str.start_with?([prefix]+)   -> true or false
06901  *
06902  *  Returns true if <i>str</i> starts with a prefix given.
06903  *
06904  *    p "hello".start_with?("hell")               #=> true
06905  *
06906  *    # returns true if one of prefix matches.
06907  *    p "hello".start_with?("heaven", "hell")     #=> true
06908  *    p "hello".start_with?("heaven", "paradice") #=> false
06909  *
06910  *
06911  *
06912  */
06913 
06914 static VALUE
06915 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06916 {
06917     int i;
06918 
06919     for (i=0; i<argc; i++) {
06920         VALUE tmp = rb_check_string_type(argv[i]);
06921         if (NIL_P(tmp)) continue;
06922         rb_enc_check(str, tmp);
06923         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06924         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06925             return Qtrue;
06926     }
06927     return Qfalse;
06928 }
06929 
06930 /*
06931  *  call-seq:
06932  *     str.end_with?([suffix]+)   -> true or false
06933  *
06934  *  Returns true if <i>str</i> ends with a suffix given.
06935  */
06936 
06937 static VALUE
06938 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06939 {
06940     int i;
06941     char *p, *s, *e;
06942     rb_encoding *enc;
06943 
06944     for (i=0; i<argc; i++) {
06945         VALUE tmp = rb_check_string_type(argv[i]);
06946         if (NIL_P(tmp)) continue;
06947         enc = rb_enc_check(str, tmp);
06948         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06949         p = RSTRING_PTR(str);
06950         e = p + RSTRING_LEN(str);
06951         s = e - RSTRING_LEN(tmp);
06952         if (rb_enc_left_char_head(p, s, e, enc) != s)
06953             continue;
06954         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06955             return Qtrue;
06956     }
06957     return Qfalse;
06958 }
06959 
06960 void
06961 rb_str_setter(VALUE val, ID id, VALUE *var)
06962 {
06963     if (!NIL_P(val) && TYPE(val) != T_STRING) {
06964         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06965     }
06966     *var = val;
06967 }
06968 
06969 
06970 /*
06971  *  call-seq:
06972  *     str.force_encoding(encoding)   -> str
06973  *
06974  *  Changes the encoding to +encoding+ and returns self.
06975  */
06976 
06977 static VALUE
06978 rb_str_force_encoding(VALUE str, VALUE enc)
06979 {
06980     str_modifiable(str);
06981     rb_enc_associate(str, rb_to_encoding(enc));
06982     ENC_CODERANGE_CLEAR(str);
06983     return str;
06984 }
06985 
06986 /*
06987  *  call-seq:
06988  *     str.valid_encoding?  -> true or false
06989  *
06990  *  Returns true for a string which encoded correctly.
06991  *
06992  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
06993  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
06994  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
06995  */
06996 
06997 static VALUE
06998 rb_str_valid_encoding_p(VALUE str)
06999 {
07000     int cr = rb_enc_str_coderange(str);
07001 
07002     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07003 }
07004 
07005 /*
07006  *  call-seq:
07007  *     str.ascii_only?  -> true or false
07008  *
07009  *  Returns true for a string which has only ASCII characters.
07010  *
07011  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07012  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07013  */
07014 
07015 static VALUE
07016 rb_str_is_ascii_only_p(VALUE str)
07017 {
07018     int cr = rb_enc_str_coderange(str);
07019 
07020     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07021 }
07022 
07023 /**********************************************************************
07024  * Document-class: Symbol
07025  *
07026  *  <code>Symbol</code> objects represent names and some strings
07027  *  inside the Ruby
07028  *  interpreter. They are generated using the <code>:name</code> and
07029  *  <code>:"string"</code> literals
07030  *  syntax, and by the various <code>to_sym</code> methods. The same
07031  *  <code>Symbol</code> object will be created for a given name or string
07032  *  for the duration of a program's execution, regardless of the context
07033  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07034  *  one context, a method in another, and a class in a third, the
07035  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07036  *  all three contexts.
07037  *
07038  *     module One
07039  *       class Fred
07040  *       end
07041  *       $f1 = :Fred
07042  *     end
07043  *     module Two
07044  *       Fred = 1
07045  *       $f2 = :Fred
07046  *     end
07047  *     def Fred()
07048  *     end
07049  *     $f3 = :Fred
07050  *     $f1.object_id   #=> 2514190
07051  *     $f2.object_id   #=> 2514190
07052  *     $f3.object_id   #=> 2514190
07053  *
07054  */
07055 
07056 
07057 /*
07058  *  call-seq:
07059  *     sym == obj   -> true or false
07060  *
07061  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07062  *  symbol, returns <code>true</code>.
07063  */
07064 
07065 static VALUE
07066 sym_equal(VALUE sym1, VALUE sym2)
07067 {
07068     if (sym1 == sym2) return Qtrue;
07069     return Qfalse;
07070 }
07071 
07072 
07073 static int
07074 sym_printable(const char *s, const char *send, rb_encoding *enc)
07075 {
07076     while (s < send) {
07077         int n;
07078         int c = rb_enc_codepoint_len(s, send, &n, enc);
07079 
07080         if (!rb_enc_isprint(c, enc)) return FALSE;
07081         s += n;
07082     }
07083     return TRUE;
07084 }
07085 
07086 /*
07087  *  call-seq:
07088  *     sym.inspect    -> string
07089  *
07090  *  Returns the representation of <i>sym</i> as a symbol literal.
07091  *
07092  *     :fred.inspect   #=> ":fred"
07093  */
07094 
07095 static VALUE
07096 sym_inspect(VALUE sym)
07097 {
07098     VALUE str;
07099     ID id = SYM2ID(sym);
07100     rb_encoding *enc;
07101     const char *ptr;
07102     long len;
07103     char *dest;
07104     rb_encoding *resenc = rb_default_internal_encoding();
07105 
07106     if (resenc == NULL) resenc = rb_default_external_encoding();
07107     sym = rb_id2str(id);
07108     enc = STR_ENC_GET(sym);
07109     ptr = RSTRING_PTR(sym);
07110     len = RSTRING_LEN(sym);
07111     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07112         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07113         str = rb_str_inspect(sym);
07114         len = RSTRING_LEN(str);
07115         rb_str_resize(str, len + 1);
07116         dest = RSTRING_PTR(str);
07117         memmove(dest + 1, dest, len);
07118         dest[0] = ':';
07119     }
07120     else {
07121         char *dest;
07122         str = rb_enc_str_new(0, len + 1, enc);
07123         dest = RSTRING_PTR(str);
07124         dest[0] = ':';
07125         memcpy(dest + 1, ptr, len);
07126     }
07127     return str;
07128 }
07129 
07130 
07131 /*
07132  *  call-seq:
07133  *     sym.id2name   -> string
07134  *     sym.to_s      -> string
07135  *
07136  *  Returns the name or string corresponding to <i>sym</i>.
07137  *
07138  *     :fred.id2name   #=> "fred"
07139  */
07140 
07141 
07142 VALUE
07143 rb_sym_to_s(VALUE sym)
07144 {
07145     ID id = SYM2ID(sym);
07146 
07147     return str_new3(rb_cString, rb_id2str(id));
07148 }
07149 
07150 
07151 /*
07152  * call-seq:
07153  *   sym.to_sym   -> sym
07154  *   sym.intern   -> sym
07155  *
07156  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07157  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07158  * in this case.
07159  */
07160 
07161 static VALUE
07162 sym_to_sym(VALUE sym)
07163 {
07164     return sym;
07165 }
07166 
07167 static VALUE
07168 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07169 {
07170     VALUE obj;
07171 
07172     if (argc < 1) {
07173         rb_raise(rb_eArgError, "no receiver given");
07174     }
07175     obj = argv[0];
07176     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
07177 }
07178 
07179 /*
07180  * call-seq:
07181  *   sym.to_proc
07182  *
07183  * Returns a _Proc_ object which respond to the given method by _sym_.
07184  *
07185  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07186  */
07187 
07188 static VALUE
07189 sym_to_proc(VALUE sym)
07190 {
07191     static VALUE sym_proc_cache = Qfalse;
07192     enum {SYM_PROC_CACHE_SIZE = 67};
07193     VALUE proc;
07194     long id, index;
07195     VALUE *aryp;
07196 
07197     if (!sym_proc_cache) {
07198         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07199         rb_gc_register_mark_object(sym_proc_cache);
07200         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07201     }
07202 
07203     id = SYM2ID(sym);
07204     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07205 
07206     aryp = RARRAY_PTR(sym_proc_cache);
07207     if (aryp[index] == sym) {
07208         return aryp[index + 1];
07209     }
07210     else {
07211         proc = rb_proc_new(sym_call, (VALUE)id);
07212         aryp[index] = sym;
07213         aryp[index + 1] = proc;
07214         return proc;
07215     }
07216 }
07217 
07218 /*
07219  * call-seq:
07220  *
07221  *   sym.succ
07222  *
07223  * Same as <code>sym.to_s.succ.intern</code>.
07224  */
07225 
07226 static VALUE
07227 sym_succ(VALUE sym)
07228 {
07229     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07230 }
07231 
07232 /*
07233  * call-seq:
07234  *
07235  *   str <=> other       -> -1, 0, +1 or nil
07236  *
07237  * Compares _sym_ with _other_ in string form.
07238  */
07239 
07240 static VALUE
07241 sym_cmp(VALUE sym, VALUE other)
07242 {
07243     if (!SYMBOL_P(other)) {
07244         return Qnil;
07245     }
07246     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07247 }
07248 
07249 /*
07250  * call-seq:
07251  *
07252  *   sym.casecmp(other)  -> -1, 0, +1 or nil
07253  *
07254  * Case-insensitive version of <code>Symbol#<=></code>.
07255  */
07256 
07257 static VALUE
07258 sym_casecmp(VALUE sym, VALUE other)
07259 {
07260     if (!SYMBOL_P(other)) {
07261         return Qnil;
07262     }
07263     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07264 }
07265 
07266 /*
07267  * call-seq:
07268  *   sym =~ obj   -> fixnum or nil
07269  *
07270  * Returns <code>sym.to_s =~ obj</code>.
07271  */
07272 
07273 static VALUE
07274 sym_match(VALUE sym, VALUE other)
07275 {
07276     return rb_str_match(rb_sym_to_s(sym), other);
07277 }
07278 
07279 /*
07280  * call-seq:
07281  *   sym[idx]      -> char
07282  *   sym[b, n]     -> char
07283  *
07284  * Returns <code>sym.to_s[]</code>.
07285  */
07286 
07287 static VALUE
07288 sym_aref(int argc, VALUE *argv, VALUE sym)
07289 {
07290     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07291 }
07292 
07293 /*
07294  * call-seq:
07295  *   sym.length    -> integer
07296  *
07297  * Same as <code>sym.to_s.length</code>.
07298  */
07299 
07300 static VALUE
07301 sym_length(VALUE sym)
07302 {
07303     return rb_str_length(rb_id2str(SYM2ID(sym)));
07304 }
07305 
07306 /*
07307  * call-seq:
07308  *   sym.empty?   -> true or false
07309  *
07310  * Returns that _sym_ is :"" or not.
07311  */
07312 
07313 static VALUE
07314 sym_empty(VALUE sym)
07315 {
07316     return rb_str_empty(rb_id2str(SYM2ID(sym)));
07317 }
07318 
07319 /*
07320  * call-seq:
07321  *   sym.upcase    -> symbol
07322  *
07323  * Same as <code>sym.to_s.upcase.intern</code>.
07324  */
07325 
07326 static VALUE
07327 sym_upcase(VALUE sym)
07328 {
07329     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07330 }
07331 
07332 /*
07333  * call-seq:
07334  *   sym.downcase  -> symbol
07335  *
07336  * Same as <code>sym.to_s.downcase.intern</code>.
07337  */
07338 
07339 static VALUE
07340 sym_downcase(VALUE sym)
07341 {
07342     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07343 }
07344 
07345 /*
07346  * call-seq:
07347  *   sym.capitalize  -> symbol
07348  *
07349  * Same as <code>sym.to_s.capitalize.intern</code>.
07350  */
07351 
07352 static VALUE
07353 sym_capitalize(VALUE sym)
07354 {
07355     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07356 }
07357 
07358 /*
07359  * call-seq:
07360  *   sym.swapcase  -> symbol
07361  *
07362  * Same as <code>sym.to_s.swapcase.intern</code>.
07363  */
07364 
07365 static VALUE
07366 sym_swapcase(VALUE sym)
07367 {
07368     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07369 }
07370 
07371 /*
07372  * call-seq:
07373  *   sym.encoding   -> encoding
07374  *
07375  * Returns the Encoding object that represents the encoding of _sym_.
07376  */
07377 
07378 static VALUE
07379 sym_encoding(VALUE sym)
07380 {
07381     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07382 }
07383 
07384 ID
07385 rb_to_id(VALUE name)
07386 {
07387     VALUE tmp;
07388     ID id;
07389 
07390     switch (TYPE(name)) {
07391       default:
07392         tmp = rb_check_string_type(name);
07393         if (NIL_P(tmp)) {
07394             tmp = rb_inspect(name);
07395             rb_raise(rb_eTypeError, "%s is not a symbol",
07396                      RSTRING_PTR(tmp));
07397         }
07398         name = tmp;
07399         /* fall through */
07400       case T_STRING:
07401         name = rb_str_intern(name);
07402         /* fall through */
07403       case T_SYMBOL:
07404         return SYM2ID(name);
07405     }
07406     return id;
07407 }
07408 
07409 /*
07410  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
07411  *  bytes, typically representing characters. String objects may be created
07412  *  using <code>String::new</code> or as literals.
07413  *
07414  *  Because of aliasing issues, users of strings should be aware of the methods
07415  *  that modify the contents of a <code>String</code> object.  Typically,
07416  *  methods with names ending in ``!'' modify their receiver, while those
07417  *  without a ``!'' return a new <code>String</code>.  However, there are
07418  *  exceptions, such as <code>String#[]=</code>.
07419  *
07420  */
07421 
07422 void
07423 Init_String(void)
07424 {
07425 #undef rb_intern
07426 #define rb_intern(str) rb_intern_const(str)
07427 
07428     rb_cString  = rb_define_class("String", rb_cObject);
07429     rb_include_module(rb_cString, rb_mComparable);
07430     rb_define_alloc_func(rb_cString, str_alloc);
07431     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07432     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07433     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07434     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07435     rb_define_method(rb_cString, "==", rb_str_equal, 1);
07436     rb_define_method(rb_cString, "===", rb_str_equal, 1);
07437     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07438     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07439     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07440     rb_define_method(rb_cString, "+", rb_str_plus, 1);
07441     rb_define_method(rb_cString, "*", rb_str_times, 1);
07442     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07443     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07444     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07445     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07446     rb_define_method(rb_cString, "length", rb_str_length, 0);
07447     rb_define_method(rb_cString, "size", rb_str_length, 0);
07448     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07449     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07450     rb_define_method(rb_cString, "=~", rb_str_match, 1);
07451     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07452     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07453     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07454     rb_define_method(rb_cString, "next", rb_str_succ, 0);
07455     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07456     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07457     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07458     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07459     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07460     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07461     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07462     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07463     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07464 
07465     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07466     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07467     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07468     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07469     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07470     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07471 
07472     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07473     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07474     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07475     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07476 
07477     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07478     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07479     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07480     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07481 
07482     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07483     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07484     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07485     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07486     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07487     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07488     rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07489     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07490     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07491     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07492     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07493     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07494     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07495     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07496     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07497 
07498     rb_define_method(rb_cString, "include?", rb_str_include, 1);
07499     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07500     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07501 
07502     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07503 
07504     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07505     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07506     rb_define_method(rb_cString, "center", rb_str_center, -1);
07507 
07508     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07509     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07510     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07511     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07512     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07513     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07514     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07515 
07516     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07517     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07518     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07519     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07520     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07521     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07522     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07523 
07524     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07525     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07526     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07527     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07528     rb_define_method(rb_cString, "count", rb_str_count, -1);
07529 
07530     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07531     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07532     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07533     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07534 
07535     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07536     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07537     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07538     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07539 
07540     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07541 
07542     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07543     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07544 
07545     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07546     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07547 
07548     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
07549     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07550     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07551     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07552 
07553     id_to_s = rb_intern("to_s");
07554 
07555     rb_fs = Qnil;
07556     rb_define_variable("$;", &rb_fs);
07557     rb_define_variable("$-F", &rb_fs);
07558 
07559     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07560     rb_include_module(rb_cSymbol, rb_mComparable);
07561     rb_undef_alloc_func(rb_cSymbol);
07562     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07563     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
07564 
07565     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07566     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07567     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07568     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07569     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07570     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07571     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07572     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07573     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07574     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07575 
07576     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07577     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07578     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07579 
07580     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07581     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07582     rb_define_method(rb_cSymbol, "length", sym_length, 0);
07583     rb_define_method(rb_cSymbol, "size", sym_length, 0);
07584     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07585     rb_define_method(rb_cSymbol, "match", sym_match, 1);
07586 
07587     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07588     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07589     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07590     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07591 
07592     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07593 }
07594 

Generated on Wed Sep 8 2010 09:56:16 for Ruby by  doxygen 1.7.1