00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021
00022 #include <math.h>
00023 #include <ctype.h>
00024
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED FL_USER2
00055 #define STR_ASSOC FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s) FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061 if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063
00064
00065 #define STR_SET_NOEMBED(str) do {\
00066 FL_SET(str, STR_NOEMBED);\
00067 STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072 long tmp_n = (n);\
00073 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076
00077 #define STR_SET_LEN(str, n) do { \
00078 if (STR_EMBED_P(str)) {\
00079 STR_SET_EMBED_LEN(str, n);\
00080 }\
00081 else {\
00082 RSTRING(str)->as.heap.len = (n);\
00083 }\
00084 } while (0)
00085
00086 #define STR_DEC_LEN(str) do {\
00087 if (STR_EMBED_P(str)) {\
00088 long n = RSTRING_LEN(str);\
00089 n--;\
00090 STR_SET_EMBED_LEN(str, n);\
00091 }\
00092 else {\
00093 RSTRING(str)->as.heap.len--;\
00094 }\
00095 } while (0)
00096
00097 #define RESIZE_CAPA(str,capacity) do {\
00098 if (STR_EMBED_P(str)) {\
00099 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100 char *tmp = ALLOC_N(char, capacity+1);\
00101 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102 RSTRING(str)->as.heap.ptr = tmp;\
00103 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104 STR_SET_NOEMBED(str);\
00105 RSTRING(str)->as.heap.aux.capa = (capacity);\
00106 }\
00107 }\
00108 else {\
00109 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110 if (!STR_NOCAPA_P(str))\
00111 RSTRING(str)->as.heap.aux.capa = (capacity);\
00112 }\
00113 } while (0)
00114
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123 rb_encoding *enc;
00124
00125
00126 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127 return 1;
00128
00129 enc = STR_ENC_GET(str);
00130 if (rb_enc_mbmaxlen(enc) == 1)
00131 return 1;
00132
00133
00134
00135 return 0;
00136 }
00137
00138 VALUE rb_fs;
00139
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149 if ((int)sizeof(VALUE) * 2 < e - p) {
00150 const VALUE *s, *t;
00151 const VALUE lowbits = sizeof(VALUE) - 1;
00152 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153 while (p < (const char *)s) {
00154 if (!ISASCII(*p))
00155 return p;
00156 p++;
00157 }
00158 t = (const VALUE*)(~lowbits & (VALUE)e);
00159 while (s < t) {
00160 if (*s & NONASCII_MASK) {
00161 t = s;
00162 break;
00163 }
00164 s++;
00165 }
00166 p = (const char *)t;
00167 }
00168 #endif
00169 while (p < e) {
00170 if (!ISASCII(*p))
00171 return p;
00172 p++;
00173 }
00174 return NULL;
00175 }
00176
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180 const char *e = p + len;
00181
00182 if (rb_enc_to_index(enc) == 0) {
00183
00184 p = search_nonascii(p, e);
00185 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186 }
00187
00188 if (rb_enc_asciicompat(enc)) {
00189 p = search_nonascii(p, e);
00190 if (!p) {
00191 return ENC_CODERANGE_7BIT;
00192 }
00193 while (p < e) {
00194 int ret = rb_enc_precise_mbclen(p, e, enc);
00195 if (!MBCLEN_CHARFOUND_P(ret)) {
00196 return ENC_CODERANGE_BROKEN;
00197 }
00198 p += MBCLEN_CHARFOUND_LEN(ret);
00199 if (p < e) {
00200 p = search_nonascii(p, e);
00201 if (!p) {
00202 return ENC_CODERANGE_VALID;
00203 }
00204 }
00205 }
00206 if (e < p) {
00207 return ENC_CODERANGE_BROKEN;
00208 }
00209 return ENC_CODERANGE_VALID;
00210 }
00211
00212 while (p < e) {
00213 int ret = rb_enc_precise_mbclen(p, e, enc);
00214
00215 if (!MBCLEN_CHARFOUND_P(ret)) {
00216 return ENC_CODERANGE_BROKEN;
00217 }
00218 p += MBCLEN_CHARFOUND_LEN(ret);
00219 }
00220 if (e < p) {
00221 return ENC_CODERANGE_BROKEN;
00222 }
00223 return ENC_CODERANGE_VALID;
00224 }
00225
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229 const char *p = s;
00230
00231 if (*cr == ENC_CODERANGE_BROKEN)
00232 return e - s;
00233
00234 if (rb_enc_to_index(enc) == 0) {
00235
00236 p = search_nonascii(p, e);
00237 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238 return e - s;
00239 }
00240 else if (rb_enc_asciicompat(enc)) {
00241 p = search_nonascii(p, e);
00242 if (!p) {
00243 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244 return e - s;
00245 }
00246 while (p < e) {
00247 int ret = rb_enc_precise_mbclen(p, e, enc);
00248 if (!MBCLEN_CHARFOUND_P(ret)) {
00249 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250 return p - s;
00251 }
00252 p += MBCLEN_CHARFOUND_LEN(ret);
00253 if (p < e) {
00254 p = search_nonascii(p, e);
00255 if (!p) {
00256 *cr = ENC_CODERANGE_VALID;
00257 return e - s;
00258 }
00259 }
00260 }
00261 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262 return p - s;
00263 }
00264 else {
00265 while (p < e) {
00266 int ret = rb_enc_precise_mbclen(p, e, enc);
00267 if (!MBCLEN_CHARFOUND_P(ret)) {
00268 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269 return p - s;
00270 }
00271 p += MBCLEN_CHARFOUND_LEN(ret);
00272 }
00273 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274 return p - s;
00275 }
00276 }
00277
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281 rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287
00288
00289
00290 str_enc_copy(dest, src);
00291 switch (ENC_CODERANGE(src)) {
00292 case ENC_CODERANGE_7BIT:
00293 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294 break;
00295 case ENC_CODERANGE_VALID:
00296 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299 else
00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301 break;
00302 default:
00303 if (RSTRING_LEN(dest) == 0) {
00304 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306 else
00307 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308 }
00309 break;
00310 }
00311 }
00312
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316 str_enc_copy(dest, src);
00317 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323 int cr = ENC_CODERANGE(str);
00324
00325 if (cr == ENC_CODERANGE_UNKNOWN) {
00326 rb_encoding *enc = STR_ENC_GET(str);
00327 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328 ENC_CODERANGE_SET(str, cr);
00329 }
00330 return cr;
00331 }
00332
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336 rb_encoding *enc = STR_ENC_GET(str);
00337
00338 if (!rb_enc_asciicompat(enc))
00339 return FALSE;
00340 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341 return TRUE;
00342 return FALSE;
00343 }
00344
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349 rb_raise(rb_eRuntimeError, "string modified");
00350 }
00351 }
00352
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356 if (OBJ_FROZEN(s)) {
00357 rb_raise(rb_eRuntimeError, "string frozen");
00358 }
00359 }
00360
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364 if (STR_EMBED_P(str)) {
00365 return RSTRING_EMBED_LEN_MAX;
00366 }
00367 else if (STR_NOCAPA_P(str)) {
00368 return RSTRING(str)->as.heap.len;
00369 }
00370 else {
00371 return RSTRING(str)->as.heap.aux.capa;
00372 }
00373 }
00374
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378 NEWOBJ(str, struct RString);
00379 OBJSETUP(str, klass, T_STRING);
00380
00381 str->as.heap.ptr = 0;
00382 str->as.heap.len = 0;
00383 str->as.heap.aux.capa = 0;
00384
00385 return (VALUE)str;
00386 }
00387
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391 VALUE str;
00392
00393 if (len < 0) {
00394 rb_raise(rb_eArgError, "negative string size (or size too big)");
00395 }
00396
00397 str = str_alloc(klass);
00398 if (len > RSTRING_EMBED_LEN_MAX) {
00399 RSTRING(str)->as.heap.aux.capa = len;
00400 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401 STR_SET_NOEMBED(str);
00402 }
00403 else if (len == 0) {
00404 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405 }
00406 if (ptr) {
00407 memcpy(RSTRING_PTR(str), ptr, len);
00408 }
00409 STR_SET_LEN(str, len);
00410 RSTRING_PTR(str)[len] = '\0';
00411 return str;
00412 }
00413
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417 return str_new(rb_cString, ptr, len);
00418 }
00419
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423 VALUE str = rb_str_new(ptr, len);
00424 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425 return str;
00426 }
00427
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431 VALUE str = rb_str_new(ptr, len);
00432 rb_enc_associate(str, enc);
00433 return str;
00434 }
00435
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439 if (!ptr) {
00440 rb_raise(rb_eArgError, "NULL pointer given");
00441 }
00442 return rb_str_new(ptr, strlen(ptr));
00443 }
00444
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451 VALUE str = rb_str_new2(ptr);
00452 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453 return str;
00454 }
00455
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462 VALUE str = rb_str_new(ptr, len);
00463
00464 OBJ_TAINT(str);
00465 return str;
00466 }
00467
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471 VALUE str = rb_str_new2(ptr);
00472
00473 OBJ_TAINT(str);
00474 return str;
00475 }
00476
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483 rb_econv_t *ec;
00484 rb_econv_result_t ret;
00485 long len;
00486 VALUE newstr;
00487 const unsigned char *sp;
00488 unsigned char *dp;
00489
00490 if (!to) return str;
00491 if (from == to) return str;
00492 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493 to == rb_ascii8bit_encoding()) {
00494 if (STR_ENC_GET(str) != to) {
00495 str = rb_str_dup(str);
00496 rb_enc_associate(str, to);
00497 }
00498 return str;
00499 }
00500
00501 len = RSTRING_LEN(str);
00502 newstr = rb_str_new(0, len);
00503
00504 retry:
00505 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506 if (!ec) return str;
00507
00508 sp = (unsigned char*)RSTRING_PTR(str);
00509 dp = (unsigned char*)RSTRING_PTR(newstr);
00510 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512 rb_econv_close(ec);
00513 switch (ret) {
00514 case econv_destination_buffer_full:
00515
00516 len = len < 2 ? 2 : len * 2;
00517 rb_str_resize(newstr, len);
00518 goto retry;
00519
00520 case econv_finished:
00521 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522 rb_str_set_len(newstr, len);
00523 rb_enc_associate(newstr, to);
00524 return newstr;
00525
00526 default:
00527
00528 return str;
00529 }
00530 }
00531
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541 VALUE str;
00542
00543 str = rb_tainted_str_new(ptr, len);
00544 if (eenc == rb_usascii_encoding() &&
00545 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546 rb_enc_associate(str, rb_ascii8bit_encoding());
00547 return str;
00548 }
00549 rb_enc_associate(str, eenc);
00550 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611 STR_SET_EMBED(str2);
00612 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614 }
00615 else {
00616 str = rb_str_new_frozen(str);
00617 FL_SET(str2, STR_NOEMBED);
00618 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620 RSTRING(str2)->as.heap.aux.shared = str;
00621 FL_SET(str2, ELTS_SHARED);
00622 }
00623 rb_enc_cr_str_exact_copy(str2, str);
00624
00625 return str2;
00626 }
00627
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631 return str_replace_shared(str_alloc(klass), str);
00632 }
00633
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637 return str_new_shared(klass, str);
00638 }
00639
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643 VALUE str2 = str_new3(rb_obj_class(str), str);
00644
00645 OBJ_INFECT(str2, str);
00646 return str2;
00647 }
00648
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655 VALUE str2;
00656
00657 str2 = str_alloc(klass);
00658 STR_SET_NOEMBED(str2);
00659 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661 if (STR_SHARED_P(str)) {
00662 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663 assert(OBJ_FROZEN(shared));
00664 FL_SET(str2, ELTS_SHARED);
00665 RSTRING(str2)->as.heap.aux.shared = shared;
00666 }
00667 else {
00668 FL_SET(str, ELTS_SHARED);
00669 RSTRING(str)->as.heap.aux.shared = str2;
00670 }
00671 rb_enc_cr_str_exact_copy(str2, str);
00672 OBJ_INFECT(str2, str);
00673 return str2;
00674 }
00675
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679 VALUE klass, str;
00680
00681 if (OBJ_FROZEN(orig)) return orig;
00682 klass = rb_obj_class(orig);
00683 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684 long ofs;
00685 assert(OBJ_FROZEN(str));
00686 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689 ENCODING_GET(str) != ENCODING_GET(orig)) {
00690 str = str_new3(klass, str);
00691 RSTRING(str)->as.heap.ptr += ofs;
00692 RSTRING(str)->as.heap.len -= ofs;
00693 rb_enc_cr_str_exact_copy(str, orig);
00694 OBJ_INFECT(str, orig);
00695 }
00696 }
00697 else if (STR_EMBED_P(orig)) {
00698 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699 rb_enc_cr_str_exact_copy(str, orig);
00700 OBJ_INFECT(str, orig);
00701 }
00702 else if (STR_ASSOC_P(orig)) {
00703 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704 FL_UNSET(orig, STR_ASSOC);
00705 str = str_new4(klass, orig);
00706 FL_SET(str, STR_ASSOC);
00707 RSTRING(str)->as.heap.aux.shared = assoc;
00708 }
00709 else {
00710 str = str_new4(klass, orig);
00711 }
00712 OBJ_FREEZE(str);
00713 return str;
00714 }
00715
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722 return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726 rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732 VALUE v = rb_str_new5(str, 0, 0);
00733 OBJ_INFECT(v, str);
00734 return v;
00735 }
00736
00737 #define STR_BUF_MIN_SIZE 128
00738
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742 VALUE str = str_alloc(rb_cString);
00743
00744 if (capa < STR_BUF_MIN_SIZE) {
00745 capa = STR_BUF_MIN_SIZE;
00746 }
00747 FL_SET(str, STR_NOEMBED);
00748 RSTRING(str)->as.heap.aux.capa = capa;
00749 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750 RSTRING(str)->as.heap.ptr[0] = '\0';
00751
00752 return str;
00753 }
00754
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758 VALUE str;
00759 long len = strlen(ptr);
00760
00761 str = rb_str_buf_new(len);
00762 rb_str_buf_cat(str, ptr, len);
00763
00764 return str;
00765 }
00766
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773 return str_new(0, 0, len);
00774 }
00775
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780 xfree(RSTRING(str)->as.heap.ptr);
00781 }
00782 }
00783
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788 return RSTRING(str)->as.heap.aux.capa;
00789 }
00790 else {
00791 return 0;
00792 }
00793 }
00794
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798 return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800
00801 static inline void str_discard(VALUE str);
00802
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806 rb_encoding *enc;
00807 int cr;
00808 if (str == str2) return;
00809 enc = STR_ENC_GET(str2);
00810 cr = ENC_CODERANGE(str2);
00811 str_discard(str);
00812 OBJ_INFECT(str, str2);
00813 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814 STR_SET_EMBED(str);
00815 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817 rb_enc_associate(str, enc);
00818 ENC_CODERANGE_SET(str, cr);
00819 return;
00820 }
00821 STR_SET_NOEMBED(str);
00822 STR_UNSET_NOCAPA(str);
00823 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825 if (STR_NOCAPA_P(str2)) {
00826 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828 }
00829 else {
00830 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831 }
00832 STR_SET_EMBED(str2);
00833 RSTRING_PTR(str2)[0] = 0;
00834 STR_SET_EMBED_LEN(str2, 0);
00835 rb_enc_associate(str, enc);
00836 ENC_CODERANGE_SET(str, cr);
00837 }
00838
00839 static ID id_to_s;
00840
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844 VALUE str;
00845
00846 if (TYPE(obj) == T_STRING) {
00847 return obj;
00848 }
00849 str = rb_funcall(obj, id_to_s, 0);
00850 if (TYPE(str) != T_STRING)
00851 return rb_any_to_s(obj);
00852 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853 return str;
00854 }
00855
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859 long len;
00860
00861 len = RSTRING_LEN(str2);
00862 if (STR_ASSOC_P(str2)) {
00863 str2 = rb_str_new4(str2);
00864 }
00865 if (STR_SHARED_P(str2)) {
00866 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867 assert(OBJ_FROZEN(shared));
00868 STR_SET_NOEMBED(str);
00869 RSTRING(str)->as.heap.len = len;
00870 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871 FL_SET(str, ELTS_SHARED);
00872 FL_UNSET(str, STR_ASSOC);
00873 RSTRING(str)->as.heap.aux.shared = shared;
00874 }
00875 else {
00876 str_replace_shared(str, str2);
00877 }
00878
00879 OBJ_INFECT(str, str2);
00880 rb_enc_cr_str_exact_copy(str, str2);
00881 return str;
00882 }
00883
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887 VALUE dup = str_alloc(klass);
00888 str_replace(dup, str);
00889 return dup;
00890 }
00891
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895 return str_duplicate(rb_obj_class(str), str);
00896 }
00897
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901 return str_replace(str_alloc(rb_cString), str);
00902 }
00903
00904
00905
00906
00907
00908
00909
00910
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914 VALUE orig;
00915
00916 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917 rb_str_replace(str, orig);
00918 return str;
00919 }
00920
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924 long c;
00925 const char *q;
00926
00927 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929 }
00930 else if (rb_enc_asciicompat(enc)) {
00931 c = 0;
00932 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933 while (p < e) {
00934 if (ISASCII(*p)) {
00935 q = search_nonascii(p, e);
00936 if (!q)
00937 return c + (e - p);
00938 c += q - p;
00939 p = q;
00940 }
00941 p += rb_enc_fast_mbclen(p, e, enc);
00942 c++;
00943 }
00944 }
00945 else {
00946 while (p < e) {
00947 if (ISASCII(*p)) {
00948 q = search_nonascii(p, e);
00949 if (!q)
00950 return c + (e - p);
00951 c += q - p;
00952 p = q;
00953 }
00954 p += rb_enc_mbclen(p, e, enc);
00955 c++;
00956 }
00957 }
00958 return c;
00959 }
00960
00961 for (c=0; p<e; c++) {
00962 p += rb_enc_mbclen(p, e, enc);
00963 }
00964 return c;
00965 }
00966
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976 long c;
00977 const char *q;
00978 int ret;
00979
00980 *cr = 0;
00981 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983 }
00984 else if (rb_enc_asciicompat(enc)) {
00985 c = 0;
00986 while (p < e) {
00987 if (ISASCII(*p)) {
00988 q = search_nonascii(p, e);
00989 if (!q) {
00990 if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991 return c + (e - p);
00992 }
00993 c += q - p;
00994 p = q;
00995 }
00996 ret = rb_enc_precise_mbclen(p, e, enc);
00997 if (MBCLEN_CHARFOUND_P(ret)) {
00998 *cr |= ENC_CODERANGE_VALID;
00999 p += MBCLEN_CHARFOUND_LEN(ret);
01000 }
01001 else {
01002 *cr = ENC_CODERANGE_BROKEN;
01003 p++;
01004 }
01005 c++;
01006 }
01007 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008 return c;
01009 }
01010
01011 for (c=0; p<e; c++) {
01012 ret = rb_enc_precise_mbclen(p, e, enc);
01013 if (MBCLEN_CHARFOUND_P(ret)) {
01014 *cr |= ENC_CODERANGE_VALID;
01015 p += MBCLEN_CHARFOUND_LEN(ret);
01016 }
01017 else {
01018 *cr = ENC_CODERANGE_BROKEN;
01019 if (p + rb_enc_mbminlen(enc) <= e)
01020 p += rb_enc_mbminlen(enc);
01021 else
01022 p = e;
01023 }
01024 }
01025 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026 return c;
01027 }
01028
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034 VALUE d = *s;
01035 d |= ~(d>>1);
01036 d >>= 6;
01037 d &= NONASCII_MASK >> 7;
01038 d += (d>>8);
01039 d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041 d += (d>>32);
01042 #endif
01043 return (d&0xF);
01044 }
01045 #endif
01046
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050 const char *p, *e;
01051 long n;
01052 int cr;
01053
01054 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055 if (!enc) enc = STR_ENC_GET(str);
01056 p = RSTRING_PTR(str);
01057 e = RSTRING_END(str);
01058 cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061 enc == rb_utf8_encoding()) {
01062
01063 VALUE len = 0;
01064 if ((int)sizeof(VALUE) * 2 < e - p) {
01065 const VALUE *s, *t;
01066 const VALUE lowbits = sizeof(VALUE) - 1;
01067 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068 t = (const VALUE*)(~lowbits & (VALUE)e);
01069 while (p < (const char *)s) {
01070 if (is_utf8_lead_byte(*p)) len++;
01071 p++;
01072 }
01073 while (s < t) {
01074 len += count_utf8_lead_bytes_with_word(s);
01075 s++;
01076 }
01077 p = (const char *)s;
01078 }
01079 while (p < e) {
01080 if (is_utf8_lead_byte(*p)) len++;
01081 p++;
01082 }
01083 return (long)len;
01084 }
01085 #endif
01086 n = rb_enc_strlen_cr(p, e, enc, &cr);
01087 if (cr) {
01088 ENC_CODERANGE_SET(str, cr);
01089 }
01090 return n;
01091 }
01092
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096 return str_strlen(str, STR_ENC_GET(str));
01097 }
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110 long len;
01111
01112 len = str_strlen(str, STR_ENC_GET(str));
01113 return LONG2NUM(len);
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126 return INT2NUM(RSTRING_LEN(str));
01127 }
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137
01138
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142 if (RSTRING_LEN(str) == 0)
01143 return Qtrue;
01144 return Qfalse;
01145 }
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160 VALUE str3;
01161 rb_encoding *enc;
01162
01163 StringValue(str2);
01164 enc = rb_enc_check(str1, str2);
01165 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168 RSTRING_PTR(str2), RSTRING_LEN(str2));
01169 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170
01171 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172 OBJ_TAINT(str3);
01173 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175 return str3;
01176 }
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191 VALUE str2;
01192 long n, len;
01193 char *ptr2;
01194
01195 len = NUM2LONG(times);
01196 if (len < 0) {
01197 rb_raise(rb_eArgError, "negative argument");
01198 }
01199 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01200 rb_raise(rb_eArgError, "argument too big");
01201 }
01202
01203 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204 ptr2 = RSTRING_PTR(str2);
01205 if (len) {
01206 n = RSTRING_LEN(str);
01207 memcpy(ptr2, RSTRING_PTR(str), n);
01208 while (n <= len/2) {
01209 memcpy(ptr2 + n, ptr2, n);
01210 n *= 2;
01211 }
01212 memcpy(ptr2 + n, ptr2, len-n);
01213 }
01214 ptr2[RSTRING_LEN(str2)] = '\0';
01215 OBJ_INFECT(str2, str);
01216 rb_enc_cr_str_copy_for_substr(str2, str);
01217
01218 return str2;
01219 }
01220
01221
01222
01223
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235 static VALUE
01236 rb_str_format_m(VALUE str, VALUE arg)
01237 {
01238 volatile VALUE tmp = rb_check_array_type(arg);
01239
01240 if (!NIL_P(tmp)) {
01241 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01242 }
01243 return rb_str_format(1, &arg, str);
01244 }
01245
01246 static inline void
01247 str_modifiable(VALUE str)
01248 {
01249 if (FL_TEST(str, STR_TMPLOCK)) {
01250 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01251 }
01252 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01253 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01254 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01255 }
01256
01257 static inline int
01258 str_independent(VALUE str)
01259 {
01260 str_modifiable(str);
01261 if (!STR_SHARED_P(str)) return 1;
01262 if (STR_EMBED_P(str)) return 1;
01263 return 0;
01264 }
01265
01266 static void
01267 str_make_independent(VALUE str)
01268 {
01269 char *ptr;
01270 long len = RSTRING_LEN(str);
01271
01272 ptr = ALLOC_N(char, len+1);
01273 if (RSTRING_PTR(str)) {
01274 memcpy(ptr, RSTRING_PTR(str), len);
01275 }
01276 STR_SET_NOEMBED(str);
01277 ptr[len] = 0;
01278 RSTRING(str)->as.heap.ptr = ptr;
01279 RSTRING(str)->as.heap.len = len;
01280 RSTRING(str)->as.heap.aux.capa = len;
01281 STR_UNSET_NOCAPA(str);
01282 }
01283
01284 void
01285 rb_str_modify(VALUE str)
01286 {
01287 if (!str_independent(str))
01288 str_make_independent(str);
01289 ENC_CODERANGE_CLEAR(str);
01290 }
01291
01292
01293 static void
01294 str_modify_keep_cr(VALUE str)
01295 {
01296 if (!str_independent(str))
01297 str_make_independent(str);
01298 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01299
01300 ENC_CODERANGE_CLEAR(str);
01301 }
01302
01303 static inline void
01304 str_discard(VALUE str)
01305 {
01306 str_modifiable(str);
01307 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01308 xfree(RSTRING_PTR(str));
01309 RSTRING(str)->as.heap.ptr = 0;
01310 RSTRING(str)->as.heap.len = 0;
01311 }
01312 }
01313
01314 void
01315 rb_str_associate(VALUE str, VALUE add)
01316 {
01317
01318 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01319 if (STR_ASSOC_P(str)) {
01320
01321 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01322 }
01323 else {
01324 if (STR_SHARED_P(str)) {
01325 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01326 str_make_independent(str);
01327 if (STR_ASSOC_P(assoc)) {
01328 assoc = RSTRING(assoc)->as.heap.aux.shared;
01329 rb_ary_concat(assoc, add);
01330 add = assoc;
01331 }
01332 }
01333 else if (STR_EMBED_P(str)) {
01334 str_make_independent(str);
01335 }
01336 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01337 RESIZE_CAPA(str, RSTRING_LEN(str));
01338 }
01339 FL_SET(str, STR_ASSOC);
01340 RBASIC(add)->klass = 0;
01341 RSTRING(str)->as.heap.aux.shared = add;
01342 }
01343 }
01344
01345 VALUE
01346 rb_str_associated(VALUE str)
01347 {
01348 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01349 if (STR_ASSOC_P(str)) {
01350 return RSTRING(str)->as.heap.aux.shared;
01351 }
01352 return Qfalse;
01353 }
01354
01355 VALUE
01356 rb_string_value(volatile VALUE *ptr)
01357 {
01358 VALUE s = *ptr;
01359 if (TYPE(s) != T_STRING) {
01360 s = rb_str_to_str(s);
01361 *ptr = s;
01362 }
01363 return s;
01364 }
01365
01366 char *
01367 rb_string_value_ptr(volatile VALUE *ptr)
01368 {
01369 VALUE str = rb_string_value(ptr);
01370 return RSTRING_PTR(str);
01371 }
01372
01373 char *
01374 rb_string_value_cstr(volatile VALUE *ptr)
01375 {
01376 VALUE str = rb_string_value(ptr);
01377 char *s = RSTRING_PTR(str);
01378 long len = RSTRING_LEN(str);
01379
01380 if (!s || memchr(s, 0, len)) {
01381 rb_raise(rb_eArgError, "string contains null byte");
01382 }
01383 if (s[len]) rb_str_modify(str);
01384 return s;
01385 }
01386
01387 VALUE
01388 rb_check_string_type(VALUE str)
01389 {
01390 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01391 return str;
01392 }
01393
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405 static VALUE
01406 rb_str_s_try_convert(VALUE dummy, VALUE str)
01407 {
01408 return rb_check_string_type(str);
01409 }
01410
01411 char*
01412 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01413 {
01414 if (rb_enc_mbmaxlen(enc) == 1) {
01415 p += nth;
01416 }
01417 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01418 p += nth * rb_enc_mbmaxlen(enc);
01419 }
01420 else if (rb_enc_asciicompat(enc)) {
01421 const char *p2, *e2;
01422 int n;
01423
01424 while (p < e && 0 < nth) {
01425 e2 = p + nth;
01426 if (e < e2)
01427 return (char *)e;
01428 if (ISASCII(*p)) {
01429 p2 = search_nonascii(p, e2);
01430 if (!p2)
01431 return (char *)e2;
01432 nth -= p2 - p;
01433 p = p2;
01434 }
01435 n = rb_enc_mbclen(p, e, enc);
01436 p += n;
01437 nth--;
01438 }
01439 if (nth != 0)
01440 return (char *)e;
01441 return (char *)p;
01442 }
01443 else {
01444 while (p<e && nth--) {
01445 p += rb_enc_mbclen(p, e, enc);
01446 }
01447 }
01448 if (p > e) p = e;
01449 return (char*)p;
01450 }
01451
01452 static char*
01453 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01454 {
01455 if (singlebyte)
01456 p += nth;
01457 else {
01458 p = rb_enc_nth(p, e, nth, enc);
01459 }
01460 if (!p) return 0;
01461 if (p > e) p = e;
01462 return (char *)p;
01463 }
01464
01465
01466 static long
01467 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01468 {
01469 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01470 if (!pp) return e - p;
01471 return pp - p;
01472 }
01473
01474 long
01475 rb_str_offset(VALUE str, long pos)
01476 {
01477 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01478 STR_ENC_GET(str), single_byte_optimizable(str));
01479 }
01480
01481 #ifdef NONASCII_MASK
01482 static char *
01483 str_utf8_nth(const char *p, const char *e, long nth)
01484 {
01485 if ((int)SIZEOF_VALUE < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01486 const VALUE *s, *t;
01487 const VALUE lowbits = sizeof(VALUE) - 1;
01488 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01489 t = (const VALUE*)(~lowbits & (VALUE)e);
01490 while (p < (const char *)s) {
01491 if (is_utf8_lead_byte(*p)) nth--;
01492 p++;
01493 }
01494 do {
01495 nth -= count_utf8_lead_bytes_with_word(s);
01496 s++;
01497 } while (s < t && (int)sizeof(VALUE) <= nth);
01498 p = (char *)s;
01499 }
01500 while (p < e) {
01501 if (is_utf8_lead_byte(*p)) {
01502 if (nth == 0) break;
01503 nth--;
01504 }
01505 p++;
01506 }
01507 return (char *)p;
01508 }
01509
01510 static long
01511 str_utf8_offset(const char *p, const char *e, long nth)
01512 {
01513 const char *pp = str_utf8_nth(p, e, nth);
01514 return pp - p;
01515 }
01516 #endif
01517
01518
01519 long
01520 rb_str_sublen(VALUE str, long pos)
01521 {
01522 if (single_byte_optimizable(str) || pos < 0)
01523 return pos;
01524 else {
01525 char *p = RSTRING_PTR(str);
01526 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01527 }
01528 }
01529
01530 VALUE
01531 rb_str_subseq(VALUE str, long beg, long len)
01532 {
01533 VALUE str2;
01534
01535 if (RSTRING_LEN(str) == beg + len &&
01536 RSTRING_EMBED_LEN_MAX < len) {
01537 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01538 rb_str_drop_bytes(str2, beg);
01539 }
01540 else {
01541 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01542 }
01543
01544 rb_enc_cr_str_copy_for_substr(str2, str);
01545 OBJ_INFECT(str2, str);
01546
01547 return str2;
01548 }
01549
01550 VALUE
01551 rb_str_substr(VALUE str, long beg, long len)
01552 {
01553 rb_encoding *enc = STR_ENC_GET(str);
01554 VALUE str2;
01555 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01556
01557 if (len < 0) return Qnil;
01558 if (!RSTRING_LEN(str)) {
01559 len = 0;
01560 }
01561 if (single_byte_optimizable(str)) {
01562 if (beg > RSTRING_LEN(str)) return Qnil;
01563 if (beg < 0) {
01564 beg += RSTRING_LEN(str);
01565 if (beg < 0) return Qnil;
01566 }
01567 if (beg + len > RSTRING_LEN(str))
01568 len = RSTRING_LEN(str) - beg;
01569 if (len <= 0) {
01570 len = 0;
01571 p = 0;
01572 }
01573 else
01574 p = s + beg;
01575 goto sub;
01576 }
01577 if (beg < 0) {
01578 if (len > -beg) len = -beg;
01579 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01580 beg = -beg;
01581 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01582 p = e;
01583 if (!p) return Qnil;
01584 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01585 if (!p) return Qnil;
01586 len = e - p;
01587 goto sub;
01588 }
01589 else {
01590 beg += str_strlen(str, enc);
01591 if (beg < 0) return Qnil;
01592 }
01593 }
01594 else if (beg > 0 && beg > str_strlen(str, enc)) {
01595 return Qnil;
01596 }
01597 if (len == 0) {
01598 p = 0;
01599 }
01600 #ifdef NONASCII_MASK
01601 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01602 enc == rb_utf8_encoding()) {
01603 p = str_utf8_nth(s, e, beg);
01604 len = str_utf8_offset(p, e, len);
01605 }
01606 #endif
01607 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01608 int char_sz = rb_enc_mbmaxlen(enc);
01609
01610 p = s + beg * char_sz;
01611 if (p > e) {
01612 p = e;
01613 len = 0;
01614 }
01615 else if (len * char_sz > e - p)
01616 len = e - p;
01617 else
01618 len *= char_sz;
01619 }
01620 else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01621 len = 0;
01622 }
01623 else {
01624 len = str_offset(p, e, len, enc, 0);
01625 }
01626 sub:
01627 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01628 str2 = rb_str_new4(str);
01629 str2 = str_new3(rb_obj_class(str2), str2);
01630 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01631 RSTRING(str2)->as.heap.len = len;
01632 }
01633 else {
01634 str2 = rb_str_new5(str, p, len);
01635 rb_enc_cr_str_copy_for_substr(str2, str);
01636 OBJ_INFECT(str2, str);
01637 }
01638
01639 return str2;
01640 }
01641
01642 VALUE
01643 rb_str_freeze(VALUE str)
01644 {
01645 if (STR_ASSOC_P(str)) {
01646 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01647 OBJ_FREEZE(ary);
01648 }
01649 return rb_obj_freeze(str);
01650 }
01651
01652 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01653 #define rb_str_dup_frozen rb_str_new_frozen
01654
01655 VALUE
01656 rb_str_locktmp(VALUE str)
01657 {
01658 if (FL_TEST(str, STR_TMPLOCK)) {
01659 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01660 }
01661 FL_SET(str, STR_TMPLOCK);
01662 return str;
01663 }
01664
01665 VALUE
01666 rb_str_unlocktmp(VALUE str)
01667 {
01668 if (!FL_TEST(str, STR_TMPLOCK)) {
01669 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01670 }
01671 FL_UNSET(str, STR_TMPLOCK);
01672 return str;
01673 }
01674
01675 void
01676 rb_str_set_len(VALUE str, long len)
01677 {
01678 rb_str_modify(str);
01679 STR_SET_LEN(str, len);
01680 RSTRING_PTR(str)[len] = '\0';
01681 }
01682
01683 VALUE
01684 rb_str_resize(VALUE str, long len)
01685 {
01686 long slen;
01687
01688 if (len < 0) {
01689 rb_raise(rb_eArgError, "negative string size (or size too big)");
01690 }
01691
01692 rb_str_modify(str);
01693 slen = RSTRING_LEN(str);
01694 if (len != slen) {
01695 if (STR_EMBED_P(str)) {
01696 char *ptr;
01697 if (len <= RSTRING_EMBED_LEN_MAX) {
01698 STR_SET_EMBED_LEN(str, len);
01699 RSTRING(str)->as.ary[len] = '\0';
01700 return str;
01701 }
01702 ptr = ALLOC_N(char,len+1);
01703 MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01704 RSTRING(str)->as.heap.ptr = ptr;
01705 STR_SET_NOEMBED(str);
01706 }
01707 else if (len <= RSTRING_EMBED_LEN_MAX) {
01708 char *ptr = RSTRING(str)->as.heap.ptr;
01709 STR_SET_EMBED(str);
01710 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
01711 RSTRING(str)->as.ary[len] = '\0';
01712 STR_SET_EMBED_LEN(str, len);
01713 xfree(ptr);
01714 return str;
01715 }
01716 else if (slen < len || slen - len > 1024) {
01717 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01718 }
01719 if (!STR_NOCAPA_P(str)) {
01720 RSTRING(str)->as.heap.aux.capa = len;
01721 }
01722 RSTRING(str)->as.heap.len = len;
01723 RSTRING(str)->as.heap.ptr[len] = '\0';
01724 }
01725 return str;
01726 }
01727
01728 static VALUE
01729 str_buf_cat(VALUE str, const char *ptr, long len)
01730 {
01731 long capa, total, off = -1;
01732
01733 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01734 off = ptr - RSTRING_PTR(str);
01735 }
01736 rb_str_modify(str);
01737 if (len == 0) return 0;
01738 if (STR_ASSOC_P(str)) {
01739 FL_UNSET(str, STR_ASSOC);
01740 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01741 }
01742 else if (STR_EMBED_P(str)) {
01743 capa = RSTRING_EMBED_LEN_MAX;
01744 }
01745 else {
01746 capa = RSTRING(str)->as.heap.aux.capa;
01747 }
01748 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01749 rb_raise(rb_eArgError, "string sizes too big");
01750 }
01751 total = RSTRING_LEN(str)+len;
01752 if (capa <= total) {
01753 while (total > capa) {
01754 if (capa + 1 >= LONG_MAX / 2) {
01755 capa = (total + 4095) / 4096;
01756 break;
01757 }
01758 capa = (capa + 1) * 2;
01759 }
01760 RESIZE_CAPA(str, capa);
01761 }
01762 if (off != -1) {
01763 ptr = RSTRING_PTR(str) + off;
01764 }
01765 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01766 STR_SET_LEN(str, total);
01767 RSTRING_PTR(str)[total] = '\0';
01768
01769 return str;
01770 }
01771
01772 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01773
01774 VALUE
01775 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01776 {
01777 if (len == 0) return str;
01778 if (len < 0) {
01779 rb_raise(rb_eArgError, "negative string size (or size too big)");
01780 }
01781 return str_buf_cat(str, ptr, len);
01782 }
01783
01784 VALUE
01785 rb_str_buf_cat2(VALUE str, const char *ptr)
01786 {
01787 return rb_str_buf_cat(str, ptr, strlen(ptr));
01788 }
01789
01790 VALUE
01791 rb_str_cat(VALUE str, const char *ptr, long len)
01792 {
01793 if (len < 0) {
01794 rb_raise(rb_eArgError, "negative string size (or size too big)");
01795 }
01796 if (STR_ASSOC_P(str)) {
01797 rb_str_modify(str);
01798 if (STR_EMBED_P(str)) str_make_independent(str);
01799 REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01800 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01801 RSTRING(str)->as.heap.len += len;
01802 RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0';
01803 return str;
01804 }
01805
01806 return rb_str_buf_cat(str, ptr, len);
01807 }
01808
01809 VALUE
01810 rb_str_cat2(VALUE str, const char *ptr)
01811 {
01812 return rb_str_cat(str, ptr, strlen(ptr));
01813 }
01814
01815 static VALUE
01816 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01817 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01818 {
01819 int str_encindex = ENCODING_GET(str);
01820 int res_encindex;
01821 int str_cr, res_cr;
01822 int str_a8 = ENCODING_IS_ASCII8BIT(str);
01823 int ptr_a8 = ptr_encindex == 0;
01824
01825 str_cr = ENC_CODERANGE(str);
01826
01827 if (str_encindex == ptr_encindex) {
01828 if (str_cr == ENC_CODERANGE_UNKNOWN ||
01829 (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01830 ptr_cr = ENC_CODERANGE_UNKNOWN;
01831 }
01832 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01833 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01834 }
01835 }
01836 else {
01837 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01838 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01839 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01840 if (len == 0)
01841 return str;
01842 if (RSTRING_LEN(str) == 0) {
01843 rb_str_buf_cat(str, ptr, len);
01844 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01845 return str;
01846 }
01847 goto incompatible;
01848 }
01849 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01850 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01851 }
01852 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01853 if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01854 str_cr = rb_enc_str_coderange(str);
01855 }
01856 }
01857 }
01858 if (ptr_cr_ret)
01859 *ptr_cr_ret = ptr_cr;
01860
01861 if (str_encindex != ptr_encindex &&
01862 str_cr != ENC_CODERANGE_7BIT &&
01863 ptr_cr != ENC_CODERANGE_7BIT) {
01864 incompatible:
01865 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01866 rb_enc_name(rb_enc_from_index(str_encindex)),
01867 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01868 }
01869
01870 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01871 res_encindex = str_encindex;
01872 res_cr = ENC_CODERANGE_UNKNOWN;
01873 }
01874 else if (str_cr == ENC_CODERANGE_7BIT) {
01875 if (ptr_cr == ENC_CODERANGE_7BIT) {
01876 res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01877 res_cr = ENC_CODERANGE_7BIT;
01878 }
01879 else {
01880 res_encindex = ptr_encindex;
01881 res_cr = ptr_cr;
01882 }
01883 }
01884 else if (str_cr == ENC_CODERANGE_VALID) {
01885 res_encindex = str_encindex;
01886 res_cr = str_cr;
01887 }
01888 else {
01889 res_encindex = str_encindex;
01890 res_cr = str_cr;
01891 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01892 }
01893
01894 if (len < 0) {
01895 rb_raise(rb_eArgError, "negative string size (or size too big)");
01896 }
01897 str_buf_cat(str, ptr, len);
01898 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01899 return str;
01900 }
01901
01902 VALUE
01903 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01904 {
01905 return rb_enc_cr_str_buf_cat(str, ptr, len,
01906 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01907 }
01908
01909 VALUE
01910 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01911 {
01912
01913 int encindex = ENCODING_GET(str);
01914 rb_encoding *enc = rb_enc_from_index(encindex);
01915 if (rb_enc_asciicompat(enc)) {
01916 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01917 encindex, ENC_CODERANGE_7BIT, 0);
01918 }
01919 else {
01920 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01921 while (*ptr) {
01922 unsigned int c = (unsigned char)*ptr;
01923 int len = rb_enc_codelen(c, enc);
01924 rb_enc_mbcput(c, buf, enc);
01925 rb_enc_cr_str_buf_cat(str, buf, len,
01926 encindex, ENC_CODERANGE_VALID, 0);
01927 ptr++;
01928 }
01929 return str;
01930 }
01931 }
01932
01933 VALUE
01934 rb_str_buf_append(VALUE str, VALUE str2)
01935 {
01936 int str2_cr;
01937
01938 str2_cr = ENC_CODERANGE(str2);
01939
01940 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01941 ENCODING_GET(str2), str2_cr, &str2_cr);
01942
01943 OBJ_INFECT(str, str2);
01944 ENC_CODERANGE_SET(str2, str2_cr);
01945
01946 return str;
01947 }
01948
01949 VALUE
01950 rb_str_append(VALUE str, VALUE str2)
01951 {
01952 rb_encoding *enc;
01953 int cr, cr2;
01954
01955 StringValue(str2);
01956 if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01957 long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01958 enc = rb_enc_check(str, str2);
01959 cr = ENC_CODERANGE(str);
01960 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01961 rb_str_modify(str);
01962 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01963 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01964 RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01965 RSTRING(str)->as.heap.len = len;
01966 rb_enc_associate(str, enc);
01967 ENC_CODERANGE_SET(str, cr);
01968 OBJ_INFECT(str, str2);
01969 return str;
01970 }
01971 return rb_str_buf_append(str, str2);
01972 }
01973
01974
01975
01976
01977
01978
01979
01980
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991 VALUE
01992 rb_str_concat(VALUE str1, VALUE str2)
01993 {
01994 SIGNED_VALUE lc;
01995
01996 if (FIXNUM_P(str2)) {
01997 lc = FIX2LONG(str2);
01998 if (lc < 0)
01999 rb_raise(rb_eRangeError, "negative argument");
02000 }
02001 else if (TYPE(str2) == T_BIGNUM) {
02002 if (!RBIGNUM_SIGN(str2))
02003 rb_raise(rb_eRangeError, "negative argument");
02004 lc = rb_big2ulong(str2);
02005 }
02006 else {
02007 return rb_str_append(str1, str2);
02008 }
02009 #if SIZEOF_INT < SIZEOF_VALUE
02010 if ((VALUE)lc > UINT_MAX) {
02011 rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02012 }
02013 #endif
02014 {
02015 rb_encoding *enc = STR_ENC_GET(str1);
02016 long pos = RSTRING_LEN(str1);
02017 int cr = ENC_CODERANGE(str1);
02018 int c, len;
02019
02020 if ((len = rb_enc_codelen(c = (int)lc, enc)) <= 0) {
02021 rb_raise(rb_eRangeError, "%u invalid char", c);
02022 }
02023 rb_str_resize(str1, pos+len);
02024 rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
02025 ENC_CODERANGE_SET(str1, cr);
02026 return str1;
02027 }
02028 }
02029
02030 st_index_t
02031 rb_memhash(const void *ptr, long len)
02032 {
02033 return st_hash(ptr, len, rb_hash_start(0));
02034 }
02035
02036 st_index_t
02037 rb_str_hash(VALUE str)
02038 {
02039 int e = ENCODING_GET(str);
02040 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02041 e = 0;
02042 }
02043 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02044 }
02045
02046 int
02047 rb_str_hash_cmp(VALUE str1, VALUE str2)
02048 {
02049 long len;
02050
02051 if (!rb_str_comparable(str1, str2)) return 1;
02052 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02053 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02054 return 0;
02055 }
02056 return 1;
02057 }
02058
02059
02060
02061
02062
02063
02064
02065
02066 static VALUE
02067 rb_str_hash_m(VALUE str)
02068 {
02069 st_index_t hval = rb_str_hash(str);
02070 return INT2FIX(hval);
02071 }
02072
02073 #define lesser(a,b) (((a)>(b))?(b):(a))
02074
02075 int
02076 rb_str_comparable(VALUE str1, VALUE str2)
02077 {
02078 int idx1, idx2;
02079 int rc1, rc2;
02080
02081 if (RSTRING_LEN(str1) == 0) return TRUE;
02082 if (RSTRING_LEN(str2) == 0) return TRUE;
02083 idx1 = ENCODING_GET(str1);
02084 idx2 = ENCODING_GET(str2);
02085 if (idx1 == idx2) return TRUE;
02086 rc1 = rb_enc_str_coderange(str1);
02087 rc2 = rb_enc_str_coderange(str2);
02088 if (rc1 == ENC_CODERANGE_7BIT) {
02089 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02090 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02091 return TRUE;
02092 }
02093 if (rc2 == ENC_CODERANGE_7BIT) {
02094 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02095 return TRUE;
02096 }
02097 return FALSE;
02098 }
02099
02100 int
02101 rb_str_cmp(VALUE str1, VALUE str2)
02102 {
02103 long len;
02104 int retval;
02105
02106 len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02107 retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02108 if (retval == 0) {
02109 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02110 if (!rb_str_comparable(str1, str2)) {
02111 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02112 return 1;
02113 return -1;
02114 }
02115 return 0;
02116 }
02117 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02118 return -1;
02119 }
02120 if (retval > 0) return 1;
02121 return -1;
02122 }
02123
02124
02125 static VALUE
02126 str_eql(const VALUE str1, const VALUE str2)
02127 {
02128 const long len = RSTRING_LEN(str1);
02129
02130 if (len != RSTRING_LEN(str2)) return Qfalse;
02131 if (!rb_str_comparable(str1, str2)) return Qfalse;
02132 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02133 return Qtrue;
02134 return Qfalse;
02135 }
02136
02137
02138
02139
02140
02141
02142
02143
02144
02145 VALUE
02146 rb_str_equal(VALUE str1, VALUE str2)
02147 {
02148 if (str1 == str2) return Qtrue;
02149 if (TYPE(str2) != T_STRING) {
02150 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02151 return Qfalse;
02152 }
02153 return rb_equal(str2, str1);
02154 }
02155 return str_eql(str1, str2);
02156 }
02157
02158
02159
02160
02161
02162
02163
02164
02165 static VALUE
02166 rb_str_eql(VALUE str1, VALUE str2)
02167 {
02168 if (TYPE(str2) != T_STRING) return Qfalse;
02169 return str_eql(str1, str2);
02170 }
02171
02172
02173
02174
02175
02176
02177
02178
02179
02180
02181
02182
02183
02184
02185
02186
02187
02188
02189
02190
02191
02192
02193
02194
02195 static VALUE
02196 rb_str_cmp_m(VALUE str1, VALUE str2)
02197 {
02198 long result;
02199
02200 if (TYPE(str2) != T_STRING) {
02201 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02202 return Qnil;
02203 }
02204 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02205 return Qnil;
02206 }
02207 else {
02208 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02209
02210 if (NIL_P(tmp)) return Qnil;
02211 if (!FIXNUM_P(tmp)) {
02212 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02213 }
02214 result = -FIX2LONG(tmp);
02215 }
02216 }
02217 else {
02218 result = rb_str_cmp(str1, str2);
02219 }
02220 return LONG2NUM(result);
02221 }
02222
02223
02224
02225
02226
02227
02228
02229
02230
02231
02232
02233
02234
02235 static VALUE
02236 rb_str_casecmp(VALUE str1, VALUE str2)
02237 {
02238 long len;
02239 rb_encoding *enc;
02240 char *p1, *p1end, *p2, *p2end;
02241
02242 StringValue(str2);
02243 enc = rb_enc_compatible(str1, str2);
02244 if (!enc) {
02245 return Qnil;
02246 }
02247
02248 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02249 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02250 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02251 while (p1 < p1end && p2 < p2end) {
02252 if (*p1 != *p2) {
02253 unsigned int c1 = TOUPPER(*p1 & 0xff);
02254 unsigned int c2 = TOUPPER(*p2 & 0xff);
02255 if (c1 != c2)
02256 return INT2FIX(c1 < c2 ? -1 : 1);
02257 }
02258 p1++;
02259 p2++;
02260 }
02261 }
02262 else {
02263 while (p1 < p1end && p2 < p2end) {
02264 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02265 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02266
02267 if (0 <= c1 && 0 <= c2) {
02268 c1 = TOUPPER(c1);
02269 c2 = TOUPPER(c2);
02270 if (c1 != c2)
02271 return INT2FIX(c1 < c2 ? -1 : 1);
02272 }
02273 else {
02274 int r;
02275 l1 = rb_enc_mbclen(p1, p1end, enc);
02276 l2 = rb_enc_mbclen(p2, p2end, enc);
02277 len = l1 < l2 ? l1 : l2;
02278 r = memcmp(p1, p2, len);
02279 if (r != 0)
02280 return INT2FIX(r < 0 ? -1 : 1);
02281 if (l1 != l2)
02282 return INT2FIX(l1 < l2 ? -1 : 1);
02283 }
02284 p1 += l1;
02285 p2 += l2;
02286 }
02287 }
02288 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02289 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02290 return INT2FIX(-1);
02291 }
02292
02293 static long
02294 rb_str_index(VALUE str, VALUE sub, long offset)
02295 {
02296 long pos;
02297 char *s, *sptr, *e;
02298 long len, slen;
02299 rb_encoding *enc;
02300
02301 enc = rb_enc_check(str, sub);
02302 if (is_broken_string(sub)) {
02303 return -1;
02304 }
02305 len = str_strlen(str, enc);
02306 slen = str_strlen(sub, enc);
02307 if (offset < 0) {
02308 offset += len;
02309 if (offset < 0) return -1;
02310 }
02311 if (len - offset < slen) return -1;
02312 s = RSTRING_PTR(str);
02313 e = s + RSTRING_LEN(str);
02314 if (offset) {
02315 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02316 s += offset;
02317 }
02318 if (slen == 0) return offset;
02319
02320 sptr = RSTRING_PTR(sub);
02321 slen = RSTRING_LEN(sub);
02322 len = RSTRING_LEN(str) - offset;
02323 for (;;) {
02324 char *t;
02325 pos = rb_memsearch(sptr, slen, s, len, enc);
02326 if (pos < 0) return pos;
02327 t = rb_enc_right_char_head(s, s+pos, e, enc);
02328 if (t == s + pos) break;
02329 if ((len -= t - s) <= 0) return -1;
02330 offset += t - s;
02331 s = t;
02332 }
02333 return pos + offset;
02334 }
02335
02336
02337
02338
02339
02340
02341
02342
02343
02344
02345
02346
02347
02348
02349
02350
02351
02352
02353
02354 static VALUE
02355 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02356 {
02357 VALUE sub;
02358 VALUE initpos;
02359 long pos;
02360
02361 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02362 pos = NUM2LONG(initpos);
02363 }
02364 else {
02365 pos = 0;
02366 }
02367 if (pos < 0) {
02368 pos += str_strlen(str, STR_ENC_GET(str));
02369 if (pos < 0) {
02370 if (TYPE(sub) == T_REGEXP) {
02371 rb_backref_set(Qnil);
02372 }
02373 return Qnil;
02374 }
02375 }
02376
02377 switch (TYPE(sub)) {
02378 case T_REGEXP:
02379 if (pos > str_strlen(str, STR_ENC_GET(str)))
02380 return Qnil;
02381 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02382 rb_enc_check(str, sub), single_byte_optimizable(str));
02383
02384 pos = rb_reg_search(sub, str, pos, 0);
02385 pos = rb_str_sublen(str, pos);
02386 break;
02387
02388 default: {
02389 VALUE tmp;
02390
02391 tmp = rb_check_string_type(sub);
02392 if (NIL_P(tmp)) {
02393 rb_raise(rb_eTypeError, "type mismatch: %s given",
02394 rb_obj_classname(sub));
02395 }
02396 sub = tmp;
02397 }
02398
02399 case T_STRING:
02400 pos = rb_str_index(str, sub, pos);
02401 pos = rb_str_sublen(str, pos);
02402 break;
02403 }
02404
02405 if (pos == -1) return Qnil;
02406 return LONG2NUM(pos);
02407 }
02408
02409 static long
02410 rb_str_rindex(VALUE str, VALUE sub, long pos)
02411 {
02412 long len, slen;
02413 char *s, *sbeg, *e, *t;
02414 rb_encoding *enc;
02415 int singlebyte = single_byte_optimizable(str);
02416
02417 enc = rb_enc_check(str, sub);
02418 if (is_broken_string(sub)) {
02419 return -1;
02420 }
02421 len = str_strlen(str, enc);
02422 slen = str_strlen(sub, enc);
02423
02424 if (len < slen) return -1;
02425 if (len - pos < slen) {
02426 pos = len - slen;
02427 }
02428 if (len == 0) {
02429 return pos;
02430 }
02431 sbeg = RSTRING_PTR(str);
02432 e = RSTRING_END(str);
02433 t = RSTRING_PTR(sub);
02434 slen = RSTRING_LEN(sub);
02435 for (;;) {
02436 s = str_nth(sbeg, e, pos, enc, singlebyte);
02437 if (!s) return -1;
02438 if (memcmp(s, t, slen) == 0) {
02439 return pos;
02440 }
02441 if (pos == 0) break;
02442 pos--;
02443 }
02444 return -1;
02445 }
02446
02447
02448
02449
02450
02451
02452
02453
02454
02455
02456
02457
02458
02459
02460
02461
02462
02463
02464
02465
02466 static VALUE
02467 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02468 {
02469 VALUE sub;
02470 VALUE vpos;
02471 rb_encoding *enc = STR_ENC_GET(str);
02472 long pos, len = str_strlen(str, enc);
02473
02474 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02475 pos = NUM2LONG(vpos);
02476 if (pos < 0) {
02477 pos += len;
02478 if (pos < 0) {
02479 if (TYPE(sub) == T_REGEXP) {
02480 rb_backref_set(Qnil);
02481 }
02482 return Qnil;
02483 }
02484 }
02485 if (pos > len) pos = len;
02486 }
02487 else {
02488 pos = len;
02489 }
02490
02491 switch (TYPE(sub)) {
02492 case T_REGEXP:
02493
02494 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02495 STR_ENC_GET(str), single_byte_optimizable(str));
02496
02497 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02498 pos = rb_reg_search(sub, str, pos, 1);
02499 pos = rb_str_sublen(str, pos);
02500 }
02501 if (pos >= 0) return LONG2NUM(pos);
02502 break;
02503
02504 default: {
02505 VALUE tmp;
02506
02507 tmp = rb_check_string_type(sub);
02508 if (NIL_P(tmp)) {
02509 rb_raise(rb_eTypeError, "type mismatch: %s given",
02510 rb_obj_classname(sub));
02511 }
02512 sub = tmp;
02513 }
02514
02515 case T_STRING:
02516 pos = rb_str_rindex(str, sub, pos);
02517 if (pos >= 0) return LONG2NUM(pos);
02518 break;
02519 }
02520 return Qnil;
02521 }
02522
02523
02524
02525
02526
02527
02528
02529
02530
02531
02532
02533
02534
02535
02536
02537 static VALUE
02538 rb_str_match(VALUE x, VALUE y)
02539 {
02540 switch (TYPE(y)) {
02541 case T_STRING:
02542 rb_raise(rb_eTypeError, "type mismatch: String given");
02543
02544 case T_REGEXP:
02545 return rb_reg_match(y, x);
02546
02547 default:
02548 return rb_funcall(y, rb_intern("=~"), 1, x);
02549 }
02550 }
02551
02552
02553 static VALUE get_pat(VALUE, int);
02554
02555
02556
02557
02558
02559
02560
02561
02562
02563
02564
02565
02566
02567
02568
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587 static VALUE
02588 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02589 {
02590 VALUE re, result;
02591 if (argc < 1)
02592 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02593 re = argv[0];
02594 argv[0] = str;
02595 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02596 if (!NIL_P(result) && rb_block_given_p()) {
02597 return rb_yield(result);
02598 }
02599 return result;
02600 }
02601
02602 enum neighbor_char {
02603 NEIGHBOR_NOT_CHAR,
02604 NEIGHBOR_FOUND,
02605 NEIGHBOR_WRAPPED
02606 };
02607
02608 static enum neighbor_char
02609 enc_succ_char(char *p, long len, rb_encoding *enc)
02610 {
02611 long i;
02612 int l;
02613 while (1) {
02614 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02615 p[i] = '\0';
02616 if (i < 0)
02617 return NEIGHBOR_WRAPPED;
02618 ++((unsigned char*)p)[i];
02619 l = rb_enc_precise_mbclen(p, p+len, enc);
02620 if (MBCLEN_CHARFOUND_P(l)) {
02621 l = MBCLEN_CHARFOUND_LEN(l);
02622 if (l == len) {
02623 return NEIGHBOR_FOUND;
02624 }
02625 else {
02626 memset(p+l, 0xff, len-l);
02627 }
02628 }
02629 if (MBCLEN_INVALID_P(l) && i < len-1) {
02630 long len2;
02631 int l2;
02632 for (len2 = len-1; 0 < len2; len2--) {
02633 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02634 if (!MBCLEN_INVALID_P(l2))
02635 break;
02636 }
02637 memset(p+len2+1, 0xff, len-(len2+1));
02638 }
02639 }
02640 }
02641
02642 static enum neighbor_char
02643 enc_pred_char(char *p, long len, rb_encoding *enc)
02644 {
02645 long i;
02646 int l;
02647 while (1) {
02648 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02649 p[i] = '\xff';
02650 if (i < 0)
02651 return NEIGHBOR_WRAPPED;
02652 --((unsigned char*)p)[i];
02653 l = rb_enc_precise_mbclen(p, p+len, enc);
02654 if (MBCLEN_CHARFOUND_P(l)) {
02655 l = MBCLEN_CHARFOUND_LEN(l);
02656 if (l == len) {
02657 return NEIGHBOR_FOUND;
02658 }
02659 else {
02660 memset(p+l, 0, len-l);
02661 }
02662 }
02663 if (MBCLEN_INVALID_P(l) && i < len-1) {
02664 long len2;
02665 int l2;
02666 for (len2 = len-1; 0 < len2; len2--) {
02667 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02668 if (!MBCLEN_INVALID_P(l2))
02669 break;
02670 }
02671 memset(p+len2+1, 0, len-(len2+1));
02672 }
02673 }
02674 }
02675
02676
02677
02678
02679
02680
02681
02682
02683
02684
02685 static enum neighbor_char
02686 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02687 {
02688 enum neighbor_char ret;
02689 unsigned int c;
02690 int ctype;
02691 int range;
02692 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02693
02694 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02695 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02696 ctype = ONIGENC_CTYPE_DIGIT;
02697 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02698 ctype = ONIGENC_CTYPE_ALPHA;
02699 else
02700 return NEIGHBOR_NOT_CHAR;
02701
02702 MEMCPY(save, p, char, len);
02703 ret = enc_succ_char(p, len, enc);
02704 if (ret == NEIGHBOR_FOUND) {
02705 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02706 if (rb_enc_isctype(c, ctype, enc))
02707 return NEIGHBOR_FOUND;
02708 }
02709 MEMCPY(p, save, char, len);
02710 range = 1;
02711 while (1) {
02712 MEMCPY(save, p, char, len);
02713 ret = enc_pred_char(p, len, enc);
02714 if (ret == NEIGHBOR_FOUND) {
02715 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02716 if (!rb_enc_isctype(c, ctype, enc)) {
02717 MEMCPY(p, save, char, len);
02718 break;
02719 }
02720 }
02721 else {
02722 MEMCPY(p, save, char, len);
02723 break;
02724 }
02725 range++;
02726 }
02727 if (range == 1) {
02728 return NEIGHBOR_NOT_CHAR;
02729 }
02730
02731 if (ctype != ONIGENC_CTYPE_DIGIT) {
02732 MEMCPY(carry, p, char, len);
02733 return NEIGHBOR_WRAPPED;
02734 }
02735
02736 MEMCPY(carry, p, char, len);
02737 enc_succ_char(carry, len, enc);
02738 return NEIGHBOR_WRAPPED;
02739 }
02740
02741
02742
02743
02744
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767 VALUE
02768 rb_str_succ(VALUE orig)
02769 {
02770 rb_encoding *enc;
02771 VALUE str;
02772 char *sbeg, *s, *e, *last_alnum = 0;
02773 int c = -1;
02774 long l;
02775 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02776 long carry_pos = 0, carry_len = 1;
02777 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02778
02779 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02780 rb_enc_cr_str_copy_for_substr(str, orig);
02781 OBJ_INFECT(str, orig);
02782 if (RSTRING_LEN(str) == 0) return str;
02783
02784 enc = STR_ENC_GET(orig);
02785 sbeg = RSTRING_PTR(str);
02786 s = e = sbeg + RSTRING_LEN(str);
02787
02788 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02789 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02790 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02791 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02792 s = last_alnum;
02793 break;
02794 }
02795 }
02796 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02797 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02798 switch (neighbor) {
02799 case NEIGHBOR_NOT_CHAR:
02800 continue;
02801 case NEIGHBOR_FOUND:
02802 return str;
02803 case NEIGHBOR_WRAPPED:
02804 last_alnum = s;
02805 break;
02806 }
02807 c = 1;
02808 carry_pos = s - sbeg;
02809 carry_len = l;
02810 }
02811 if (c == -1) {
02812 s = e;
02813 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02814 enum neighbor_char neighbor;
02815 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02816 neighbor = enc_succ_char(s, l, enc);
02817 if (neighbor == NEIGHBOR_FOUND)
02818 return str;
02819 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02820
02821 enc_succ_char(s, l, enc);
02822 }
02823 if (!rb_enc_asciicompat(enc)) {
02824 MEMCPY(carry, s, char, l);
02825 carry_len = l;
02826 }
02827 carry_pos = s - sbeg;
02828 }
02829 }
02830 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02831 s = RSTRING_PTR(str) + carry_pos;
02832 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02833 memmove(s, carry, carry_len);
02834 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02835 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02836 rb_enc_str_coderange(str);
02837 return str;
02838 }
02839
02840
02841
02842
02843
02844
02845
02846
02847
02848
02849
02850 static VALUE
02851 rb_str_succ_bang(VALUE str)
02852 {
02853 rb_str_shared_replace(str, rb_str_succ(str));
02854
02855 return str;
02856 }
02857
02858
02859
02860
02861
02862
02863
02864
02865
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891 static VALUE
02892 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02893 {
02894 VALUE end, exclusive;
02895 VALUE current, after_end;
02896 ID succ;
02897 int n, excl, ascii;
02898 rb_encoding *enc;
02899
02900 rb_scan_args(argc, argv, "11", &end, &exclusive);
02901 RETURN_ENUMERATOR(beg, argc, argv);
02902 excl = RTEST(exclusive);
02903 CONST_ID(succ, "succ");
02904 StringValue(end);
02905 enc = rb_enc_check(beg, end);
02906 ascii = (is_ascii_string(beg) && is_ascii_string(end));
02907
02908 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02909 char c = RSTRING_PTR(beg)[0];
02910 char e = RSTRING_PTR(end)[0];
02911
02912 if (c > e || (excl && c == e)) return beg;
02913 for (;;) {
02914 rb_yield(rb_enc_str_new(&c, 1, enc));
02915 if (!excl && c == e) break;
02916 c++;
02917 if (excl && c == e) break;
02918 }
02919 return beg;
02920 }
02921
02922 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02923 char *s, *send;
02924 VALUE b, e;
02925 int width;
02926
02927 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02928 width = rb_long2int(send - s);
02929 while (s < send) {
02930 if (!ISDIGIT(*s)) goto no_digits;
02931 s++;
02932 }
02933 s = RSTRING_PTR(end); send = RSTRING_END(end);
02934 while (s < send) {
02935 if (!ISDIGIT(*s)) goto no_digits;
02936 s++;
02937 }
02938 b = rb_str_to_inum(beg, 10, FALSE);
02939 e = rb_str_to_inum(end, 10, FALSE);
02940 if (FIXNUM_P(b) && FIXNUM_P(e)) {
02941 long bi = FIX2LONG(b);
02942 long ei = FIX2LONG(e);
02943 rb_encoding *usascii = rb_usascii_encoding();
02944
02945 while (bi <= ei) {
02946 if (excl && bi == ei) break;
02947 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02948 bi++;
02949 }
02950 }
02951 else {
02952 ID op = excl ? '<' : rb_intern("<=");
02953 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02954
02955 args[0] = INT2FIX(width);
02956 while (rb_funcall(b, op, 1, e)) {
02957 args[1] = b;
02958 rb_yield(rb_str_format(numberof(args), args, fmt));
02959 b = rb_funcall(b, succ, 0, 0);
02960 }
02961 }
02962 return beg;
02963 }
02964
02965 no_digits:
02966 n = rb_str_cmp(beg, end);
02967 if (n > 0 || (excl && n == 0)) return beg;
02968
02969 after_end = rb_funcall(end, succ, 0, 0);
02970 current = rb_str_dup(beg);
02971 while (!rb_str_equal(current, after_end)) {
02972 VALUE next = Qnil;
02973 if (excl || !rb_str_equal(current, end))
02974 next = rb_funcall(current, succ, 0, 0);
02975 rb_yield(current);
02976 if (NIL_P(next)) break;
02977 current = next;
02978 StringValue(current);
02979 if (excl && rb_str_equal(current, end)) break;
02980 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02981 break;
02982 }
02983
02984 return beg;
02985 }
02986
02987 static VALUE
02988 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
02989 {
02990 if (rb_reg_search(re, str, 0, 0) >= 0) {
02991 VALUE match = rb_backref_get();
02992 int nth = rb_reg_backref_number(match, backref);
02993 return rb_reg_nth_match(nth, match);
02994 }
02995 return Qnil;
02996 }
02997
02998 static VALUE
02999 rb_str_aref(VALUE str, VALUE indx)
03000 {
03001 long idx;
03002
03003 switch (TYPE(indx)) {
03004 case T_FIXNUM:
03005 idx = FIX2LONG(indx);
03006
03007 num_index:
03008 str = rb_str_substr(str, idx, 1);
03009 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03010 return str;
03011
03012 case T_REGEXP:
03013 return rb_str_subpat(str, indx, INT2FIX(0));
03014
03015 case T_STRING:
03016 if (rb_str_index(str, indx, 0) != -1)
03017 return rb_str_dup(indx);
03018 return Qnil;
03019
03020 default:
03021
03022 {
03023 long beg, len;
03024 VALUE tmp;
03025
03026 len = str_strlen(str, STR_ENC_GET(str));
03027 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03028 case Qfalse:
03029 break;
03030 case Qnil:
03031 return Qnil;
03032 default:
03033 tmp = rb_str_substr(str, beg, len);
03034 return tmp;
03035 }
03036 }
03037 idx = NUM2LONG(indx);
03038 goto num_index;
03039 }
03040 return Qnil;
03041 }
03042
03043
03044
03045
03046
03047
03048
03049
03050
03051
03052
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070
03071
03072
03073
03074
03075
03076
03077
03078
03079
03080
03081
03082
03083
03084
03085
03086
03087
03088
03089
03090
03091
03092 static VALUE
03093 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03094 {
03095 if (argc == 2) {
03096 if (TYPE(argv[0]) == T_REGEXP) {
03097 return rb_str_subpat(str, argv[0], argv[1]);
03098 }
03099 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03100 }
03101 if (argc != 1) {
03102 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03103 }
03104 return rb_str_aref(str, argv[0]);
03105 }
03106
03107 VALUE
03108 rb_str_drop_bytes(VALUE str, long len)
03109 {
03110 char *ptr = RSTRING_PTR(str);
03111 long olen = RSTRING_LEN(str), nlen;
03112
03113 str_modifiable(str);
03114 if (len > olen) len = olen;
03115 nlen = olen - len;
03116 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03117 char *oldptr = ptr;
03118 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03119 STR_SET_EMBED(str);
03120 STR_SET_EMBED_LEN(str, nlen);
03121 ptr = RSTRING(str)->as.ary;
03122 memmove(ptr, oldptr + len, nlen);
03123 if (fl == STR_NOEMBED) xfree(oldptr);
03124 }
03125 else {
03126 if (!STR_SHARED_P(str)) rb_str_new4(str);
03127 ptr = RSTRING(str)->as.heap.ptr += len;
03128 RSTRING(str)->as.heap.len = nlen;
03129 }
03130 ptr[nlen] = 0;
03131 ENC_CODERANGE_CLEAR(str);
03132 return str;
03133 }
03134
03135 static void
03136 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03137 {
03138 if (beg == 0 && RSTRING_LEN(val) == 0) {
03139 rb_str_drop_bytes(str, len);
03140 OBJ_INFECT(str, val);
03141 return;
03142 }
03143
03144 rb_str_modify(str);
03145 if (len < RSTRING_LEN(val)) {
03146
03147 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03148 }
03149
03150 if (RSTRING_LEN(val) != len) {
03151 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03152 RSTRING_PTR(str) + beg + len,
03153 RSTRING_LEN(str) - (beg + len));
03154 }
03155 if (RSTRING_LEN(val) < beg && len < 0) {
03156 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03157 }
03158 if (RSTRING_LEN(val) > 0) {
03159 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03160 }
03161 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03162 if (RSTRING_PTR(str)) {
03163 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03164 }
03165 OBJ_INFECT(str, val);
03166 }
03167
03168 static void
03169 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03170 {
03171 long slen;
03172 char *p, *e;
03173 rb_encoding *enc;
03174 int singlebyte = single_byte_optimizable(str);
03175 int cr;
03176
03177 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03178
03179 StringValue(val);
03180 enc = rb_enc_check(str, val);
03181 slen = str_strlen(str, enc);
03182
03183 if (slen < beg) {
03184 out_of_range:
03185 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03186 }
03187 if (beg < 0) {
03188 if (-beg > slen) {
03189 goto out_of_range;
03190 }
03191 beg += slen;
03192 }
03193 if (slen < len || slen < beg + len) {
03194 len = slen - beg;
03195 }
03196 str_modify_keep_cr(str);
03197 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03198 if (!p) p = RSTRING_END(str);
03199 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03200 if (!e) e = RSTRING_END(str);
03201
03202 beg = p - RSTRING_PTR(str);
03203 len = e - p;
03204 rb_str_splice_0(str, beg, len, val);
03205 rb_enc_associate(str, enc);
03206 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03207 if (cr != ENC_CODERANGE_BROKEN)
03208 ENC_CODERANGE_SET(str, cr);
03209 }
03210
03211 void
03212 rb_str_update(VALUE str, long beg, long len, VALUE val)
03213 {
03214 rb_str_splice(str, beg, len, val);
03215 }
03216
03217 static void
03218 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03219 {
03220 int nth;
03221 VALUE match;
03222 long start, end, len;
03223 rb_encoding *enc;
03224 struct re_registers *regs;
03225
03226 if (rb_reg_search(re, str, 0, 0) < 0) {
03227 rb_raise(rb_eIndexError, "regexp not matched");
03228 }
03229 match = rb_backref_get();
03230 nth = rb_reg_backref_number(match, backref);
03231 regs = RMATCH_REGS(match);
03232 if (nth >= regs->num_regs) {
03233 out_of_range:
03234 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03235 }
03236 if (nth < 0) {
03237 if (-nth >= regs->num_regs) {
03238 goto out_of_range;
03239 }
03240 nth += regs->num_regs;
03241 }
03242
03243 start = BEG(nth);
03244 if (start == -1) {
03245 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03246 }
03247 end = END(nth);
03248 len = end - start;
03249 StringValue(val);
03250 enc = rb_enc_check(str, val);
03251 rb_str_splice_0(str, start, len, val);
03252 rb_enc_associate(str, enc);
03253 }
03254
03255 static VALUE
03256 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03257 {
03258 long idx, beg;
03259
03260 switch (TYPE(indx)) {
03261 case T_FIXNUM:
03262 idx = FIX2LONG(indx);
03263 num_index:
03264 rb_str_splice(str, idx, 1, val);
03265 return val;
03266
03267 case T_REGEXP:
03268 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03269 return val;
03270
03271 case T_STRING:
03272 beg = rb_str_index(str, indx, 0);
03273 if (beg < 0) {
03274 rb_raise(rb_eIndexError, "string not matched");
03275 }
03276 beg = rb_str_sublen(str, beg);
03277 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03278 return val;
03279
03280 default:
03281
03282 {
03283 long beg, len;
03284 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03285 rb_str_splice(str, beg, len, val);
03286 return val;
03287 }
03288 }
03289 idx = NUM2LONG(indx);
03290 goto num_index;
03291 }
03292 }
03293
03294
03295
03296
03297
03298
03299
03300
03301
03302
03303
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319 static VALUE
03320 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03321 {
03322 if (argc == 3) {
03323 if (TYPE(argv[0]) == T_REGEXP) {
03324 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03325 }
03326 else {
03327 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03328 }
03329 return argv[2];
03330 }
03331 if (argc != 2) {
03332 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03333 }
03334 return rb_str_aset(str, argv[0], argv[1]);
03335 }
03336
03337
03338
03339
03340
03341
03342
03343
03344
03345
03346
03347
03348
03349
03350
03351
03352
03353
03354 static VALUE
03355 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03356 {
03357 long pos = NUM2LONG(idx);
03358
03359 if (pos == -1) {
03360 return rb_str_append(str, str2);
03361 }
03362 else if (pos < 0) {
03363 pos++;
03364 }
03365 rb_str_splice(str, pos, 0, str2);
03366 return str;
03367 }
03368
03369
03370
03371
03372
03373
03374
03375
03376
03377
03378
03379
03380
03381
03382
03383
03384
03385
03386
03387
03388
03389 static VALUE
03390 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03391 {
03392 VALUE result;
03393 VALUE buf[3];
03394 int i;
03395
03396 if (argc < 1 || 2 < argc) {
03397 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03398 }
03399 for (i=0; i<argc; i++) {
03400 buf[i] = argv[i];
03401 }
03402 str_modify_keep_cr(str);
03403 buf[i] = rb_str_new(0,0);
03404 result = rb_str_aref_m(argc, buf, str);
03405 if (!NIL_P(result)) {
03406 rb_str_aset_m(argc+1, buf, str);
03407 }
03408 return result;
03409 }
03410
03411 static VALUE
03412 get_pat(VALUE pat, int quote)
03413 {
03414 VALUE val;
03415
03416 switch (TYPE(pat)) {
03417 case T_REGEXP:
03418 return pat;
03419
03420 case T_STRING:
03421 break;
03422
03423 default:
03424 val = rb_check_string_type(pat);
03425 if (NIL_P(val)) {
03426 Check_Type(pat, T_REGEXP);
03427 }
03428 pat = val;
03429 }
03430
03431 if (quote) {
03432 pat = rb_reg_quote(pat);
03433 }
03434
03435 return rb_reg_regcomp(pat);
03436 }
03437
03438
03439
03440
03441
03442
03443
03444
03445
03446
03447
03448
03449 static VALUE
03450 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03451 {
03452 VALUE pat, repl, hash = Qnil;
03453 int iter = 0;
03454 int tainted = 0;
03455 int untrusted = 0;
03456 long plen;
03457
03458 if (argc == 1 && rb_block_given_p()) {
03459 iter = 1;
03460 }
03461 else if (argc == 2) {
03462 repl = argv[1];
03463 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03464 if (NIL_P(hash)) {
03465 StringValue(repl);
03466 }
03467 if (OBJ_TAINTED(repl)) tainted = 1;
03468 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03469 }
03470 else {
03471 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03472 }
03473
03474 pat = get_pat(argv[0], 1);
03475 str_modifiable(str);
03476 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03477 rb_encoding *enc;
03478 int cr = ENC_CODERANGE(str);
03479 VALUE match = rb_backref_get();
03480 struct re_registers *regs = RMATCH_REGS(match);
03481 long beg0 = BEG(0);
03482 long end0 = END(0);
03483 char *p, *rp;
03484 long len, rlen;
03485
03486 if (iter || !NIL_P(hash)) {
03487 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03488
03489 if (iter) {
03490 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03491 }
03492 else {
03493 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03494 repl = rb_obj_as_string(repl);
03495 }
03496 str_mod_check(str, p, len);
03497 str_frozen_check(str);
03498 }
03499 else {
03500 repl = rb_reg_regsub(repl, str, regs, pat);
03501 }
03502 enc = rb_enc_compatible(str, repl);
03503 if (!enc) {
03504 rb_encoding *str_enc = STR_ENC_GET(str);
03505 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03506 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03507 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03508 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03509 rb_enc_name(str_enc),
03510 rb_enc_name(STR_ENC_GET(repl)));
03511 }
03512 enc = STR_ENC_GET(repl);
03513 }
03514 rb_str_modify(str);
03515 rb_enc_associate(str, enc);
03516 if (OBJ_TAINTED(repl)) tainted = 1;
03517 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03518 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03519 int cr2 = ENC_CODERANGE(repl);
03520 if (cr2 == ENC_CODERANGE_BROKEN ||
03521 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03522 cr = ENC_CODERANGE_UNKNOWN;
03523 else
03524 cr = cr2;
03525 }
03526 plen = end0 - beg0;
03527 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03528 len = RSTRING_LEN(str);
03529 if (rlen > plen) {
03530 RESIZE_CAPA(str, len + rlen - plen);
03531 }
03532 p = RSTRING_PTR(str);
03533 if (rlen != plen) {
03534 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03535 }
03536 memcpy(p + beg0, rp, rlen);
03537 len += rlen - plen;
03538 STR_SET_LEN(str, len);
03539 RSTRING_PTR(str)[len] = '\0';
03540 ENC_CODERANGE_SET(str, cr);
03541 if (tainted) OBJ_TAINT(str);
03542 if (untrusted) OBJ_UNTRUST(str);
03543
03544 return str;
03545 }
03546 return Qnil;
03547 }
03548
03549
03550
03551
03552
03553
03554
03555
03556
03557
03558
03559
03560
03561
03562
03563
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590 static VALUE
03591 rb_str_sub(int argc, VALUE *argv, VALUE str)
03592 {
03593 str = rb_str_dup(str);
03594 rb_str_sub_bang(argc, argv, str);
03595 return str;
03596 }
03597
03598 static VALUE
03599 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03600 {
03601 VALUE pat, val, repl, match, dest, hash = Qnil;
03602 struct re_registers *regs;
03603 long beg, n;
03604 long beg0, end0;
03605 long offset, blen, slen, len, last;
03606 int iter = 0;
03607 char *sp, *cp;
03608 int tainted = 0;
03609 rb_encoding *str_enc;
03610
03611 switch (argc) {
03612 case 1:
03613 RETURN_ENUMERATOR(str, argc, argv);
03614 iter = 1;
03615 break;
03616 case 2:
03617 repl = argv[1];
03618 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03619 if (NIL_P(hash)) {
03620 StringValue(repl);
03621 }
03622 if (OBJ_TAINTED(repl)) tainted = 1;
03623 break;
03624 default:
03625 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03626 }
03627
03628 pat = get_pat(argv[0], 1);
03629 beg = rb_reg_search(pat, str, 0, 0);
03630 if (beg < 0) {
03631 if (bang) return Qnil;
03632 return rb_str_dup(str);
03633 }
03634
03635 offset = 0;
03636 n = 0;
03637 blen = RSTRING_LEN(str) + 30;
03638 dest = rb_str_buf_new(blen);
03639 sp = RSTRING_PTR(str);
03640 slen = RSTRING_LEN(str);
03641 cp = sp;
03642 str_enc = STR_ENC_GET(str);
03643
03644 do {
03645 n++;
03646 match = rb_backref_get();
03647 regs = RMATCH_REGS(match);
03648 beg0 = BEG(0);
03649 end0 = END(0);
03650 if (iter || !NIL_P(hash)) {
03651 if (iter) {
03652 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03653 }
03654 else {
03655 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03656 val = rb_obj_as_string(val);
03657 }
03658 str_mod_check(str, sp, slen);
03659 if (val == dest) {
03660 rb_raise(rb_eRuntimeError, "block should not cheat");
03661 }
03662 }
03663 else {
03664 val = rb_reg_regsub(repl, str, regs, pat);
03665 }
03666
03667 if (OBJ_TAINTED(val)) tainted = 1;
03668
03669 len = beg - offset;
03670 if (len) {
03671 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03672 }
03673
03674 rb_str_buf_append(dest, val);
03675
03676 last = offset;
03677 offset = end0;
03678 if (beg0 == end0) {
03679
03680
03681
03682
03683 if (RSTRING_LEN(str) <= end0) break;
03684 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03685 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03686 offset = end0 + len;
03687 }
03688 cp = RSTRING_PTR(str) + offset;
03689 if (offset > RSTRING_LEN(str)) break;
03690 beg = rb_reg_search(pat, str, offset, 0);
03691 } while (beg >= 0);
03692 if (RSTRING_LEN(str) > offset) {
03693 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03694 }
03695 rb_reg_search(pat, str, last, 0);
03696 if (bang) {
03697 rb_str_shared_replace(str, dest);
03698 }
03699 else {
03700 RBASIC(dest)->klass = rb_obj_class(str);
03701 OBJ_INFECT(dest, str);
03702 str = dest;
03703 }
03704
03705 if (tainted) OBJ_TAINT(str);
03706 return str;
03707 }
03708
03709
03710
03711
03712
03713
03714
03715
03716
03717
03718
03719
03720
03721 static VALUE
03722 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03723 {
03724 str_modify_keep_cr(str);
03725 return str_gsub(argc, argv, str, 1);
03726 }
03727
03728
03729
03730
03731
03732
03733
03734
03735
03736
03737
03738
03739
03740
03741
03742
03743
03744
03745
03746
03747
03748
03749
03750
03751
03752
03753
03754
03755
03756
03757
03758
03759
03760
03761
03762
03763
03764
03765
03766
03767
03768
03769
03770
03771
03772 static VALUE
03773 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03774 {
03775 return str_gsub(argc, argv, str, 0);
03776 }
03777
03778
03779
03780
03781
03782
03783
03784
03785
03786
03787
03788
03789
03790 VALUE
03791 rb_str_replace(VALUE str, VALUE str2)
03792 {
03793 str_modifiable(str);
03794 if (str == str2) return str;
03795
03796 StringValue(str2);
03797 str_discard(str);
03798 return str_replace(str, str2);
03799 }
03800
03801
03802
03803
03804
03805
03806
03807
03808
03809
03810
03811 static VALUE
03812 rb_str_clear(VALUE str)
03813 {
03814 str_discard(str);
03815 STR_SET_EMBED(str);
03816 STR_SET_EMBED_LEN(str, 0);
03817 RSTRING_PTR(str)[0] = 0;
03818 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03819 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03820 else
03821 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03822 return str;
03823 }
03824
03825
03826
03827
03828
03829
03830
03831
03832
03833
03834
03835 static VALUE
03836 rb_str_chr(VALUE str)
03837 {
03838 return rb_str_substr(str, 0, 1);
03839 }
03840
03841
03842
03843
03844
03845
03846
03847 static VALUE
03848 rb_str_getbyte(VALUE str, VALUE index)
03849 {
03850 long pos = NUM2LONG(index);
03851
03852 if (pos < 0)
03853 pos += RSTRING_LEN(str);
03854 if (pos < 0 || RSTRING_LEN(str) <= pos)
03855 return Qnil;
03856
03857 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03858 }
03859
03860
03861
03862
03863
03864
03865
03866 static VALUE
03867 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03868 {
03869 long pos = NUM2LONG(index);
03870 int byte = NUM2INT(value);
03871
03872 rb_str_modify(str);
03873
03874 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03875 rb_raise(rb_eIndexError, "index %ld out of string", pos);
03876 if (pos < 0)
03877 pos += RSTRING_LEN(str);
03878
03879 RSTRING_PTR(str)[pos] = byte;
03880
03881 return value;
03882 }
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892
03893 static VALUE
03894 rb_str_reverse(VALUE str)
03895 {
03896 rb_encoding *enc;
03897 VALUE rev;
03898 char *s, *e, *p;
03899 int single = 1;
03900
03901 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03902 enc = STR_ENC_GET(str);
03903 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03904 s = RSTRING_PTR(str); e = RSTRING_END(str);
03905 p = RSTRING_END(rev);
03906
03907 if (RSTRING_LEN(str) > 1) {
03908 if (single_byte_optimizable(str)) {
03909 while (s < e) {
03910 *--p = *s++;
03911 }
03912 }
03913 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03914 while (s < e) {
03915 int clen = rb_enc_fast_mbclen(s, e, enc);
03916
03917 if (clen > 1 || (*s & 0x80)) single = 0;
03918 p -= clen;
03919 memcpy(p, s, clen);
03920 s += clen;
03921 }
03922 }
03923 else {
03924 while (s < e) {
03925 int clen = rb_enc_mbclen(s, e, enc);
03926
03927 if (clen > 1 || (*s & 0x80)) single = 0;
03928 p -= clen;
03929 memcpy(p, s, clen);
03930 s += clen;
03931 }
03932 }
03933 }
03934 STR_SET_LEN(rev, RSTRING_LEN(str));
03935 OBJ_INFECT(rev, str);
03936 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03937 if (single) {
03938 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03939 }
03940 else {
03941 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03942 }
03943 }
03944 rb_enc_cr_str_copy_for_substr(rev, str);
03945
03946 return rev;
03947 }
03948
03949
03950
03951
03952
03953
03954
03955
03956
03957 static VALUE
03958 rb_str_reverse_bang(VALUE str)
03959 {
03960 if (RSTRING_LEN(str) > 1) {
03961 if (single_byte_optimizable(str)) {
03962 char *s, *e, c;
03963
03964 str_modify_keep_cr(str);
03965 s = RSTRING_PTR(str);
03966 e = RSTRING_END(str) - 1;
03967 while (s < e) {
03968 c = *s;
03969 *s++ = *e;
03970 *e-- = c;
03971 }
03972 }
03973 else {
03974 rb_str_shared_replace(str, rb_str_reverse(str));
03975 }
03976 }
03977 else {
03978 str_modify_keep_cr(str);
03979 }
03980 return str;
03981 }
03982
03983
03984
03985
03986
03987
03988
03989
03990
03991
03992
03993
03994
03995
03996 static VALUE
03997 rb_str_include(VALUE str, VALUE arg)
03998 {
03999 long i;
04000
04001 StringValue(arg);
04002 i = rb_str_index(str, arg, 0);
04003
04004 if (i == -1) return Qfalse;
04005 return Qtrue;
04006 }
04007
04008
04009
04010
04011
04012
04013
04014
04015
04016
04017
04018
04019
04020
04021
04022
04023
04024
04025
04026
04027
04028
04029
04030 static VALUE
04031 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04032 {
04033 int base;
04034
04035 if (argc == 0) base = 10;
04036 else {
04037 VALUE b;
04038
04039 rb_scan_args(argc, argv, "01", &b);
04040 base = NUM2INT(b);
04041 }
04042 if (base < 0) {
04043 rb_raise(rb_eArgError, "invalid radix %d", base);
04044 }
04045 return rb_str_to_inum(str, base, FALSE);
04046 }
04047
04048
04049
04050
04051
04052
04053
04054
04055
04056
04057
04058
04059
04060
04061
04062
04063 static VALUE
04064 rb_str_to_f(VALUE str)
04065 {
04066 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04067 }
04068
04069
04070
04071
04072
04073
04074
04075
04076
04077
04078 static VALUE
04079 rb_str_to_s(VALUE str)
04080 {
04081 if (rb_obj_class(str) != rb_cString) {
04082 return str_duplicate(rb_cString, str);
04083 }
04084 return str;
04085 }
04086
04087 #if 0
04088 static void
04089 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04090 {
04091 char s[RUBY_MAX_CHAR_LEN];
04092 int n = rb_enc_codelen(c, enc);
04093
04094 rb_enc_mbcput(c, s, enc);
04095 rb_enc_str_buf_cat(str, s, n, enc);
04096 }
04097 #endif
04098
04099 #define CHAR_ESC_LEN 13
04100
04101 int
04102 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04103 {
04104 char buf[CHAR_ESC_LEN + 1];
04105 int l;
04106
04107 #if SIZEOF_INT > 4
04108 c &= 0xffffffff;
04109 #endif
04110 if (unicode_p) {
04111 if (c < 0x7F && ISPRINT(c)) {
04112 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04113 }
04114 else if (c < 0x10000) {
04115 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04116 }
04117 else {
04118 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04119 }
04120 }
04121 else {
04122 if (c < 0x100) {
04123 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04124 }
04125 else {
04126 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04127 }
04128 }
04129 l = (int)strlen(buf);
04130 rb_str_buf_cat(result, buf, l);
04131 return l;
04132 }
04133
04134
04135
04136
04137
04138
04139
04140
04141
04142
04143
04144
04145
04146 VALUE
04147 rb_str_inspect(VALUE str)
04148 {
04149 rb_encoding *enc = STR_ENC_GET(str);
04150 const char *p, *pend, *prev;
04151 char buf[CHAR_ESC_LEN + 1];
04152 VALUE result = rb_str_buf_new(0);
04153 rb_encoding *resenc = rb_default_internal_encoding();
04154 int unicode_p = rb_enc_unicode_p(enc);
04155 int asciicompat = rb_enc_asciicompat(enc);
04156
04157 if (resenc == NULL) resenc = rb_default_external_encoding();
04158 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04159 rb_enc_associate(result, resenc);
04160 str_buf_cat2(result, "\"");
04161
04162 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04163 prev = p;
04164 while (p < pend) {
04165 unsigned int c, cc;
04166 int n;
04167
04168 n = rb_enc_precise_mbclen(p, pend, enc);
04169 if (!MBCLEN_CHARFOUND_P(n)) {
04170 if (p > prev) str_buf_cat(result, prev, p - prev);
04171 n = rb_enc_mbminlen(enc);
04172 if (pend < p + n)
04173 n = (int)(pend - p);
04174 while (n--) {
04175 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04176 str_buf_cat(result, buf, strlen(buf));
04177 prev = ++p;
04178 }
04179 continue;
04180 }
04181 n = MBCLEN_CHARFOUND_LEN(n);
04182 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04183 p += n;
04184 if (c == '"'|| c == '\\' ||
04185 (c == '#' &&
04186 p < pend &&
04187 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04188 (cc = rb_enc_codepoint(p,pend,enc),
04189 (cc == '$' || cc == '@' || cc == '{')))) {
04190 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04191 str_buf_cat2(result, "\\");
04192 prev = p - n;
04193 continue;
04194 }
04195 switch (c) {
04196 case '\n': cc = 'n'; break;
04197 case '\r': cc = 'r'; break;
04198 case '\t': cc = 't'; break;
04199 case '\f': cc = 'f'; break;
04200 case '\013': cc = 'v'; break;
04201 case '\010': cc = 'b'; break;
04202 case '\007': cc = 'a'; break;
04203 case 033: cc = 'e'; break;
04204 default: cc = 0; break;
04205 }
04206 if (cc) {
04207 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04208 buf[0] = '\\';
04209 buf[1] = (char)cc;
04210 str_buf_cat(result, buf, 2);
04211 prev = p;
04212 continue;
04213 }
04214 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04215 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04216 continue;
04217 }
04218 else {
04219 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04220 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04221 prev = p;
04222 continue;
04223 }
04224 }
04225 if (p > prev) str_buf_cat(result, prev, p - prev);
04226 str_buf_cat2(result, "\"");
04227
04228 OBJ_INFECT(result, str);
04229 return result;
04230 }
04231
04232 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04233
04234
04235
04236
04237
04238
04239
04240
04241
04242 VALUE
04243 rb_str_dump(VALUE str)
04244 {
04245 rb_encoding *enc = rb_enc_get(str);
04246 long len;
04247 const char *p, *pend;
04248 char *q, *qend;
04249 VALUE result;
04250 int u8 = (enc == rb_utf8_encoding());
04251
04252 len = 2;
04253 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04254 while (p < pend) {
04255 unsigned char c = *p++;
04256 switch (c) {
04257 case '"': case '\\':
04258 case '\n': case '\r':
04259 case '\t': case '\f':
04260 case '\013': case '\010': case '\007': case '\033':
04261 len += 2;
04262 break;
04263
04264 case '#':
04265 len += IS_EVSTR(p, pend) ? 2 : 1;
04266 break;
04267
04268 default:
04269 if (ISPRINT(c)) {
04270 len++;
04271 }
04272 else {
04273 if (u8) {
04274 char buf[32];
04275 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04276 if (MBCLEN_CHARFOUND_P(n)) {
04277 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04278 sprintf(buf, "%x", cc);
04279 len += strlen(buf)+4;
04280 p += MBCLEN_CHARFOUND_LEN(n)-1;
04281 break;
04282 }
04283 }
04284 len += 4;
04285 }
04286 break;
04287 }
04288 }
04289 if (!rb_enc_asciicompat(enc)) {
04290 len += 19;
04291 len += strlen(enc->name);
04292 }
04293
04294 result = rb_str_new5(str, 0, len);
04295 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04296 q = RSTRING_PTR(result); qend = q + len + 1;
04297
04298 *q++ = '"';
04299 while (p < pend) {
04300 unsigned char c = *p++;
04301
04302 if (c == '"' || c == '\\') {
04303 *q++ = '\\';
04304 *q++ = c;
04305 }
04306 else if (c == '#') {
04307 if (IS_EVSTR(p, pend)) *q++ = '\\';
04308 *q++ = '#';
04309 }
04310 else if (c == '\n') {
04311 *q++ = '\\';
04312 *q++ = 'n';
04313 }
04314 else if (c == '\r') {
04315 *q++ = '\\';
04316 *q++ = 'r';
04317 }
04318 else if (c == '\t') {
04319 *q++ = '\\';
04320 *q++ = 't';
04321 }
04322 else if (c == '\f') {
04323 *q++ = '\\';
04324 *q++ = 'f';
04325 }
04326 else if (c == '\013') {
04327 *q++ = '\\';
04328 *q++ = 'v';
04329 }
04330 else if (c == '\010') {
04331 *q++ = '\\';
04332 *q++ = 'b';
04333 }
04334 else if (c == '\007') {
04335 *q++ = '\\';
04336 *q++ = 'a';
04337 }
04338 else if (c == '\033') {
04339 *q++ = '\\';
04340 *q++ = 'e';
04341 }
04342 else if (ISPRINT(c)) {
04343 *q++ = c;
04344 }
04345 else {
04346 *q++ = '\\';
04347 if (u8) {
04348 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04349 if (MBCLEN_CHARFOUND_P(n)) {
04350 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04351 p += n;
04352 snprintf(q, qend-q, "u{%x}", cc);
04353 q += strlen(q);
04354 continue;
04355 }
04356 }
04357 snprintf(q, qend-q, "x%02X", c);
04358 q += 3;
04359 }
04360 }
04361 *q++ = '"';
04362 *q = '\0';
04363 if (!rb_enc_asciicompat(enc)) {
04364 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04365 enc = rb_ascii8bit_encoding();
04366 }
04367 OBJ_INFECT(result, str);
04368
04369 rb_enc_associate(result, enc);
04370 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04371 return result;
04372 }
04373
04374
04375 static void
04376 rb_str_check_dummy_enc(rb_encoding *enc)
04377 {
04378 if (rb_enc_dummy_p(enc)) {
04379 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04380 rb_enc_name(enc));
04381 }
04382 }
04383
04384
04385
04386
04387
04388
04389
04390
04391
04392
04393 static VALUE
04394 rb_str_upcase_bang(VALUE str)
04395 {
04396 rb_encoding *enc;
04397 char *s, *send;
04398 int modify = 0;
04399 int n;
04400
04401 str_modify_keep_cr(str);
04402 enc = STR_ENC_GET(str);
04403 rb_str_check_dummy_enc(enc);
04404 s = RSTRING_PTR(str); send = RSTRING_END(str);
04405 if (single_byte_optimizable(str)) {
04406 while (s < send) {
04407 unsigned int c = *(unsigned char*)s;
04408
04409 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04410 *s = 'A' + (c - 'a');
04411 modify = 1;
04412 }
04413 s++;
04414 }
04415 }
04416 else {
04417 int ascompat = rb_enc_asciicompat(enc);
04418
04419 while (s < send) {
04420 unsigned int c;
04421
04422 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04423 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04424 *s = 'A' + (c - 'a');
04425 modify = 1;
04426 }
04427 s++;
04428 }
04429 else {
04430 c = rb_enc_codepoint_len(s, send, &n, enc);
04431 if (rb_enc_islower(c, enc)) {
04432
04433 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04434 modify = 1;
04435 }
04436 s += n;
04437 }
04438 }
04439 }
04440
04441 if (modify) return str;
04442 return Qnil;
04443 }
04444
04445
04446
04447
04448
04449
04450
04451
04452
04453
04454
04455
04456
04457
04458 static VALUE
04459 rb_str_upcase(VALUE str)
04460 {
04461 str = rb_str_dup(str);
04462 rb_str_upcase_bang(str);
04463 return str;
04464 }
04465
04466
04467
04468
04469
04470
04471
04472
04473
04474
04475
04476 static VALUE
04477 rb_str_downcase_bang(VALUE str)
04478 {
04479 rb_encoding *enc;
04480 char *s, *send;
04481 int modify = 0;
04482
04483 str_modify_keep_cr(str);
04484 enc = STR_ENC_GET(str);
04485 rb_str_check_dummy_enc(enc);
04486 s = RSTRING_PTR(str); send = RSTRING_END(str);
04487 if (single_byte_optimizable(str)) {
04488 while (s < send) {
04489 unsigned int c = *(unsigned char*)s;
04490
04491 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04492 *s = 'a' + (c - 'A');
04493 modify = 1;
04494 }
04495 s++;
04496 }
04497 }
04498 else {
04499 int ascompat = rb_enc_asciicompat(enc);
04500
04501 while (s < send) {
04502 unsigned int c;
04503 int n;
04504
04505 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04506 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04507 *s = 'a' + (c - 'A');
04508 modify = 1;
04509 }
04510 s++;
04511 }
04512 else {
04513 c = rb_enc_codepoint_len(s, send, &n, enc);
04514 if (rb_enc_isupper(c, enc)) {
04515
04516 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04517 modify = 1;
04518 }
04519 s += n;
04520 }
04521 }
04522 }
04523
04524 if (modify) return str;
04525 return Qnil;
04526 }
04527
04528
04529
04530
04531
04532
04533
04534
04535
04536
04537
04538
04539
04540
04541 static VALUE
04542 rb_str_downcase(VALUE str)
04543 {
04544 str = rb_str_dup(str);
04545 rb_str_downcase_bang(str);
04546 return str;
04547 }
04548
04549
04550
04551
04552
04553
04554
04555
04556
04557
04558
04559
04560
04561
04562
04563
04564 static VALUE
04565 rb_str_capitalize_bang(VALUE str)
04566 {
04567 rb_encoding *enc;
04568 char *s, *send;
04569 int modify = 0;
04570 unsigned int c;
04571 int n;
04572
04573 str_modify_keep_cr(str);
04574 enc = STR_ENC_GET(str);
04575 rb_str_check_dummy_enc(enc);
04576 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04577 s = RSTRING_PTR(str); send = RSTRING_END(str);
04578
04579 c = rb_enc_codepoint_len(s, send, &n, enc);
04580 if (rb_enc_islower(c, enc)) {
04581 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04582 modify = 1;
04583 }
04584 s += n;
04585 while (s < send) {
04586 c = rb_enc_codepoint_len(s, send, &n, enc);
04587 if (rb_enc_isupper(c, enc)) {
04588 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04589 modify = 1;
04590 }
04591 s += n;
04592 }
04593
04594 if (modify) return str;
04595 return Qnil;
04596 }
04597
04598
04599
04600
04601
04602
04603
04604
04605
04606
04607
04608
04609
04610
04611
04612 static VALUE
04613 rb_str_capitalize(VALUE str)
04614 {
04615 str = rb_str_dup(str);
04616 rb_str_capitalize_bang(str);
04617 return str;
04618 }
04619
04620
04621
04622
04623
04624
04625
04626
04627
04628
04629
04630 static VALUE
04631 rb_str_swapcase_bang(VALUE str)
04632 {
04633 rb_encoding *enc;
04634 char *s, *send;
04635 int modify = 0;
04636 int n;
04637
04638 str_modify_keep_cr(str);
04639 enc = STR_ENC_GET(str);
04640 rb_str_check_dummy_enc(enc);
04641 s = RSTRING_PTR(str); send = RSTRING_END(str);
04642 while (s < send) {
04643 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04644
04645 if (rb_enc_isupper(c, enc)) {
04646
04647 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04648 modify = 1;
04649 }
04650 else if (rb_enc_islower(c, enc)) {
04651
04652 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04653 modify = 1;
04654 }
04655 s += n;
04656 }
04657
04658 if (modify) return str;
04659 return Qnil;
04660 }
04661
04662
04663
04664
04665
04666
04667
04668
04669
04670
04671
04672
04673
04674
04675 static VALUE
04676 rb_str_swapcase(VALUE str)
04677 {
04678 str = rb_str_dup(str);
04679 rb_str_swapcase_bang(str);
04680 return str;
04681 }
04682
04683 typedef unsigned char *USTR;
04684
04685 struct tr {
04686 int gen;
04687 unsigned int now, max;
04688 char *p, *pend;
04689 };
04690
04691 static unsigned int
04692 trnext(struct tr *t, rb_encoding *enc)
04693 {
04694 int n;
04695
04696 for (;;) {
04697 if (!t->gen) {
04698 if (t->p == t->pend) return -1;
04699 if (t->p < t->pend - 1 && *t->p == '\\') {
04700 t->p++;
04701 }
04702 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04703 t->p += n;
04704 if (t->p < t->pend - 1 && *t->p == '-') {
04705 t->p++;
04706 if (t->p < t->pend) {
04707 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04708 t->p += n;
04709 if (t->now > c) {
04710 if (t->now < 0x80 && c < 0x80) {
04711 rb_raise(rb_eArgError,
04712 "invalid range \"%c-%c\" in string transliteration",
04713 t->now, c);
04714 }
04715 else {
04716 rb_raise(rb_eArgError, "invalid range in string transliteration");
04717 }
04718 continue;
04719 }
04720 t->gen = 1;
04721 t->max = c;
04722 }
04723 }
04724 return t->now;
04725 }
04726 else if (++t->now < t->max) {
04727 return t->now;
04728 }
04729 else {
04730 t->gen = 0;
04731 return t->max;
04732 }
04733 }
04734 }
04735
04736 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04737
04738 static VALUE
04739 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04740 {
04741 const unsigned int errc = -1;
04742 unsigned int trans[256];
04743 rb_encoding *enc, *e1, *e2;
04744 struct tr trsrc, trrepl;
04745 int cflag = 0;
04746 unsigned int c, c0;
04747 int last = 0, modify = 0, i, l;
04748 char *s, *send;
04749 VALUE hash = 0;
04750 int singlebyte = single_byte_optimizable(str);
04751 int cr;
04752
04753 #define CHECK_IF_ASCII(c) \
04754 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04755 (cr = ENC_CODERANGE_VALID) : 0)
04756
04757 StringValue(src);
04758 StringValue(repl);
04759 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04760 if (RSTRING_LEN(repl) == 0) {
04761 return rb_str_delete_bang(1, &src, str);
04762 }
04763
04764 cr = ENC_CODERANGE(str);
04765 e1 = rb_enc_check(str, src);
04766 e2 = rb_enc_check(str, repl);
04767 if (e1 == e2) {
04768 enc = e1;
04769 }
04770 else {
04771 enc = rb_enc_check(src, repl);
04772 }
04773 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04774 if (RSTRING_LEN(src) > 1 &&
04775 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04776 trsrc.p + l < trsrc.pend) {
04777 cflag = 1;
04778 trsrc.p += l;
04779 }
04780 trrepl.p = RSTRING_PTR(repl);
04781 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04782 trsrc.gen = trrepl.gen = 0;
04783 trsrc.now = trrepl.now = 0;
04784 trsrc.max = trrepl.max = 0;
04785
04786 if (cflag) {
04787 for (i=0; i<256; i++) {
04788 trans[i] = 1;
04789 }
04790 while ((c = trnext(&trsrc, enc)) != errc) {
04791 if (c < 256) {
04792 trans[c] = errc;
04793 }
04794 else {
04795 if (!hash) hash = rb_hash_new();
04796 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04797 }
04798 }
04799 while ((c = trnext(&trrepl, enc)) != errc)
04800 ;
04801 last = trrepl.now;
04802 for (i=0; i<256; i++) {
04803 if (trans[i] != errc) {
04804 trans[i] = last;
04805 }
04806 }
04807 }
04808 else {
04809 unsigned int r;
04810
04811 for (i=0; i<256; i++) {
04812 trans[i] = errc;
04813 }
04814 while ((c = trnext(&trsrc, enc)) != errc) {
04815 r = trnext(&trrepl, enc);
04816 if (r == errc) r = trrepl.now;
04817 if (c < 256) {
04818 trans[c] = r;
04819 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04820 }
04821 else {
04822 if (!hash) hash = rb_hash_new();
04823 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04824 }
04825 }
04826 }
04827
04828 if (cr == ENC_CODERANGE_VALID)
04829 cr = ENC_CODERANGE_7BIT;
04830 str_modify_keep_cr(str);
04831 s = RSTRING_PTR(str); send = RSTRING_END(str);
04832 if (sflag) {
04833 int clen, tlen;
04834 long offset, max = RSTRING_LEN(str);
04835 unsigned int save = -1;
04836 char *buf = ALLOC_N(char, max), *t = buf;
04837
04838 while (s < send) {
04839 int may_modify = 0;
04840
04841 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04842 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04843
04844 s += clen;
04845 if (c < 256) {
04846 c = trans[c];
04847 }
04848 else if (hash) {
04849 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04850 if (NIL_P(tmp)) {
04851 if (cflag) c = last;
04852 else c = errc;
04853 }
04854 else if (cflag) c = errc;
04855 else c = NUM2INT(tmp);
04856 }
04857 else {
04858 c = errc;
04859 }
04860 if (c != (unsigned int)-1) {
04861 if (save == c) {
04862 CHECK_IF_ASCII(c);
04863 continue;
04864 }
04865 save = c;
04866 tlen = rb_enc_codelen(c, enc);
04867 modify = 1;
04868 }
04869 else {
04870 save = -1;
04871 c = c0;
04872 if (enc != e1) may_modify = 1;
04873 }
04874 while (t - buf + tlen >= max) {
04875 offset = t - buf;
04876 max *= 2;
04877 REALLOC_N(buf, char, max);
04878 t = buf + offset;
04879 }
04880 rb_enc_mbcput(c, t, enc);
04881 if (may_modify && memcmp(s, t, tlen) != 0) {
04882 modify = 1;
04883 }
04884 CHECK_IF_ASCII(c);
04885 t += tlen;
04886 }
04887 *t = '\0';
04888 RSTRING(str)->as.heap.ptr = buf;
04889 RSTRING(str)->as.heap.len = t - buf;
04890 STR_SET_NOEMBED(str);
04891 RSTRING(str)->as.heap.aux.capa = max;
04892 }
04893 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04894 while (s < send) {
04895 c = (unsigned char)*s;
04896 if (trans[c] != errc) {
04897 if (!cflag) {
04898 c = trans[c];
04899 *s = c;
04900 modify = 1;
04901 }
04902 else {
04903 *s = last;
04904 modify = 1;
04905 }
04906 }
04907 CHECK_IF_ASCII(c);
04908 s++;
04909 }
04910 }
04911 else {
04912 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04913 long offset;
04914 char *buf = ALLOC_N(char, max), *t = buf;
04915
04916 while (s < send) {
04917 int may_modify = 0;
04918 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04919 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04920
04921 if (c < 256) {
04922 c = trans[c];
04923 }
04924 else if (hash) {
04925 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04926 if (NIL_P(tmp)) {
04927 if (cflag) c = last;
04928 else c = errc;
04929 }
04930 else if (cflag) c = errc;
04931 else c = NUM2INT(tmp);
04932 }
04933 else {
04934 c = errc;
04935 }
04936 if (c != errc) {
04937 tlen = rb_enc_codelen(c, enc);
04938 modify = 1;
04939 }
04940 else {
04941 c = c0;
04942 if (enc != e1) may_modify = 1;
04943 }
04944 while (t - buf + tlen >= max) {
04945 offset = t - buf;
04946 max *= 2;
04947 REALLOC_N(buf, char, max);
04948 t = buf + offset;
04949 }
04950 if (s != t) {
04951 rb_enc_mbcput(c, t, enc);
04952 if (may_modify && memcmp(s, t, tlen) != 0) {
04953 modify = 1;
04954 }
04955 }
04956 CHECK_IF_ASCII(c);
04957 s += clen;
04958 t += tlen;
04959 }
04960 if (!STR_EMBED_P(str)) {
04961 xfree(RSTRING(str)->as.heap.ptr);
04962 }
04963 *t = '\0';
04964 RSTRING(str)->as.heap.ptr = buf;
04965 RSTRING(str)->as.heap.len = t - buf;
04966 STR_SET_NOEMBED(str);
04967 RSTRING(str)->as.heap.aux.capa = max;
04968 }
04969
04970 if (modify) {
04971 if (cr != ENC_CODERANGE_BROKEN)
04972 ENC_CODERANGE_SET(str, cr);
04973 rb_enc_associate(str, enc);
04974 return str;
04975 }
04976 return Qnil;
04977 }
04978
04979
04980
04981
04982
04983
04984
04985
04986
04987
04988
04989 static VALUE
04990 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
04991 {
04992 return tr_trans(str, src, repl, 0);
04993 }
04994
04995
04996
04997
04998
04999
05000
05001
05002
05003
05004
05005
05006
05007
05008
05009
05010
05011
05012
05013 static VALUE
05014 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05015 {
05016 str = rb_str_dup(str);
05017 tr_trans(str, src, repl, 0);
05018 return str;
05019 }
05020
05021 static void
05022 tr_setup_table(VALUE str, char stable[256], int first,
05023 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05024 {
05025 const unsigned int errc = -1;
05026 char buf[256];
05027 struct tr tr;
05028 unsigned int c;
05029 VALUE table = 0, ptable = 0;
05030 int i, l, cflag = 0;
05031
05032 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05033 tr.gen = tr.now = tr.max = 0;
05034
05035 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05036 cflag = 1;
05037 tr.p += l;
05038 }
05039 if (first) {
05040 for (i=0; i<256; i++) {
05041 stable[i] = 1;
05042 }
05043 }
05044 for (i=0; i<256; i++) {
05045 buf[i] = cflag;
05046 }
05047
05048 while ((c = trnext(&tr, enc)) != errc) {
05049 if (c < 256) {
05050 buf[c & 0xff] = !cflag;
05051 }
05052 else {
05053 VALUE key = UINT2NUM(c);
05054
05055 if (!table) {
05056 table = rb_hash_new();
05057 if (cflag) {
05058 ptable = *ctablep;
05059 *ctablep = table;
05060 }
05061 else {
05062 ptable = *tablep;
05063 *tablep = table;
05064 }
05065 }
05066 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05067 rb_hash_aset(table, key, Qtrue);
05068 }
05069 }
05070 }
05071 for (i=0; i<256; i++) {
05072 stable[i] = stable[i] && buf[i];
05073 }
05074 }
05075
05076
05077 static int
05078 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05079 {
05080 if (c < 256) {
05081 return table[c] != 0;
05082 }
05083 else {
05084 VALUE v = UINT2NUM(c);
05085
05086 if (del && !NIL_P(rb_hash_lookup(del, v))) {
05087 if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
05088 return TRUE;
05089 }
05090 }
05091 return FALSE;
05092 }
05093 }
05094
05095
05096
05097
05098
05099
05100
05101
05102
05103 static VALUE
05104 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05105 {
05106 char squeez[256];
05107 rb_encoding *enc = 0;
05108 char *s, *send, *t;
05109 VALUE del = 0, nodel = 0;
05110 int modify = 0;
05111 int i, ascompat, cr;
05112
05113 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05114 if (argc < 1) {
05115 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05116 }
05117 for (i=0; i<argc; i++) {
05118 VALUE s = argv[i];
05119
05120 StringValue(s);
05121 enc = rb_enc_check(str, s);
05122 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05123 }
05124
05125 str_modify_keep_cr(str);
05126 ascompat = rb_enc_asciicompat(enc);
05127 s = t = RSTRING_PTR(str);
05128 send = RSTRING_END(str);
05129 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05130 while (s < send) {
05131 unsigned int c;
05132 int clen;
05133
05134 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05135 if (squeez[c]) {
05136 modify = 1;
05137 }
05138 else {
05139 if (t != s) *t = c;
05140 t++;
05141 }
05142 s++;
05143 }
05144 else {
05145 c = rb_enc_codepoint_len(s, send, &clen, enc);
05146
05147 if (tr_find(c, squeez, del, nodel)) {
05148 modify = 1;
05149 }
05150 else {
05151 if (t != s) rb_enc_mbcput(c, t, enc);
05152 t += clen;
05153 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05154 }
05155 s += clen;
05156 }
05157 }
05158 *t = '\0';
05159 STR_SET_LEN(str, t - RSTRING_PTR(str));
05160 ENC_CODERANGE_SET(str, cr);
05161
05162 if (modify) return str;
05163 return Qnil;
05164 }
05165
05166
05167
05168
05169
05170
05171
05172
05173
05174
05175
05176
05177
05178
05179
05180
05181 static VALUE
05182 rb_str_delete(int argc, VALUE *argv, VALUE str)
05183 {
05184 str = rb_str_dup(str);
05185 rb_str_delete_bang(argc, argv, str);
05186 return str;
05187 }
05188
05189
05190
05191
05192
05193
05194
05195
05196
05197
05198 static VALUE
05199 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05200 {
05201 char squeez[256];
05202 rb_encoding *enc = 0;
05203 VALUE del = 0, nodel = 0;
05204 char *s, *send, *t;
05205 int i, modify = 0;
05206 int ascompat, singlebyte = single_byte_optimizable(str);
05207 unsigned int save;
05208
05209 if (argc == 0) {
05210 enc = STR_ENC_GET(str);
05211 }
05212 else {
05213 for (i=0; i<argc; i++) {
05214 VALUE s = argv[i];
05215
05216 StringValue(s);
05217 enc = rb_enc_check(str, s);
05218 if (singlebyte && !single_byte_optimizable(s))
05219 singlebyte = 0;
05220 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05221 }
05222 }
05223
05224 str_modify_keep_cr(str);
05225 s = t = RSTRING_PTR(str);
05226 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05227 send = RSTRING_END(str);
05228 save = -1;
05229 ascompat = rb_enc_asciicompat(enc);
05230
05231 if (singlebyte) {
05232 while (s < send) {
05233 unsigned int c = *(unsigned char*)s++;
05234 if (c != save || (argc > 0 && !squeez[c])) {
05235 *t++ = save = c;
05236 }
05237 }
05238 } else {
05239 while (s < send) {
05240 unsigned int c;
05241 int clen;
05242
05243 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05244 if (c != save || (argc > 0 && !squeez[c])) {
05245 *t++ = save = c;
05246 }
05247 s++;
05248 }
05249 else {
05250 c = rb_enc_codepoint_len(s, send, &clen, enc);
05251
05252 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05253 if (t != s) rb_enc_mbcput(c, t, enc);
05254 save = c;
05255 t += clen;
05256 }
05257 s += clen;
05258 }
05259 }
05260 }
05261
05262 *t = '\0';
05263 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05264 STR_SET_LEN(str, t - RSTRING_PTR(str));
05265 modify = 1;
05266 }
05267
05268 if (modify) return str;
05269 return Qnil;
05270 }
05271
05272
05273
05274
05275
05276
05277
05278
05279
05280
05281
05282
05283
05284
05285
05286
05287
05288 static VALUE
05289 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05290 {
05291 str = rb_str_dup(str);
05292 rb_str_squeeze_bang(argc, argv, str);
05293 return str;
05294 }
05295
05296
05297
05298
05299
05300
05301
05302
05303
05304
05305 static VALUE
05306 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05307 {
05308 return tr_trans(str, src, repl, 1);
05309 }
05310
05311
05312
05313
05314
05315
05316
05317
05318
05319
05320
05321
05322
05323
05324
05325 static VALUE
05326 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05327 {
05328 str = rb_str_dup(str);
05329 tr_trans(str, src, repl, 1);
05330 return str;
05331 }
05332
05333
05334
05335
05336
05337
05338
05339
05340
05341
05342
05343
05344
05345
05346
05347
05348
05349
05350 static VALUE
05351 rb_str_count(int argc, VALUE *argv, VALUE str)
05352 {
05353 char table[256];
05354 rb_encoding *enc = 0;
05355 VALUE del = 0, nodel = 0;
05356 char *s, *send;
05357 int i;
05358 int ascompat;
05359
05360 if (argc < 1) {
05361 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05362 }
05363 for (i=0; i<argc; i++) {
05364 VALUE tstr = argv[i];
05365 unsigned char c;
05366
05367 StringValue(tstr);
05368 enc = rb_enc_check(str, tstr);
05369 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05370 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05371 int n = 0;
05372
05373 s = RSTRING_PTR(str);
05374 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05375 send = RSTRING_END(str);
05376 while (s < send) {
05377 if (*(unsigned char*)s++ == c) n++;
05378 }
05379 return INT2NUM(n);
05380 }
05381 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05382 }
05383
05384 s = RSTRING_PTR(str);
05385 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05386 send = RSTRING_END(str);
05387 ascompat = rb_enc_asciicompat(enc);
05388 i = 0;
05389 while (s < send) {
05390 unsigned int c;
05391 int clen;
05392
05393 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05394 clen = 1;
05395 if (table[c]) {
05396 i++;
05397 }
05398 s++;
05399 }
05400 else {
05401 c = rb_enc_codepoint_len(s, send, &clen, enc);
05402 if (tr_find(c, table, del, nodel)) {
05403 i++;
05404 }
05405 s += clen;
05406 }
05407 }
05408
05409 return INT2NUM(i);
05410 }
05411
05412 static const char isspacetable[256] = {
05413 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05414 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05415 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05429 };
05430
05431 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05432
05433
05434
05435
05436
05437
05438
05439
05440
05441
05442
05443
05444
05445
05446
05447
05448
05449
05450
05451
05452
05453
05454
05455
05456
05457
05458
05459
05460
05461
05462
05463
05464
05465
05466
05467
05468
05469
05470
05471
05472
05473
05474
05475 static VALUE
05476 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05477 {
05478 rb_encoding *enc;
05479 VALUE spat;
05480 VALUE limit;
05481 enum {awk, string, regexp} split_type;
05482 long beg, end, i = 0;
05483 int lim = 0;
05484 VALUE result, tmp;
05485
05486 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05487 lim = NUM2INT(limit);
05488 if (lim <= 0) limit = Qnil;
05489 else if (lim == 1) {
05490 if (RSTRING_LEN(str) == 0)
05491 return rb_ary_new2(0);
05492 return rb_ary_new3(1, str);
05493 }
05494 i = 1;
05495 }
05496
05497 enc = STR_ENC_GET(str);
05498 if (NIL_P(spat)) {
05499 if (!NIL_P(rb_fs)) {
05500 spat = rb_fs;
05501 goto fs_set;
05502 }
05503 split_type = awk;
05504 }
05505 else {
05506 fs_set:
05507 if (TYPE(spat) == T_STRING) {
05508 rb_encoding *enc2 = STR_ENC_GET(spat);
05509
05510 split_type = string;
05511 if (RSTRING_LEN(spat) == 0) {
05512
05513 spat = rb_reg_regcomp(spat);
05514 split_type = regexp;
05515 }
05516 else if (rb_enc_asciicompat(enc2) == 1) {
05517 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05518 split_type = awk;
05519 }
05520 }
05521 else {
05522 int l;
05523 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05524 RSTRING_LEN(spat) == l) {
05525 split_type = awk;
05526 }
05527 }
05528 }
05529 else {
05530 spat = get_pat(spat, 1);
05531 split_type = regexp;
05532 }
05533 }
05534
05535 result = rb_ary_new();
05536 beg = 0;
05537 if (split_type == awk) {
05538 char *ptr = RSTRING_PTR(str);
05539 char *eptr = RSTRING_END(str);
05540 char *bptr = ptr;
05541 int skip = 1;
05542 unsigned int c;
05543
05544 end = beg;
05545 if (is_ascii_string(str)) {
05546 while (ptr < eptr) {
05547 c = (unsigned char)*ptr++;
05548 if (skip) {
05549 if (ascii_isspace(c)) {
05550 beg = ptr - bptr;
05551 }
05552 else {
05553 end = ptr - bptr;
05554 skip = 0;
05555 if (!NIL_P(limit) && lim <= i) break;
05556 }
05557 }
05558 else if (ascii_isspace(c)) {
05559 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05560 skip = 1;
05561 beg = ptr - bptr;
05562 if (!NIL_P(limit)) ++i;
05563 }
05564 else {
05565 end = ptr - bptr;
05566 }
05567 }
05568 }
05569 else {
05570 while (ptr < eptr) {
05571 int n;
05572
05573 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05574 ptr += n;
05575 if (skip) {
05576 if (rb_isspace(c)) {
05577 beg = ptr - bptr;
05578 }
05579 else {
05580 end = ptr - bptr;
05581 skip = 0;
05582 if (!NIL_P(limit) && lim <= i) break;
05583 }
05584 }
05585 else if (rb_isspace(c)) {
05586 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05587 skip = 1;
05588 beg = ptr - bptr;
05589 if (!NIL_P(limit)) ++i;
05590 }
05591 else {
05592 end = ptr - bptr;
05593 }
05594 }
05595 }
05596 }
05597 else if (split_type == string) {
05598 char *ptr = RSTRING_PTR(str);
05599 char *temp = ptr;
05600 char *eptr = RSTRING_END(str);
05601 char *sptr = RSTRING_PTR(spat);
05602 long slen = RSTRING_LEN(spat);
05603
05604 if (is_broken_string(str)) {
05605 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05606 }
05607 if (is_broken_string(spat)) {
05608 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05609 }
05610 enc = rb_enc_check(str, spat);
05611 while (ptr < eptr &&
05612 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05613
05614 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05615 if (t != ptr + end) {
05616 ptr = t;
05617 continue;
05618 }
05619 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05620 ptr += end + slen;
05621 if (!NIL_P(limit) && lim <= ++i) break;
05622 }
05623 beg = ptr - temp;
05624 }
05625 else {
05626 char *ptr = RSTRING_PTR(str);
05627 long len = RSTRING_LEN(str);
05628 long start = beg;
05629 long idx;
05630 int last_null = 0;
05631 struct re_registers *regs;
05632
05633 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05634 regs = RMATCH_REGS(rb_backref_get());
05635 if (start == end && BEG(0) == END(0)) {
05636 if (!ptr) {
05637 rb_ary_push(result, str_new_empty(str));
05638 break;
05639 }
05640 else if (last_null == 1) {
05641 rb_ary_push(result, rb_str_subseq(str, beg,
05642 rb_enc_fast_mbclen(ptr+beg,
05643 ptr+len,
05644 enc)));
05645 beg = start;
05646 }
05647 else {
05648 if (ptr+start == ptr+len)
05649 start++;
05650 else
05651 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05652 last_null = 1;
05653 continue;
05654 }
05655 }
05656 else {
05657 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05658 beg = start = END(0);
05659 }
05660 last_null = 0;
05661
05662 for (idx=1; idx < regs->num_regs; idx++) {
05663 if (BEG(idx) == -1) continue;
05664 if (BEG(idx) == END(idx))
05665 tmp = str_new_empty(str);
05666 else
05667 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05668 rb_ary_push(result, tmp);
05669 }
05670 if (!NIL_P(limit) && lim <= ++i) break;
05671 }
05672 }
05673 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05674 if (RSTRING_LEN(str) == beg)
05675 tmp = str_new_empty(str);
05676 else
05677 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05678 rb_ary_push(result, tmp);
05679 }
05680 if (NIL_P(limit) && lim == 0) {
05681 long len;
05682 while ((len = RARRAY_LEN(result)) > 0 &&
05683 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05684 rb_ary_pop(result);
05685 }
05686
05687 return result;
05688 }
05689
05690 VALUE
05691 rb_str_split(VALUE str, const char *sep0)
05692 {
05693 VALUE sep;
05694
05695 StringValue(str);
05696 sep = rb_str_new2(sep0);
05697 return rb_str_split_m(1, &sep, str);
05698 }
05699
05700
05701
05702
05703
05704
05705
05706
05707
05708
05709
05710
05711
05712
05713
05714
05715
05716
05717
05718
05719
05720
05721
05722
05723
05724
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736
05737
05738 static VALUE
05739 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05740 {
05741 rb_encoding *enc;
05742 VALUE rs;
05743 unsigned int newline;
05744 const char *p, *pend, *s, *ptr;
05745 long len, rslen;
05746 VALUE line;
05747 int n;
05748 VALUE orig = str;
05749
05750 if (argc == 0) {
05751 rs = rb_rs;
05752 }
05753 else {
05754 rb_scan_args(argc, argv, "01", &rs);
05755 }
05756 RETURN_ENUMERATOR(str, argc, argv);
05757 if (NIL_P(rs)) {
05758 rb_yield(str);
05759 return orig;
05760 }
05761 str = rb_str_new4(str);
05762 ptr = p = s = RSTRING_PTR(str);
05763 pend = p + RSTRING_LEN(str);
05764 len = RSTRING_LEN(str);
05765 StringValue(rs);
05766 if (rs == rb_default_rs) {
05767 enc = rb_enc_get(str);
05768 while (p < pend) {
05769 char *p0;
05770
05771 p = memchr(p, '\n', pend - p);
05772 if (!p) break;
05773 p0 = rb_enc_left_char_head(s, p, pend, enc);
05774 if (!rb_enc_is_newline(p0, pend, enc)) {
05775 p++;
05776 continue;
05777 }
05778 p = p0 + rb_enc_mbclen(p0, pend, enc);
05779 line = rb_str_new5(str, s, p - s);
05780 OBJ_INFECT(line, str);
05781 rb_enc_cr_str_copy_for_substr(line, str);
05782 rb_yield(line);
05783 str_mod_check(str, ptr, len);
05784 s = p;
05785 }
05786 goto finish;
05787 }
05788
05789 enc = rb_enc_check(str, rs);
05790 rslen = RSTRING_LEN(rs);
05791 if (rslen == 0) {
05792 newline = '\n';
05793 }
05794 else {
05795 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05796 }
05797
05798 while (p < pend) {
05799 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05800
05801 again:
05802 if (rslen == 0 && c == newline) {
05803 p += n;
05804 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05805 goto again;
05806 }
05807 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05808 p += n;
05809 }
05810 p -= n;
05811 }
05812 if (c == newline &&
05813 (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
05814 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05815 OBJ_INFECT(line, str);
05816 rb_enc_cr_str_copy_for_substr(line, str);
05817 rb_yield(line);
05818 str_mod_check(str, ptr, len);
05819 s = p + (rslen ? rslen : n);
05820 }
05821 p += n;
05822 }
05823
05824 finish:
05825 if (s != pend) {
05826 line = rb_str_new5(str, s, pend - s);
05827 OBJ_INFECT(line, str);
05828 rb_enc_cr_str_copy_for_substr(line, str);
05829 rb_yield(line);
05830 }
05831
05832 return orig;
05833 }
05834
05835
05836
05837
05838
05839
05840
05841
05842
05843
05844
05845
05846
05847
05848
05849
05850
05851
05852
05853
05854 static VALUE
05855 rb_str_each_byte(VALUE str)
05856 {
05857 long i;
05858
05859 RETURN_ENUMERATOR(str, 0, 0);
05860 for (i=0; i<RSTRING_LEN(str); i++) {
05861 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05862 }
05863 return str;
05864 }
05865
05866
05867
05868
05869
05870
05871
05872
05873
05874
05875
05876
05877
05878
05879
05880
05881
05882
05883
05884
05885 static VALUE
05886 rb_str_each_char(VALUE str)
05887 {
05888 VALUE orig = str;
05889 long i, len, n;
05890 const char *ptr;
05891 rb_encoding *enc;
05892
05893 RETURN_ENUMERATOR(str, 0, 0);
05894 str = rb_str_new4(str);
05895 ptr = RSTRING_PTR(str);
05896 len = RSTRING_LEN(str);
05897 enc = rb_enc_get(str);
05898 switch (ENC_CODERANGE(str)) {
05899 case ENC_CODERANGE_VALID:
05900 case ENC_CODERANGE_7BIT:
05901 for (i = 0; i < len; i += n) {
05902 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05903 rb_yield(rb_str_subseq(str, i, n));
05904 }
05905 break;
05906 default:
05907 for (i = 0; i < len; i += n) {
05908 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05909 rb_yield(rb_str_subseq(str, i, n));
05910 }
05911 }
05912 return orig;
05913 }
05914
05915
05916
05917
05918
05919
05920
05921
05922
05923
05924
05925
05926
05927
05928
05929
05930
05931
05932
05933
05934
05935
05936 static VALUE
05937 rb_str_each_codepoint(VALUE str)
05938 {
05939 VALUE orig = str;
05940 long len;
05941 int n;
05942 unsigned int c;
05943 const char *ptr, *end;
05944 rb_encoding *enc;
05945
05946 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05947 RETURN_ENUMERATOR(str, 0, 0);
05948 str = rb_str_new4(str);
05949 ptr = RSTRING_PTR(str);
05950 len = RSTRING_LEN(str);
05951 end = RSTRING_END(str);
05952 enc = STR_ENC_GET(str);
05953 while (ptr < end) {
05954 c = rb_enc_codepoint_len(ptr, end, &n, enc);
05955 rb_yield(UINT2NUM(c));
05956 ptr += n;
05957 }
05958 return orig;
05959 }
05960
05961 static long
05962 chopped_length(VALUE str)
05963 {
05964 rb_encoding *enc = STR_ENC_GET(str);
05965 const char *p, *p2, *beg, *end;
05966
05967 beg = RSTRING_PTR(str);
05968 end = beg + RSTRING_LEN(str);
05969 if (beg > end) return 0;
05970 p = rb_enc_prev_char(beg, end, end, enc);
05971 if (!p) return 0;
05972 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05973 p2 = rb_enc_prev_char(beg, p, end, enc);
05974 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05975 }
05976 return p - beg;
05977 }
05978
05979
05980
05981
05982
05983
05984
05985
05986
05987
05988 static VALUE
05989 rb_str_chop_bang(VALUE str)
05990 {
05991 str_modify_keep_cr(str);
05992 if (RSTRING_LEN(str) > 0) {
05993 long len;
05994 len = chopped_length(str);
05995 STR_SET_LEN(str, len);
05996 RSTRING_PTR(str)[len] = '\0';
05997 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
05998 ENC_CODERANGE_CLEAR(str);
05999 }
06000 return str;
06001 }
06002 return Qnil;
06003 }
06004
06005
06006
06007
06008
06009
06010
06011
06012
06013
06014
06015
06016
06017
06018
06019
06020
06021
06022
06023 static VALUE
06024 rb_str_chop(VALUE str)
06025 {
06026 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06027 rb_enc_cr_str_copy_for_substr(str2, str);
06028 OBJ_INFECT(str2, str);
06029 return str2;
06030 }
06031
06032
06033
06034
06035
06036
06037
06038
06039
06040
06041 static VALUE
06042 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06043 {
06044 rb_encoding *enc;
06045 VALUE rs;
06046 int newline;
06047 char *p, *pp, *e;
06048 long len, rslen;
06049
06050 str_modify_keep_cr(str);
06051 len = RSTRING_LEN(str);
06052 if (len == 0) return Qnil;
06053 p = RSTRING_PTR(str);
06054 e = p + len;
06055 if (argc == 0) {
06056 rs = rb_rs;
06057 if (rs == rb_default_rs) {
06058 smart_chomp:
06059 enc = rb_enc_get(str);
06060 if (rb_enc_mbminlen(enc) > 1) {
06061 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06062 if (rb_enc_is_newline(pp, e, enc)) {
06063 e = pp;
06064 }
06065 pp = e - rb_enc_mbminlen(enc);
06066 if (pp >= p) {
06067 pp = rb_enc_left_char_head(p, pp, e, enc);
06068 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06069 e = pp;
06070 }
06071 }
06072 if (e == RSTRING_END(str)) {
06073 return Qnil;
06074 }
06075 len = e - RSTRING_PTR(str);
06076 STR_SET_LEN(str, len);
06077 }
06078 else {
06079 if (RSTRING_PTR(str)[len-1] == '\n') {
06080 STR_DEC_LEN(str);
06081 if (RSTRING_LEN(str) > 0 &&
06082 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06083 STR_DEC_LEN(str);
06084 }
06085 }
06086 else if (RSTRING_PTR(str)[len-1] == '\r') {
06087 STR_DEC_LEN(str);
06088 }
06089 else {
06090 return Qnil;
06091 }
06092 }
06093 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06094 return str;
06095 }
06096 }
06097 else {
06098 rb_scan_args(argc, argv, "01", &rs);
06099 }
06100 if (NIL_P(rs)) return Qnil;
06101 StringValue(rs);
06102 rslen = RSTRING_LEN(rs);
06103 if (rslen == 0) {
06104 while (len>0 && p[len-1] == '\n') {
06105 len--;
06106 if (len>0 && p[len-1] == '\r')
06107 len--;
06108 }
06109 if (len < RSTRING_LEN(str)) {
06110 STR_SET_LEN(str, len);
06111 RSTRING_PTR(str)[len] = '\0';
06112 return str;
06113 }
06114 return Qnil;
06115 }
06116 if (rslen > len) return Qnil;
06117 newline = RSTRING_PTR(rs)[rslen-1];
06118 if (rslen == 1 && newline == '\n')
06119 goto smart_chomp;
06120
06121 enc = rb_enc_check(str, rs);
06122 if (is_broken_string(rs)) {
06123 return Qnil;
06124 }
06125 pp = e - rslen;
06126 if (p[len-1] == newline &&
06127 (rslen <= 1 ||
06128 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06129 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06130 return Qnil;
06131 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06132 ENC_CODERANGE_CLEAR(str);
06133 }
06134 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06135 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06136 return str;
06137 }
06138 return Qnil;
06139 }
06140
06141
06142
06143
06144
06145
06146
06147
06148
06149
06150
06151
06152
06153
06154
06155
06156
06157
06158
06159
06160
06161 static VALUE
06162 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06163 {
06164 str = rb_str_dup(str);
06165 rb_str_chomp_bang(argc, argv, str);
06166 return str;
06167 }
06168
06169
06170
06171
06172
06173
06174
06175
06176
06177
06178
06179
06180
06181 static VALUE
06182 rb_str_lstrip_bang(VALUE str)
06183 {
06184 rb_encoding *enc;
06185 char *s, *t, *e;
06186
06187 str_modify_keep_cr(str);
06188 enc = STR_ENC_GET(str);
06189 s = RSTRING_PTR(str);
06190 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06191 e = t = RSTRING_END(str);
06192
06193 while (s < e) {
06194 int n;
06195 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06196
06197 if (!rb_isspace(cc)) break;
06198 s += n;
06199 }
06200
06201 if (s > RSTRING_PTR(str)) {
06202 STR_SET_LEN(str, t-s);
06203 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06204 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06205 return str;
06206 }
06207 return Qnil;
06208 }
06209
06210
06211
06212
06213
06214
06215
06216
06217
06218
06219
06220
06221
06222 static VALUE
06223 rb_str_lstrip(VALUE str)
06224 {
06225 str = rb_str_dup(str);
06226 rb_str_lstrip_bang(str);
06227 return str;
06228 }
06229
06230
06231
06232
06233
06234
06235
06236
06237
06238
06239
06240
06241
06242
06243 static VALUE
06244 rb_str_rstrip_bang(VALUE str)
06245 {
06246 rb_encoding *enc;
06247 char *s, *t, *e;
06248
06249 str_modify_keep_cr(str);
06250 enc = STR_ENC_GET(str);
06251 rb_str_check_dummy_enc(enc);
06252 s = RSTRING_PTR(str);
06253 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06254 t = e = RSTRING_END(str);
06255
06256
06257 if (single_byte_optimizable(str)) {
06258 unsigned char c;
06259 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06260 }
06261 else {
06262 char *tp;
06263
06264 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06265 unsigned int c = rb_enc_codepoint(tp, e, enc);
06266 if (c && !rb_isspace(c)) break;
06267 t = tp;
06268 }
06269 }
06270 if (t < e) {
06271 long len = t-RSTRING_PTR(str);
06272
06273 STR_SET_LEN(str, len);
06274 RSTRING_PTR(str)[len] = '\0';
06275 return str;
06276 }
06277 return Qnil;
06278 }
06279
06280
06281
06282
06283
06284
06285
06286
06287
06288
06289
06290
06291
06292 static VALUE
06293 rb_str_rstrip(VALUE str)
06294 {
06295 str = rb_str_dup(str);
06296 rb_str_rstrip_bang(str);
06297 return str;
06298 }
06299
06300
06301
06302
06303
06304
06305
06306
06307
06308
06309 static VALUE
06310 rb_str_strip_bang(VALUE str)
06311 {
06312 VALUE l = rb_str_lstrip_bang(str);
06313 VALUE r = rb_str_rstrip_bang(str);
06314
06315 if (NIL_P(l) && NIL_P(r)) return Qnil;
06316 return str;
06317 }
06318
06319
06320
06321
06322
06323
06324
06325
06326
06327
06328
06329
06330 static VALUE
06331 rb_str_strip(VALUE str)
06332 {
06333 str = rb_str_dup(str);
06334 rb_str_strip_bang(str);
06335 return str;
06336 }
06337
06338 static VALUE
06339 scan_once(VALUE str, VALUE pat, long *start)
06340 {
06341 VALUE result, match;
06342 struct re_registers *regs;
06343 int i;
06344
06345 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06346 match = rb_backref_get();
06347 regs = RMATCH_REGS(match);
06348 if (BEG(0) == END(0)) {
06349 rb_encoding *enc = STR_ENC_GET(str);
06350
06351
06352
06353 if (RSTRING_LEN(str) > END(0))
06354 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06355 RSTRING_END(str), enc);
06356 else
06357 *start = END(0)+1;
06358 }
06359 else {
06360 *start = END(0);
06361 }
06362 if (regs->num_regs == 1) {
06363 return rb_reg_nth_match(0, match);
06364 }
06365 result = rb_ary_new2(regs->num_regs);
06366 for (i=1; i < regs->num_regs; i++) {
06367 rb_ary_push(result, rb_reg_nth_match(i, match));
06368 }
06369
06370 return result;
06371 }
06372 return Qnil;
06373 }
06374
06375
06376
06377
06378
06379
06380
06381
06382
06383
06384
06385
06386
06387
06388
06389
06390
06391
06392
06393
06394
06395
06396
06397
06398
06399
06400
06401
06402
06403
06404
06405
06406
06407 static VALUE
06408 rb_str_scan(VALUE str, VALUE pat)
06409 {
06410 VALUE result;
06411 long start = 0;
06412 long last = -1, prev = 0;
06413 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06414
06415 pat = get_pat(pat, 1);
06416 if (!rb_block_given_p()) {
06417 VALUE ary = rb_ary_new();
06418
06419 while (!NIL_P(result = scan_once(str, pat, &start))) {
06420 last = prev;
06421 prev = start;
06422 rb_ary_push(ary, result);
06423 }
06424 if (last >= 0) rb_reg_search(pat, str, last, 0);
06425 return ary;
06426 }
06427
06428 while (!NIL_P(result = scan_once(str, pat, &start))) {
06429 last = prev;
06430 prev = start;
06431 rb_yield(result);
06432 str_mod_check(str, p, len);
06433 }
06434 if (last >= 0) rb_reg_search(pat, str, last, 0);
06435 return str;
06436 }
06437
06438
06439
06440
06441
06442
06443
06444
06445
06446
06447
06448
06449
06450
06451
06452
06453 static VALUE
06454 rb_str_hex(VALUE str)
06455 {
06456 rb_encoding *enc = rb_enc_get(str);
06457
06458 if (!rb_enc_asciicompat(enc)) {
06459 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06460 }
06461 return rb_str_to_inum(str, 16, FALSE);
06462 }
06463
06464
06465
06466
06467
06468
06469
06470
06471
06472
06473
06474
06475
06476
06477
06478
06479 static VALUE
06480 rb_str_oct(VALUE str)
06481 {
06482 rb_encoding *enc = rb_enc_get(str);
06483
06484 if (!rb_enc_asciicompat(enc)) {
06485 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06486 }
06487 return rb_str_to_inum(str, -8, FALSE);
06488 }
06489
06490
06491
06492
06493
06494
06495
06496
06497
06498
06499
06500
06501 static VALUE
06502 rb_str_crypt(VALUE str, VALUE salt)
06503 {
06504 extern char *crypt(const char *, const char *);
06505 VALUE result;
06506 const char *s, *saltp;
06507 #ifdef BROKEN_CRYPT
06508 char salt_8bit_clean[3];
06509 #endif
06510
06511 StringValue(salt);
06512 if (RSTRING_LEN(salt) < 2)
06513 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06514
06515 s = RSTRING_PTR(str);
06516 if (!s) s = "";
06517 saltp = RSTRING_PTR(salt);
06518 #ifdef BROKEN_CRYPT
06519 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06520 salt_8bit_clean[0] = saltp[0] & 0x7f;
06521 salt_8bit_clean[1] = saltp[1] & 0x7f;
06522 salt_8bit_clean[2] = '\0';
06523 saltp = salt_8bit_clean;
06524 }
06525 #endif
06526 result = rb_str_new2(crypt(s, saltp));
06527 OBJ_INFECT(result, str);
06528 OBJ_INFECT(result, salt);
06529 return result;
06530 }
06531
06532
06533
06534
06535
06536
06537
06538
06539
06540
06541
06542
06543
06544
06545
06546
06547
06548
06549
06550
06551
06552
06553 VALUE
06554 rb_str_intern(VALUE s)
06555 {
06556 VALUE str = RB_GC_GUARD(s);
06557 ID id;
06558
06559 id = rb_intern_str(str);
06560 return ID2SYM(id);
06561 }
06562
06563
06564
06565
06566
06567
06568
06569
06570
06571
06572
06573 VALUE
06574 rb_str_ord(VALUE s)
06575 {
06576 unsigned int c;
06577
06578 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06579 return UINT2NUM(c);
06580 }
06581
06582
06583
06584
06585
06586
06587
06588
06589
06590
06591
06592 static VALUE
06593 rb_str_sum(int argc, VALUE *argv, VALUE str)
06594 {
06595 VALUE vbits;
06596 int bits;
06597 char *ptr, *p, *pend;
06598 long len;
06599 VALUE sum = INT2FIX(0);
06600 unsigned long sum0 = 0;
06601
06602 if (argc == 0) {
06603 bits = 16;
06604 }
06605 else {
06606 rb_scan_args(argc, argv, "01", &vbits);
06607 bits = NUM2INT(vbits);
06608 }
06609 ptr = p = RSTRING_PTR(str);
06610 len = RSTRING_LEN(str);
06611 pend = p + len;
06612
06613 while (p < pend) {
06614 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06615 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06616 str_mod_check(str, ptr, len);
06617 sum0 = 0;
06618 }
06619 sum0 += (unsigned char)*p;
06620 p++;
06621 }
06622
06623 if (bits == 0) {
06624 if (sum0) {
06625 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06626 }
06627 }
06628 else {
06629 if (sum == INT2FIX(0)) {
06630 if (bits < (int)sizeof(long)*CHAR_BIT) {
06631 sum0 &= (((unsigned long)1)<<bits)-1;
06632 }
06633 sum = LONG2FIX(sum0);
06634 }
06635 else {
06636 VALUE mod;
06637
06638 if (sum0) {
06639 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06640 }
06641
06642 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06643 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06644 sum = rb_funcall(sum, '&', 1, mod);
06645 }
06646 }
06647 return sum;
06648 }
06649
06650 static VALUE
06651 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06652 {
06653 rb_encoding *enc;
06654 VALUE w;
06655 long width, len, flen = 1, fclen = 1;
06656 VALUE res;
06657 char *p;
06658 const char *f = " ";
06659 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06660 volatile VALUE pad;
06661 int singlebyte = 1, cr;
06662
06663 rb_scan_args(argc, argv, "11", &w, &pad);
06664 enc = STR_ENC_GET(str);
06665 width = NUM2LONG(w);
06666 if (argc == 2) {
06667 StringValue(pad);
06668 enc = rb_enc_check(str, pad);
06669 f = RSTRING_PTR(pad);
06670 flen = RSTRING_LEN(pad);
06671 fclen = str_strlen(pad, enc);
06672 singlebyte = single_byte_optimizable(pad);
06673 if (flen == 0 || fclen == 0) {
06674 rb_raise(rb_eArgError, "zero width padding");
06675 }
06676 }
06677 len = str_strlen(str, enc);
06678 if (width < 0 || len >= width) return rb_str_dup(str);
06679 n = width - len;
06680 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06681 rlen = n - llen;
06682 cr = ENC_CODERANGE(str);
06683 if (flen > 1) {
06684 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06685 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06686 }
06687 size = RSTRING_LEN(str);
06688 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06689 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06690 (len += llen2 + rlen2) >= LONG_MAX - size) {
06691 rb_raise(rb_eArgError, "argument too big");
06692 }
06693 len += size;
06694 res = rb_str_new5(str, 0, len);
06695 p = RSTRING_PTR(res);
06696 if (flen <= 1) {
06697 memset(p, *f, llen);
06698 p += llen;
06699 }
06700 else {
06701 while (llen >= fclen) {
06702 memcpy(p,f,flen);
06703 p += flen;
06704 llen -= fclen;
06705 }
06706 if (llen > 0) {
06707 memcpy(p, f, llen2);
06708 p += llen2;
06709 }
06710 }
06711 memcpy(p, RSTRING_PTR(str), size);
06712 p += size;
06713 if (flen <= 1) {
06714 memset(p, *f, rlen);
06715 p += rlen;
06716 }
06717 else {
06718 while (rlen >= fclen) {
06719 memcpy(p,f,flen);
06720 p += flen;
06721 rlen -= fclen;
06722 }
06723 if (rlen > 0) {
06724 memcpy(p, f, rlen2);
06725 p += rlen2;
06726 }
06727 }
06728 *p = '\0';
06729 STR_SET_LEN(res, p-RSTRING_PTR(res));
06730 OBJ_INFECT(res, str);
06731 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06732 rb_enc_associate(res, enc);
06733 if (argc == 2)
06734 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06735 if (cr != ENC_CODERANGE_BROKEN)
06736 ENC_CODERANGE_SET(res, cr);
06737 return res;
06738 }
06739
06740
06741
06742
06743
06744
06745
06746
06747
06748
06749
06750
06751
06752
06753
06754 static VALUE
06755 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06756 {
06757 return rb_str_justify(argc, argv, str, 'l');
06758 }
06759
06760
06761
06762
06763
06764
06765
06766
06767
06768
06769
06770
06771
06772
06773
06774 static VALUE
06775 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06776 {
06777 return rb_str_justify(argc, argv, str, 'r');
06778 }
06779
06780
06781
06782
06783
06784
06785
06786
06787
06788
06789
06790
06791
06792
06793
06794 static VALUE
06795 rb_str_center(int argc, VALUE *argv, VALUE str)
06796 {
06797 return rb_str_justify(argc, argv, str, 'c');
06798 }
06799
06800
06801
06802
06803
06804
06805
06806
06807
06808
06809
06810
06811
06812
06813
06814
06815 static VALUE
06816 rb_str_partition(VALUE str, VALUE sep)
06817 {
06818 long pos;
06819 int regex = FALSE;
06820
06821 if (TYPE(sep) == T_REGEXP) {
06822 pos = rb_reg_search(sep, str, 0, 0);
06823 regex = TRUE;
06824 }
06825 else {
06826 VALUE tmp;
06827
06828 tmp = rb_check_string_type(sep);
06829 if (NIL_P(tmp)) {
06830 rb_raise(rb_eTypeError, "type mismatch: %s given",
06831 rb_obj_classname(sep));
06832 }
06833 sep = tmp;
06834 pos = rb_str_index(str, sep, 0);
06835 }
06836 if (pos < 0) {
06837 failed:
06838 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06839 }
06840 if (regex) {
06841 sep = rb_str_subpat(str, sep, INT2FIX(0));
06842 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06843 }
06844 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06845 sep,
06846 rb_str_subseq(str, pos+RSTRING_LEN(sep),
06847 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06848 }
06849
06850
06851
06852
06853
06854
06855
06856
06857
06858
06859
06860
06861
06862
06863
06864
06865 static VALUE
06866 rb_str_rpartition(VALUE str, VALUE sep)
06867 {
06868 long pos = RSTRING_LEN(str);
06869 int regex = FALSE;
06870
06871 if (TYPE(sep) == T_REGEXP) {
06872 pos = rb_reg_search(sep, str, pos, 1);
06873 regex = TRUE;
06874 }
06875 else {
06876 VALUE tmp;
06877
06878 tmp = rb_check_string_type(sep);
06879 if (NIL_P(tmp)) {
06880 rb_raise(rb_eTypeError, "type mismatch: %s given",
06881 rb_obj_classname(sep));
06882 }
06883 sep = tmp;
06884 pos = rb_str_sublen(str, pos);
06885 pos = rb_str_rindex(str, sep, pos);
06886 }
06887 if (pos < 0) {
06888 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06889 }
06890 if (regex) {
06891 sep = rb_reg_nth_match(0, rb_backref_get());
06892 }
06893 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06894 sep,
06895 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06896 }
06897
06898
06899
06900
06901
06902
06903
06904
06905
06906
06907
06908
06909
06910
06911
06912
06913
06914 static VALUE
06915 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06916 {
06917 int i;
06918
06919 for (i=0; i<argc; i++) {
06920 VALUE tmp = rb_check_string_type(argv[i]);
06921 if (NIL_P(tmp)) continue;
06922 rb_enc_check(str, tmp);
06923 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06924 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06925 return Qtrue;
06926 }
06927 return Qfalse;
06928 }
06929
06930
06931
06932
06933
06934
06935
06936
06937 static VALUE
06938 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06939 {
06940 int i;
06941 char *p, *s, *e;
06942 rb_encoding *enc;
06943
06944 for (i=0; i<argc; i++) {
06945 VALUE tmp = rb_check_string_type(argv[i]);
06946 if (NIL_P(tmp)) continue;
06947 enc = rb_enc_check(str, tmp);
06948 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06949 p = RSTRING_PTR(str);
06950 e = p + RSTRING_LEN(str);
06951 s = e - RSTRING_LEN(tmp);
06952 if (rb_enc_left_char_head(p, s, e, enc) != s)
06953 continue;
06954 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06955 return Qtrue;
06956 }
06957 return Qfalse;
06958 }
06959
06960 void
06961 rb_str_setter(VALUE val, ID id, VALUE *var)
06962 {
06963 if (!NIL_P(val) && TYPE(val) != T_STRING) {
06964 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06965 }
06966 *var = val;
06967 }
06968
06969
06970
06971
06972
06973
06974
06975
06976
06977 static VALUE
06978 rb_str_force_encoding(VALUE str, VALUE enc)
06979 {
06980 str_modifiable(str);
06981 rb_enc_associate(str, rb_to_encoding(enc));
06982 ENC_CODERANGE_CLEAR(str);
06983 return str;
06984 }
06985
06986
06987
06988
06989
06990
06991
06992
06993
06994
06995
06996
06997 static VALUE
06998 rb_str_valid_encoding_p(VALUE str)
06999 {
07000 int cr = rb_enc_str_coderange(str);
07001
07002 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07003 }
07004
07005
07006
07007
07008
07009
07010
07011
07012
07013
07014
07015 static VALUE
07016 rb_str_is_ascii_only_p(VALUE str)
07017 {
07018 int cr = rb_enc_str_coderange(str);
07019
07020 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07021 }
07022
07023
07024
07025
07026
07027
07028
07029
07030
07031
07032
07033
07034
07035
07036
07037
07038
07039
07040
07041
07042
07043
07044
07045
07046
07047
07048
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065 static VALUE
07066 sym_equal(VALUE sym1, VALUE sym2)
07067 {
07068 if (sym1 == sym2) return Qtrue;
07069 return Qfalse;
07070 }
07071
07072
07073 static int
07074 sym_printable(const char *s, const char *send, rb_encoding *enc)
07075 {
07076 while (s < send) {
07077 int n;
07078 int c = rb_enc_codepoint_len(s, send, &n, enc);
07079
07080 if (!rb_enc_isprint(c, enc)) return FALSE;
07081 s += n;
07082 }
07083 return TRUE;
07084 }
07085
07086
07087
07088
07089
07090
07091
07092
07093
07094
07095 static VALUE
07096 sym_inspect(VALUE sym)
07097 {
07098 VALUE str;
07099 ID id = SYM2ID(sym);
07100 rb_encoding *enc;
07101 const char *ptr;
07102 long len;
07103 char *dest;
07104 rb_encoding *resenc = rb_default_internal_encoding();
07105
07106 if (resenc == NULL) resenc = rb_default_external_encoding();
07107 sym = rb_id2str(id);
07108 enc = STR_ENC_GET(sym);
07109 ptr = RSTRING_PTR(sym);
07110 len = RSTRING_LEN(sym);
07111 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07112 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07113 str = rb_str_inspect(sym);
07114 len = RSTRING_LEN(str);
07115 rb_str_resize(str, len + 1);
07116 dest = RSTRING_PTR(str);
07117 memmove(dest + 1, dest, len);
07118 dest[0] = ':';
07119 }
07120 else {
07121 char *dest;
07122 str = rb_enc_str_new(0, len + 1, enc);
07123 dest = RSTRING_PTR(str);
07124 dest[0] = ':';
07125 memcpy(dest + 1, ptr, len);
07126 }
07127 return str;
07128 }
07129
07130
07131
07132
07133
07134
07135
07136
07137
07138
07139
07140
07141
07142 VALUE
07143 rb_sym_to_s(VALUE sym)
07144 {
07145 ID id = SYM2ID(sym);
07146
07147 return str_new3(rb_cString, rb_id2str(id));
07148 }
07149
07150
07151
07152
07153
07154
07155
07156
07157
07158
07159
07160
07161 static VALUE
07162 sym_to_sym(VALUE sym)
07163 {
07164 return sym;
07165 }
07166
07167 static VALUE
07168 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07169 {
07170 VALUE obj;
07171
07172 if (argc < 1) {
07173 rb_raise(rb_eArgError, "no receiver given");
07174 }
07175 obj = argv[0];
07176 return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
07177 }
07178
07179
07180
07181
07182
07183
07184
07185
07186
07187
07188 static VALUE
07189 sym_to_proc(VALUE sym)
07190 {
07191 static VALUE sym_proc_cache = Qfalse;
07192 enum {SYM_PROC_CACHE_SIZE = 67};
07193 VALUE proc;
07194 long id, index;
07195 VALUE *aryp;
07196
07197 if (!sym_proc_cache) {
07198 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07199 rb_gc_register_mark_object(sym_proc_cache);
07200 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07201 }
07202
07203 id = SYM2ID(sym);
07204 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07205
07206 aryp = RARRAY_PTR(sym_proc_cache);
07207 if (aryp[index] == sym) {
07208 return aryp[index + 1];
07209 }
07210 else {
07211 proc = rb_proc_new(sym_call, (VALUE)id);
07212 aryp[index] = sym;
07213 aryp[index + 1] = proc;
07214 return proc;
07215 }
07216 }
07217
07218
07219
07220
07221
07222
07223
07224
07225
07226 static VALUE
07227 sym_succ(VALUE sym)
07228 {
07229 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07230 }
07231
07232
07233
07234
07235
07236
07237
07238
07239
07240 static VALUE
07241 sym_cmp(VALUE sym, VALUE other)
07242 {
07243 if (!SYMBOL_P(other)) {
07244 return Qnil;
07245 }
07246 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07247 }
07248
07249
07250
07251
07252
07253
07254
07255
07256
07257 static VALUE
07258 sym_casecmp(VALUE sym, VALUE other)
07259 {
07260 if (!SYMBOL_P(other)) {
07261 return Qnil;
07262 }
07263 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07264 }
07265
07266
07267
07268
07269
07270
07271
07272
07273 static VALUE
07274 sym_match(VALUE sym, VALUE other)
07275 {
07276 return rb_str_match(rb_sym_to_s(sym), other);
07277 }
07278
07279
07280
07281
07282
07283
07284
07285
07286
07287 static VALUE
07288 sym_aref(int argc, VALUE *argv, VALUE sym)
07289 {
07290 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07291 }
07292
07293
07294
07295
07296
07297
07298
07299
07300 static VALUE
07301 sym_length(VALUE sym)
07302 {
07303 return rb_str_length(rb_id2str(SYM2ID(sym)));
07304 }
07305
07306
07307
07308
07309
07310
07311
07312
07313 static VALUE
07314 sym_empty(VALUE sym)
07315 {
07316 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07317 }
07318
07319
07320
07321
07322
07323
07324
07325
07326 static VALUE
07327 sym_upcase(VALUE sym)
07328 {
07329 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07330 }
07331
07332
07333
07334
07335
07336
07337
07338
07339 static VALUE
07340 sym_downcase(VALUE sym)
07341 {
07342 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07343 }
07344
07345
07346
07347
07348
07349
07350
07351
07352 static VALUE
07353 sym_capitalize(VALUE sym)
07354 {
07355 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07356 }
07357
07358
07359
07360
07361
07362
07363
07364
07365 static VALUE
07366 sym_swapcase(VALUE sym)
07367 {
07368 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07369 }
07370
07371
07372
07373
07374
07375
07376
07377
07378 static VALUE
07379 sym_encoding(VALUE sym)
07380 {
07381 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07382 }
07383
07384 ID
07385 rb_to_id(VALUE name)
07386 {
07387 VALUE tmp;
07388 ID id;
07389
07390 switch (TYPE(name)) {
07391 default:
07392 tmp = rb_check_string_type(name);
07393 if (NIL_P(tmp)) {
07394 tmp = rb_inspect(name);
07395 rb_raise(rb_eTypeError, "%s is not a symbol",
07396 RSTRING_PTR(tmp));
07397 }
07398 name = tmp;
07399
07400 case T_STRING:
07401 name = rb_str_intern(name);
07402
07403 case T_SYMBOL:
07404 return SYM2ID(name);
07405 }
07406 return id;
07407 }
07408
07409
07410
07411
07412
07413
07414
07415
07416
07417
07418
07419
07420
07421
07422 void
07423 Init_String(void)
07424 {
07425 #undef rb_intern
07426 #define rb_intern(str) rb_intern_const(str)
07427
07428 rb_cString = rb_define_class("String", rb_cObject);
07429 rb_include_module(rb_cString, rb_mComparable);
07430 rb_define_alloc_func(rb_cString, str_alloc);
07431 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07432 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07433 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07434 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07435 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07436 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07437 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07438 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07439 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07440 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07441 rb_define_method(rb_cString, "*", rb_str_times, 1);
07442 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07443 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07444 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07445 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07446 rb_define_method(rb_cString, "length", rb_str_length, 0);
07447 rb_define_method(rb_cString, "size", rb_str_length, 0);
07448 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07449 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07450 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07451 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07452 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07453 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07454 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07455 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07456 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07457 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07458 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07459 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07460 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07461 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07462 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07463 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07464
07465 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07466 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07467 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07468 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07469 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07470 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07471
07472 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07473 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07474 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07475 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07476
07477 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07478 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07479 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07480 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07481
07482 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07483 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07484 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07485 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07486 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07487 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07488 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07489 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07490 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07491 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07492 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07493 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07494 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07495 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07496 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07497
07498 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07499 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07500 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07501
07502 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07503
07504 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07505 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07506 rb_define_method(rb_cString, "center", rb_str_center, -1);
07507
07508 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07509 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07510 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07511 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07512 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07513 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07514 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07515
07516 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07517 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07518 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07519 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07520 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07521 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07522 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07523
07524 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07525 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07526 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07527 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07528 rb_define_method(rb_cString, "count", rb_str_count, -1);
07529
07530 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07531 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07532 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07533 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07534
07535 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07536 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07537 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07538 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07539
07540 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07541
07542 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07543 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07544
07545 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07546 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07547
07548 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07549 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07550 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07551 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07552
07553 id_to_s = rb_intern("to_s");
07554
07555 rb_fs = Qnil;
07556 rb_define_variable("$;", &rb_fs);
07557 rb_define_variable("$-F", &rb_fs);
07558
07559 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07560 rb_include_module(rb_cSymbol, rb_mComparable);
07561 rb_undef_alloc_func(rb_cSymbol);
07562 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07563 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07564
07565 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07566 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07567 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07568 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07569 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07570 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07571 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07572 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07573 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07574 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07575
07576 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07577 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07578 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07579
07580 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07581 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07582 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07583 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07584 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07585 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07586
07587 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07588 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07589 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07590 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07591
07592 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07593 }
07594