00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "transcode_data.h"
00015 #include <ctype.h>
00016
00017
00018 VALUE rb_eUndefinedConversionError;
00019 VALUE rb_eInvalidByteSequenceError;
00020 VALUE rb_eConverterNotFoundError;
00021
00022 VALUE rb_cEncodingConverter;
00023
00024 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
00025 static VALUE sym_xml, sym_text, sym_attr;
00026 static VALUE sym_universal_newline;
00027 static VALUE sym_crlf_newline;
00028 static VALUE sym_cr_newline;
00029 static VALUE sym_partial_input;
00030
00031 static VALUE sym_invalid_byte_sequence;
00032 static VALUE sym_undefined_conversion;
00033 static VALUE sym_destination_buffer_full;
00034 static VALUE sym_source_buffer_empty;
00035 static VALUE sym_finished;
00036 static VALUE sym_after_output;
00037 static VALUE sym_incomplete_input;
00038
00039 static unsigned char *
00040 allocate_converted_string(const char *sname, const char *dname,
00041 const unsigned char *str, size_t len,
00042 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
00043 size_t *dst_len_ptr);
00044
00045
00046
00047 typedef struct rb_transcoding {
00048 const rb_transcoder *transcoder;
00049
00050 int flags;
00051
00052 int resume_position;
00053 unsigned int next_table;
00054 VALUE next_info;
00055 unsigned char next_byte;
00056 unsigned int output_index;
00057
00058 ssize_t recognized_len;
00059 ssize_t readagain_len;
00060 union {
00061 unsigned char ary[8];
00062 unsigned char *ptr;
00063 } readbuf;
00064
00065 ssize_t writebuf_off;
00066 ssize_t writebuf_len;
00067 union {
00068 unsigned char ary[8];
00069 unsigned char *ptr;
00070 } writebuf;
00071
00072 union rb_transcoding_state_t {
00073 void *ptr;
00074 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
00075 double dummy_for_alignment;
00076 } state;
00077 } rb_transcoding;
00078 #define TRANSCODING_READBUF(tc) \
00079 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
00080 (tc)->readbuf.ary : \
00081 (tc)->readbuf.ptr)
00082 #define TRANSCODING_WRITEBUF(tc) \
00083 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00084 (tc)->writebuf.ary : \
00085 (tc)->writebuf.ptr)
00086 #define TRANSCODING_WRITEBUF_SIZE(tc) \
00087 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00088 sizeof((tc)->writebuf.ary) : \
00089 (size_t)(tc)->transcoder->max_output)
00090 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
00091 #define TRANSCODING_STATE(tc) \
00092 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
00093 (tc)->state.ary : \
00094 (tc)->state.ptr)
00095
00096 typedef struct {
00097 struct rb_transcoding *tc;
00098 unsigned char *out_buf_start;
00099 unsigned char *out_data_start;
00100 unsigned char *out_data_end;
00101 unsigned char *out_buf_end;
00102 rb_econv_result_t last_result;
00103 } rb_econv_elem_t;
00104
00105 struct rb_econv_t {
00106 int flags;
00107 const char *source_encoding_name;
00108 const char *destination_encoding_name;
00109
00110 int started;
00111
00112 const unsigned char *replacement_str;
00113 size_t replacement_len;
00114 const char *replacement_enc;
00115 int replacement_allocated;
00116
00117 unsigned char *in_buf_start;
00118 unsigned char *in_data_start;
00119 unsigned char *in_data_end;
00120 unsigned char *in_buf_end;
00121 rb_econv_elem_t *elems;
00122 int num_allocated;
00123 int num_trans;
00124 int num_finished;
00125 struct rb_transcoding *last_tc;
00126
00127
00128 struct {
00129 rb_econv_result_t result;
00130 struct rb_transcoding *error_tc;
00131 const char *source_encoding;
00132 const char *destination_encoding;
00133 const unsigned char *error_bytes_start;
00134 size_t error_bytes_len;
00135 size_t readagain_len;
00136 } last_error;
00137
00138
00139
00140 rb_encoding *source_encoding;
00141 rb_encoding *destination_encoding;
00142 };
00143
00144
00145
00146
00147
00148 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
00149
00150 typedef struct {
00151 const char *sname;
00152 const char *dname;
00153 const char *lib;
00154 const rb_transcoder *transcoder;
00155 } transcoder_entry_t;
00156
00157 static st_table *transcoder_table;
00158
00159 static transcoder_entry_t *
00160 make_transcoder_entry(const char *sname, const char *dname)
00161 {
00162 st_data_t val;
00163 st_table *table2;
00164
00165 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00166 val = (st_data_t)st_init_strcasetable();
00167 st_add_direct(transcoder_table, (st_data_t)sname, val);
00168 }
00169 table2 = (st_table *)val;
00170 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00171 transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
00172 entry->sname = sname;
00173 entry->dname = dname;
00174 entry->lib = NULL;
00175 entry->transcoder = NULL;
00176 val = (st_data_t)entry;
00177 st_add_direct(table2, (st_data_t)dname, val);
00178 }
00179 return (transcoder_entry_t *)val;
00180 }
00181
00182 static transcoder_entry_t *
00183 get_transcoder_entry(const char *sname, const char *dname)
00184 {
00185 st_data_t val;
00186 st_table *table2;
00187
00188 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00189 return NULL;
00190 }
00191 table2 = (st_table *)val;
00192 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00193 return NULL;
00194 }
00195 return (transcoder_entry_t *)val;
00196 }
00197
00198 void
00199 rb_register_transcoder(const rb_transcoder *tr)
00200 {
00201 const char *const sname = tr->src_encoding;
00202 const char *const dname = tr->dst_encoding;
00203
00204 transcoder_entry_t *entry;
00205
00206 entry = make_transcoder_entry(sname, dname);
00207 if (entry->transcoder) {
00208 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
00209 sname, dname);
00210 }
00211
00212 entry->transcoder = tr;
00213 }
00214
00215 static void
00216 declare_transcoder(const char *sname, const char *dname, const char *lib)
00217 {
00218 transcoder_entry_t *entry;
00219
00220 entry = make_transcoder_entry(sname, dname);
00221 entry->lib = lib;
00222 }
00223
00224 #define MAX_TRANSCODER_LIBNAME_LEN 64
00225 static const char transcoder_lib_prefix[] = "enc/trans/";
00226
00227 void
00228 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
00229 {
00230 if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
00231 rb_raise(rb_eArgError, "invalid library name - %s",
00232 lib ? lib : "(null)");
00233 }
00234 declare_transcoder(enc1, enc2, lib);
00235 }
00236
00237 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
00238
00239 typedef struct search_path_queue_tag {
00240 struct search_path_queue_tag *next;
00241 const char *enc;
00242 } search_path_queue_t;
00243
00244 typedef struct {
00245 st_table *visited;
00246 search_path_queue_t *queue;
00247 search_path_queue_t **queue_last_ptr;
00248 const char *base_enc;
00249 } search_path_bfs_t;
00250
00251 static int
00252 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
00253 {
00254 const char *dname = (const char *)key;
00255 search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
00256 search_path_queue_t *q;
00257
00258 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
00259 return ST_CONTINUE;
00260 }
00261
00262 q = ALLOC(search_path_queue_t);
00263 q->enc = dname;
00264 q->next = NULL;
00265 *bfs->queue_last_ptr = q;
00266 bfs->queue_last_ptr = &q->next;
00267
00268 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
00269 return ST_CONTINUE;
00270 }
00271
00272 static int
00273 transcode_search_path(const char *sname, const char *dname,
00274 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
00275 void *arg)
00276 {
00277 search_path_bfs_t bfs;
00278 search_path_queue_t *q;
00279 st_data_t val;
00280 st_table *table2;
00281 int found;
00282 int pathlen = -1;
00283
00284 if (encoding_equal(sname, dname))
00285 return -1;
00286
00287 q = ALLOC(search_path_queue_t);
00288 q->enc = sname;
00289 q->next = NULL;
00290 bfs.queue_last_ptr = &q->next;
00291 bfs.queue = q;
00292
00293 bfs.visited = st_init_strcasetable();
00294 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
00295
00296 while (bfs.queue) {
00297 q = bfs.queue;
00298 bfs.queue = q->next;
00299 if (!bfs.queue)
00300 bfs.queue_last_ptr = &bfs.queue;
00301
00302 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
00303 xfree(q);
00304 continue;
00305 }
00306 table2 = (st_table *)val;
00307
00308 if (st_lookup(table2, (st_data_t)dname, &val)) {
00309 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
00310 xfree(q);
00311 found = 1;
00312 goto cleanup;
00313 }
00314
00315 bfs.base_enc = q->enc;
00316 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
00317 bfs.base_enc = NULL;
00318
00319 xfree(q);
00320 }
00321 found = 0;
00322
00323 cleanup:
00324 while (bfs.queue) {
00325 q = bfs.queue;
00326 bfs.queue = q->next;
00327 xfree(q);
00328 }
00329
00330 if (found) {
00331 const char *enc = dname;
00332 int depth;
00333 pathlen = 0;
00334 while (1) {
00335 st_lookup(bfs.visited, (st_data_t)enc, &val);
00336 if (!val)
00337 break;
00338 pathlen++;
00339 enc = (const char *)val;
00340 }
00341 depth = pathlen;
00342 enc = dname;
00343 while (1) {
00344 st_lookup(bfs.visited, (st_data_t)enc, &val);
00345 if (!val)
00346 break;
00347 callback((const char *)val, enc, --depth, arg);
00348 enc = (const char *)val;
00349 }
00350 }
00351
00352 st_free_table(bfs.visited);
00353
00354 return pathlen;
00355 }
00356
00357 static const rb_transcoder *
00358 load_transcoder_entry(transcoder_entry_t *entry)
00359 {
00360 if (entry->transcoder)
00361 return entry->transcoder;
00362
00363 if (entry->lib) {
00364 const char *lib = entry->lib;
00365 size_t len = strlen(lib);
00366 char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
00367
00368 entry->lib = NULL;
00369
00370 if (len > MAX_TRANSCODER_LIBNAME_LEN)
00371 return NULL;
00372 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
00373 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
00374 if (!rb_require(path))
00375 return NULL;
00376 }
00377
00378 if (entry->transcoder)
00379 return entry->transcoder;
00380
00381 return NULL;
00382 }
00383
00384 static const char*
00385 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
00386 {
00387 if (encoding_equal(encname, "UTF-8")) {
00388 *len_ret = 3;
00389 *repl_encname_ptr = "UTF-8";
00390 return "\xEF\xBF\xBD";
00391 }
00392 else {
00393 *len_ret = 1;
00394 *repl_encname_ptr = "US-ASCII";
00395 return "?";
00396 }
00397 }
00398
00399
00400
00401
00402
00403 static const unsigned char *
00404 transcode_char_start(rb_transcoding *tc,
00405 const unsigned char *in_start,
00406 const unsigned char *inchar_start,
00407 const unsigned char *in_p,
00408 size_t *char_len_ptr)
00409 {
00410 const unsigned char *ptr;
00411 if (inchar_start - in_start < tc->recognized_len) {
00412 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
00413 inchar_start, unsigned char, in_p - inchar_start);
00414 ptr = TRANSCODING_READBUF(tc);
00415 }
00416 else {
00417 ptr = inchar_start - tc->recognized_len;
00418 }
00419 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
00420 return ptr;
00421 }
00422
00423 static rb_econv_result_t
00424 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
00425 const unsigned char *in_stop, unsigned char *out_stop,
00426 rb_transcoding *tc,
00427 const int opt)
00428 {
00429 const rb_transcoder *tr = tc->transcoder;
00430 int unitlen = tr->input_unit_length;
00431 ssize_t readagain_len = 0;
00432
00433 const unsigned char *inchar_start;
00434 const unsigned char *in_p;
00435
00436 unsigned char *out_p;
00437
00438 in_p = inchar_start = *in_pos;
00439
00440 out_p = *out_pos;
00441
00442 #define SUSPEND(ret, num) \
00443 do { \
00444 tc->resume_position = (num); \
00445 if (0 < in_p - inchar_start) \
00446 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
00447 inchar_start, unsigned char, in_p - inchar_start); \
00448 *in_pos = in_p; \
00449 *out_pos = out_p; \
00450 tc->recognized_len += in_p - inchar_start; \
00451 if (readagain_len) { \
00452 tc->recognized_len -= readagain_len; \
00453 tc->readagain_len = readagain_len; \
00454 } \
00455 return ret; \
00456 resume_label ## num:; \
00457 } while (0)
00458 #define SUSPEND_OBUF(num) \
00459 do { \
00460 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
00461 } while (0)
00462
00463 #define SUSPEND_AFTER_OUTPUT(num) \
00464 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
00465 SUSPEND(econv_after_output, num); \
00466 }
00467
00468 #define next_table (tc->next_table)
00469 #define next_info (tc->next_info)
00470 #define next_byte (tc->next_byte)
00471 #define writebuf_len (tc->writebuf_len)
00472 #define writebuf_off (tc->writebuf_off)
00473
00474 switch (tc->resume_position) {
00475 case 0: break;
00476 case 1: goto resume_label1;
00477 case 2: goto resume_label2;
00478 case 3: goto resume_label3;
00479 case 4: goto resume_label4;
00480 case 5: goto resume_label5;
00481 case 6: goto resume_label6;
00482 case 7: goto resume_label7;
00483 case 8: goto resume_label8;
00484 case 9: goto resume_label9;
00485 case 10: goto resume_label10;
00486 case 11: goto resume_label11;
00487 case 12: goto resume_label12;
00488 case 13: goto resume_label13;
00489 case 14: goto resume_label14;
00490 case 15: goto resume_label15;
00491 case 16: goto resume_label16;
00492 case 17: goto resume_label17;
00493 case 18: goto resume_label18;
00494 case 19: goto resume_label19;
00495 case 20: goto resume_label20;
00496 case 21: goto resume_label21;
00497 case 22: goto resume_label22;
00498 case 23: goto resume_label23;
00499 case 24: goto resume_label24;
00500 case 25: goto resume_label25;
00501 case 26: goto resume_label26;
00502 case 27: goto resume_label27;
00503 case 28: goto resume_label28;
00504 case 29: goto resume_label29;
00505 case 30: goto resume_label30;
00506 case 31: goto resume_label31;
00507 case 32: goto resume_label32;
00508 case 33: goto resume_label33;
00509 case 34: goto resume_label34;
00510 }
00511
00512 while (1) {
00513 inchar_start = in_p;
00514 tc->recognized_len = 0;
00515 next_table = tr->conv_tree_start;
00516
00517 SUSPEND_AFTER_OUTPUT(24);
00518
00519 if (in_stop <= in_p) {
00520 if (!(opt & ECONV_PARTIAL_INPUT))
00521 break;
00522 SUSPEND(econv_source_buffer_empty, 7);
00523 continue;
00524 }
00525
00526 #define BYTE_ADDR(index) (tr->byte_array + (index))
00527 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
00528 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
00529 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
00530 #define BL_MIN_BYTE (BL_BASE[0])
00531 #define BL_MAX_BYTE (BL_BASE[1])
00532 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
00533 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
00534
00535 next_byte = (unsigned char)*in_p++;
00536 follow_byte:
00537 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
00538 next_info = INVALID;
00539 else {
00540 next_info = (VALUE)BL_ACTION(next_byte);
00541 }
00542 follow_info:
00543 switch (next_info & 0x1F) {
00544 case NOMAP:
00545 {
00546 const unsigned char *p = inchar_start;
00547 writebuf_off = 0;
00548 while (p < in_p) {
00549 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
00550 }
00551 writebuf_len = writebuf_off;
00552 writebuf_off = 0;
00553 while (writebuf_off < writebuf_len) {
00554 SUSPEND_OBUF(3);
00555 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00556 }
00557 }
00558 continue;
00559 case 0x00: case 0x04: case 0x08: case 0x0C:
00560 case 0x10: case 0x14: case 0x18: case 0x1C:
00561 SUSPEND_AFTER_OUTPUT(25);
00562 while (in_p >= in_stop) {
00563 if (!(opt & ECONV_PARTIAL_INPUT))
00564 goto incomplete;
00565 SUSPEND(econv_source_buffer_empty, 5);
00566 }
00567 next_byte = (unsigned char)*in_p++;
00568 next_table = (unsigned int)next_info;
00569 goto follow_byte;
00570 case ZERObt:
00571 continue;
00572 case ONEbt:
00573 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
00574 continue;
00575 case TWObt:
00576 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
00577 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
00578 continue;
00579 case THREEbt:
00580 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
00581 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
00582 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
00583 continue;
00584 case FOURbt:
00585 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
00586 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
00587 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
00588 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
00589 continue;
00590 case GB4bt:
00591 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
00592 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
00593 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
00594 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
00595 continue;
00596 case STR1:
00597 tc->output_index = 0;
00598 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
00599 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
00600 tc->output_index++;
00601 }
00602 continue;
00603 case FUNii:
00604 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
00605 goto follow_info;
00606 case FUNsi:
00607 {
00608 const unsigned char *char_start;
00609 size_t char_len;
00610 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00611 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
00612 goto follow_info;
00613 }
00614 case FUNio:
00615 SUSPEND_OBUF(13);
00616 if (tr->max_output <= out_stop - out_p)
00617 out_p += tr->func_io(TRANSCODING_STATE(tc),
00618 next_info, out_p, out_stop - out_p);
00619 else {
00620 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
00621 next_info,
00622 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00623 writebuf_off = 0;
00624 while (writebuf_off < writebuf_len) {
00625 SUSPEND_OBUF(20);
00626 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00627 }
00628 }
00629 break;
00630 case FUNso:
00631 {
00632 const unsigned char *char_start;
00633 size_t char_len;
00634 SUSPEND_OBUF(14);
00635 if (tr->max_output <= out_stop - out_p) {
00636 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00637 out_p += tr->func_so(TRANSCODING_STATE(tc),
00638 char_start, (size_t)char_len,
00639 out_p, out_stop - out_p);
00640 }
00641 else {
00642 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00643 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
00644 char_start, (size_t)char_len,
00645 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00646 writebuf_off = 0;
00647 while (writebuf_off < writebuf_len) {
00648 SUSPEND_OBUF(22);
00649 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00650 }
00651 }
00652 break;
00653 }
00654 case FUNsio:
00655 {
00656 const unsigned char *char_start;
00657 size_t char_len;
00658 SUSPEND_OBUF(33);
00659 if (tr->max_output <= out_stop - out_p) {
00660 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00661 out_p += tr->func_sio(TRANSCODING_STATE(tc),
00662 char_start, (size_t)char_len, next_info,
00663 out_p, out_stop - out_p);
00664 }
00665 else {
00666 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00667 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
00668 char_start, (size_t)char_len, next_info,
00669 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00670 writebuf_off = 0;
00671 while (writebuf_off < writebuf_len) {
00672 SUSPEND_OBUF(34);
00673 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00674 }
00675 }
00676 break;
00677 }
00678 case INVALID:
00679 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
00680 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
00681 SUSPEND_AFTER_OUTPUT(26);
00682 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
00683 in_p = in_stop;
00684 SUSPEND(econv_source_buffer_empty, 8);
00685 }
00686 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
00687 in_p = in_stop;
00688 }
00689 else {
00690 in_p = inchar_start + (unitlen - tc->recognized_len);
00691 }
00692 }
00693 else {
00694 ssize_t invalid_len;
00695 ssize_t discard_len;
00696 invalid_len = tc->recognized_len + (in_p - inchar_start);
00697 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
00698 readagain_len = invalid_len - discard_len;
00699 }
00700 goto invalid;
00701 case UNDEF:
00702 goto undef;
00703 default:
00704 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
00705 }
00706 continue;
00707
00708 invalid:
00709 SUSPEND(econv_invalid_byte_sequence, 1);
00710 continue;
00711
00712 incomplete:
00713 SUSPEND(econv_incomplete_input, 27);
00714 continue;
00715
00716 undef:
00717 SUSPEND(econv_undefined_conversion, 2);
00718 continue;
00719 }
00720
00721
00722 if (tr->finish_func) {
00723 SUSPEND_OBUF(4);
00724 if (tr->max_output <= out_stop - out_p) {
00725 out_p += tr->finish_func(TRANSCODING_STATE(tc),
00726 out_p, out_stop - out_p);
00727 }
00728 else {
00729 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
00730 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00731 writebuf_off = 0;
00732 while (writebuf_off < writebuf_len) {
00733 SUSPEND_OBUF(23);
00734 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00735 }
00736 }
00737 }
00738 while (1)
00739 SUSPEND(econv_finished, 6);
00740 #undef SUSPEND
00741 #undef next_table
00742 #undef next_info
00743 #undef next_byte
00744 #undef writebuf_len
00745 #undef writebuf_off
00746 }
00747
00748 static rb_econv_result_t
00749 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
00750 const unsigned char *in_stop, unsigned char *out_stop,
00751 rb_transcoding *tc,
00752 const int opt)
00753 {
00754 if (tc->readagain_len) {
00755 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
00756 const unsigned char *readagain_pos = readagain_buf;
00757 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
00758 rb_econv_result_t res;
00759
00760 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
00761 unsigned char, tc->readagain_len);
00762 tc->readagain_len = 0;
00763 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
00764 if (res != econv_source_buffer_empty) {
00765 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
00766 readagain_pos, unsigned char, readagain_stop - readagain_pos);
00767 tc->readagain_len += readagain_stop - readagain_pos;
00768 return res;
00769 }
00770 }
00771 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
00772 }
00773
00774 static rb_transcoding *
00775 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
00776 {
00777 rb_transcoding *tc;
00778
00779 tc = ALLOC(rb_transcoding);
00780 tc->transcoder = tr;
00781 tc->flags = flags;
00782 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00783 tc->state.ptr = xmalloc(tr->state_size);
00784 if (tr->state_init_func) {
00785 (tr->state_init_func)(TRANSCODING_STATE(tc));
00786 }
00787 tc->resume_position = 0;
00788 tc->recognized_len = 0;
00789 tc->readagain_len = 0;
00790 tc->writebuf_len = 0;
00791 tc->writebuf_off = 0;
00792 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00793 tc->readbuf.ptr = xmalloc(tr->max_input);
00794 }
00795 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00796 tc->writebuf.ptr = xmalloc(tr->max_output);
00797 }
00798 return tc;
00799 }
00800
00801 static rb_econv_result_t
00802 rb_transcoding_convert(rb_transcoding *tc,
00803 const unsigned char **input_ptr, const unsigned char *input_stop,
00804 unsigned char **output_ptr, unsigned char *output_stop,
00805 int flags)
00806 {
00807 return transcode_restartable(
00808 input_ptr, output_ptr,
00809 input_stop, output_stop,
00810 tc, flags);
00811 }
00812
00813 static void
00814 rb_transcoding_close(rb_transcoding *tc)
00815 {
00816 const rb_transcoder *tr = tc->transcoder;
00817 if (tr->state_fini_func) {
00818 (tr->state_fini_func)(TRANSCODING_STATE(tc));
00819 }
00820 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00821 xfree(tc->state.ptr);
00822 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
00823 xfree(tc->readbuf.ptr);
00824 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
00825 xfree(tc->writebuf.ptr);
00826 xfree(tc);
00827 }
00828
00829 static size_t
00830 rb_transcoding_memsize(rb_transcoding *tc)
00831 {
00832 size_t size = sizeof(rb_transcoding);
00833 const rb_transcoder *tr = tc->transcoder;
00834
00835 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
00836 size += tr->state_size;
00837 }
00838 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00839 size += tr->max_input;
00840 }
00841 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00842 size += tr->max_output;
00843 }
00844 return size;
00845 }
00846
00847 static rb_econv_t *
00848 rb_econv_alloc(int n_hint)
00849 {
00850 rb_econv_t *ec;
00851
00852 if (n_hint <= 0)
00853 n_hint = 1;
00854
00855 ec = ALLOC(rb_econv_t);
00856 ec->flags = 0;
00857 ec->source_encoding_name = NULL;
00858 ec->destination_encoding_name = NULL;
00859 ec->started = 0;
00860 ec->replacement_str = NULL;
00861 ec->replacement_len = 0;
00862 ec->replacement_enc = NULL;
00863 ec->replacement_allocated = 0;
00864 ec->in_buf_start = NULL;
00865 ec->in_data_start = NULL;
00866 ec->in_data_end = NULL;
00867 ec->in_buf_end = NULL;
00868 ec->num_allocated = n_hint;
00869 ec->num_trans = 0;
00870 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
00871 ec->num_finished = 0;
00872 ec->last_tc = NULL;
00873 ec->last_error.result = econv_source_buffer_empty;
00874 ec->last_error.error_tc = NULL;
00875 ec->last_error.source_encoding = NULL;
00876 ec->last_error.destination_encoding = NULL;
00877 ec->last_error.error_bytes_start = NULL;
00878 ec->last_error.error_bytes_len = 0;
00879 ec->last_error.readagain_len = 0;
00880 ec->source_encoding = NULL;
00881 ec->destination_encoding = NULL;
00882 return ec;
00883 }
00884
00885 static int
00886 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
00887 {
00888 int n, j;
00889 int bufsize = 4096;
00890 unsigned char *p;
00891
00892 if (ec->num_trans == ec->num_allocated) {
00893 n = ec->num_allocated * 2;
00894 REALLOC_N(ec->elems, rb_econv_elem_t, n);
00895 ec->num_allocated = n;
00896 }
00897
00898 p = xmalloc(bufsize);
00899
00900 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
00901
00902 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
00903 ec->elems[i].out_buf_start = p;
00904 ec->elems[i].out_buf_end = p + bufsize;
00905 ec->elems[i].out_data_start = p;
00906 ec->elems[i].out_data_end = p;
00907 ec->elems[i].last_result = econv_source_buffer_empty;
00908
00909 ec->num_trans++;
00910
00911 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
00912 for (j = ec->num_trans-1; i <= j; j--) {
00913 rb_transcoding *tc = ec->elems[j].tc;
00914 const rb_transcoder *tr2 = tc->transcoder;
00915 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
00916 ec->last_tc = tc;
00917 break;
00918 }
00919 }
00920
00921 return 0;
00922 }
00923
00924 static rb_econv_t *
00925 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
00926 {
00927 rb_econv_t *ec;
00928 int i, ret;
00929
00930 for (i = 0; i < n; i++) {
00931 const rb_transcoder *tr;
00932 tr = load_transcoder_entry(entries[i]);
00933 if (!tr)
00934 return NULL;
00935 }
00936
00937 ec = rb_econv_alloc(n);
00938
00939 for (i = 0; i < n; i++) {
00940 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
00941 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
00942 if (ret == -1) {
00943 rb_econv_close(ec);
00944 return NULL;
00945 }
00946 }
00947
00948 return ec;
00949 }
00950
00951 struct trans_open_t {
00952 transcoder_entry_t **entries;
00953 int num_additional;
00954 };
00955
00956 static void
00957 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
00958 {
00959 struct trans_open_t *toarg = arg;
00960
00961 if (!toarg->entries) {
00962 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
00963 }
00964 toarg->entries[depth] = get_transcoder_entry(sname, dname);
00965 }
00966
00967 static rb_econv_t *
00968 rb_econv_open0(const char *sname, const char *dname, int ecflags)
00969 {
00970 transcoder_entry_t **entries = NULL;
00971 int num_trans;
00972 rb_econv_t *ec;
00973
00974 rb_encoding *senc, *denc;
00975 int sidx, didx;
00976
00977 senc = NULL;
00978 if (*sname) {
00979 sidx = rb_enc_find_index(sname);
00980 if (0 <= sidx) {
00981 senc = rb_enc_from_index(sidx);
00982 }
00983 }
00984
00985 denc = NULL;
00986 if (*dname) {
00987 didx = rb_enc_find_index(dname);
00988 if (0 <= didx) {
00989 denc = rb_enc_from_index(didx);
00990 }
00991 }
00992
00993 if (*sname == '\0' && *dname == '\0') {
00994 num_trans = 0;
00995 entries = NULL;
00996 }
00997 else {
00998 struct trans_open_t toarg;
00999 toarg.entries = NULL;
01000 toarg.num_additional = 0;
01001 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
01002 entries = toarg.entries;
01003 if (num_trans < 0) {
01004 xfree(entries);
01005 return NULL;
01006 }
01007 }
01008
01009 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
01010 xfree(entries);
01011 if (!ec)
01012 return NULL;
01013
01014 ec->flags = ecflags;
01015 ec->source_encoding_name = sname;
01016 ec->destination_encoding_name = dname;
01017
01018 return ec;
01019 }
01020
01021 #define MAX_ECFLAGS_DECORATORS 32
01022
01023 static int
01024 decorator_names(int ecflags, const char **decorators_ret)
01025 {
01026 int num_decorators;
01027
01028 if ((ecflags & ECONV_CRLF_NEWLINE_DECORATOR) &&
01029 (ecflags & ECONV_CR_NEWLINE_DECORATOR))
01030 return -1;
01031
01032 if ((ecflags & (ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR)) &&
01033 (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR))
01034 return -1;
01035
01036 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
01037 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
01038 return -1;
01039
01040 num_decorators = 0;
01041
01042 if (ecflags & ECONV_XML_TEXT_DECORATOR)
01043 decorators_ret[num_decorators++] = "xml_text_escape";
01044 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
01045 decorators_ret[num_decorators++] = "xml_attr_content_escape";
01046 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
01047 decorators_ret[num_decorators++] = "xml_attr_quote";
01048
01049 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
01050 decorators_ret[num_decorators++] = "crlf_newline";
01051 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
01052 decorators_ret[num_decorators++] = "cr_newline";
01053 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
01054 decorators_ret[num_decorators++] = "universal_newline";
01055
01056 return num_decorators;
01057 }
01058
01059 rb_econv_t *
01060 rb_econv_open(const char *sname, const char *dname, int ecflags)
01061 {
01062 rb_econv_t *ec;
01063 int num_decorators;
01064 const char *decorators[MAX_ECFLAGS_DECORATORS];
01065 int i;
01066
01067 num_decorators = decorator_names(ecflags, decorators);
01068 if (num_decorators == -1)
01069 return NULL;
01070
01071 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
01072 if (!ec)
01073 return NULL;
01074
01075 for (i = 0; i < num_decorators; i++)
01076 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
01077 rb_econv_close(ec);
01078 return NULL;
01079 }
01080
01081 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
01082
01083 return ec;
01084 }
01085
01086 static int
01087 trans_sweep(rb_econv_t *ec,
01088 const unsigned char **input_ptr, const unsigned char *input_stop,
01089 unsigned char **output_ptr, unsigned char *output_stop,
01090 int flags,
01091 int start)
01092 {
01093 int try;
01094 int i, f;
01095
01096 const unsigned char **ipp, *is, *iold;
01097 unsigned char **opp, *os, *oold;
01098 rb_econv_result_t res;
01099
01100 try = 1;
01101 while (try) {
01102 try = 0;
01103 for (i = start; i < ec->num_trans; i++) {
01104 rb_econv_elem_t *te = &ec->elems[i];
01105
01106 if (i == 0) {
01107 ipp = input_ptr;
01108 is = input_stop;
01109 }
01110 else {
01111 rb_econv_elem_t *prev_te = &ec->elems[i-1];
01112 ipp = (const unsigned char **)&prev_te->out_data_start;
01113 is = prev_te->out_data_end;
01114 }
01115
01116 if (i == ec->num_trans-1) {
01117 opp = output_ptr;
01118 os = output_stop;
01119 }
01120 else {
01121 if (te->out_buf_start != te->out_data_start) {
01122 ssize_t len = te->out_data_end - te->out_data_start;
01123 ssize_t off = te->out_data_start - te->out_buf_start;
01124 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
01125 te->out_data_start = te->out_buf_start;
01126 te->out_data_end -= off;
01127 }
01128 opp = &te->out_data_end;
01129 os = te->out_buf_end;
01130 }
01131
01132 f = flags;
01133 if (ec->num_finished != i)
01134 f |= ECONV_PARTIAL_INPUT;
01135 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
01136 start = 1;
01137 flags &= ~ECONV_AFTER_OUTPUT;
01138 }
01139 if (i != 0)
01140 f &= ~ECONV_AFTER_OUTPUT;
01141 iold = *ipp;
01142 oold = *opp;
01143 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
01144 if (iold != *ipp || oold != *opp)
01145 try = 1;
01146
01147 switch (res) {
01148 case econv_invalid_byte_sequence:
01149 case econv_incomplete_input:
01150 case econv_undefined_conversion:
01151 case econv_after_output:
01152 return i;
01153
01154 case econv_destination_buffer_full:
01155 case econv_source_buffer_empty:
01156 break;
01157
01158 case econv_finished:
01159 ec->num_finished = i+1;
01160 break;
01161 }
01162 }
01163 }
01164 return -1;
01165 }
01166
01167 static rb_econv_result_t
01168 rb_trans_conv(rb_econv_t *ec,
01169 const unsigned char **input_ptr, const unsigned char *input_stop,
01170 unsigned char **output_ptr, unsigned char *output_stop,
01171 int flags,
01172 int *result_position_ptr)
01173 {
01174 int i;
01175 int needreport_index;
01176 int sweep_start;
01177
01178 unsigned char empty_buf;
01179 unsigned char *empty_ptr = &empty_buf;
01180
01181 if (!input_ptr) {
01182 input_ptr = (const unsigned char **)&empty_ptr;
01183 input_stop = empty_ptr;
01184 }
01185
01186 if (!output_ptr) {
01187 output_ptr = &empty_ptr;
01188 output_stop = empty_ptr;
01189 }
01190
01191 if (ec->elems[0].last_result == econv_after_output)
01192 ec->elems[0].last_result = econv_source_buffer_empty;
01193
01194 needreport_index = -1;
01195 for (i = ec->num_trans-1; 0 <= i; i--) {
01196 switch (ec->elems[i].last_result) {
01197 case econv_invalid_byte_sequence:
01198 case econv_incomplete_input:
01199 case econv_undefined_conversion:
01200 case econv_after_output:
01201 case econv_finished:
01202 sweep_start = i+1;
01203 needreport_index = i;
01204 goto found_needreport;
01205
01206 case econv_destination_buffer_full:
01207 case econv_source_buffer_empty:
01208 break;
01209
01210 default:
01211 rb_bug("unexpected transcode last result");
01212 }
01213 }
01214
01215
01216
01217 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
01218 (flags & ECONV_AFTER_OUTPUT)) {
01219 rb_econv_result_t res;
01220
01221 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
01222 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
01223 result_position_ptr);
01224
01225 if (res == econv_source_buffer_empty)
01226 return econv_after_output;
01227 return res;
01228 }
01229
01230 sweep_start = 0;
01231
01232 found_needreport:
01233
01234 do {
01235 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
01236 sweep_start = needreport_index + 1;
01237 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
01238
01239 for (i = ec->num_trans-1; 0 <= i; i--) {
01240 if (ec->elems[i].last_result != econv_source_buffer_empty) {
01241 rb_econv_result_t res = ec->elems[i].last_result;
01242 if (res == econv_invalid_byte_sequence ||
01243 res == econv_incomplete_input ||
01244 res == econv_undefined_conversion ||
01245 res == econv_after_output) {
01246 ec->elems[i].last_result = econv_source_buffer_empty;
01247 }
01248 if (result_position_ptr)
01249 *result_position_ptr = i;
01250 return res;
01251 }
01252 }
01253 if (result_position_ptr)
01254 *result_position_ptr = -1;
01255 return econv_source_buffer_empty;
01256 }
01257
01258 static rb_econv_result_t
01259 rb_econv_convert0(rb_econv_t *ec,
01260 const unsigned char **input_ptr, const unsigned char *input_stop,
01261 unsigned char **output_ptr, unsigned char *output_stop,
01262 int flags)
01263 {
01264 rb_econv_result_t res;
01265 int result_position;
01266 int has_output = 0;
01267
01268 memset(&ec->last_error, 0, sizeof(ec->last_error));
01269
01270 if (ec->num_trans == 0) {
01271 size_t len;
01272 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
01273 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
01274 len = output_stop - *output_ptr;
01275 memcpy(*output_ptr, ec->in_data_start, len);
01276 *output_ptr = output_stop;
01277 ec->in_data_start += len;
01278 res = econv_destination_buffer_full;
01279 goto gotresult;
01280 }
01281 len = ec->in_data_end - ec->in_data_start;
01282 memcpy(*output_ptr, ec->in_data_start, len);
01283 *output_ptr += len;
01284 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
01285 if (flags & ECONV_AFTER_OUTPUT) {
01286 res = econv_after_output;
01287 goto gotresult;
01288 }
01289 }
01290 if (output_stop - *output_ptr < input_stop - *input_ptr) {
01291 len = output_stop - *output_ptr;
01292 }
01293 else {
01294 len = input_stop - *input_ptr;
01295 }
01296 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
01297 *(*output_ptr)++ = *(*input_ptr)++;
01298 res = econv_after_output;
01299 goto gotresult;
01300 }
01301 memcpy(*output_ptr, *input_ptr, len);
01302 *output_ptr += len;
01303 *input_ptr += len;
01304 if (*input_ptr != input_stop)
01305 res = econv_destination_buffer_full;
01306 else if (flags & ECONV_PARTIAL_INPUT)
01307 res = econv_source_buffer_empty;
01308 else
01309 res = econv_finished;
01310 goto gotresult;
01311 }
01312
01313 if (ec->elems[ec->num_trans-1].out_data_start) {
01314 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
01315 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
01316 if (data_start != data_end) {
01317 size_t len;
01318 if (output_stop - *output_ptr < data_end - data_start) {
01319 len = output_stop - *output_ptr;
01320 memcpy(*output_ptr, data_start, len);
01321 *output_ptr = output_stop;
01322 ec->elems[ec->num_trans-1].out_data_start += len;
01323 res = econv_destination_buffer_full;
01324 goto gotresult;
01325 }
01326 len = data_end - data_start;
01327 memcpy(*output_ptr, data_start, len);
01328 *output_ptr += len;
01329 ec->elems[ec->num_trans-1].out_data_start =
01330 ec->elems[ec->num_trans-1].out_data_end =
01331 ec->elems[ec->num_trans-1].out_buf_start;
01332 has_output = 1;
01333 }
01334 }
01335
01336 if (ec->in_buf_start &&
01337 ec->in_data_start != ec->in_data_end) {
01338 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
01339 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
01340 if (res != econv_source_buffer_empty)
01341 goto gotresult;
01342 }
01343
01344 if (has_output &&
01345 (flags & ECONV_AFTER_OUTPUT) &&
01346 *input_ptr != input_stop) {
01347 input_stop = *input_ptr;
01348 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01349 if (res == econv_source_buffer_empty)
01350 res = econv_after_output;
01351 }
01352 else if ((flags & ECONV_AFTER_OUTPUT) ||
01353 ec->num_trans == 1) {
01354 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01355 }
01356 else {
01357 flags |= ECONV_AFTER_OUTPUT;
01358 do {
01359 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01360 } while (res == econv_after_output);
01361 }
01362
01363 gotresult:
01364 ec->last_error.result = res;
01365 if (res == econv_invalid_byte_sequence ||
01366 res == econv_incomplete_input ||
01367 res == econv_undefined_conversion) {
01368 rb_transcoding *error_tc = ec->elems[result_position].tc;
01369 ec->last_error.error_tc = error_tc;
01370 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
01371 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
01372 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
01373 ec->last_error.error_bytes_len = error_tc->recognized_len;
01374 ec->last_error.readagain_len = error_tc->readagain_len;
01375 }
01376
01377 return res;
01378 }
01379
01380 static int output_replacement_character(rb_econv_t *ec);
01381
01382 static int
01383 output_hex_charref(rb_econv_t *ec)
01384 {
01385 int ret;
01386 unsigned char utfbuf[1024];
01387 const unsigned char *utf;
01388 size_t utf_len;
01389 int utf_allocated = 0;
01390 char charef_buf[16];
01391 const unsigned char *p;
01392
01393 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
01394 utf = ec->last_error.error_bytes_start;
01395 utf_len = ec->last_error.error_bytes_len;
01396 }
01397 else {
01398 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
01399 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
01400 utfbuf, sizeof(utfbuf),
01401 &utf_len);
01402 if (!utf)
01403 return -1;
01404 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
01405 utf_allocated = 1;
01406 }
01407
01408 if (utf_len % 4 != 0)
01409 goto fail;
01410
01411 p = utf;
01412 while (4 <= utf_len) {
01413 unsigned int u = 0;
01414 u += p[0] << 24;
01415 u += p[1] << 16;
01416 u += p[2] << 8;
01417 u += p[3];
01418 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
01419
01420 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
01421 if (ret == -1)
01422 goto fail;
01423
01424 p += 4;
01425 utf_len -= 4;
01426 }
01427
01428 if (utf_allocated)
01429 xfree((void *)utf);
01430 return 0;
01431
01432 fail:
01433 if (utf_allocated)
01434 xfree((void *)utf);
01435 return -1;
01436 }
01437
01438 rb_econv_result_t
01439 rb_econv_convert(rb_econv_t *ec,
01440 const unsigned char **input_ptr, const unsigned char *input_stop,
01441 unsigned char **output_ptr, unsigned char *output_stop,
01442 int flags)
01443 {
01444 rb_econv_result_t ret;
01445
01446 unsigned char empty_buf;
01447 unsigned char *empty_ptr = &empty_buf;
01448
01449 ec->started = 1;
01450
01451 if (!input_ptr) {
01452 input_ptr = (const unsigned char **)&empty_ptr;
01453 input_stop = empty_ptr;
01454 }
01455
01456 if (!output_ptr) {
01457 output_ptr = &empty_ptr;
01458 output_stop = empty_ptr;
01459 }
01460
01461 resume:
01462 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
01463
01464 if (ret == econv_invalid_byte_sequence ||
01465 ret == econv_incomplete_input) {
01466
01467
01468 switch (ec->flags & ECONV_INVALID_MASK) {
01469 case ECONV_INVALID_REPLACE:
01470 if (output_replacement_character(ec) == 0)
01471 goto resume;
01472 }
01473 }
01474
01475 if (ret == econv_undefined_conversion) {
01476
01477
01478
01479 switch (ec->flags & ECONV_UNDEF_MASK) {
01480 case ECONV_UNDEF_REPLACE:
01481 if (output_replacement_character(ec) == 0)
01482 goto resume;
01483 break;
01484
01485 case ECONV_UNDEF_HEX_CHARREF:
01486 if (output_hex_charref(ec) == 0)
01487 goto resume;
01488 break;
01489 }
01490 }
01491
01492 return ret;
01493 }
01494
01495 const char *
01496 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
01497 {
01498 rb_transcoding *tc = ec->last_tc;
01499 const rb_transcoder *tr;
01500
01501 if (tc == NULL)
01502 return "";
01503
01504 tr = tc->transcoder;
01505
01506 if (tr->asciicompat_type == asciicompat_encoder)
01507 return tr->src_encoding;
01508 return tr->dst_encoding;
01509 }
01510
01511 static unsigned char *
01512 allocate_converted_string(const char *sname, const char *dname,
01513 const unsigned char *str, size_t len,
01514 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
01515 size_t *dst_len_ptr)
01516 {
01517 unsigned char *dst_str;
01518 size_t dst_len;
01519 size_t dst_bufsize;
01520
01521 rb_econv_t *ec;
01522 rb_econv_result_t res;
01523
01524 const unsigned char *sp;
01525 unsigned char *dp;
01526
01527 if (caller_dst_buf)
01528 dst_bufsize = caller_dst_bufsize;
01529 else if (len == 0)
01530 dst_bufsize = 1;
01531 else
01532 dst_bufsize = len;
01533
01534 ec = rb_econv_open(sname, dname, 0);
01535 if (ec == NULL)
01536 return NULL;
01537 if (caller_dst_buf)
01538 dst_str = caller_dst_buf;
01539 else
01540 dst_str = xmalloc(dst_bufsize);
01541 dst_len = 0;
01542 sp = str;
01543 dp = dst_str+dst_len;
01544 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01545 dst_len = dp - dst_str;
01546 while (res == econv_destination_buffer_full) {
01547 if (SIZE_MAX/2 < dst_bufsize) {
01548 goto fail;
01549 }
01550 dst_bufsize *= 2;
01551 if (dst_str == caller_dst_buf) {
01552 unsigned char *tmp;
01553 tmp = xmalloc(dst_bufsize);
01554 memcpy(tmp, dst_str, dst_bufsize/2);
01555 dst_str = tmp;
01556 }
01557 else {
01558 dst_str = xrealloc(dst_str, dst_bufsize);
01559 }
01560 dp = dst_str+dst_len;
01561 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01562 dst_len = dp - dst_str;
01563 }
01564 if (res != econv_finished) {
01565 goto fail;
01566 }
01567 rb_econv_close(ec);
01568 *dst_len_ptr = dst_len;
01569 return dst_str;
01570
01571 fail:
01572 if (dst_str != caller_dst_buf)
01573 xfree(dst_str);
01574 rb_econv_close(ec);
01575 return NULL;
01576 }
01577
01578
01579 int
01580 rb_econv_insert_output(rb_econv_t *ec,
01581 const unsigned char *str, size_t len, const char *str_encoding)
01582 {
01583 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
01584 unsigned char insert_buf[4096];
01585 const unsigned char *insert_str = NULL;
01586 size_t insert_len;
01587
01588 int last_trans_index;
01589 rb_transcoding *tc;
01590
01591 unsigned char **buf_start_p;
01592 unsigned char **data_start_p;
01593 unsigned char **data_end_p;
01594 unsigned char **buf_end_p;
01595
01596 size_t need;
01597
01598 ec->started = 1;
01599
01600 if (len == 0)
01601 return 0;
01602
01603 if (encoding_equal(insert_encoding, str_encoding)) {
01604 insert_str = str;
01605 insert_len = len;
01606 }
01607 else {
01608 insert_str = allocate_converted_string(str_encoding, insert_encoding,
01609 str, len, insert_buf, sizeof(insert_buf), &insert_len);
01610 if (insert_str == NULL)
01611 return -1;
01612 }
01613
01614 need = insert_len;
01615
01616 last_trans_index = ec->num_trans-1;
01617 if (ec->num_trans == 0) {
01618 tc = NULL;
01619 buf_start_p = &ec->in_buf_start;
01620 data_start_p = &ec->in_data_start;
01621 data_end_p = &ec->in_data_end;
01622 buf_end_p = &ec->in_buf_end;
01623 }
01624 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
01625 tc = ec->elems[last_trans_index].tc;
01626 need += tc->readagain_len;
01627 if (need < insert_len)
01628 goto fail;
01629 if (last_trans_index == 0) {
01630 buf_start_p = &ec->in_buf_start;
01631 data_start_p = &ec->in_data_start;
01632 data_end_p = &ec->in_data_end;
01633 buf_end_p = &ec->in_buf_end;
01634 }
01635 else {
01636 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
01637 buf_start_p = &ee->out_buf_start;
01638 data_start_p = &ee->out_data_start;
01639 data_end_p = &ee->out_data_end;
01640 buf_end_p = &ee->out_buf_end;
01641 }
01642 }
01643 else {
01644 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
01645 buf_start_p = &ee->out_buf_start;
01646 data_start_p = &ee->out_data_start;
01647 data_end_p = &ee->out_data_end;
01648 buf_end_p = &ee->out_buf_end;
01649 tc = ec->elems[last_trans_index].tc;
01650 }
01651
01652 if (*buf_start_p == NULL) {
01653 unsigned char *buf = xmalloc(need);
01654 *buf_start_p = buf;
01655 *data_start_p = buf;
01656 *data_end_p = buf;
01657 *buf_end_p = buf+need;
01658 }
01659 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
01660 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
01661 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
01662 *data_start_p = *buf_start_p;
01663 if ((size_t)(*buf_end_p - *data_end_p) < need) {
01664 unsigned char *buf;
01665 size_t s = (*data_end_p - *buf_start_p) + need;
01666 if (s < need)
01667 goto fail;
01668 buf = xrealloc(*buf_start_p, s);
01669 *data_start_p = buf;
01670 *data_end_p = buf + (*data_end_p - *buf_start_p);
01671 *buf_start_p = buf;
01672 *buf_end_p = buf + s;
01673 }
01674 }
01675
01676 memcpy(*data_end_p, insert_str, insert_len);
01677 *data_end_p += insert_len;
01678 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
01679 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
01680 *data_end_p += tc->readagain_len;
01681 tc->readagain_len = 0;
01682 }
01683
01684 if (insert_str != str && insert_str != insert_buf)
01685 xfree((void*)insert_str);
01686 return 0;
01687
01688 fail:
01689 if (insert_str != str && insert_str != insert_buf)
01690 xfree((void*)insert_str);
01691 return -1;
01692 }
01693
01694 void
01695 rb_econv_close(rb_econv_t *ec)
01696 {
01697 int i;
01698
01699 if (ec->replacement_allocated) {
01700 xfree((void *)ec->replacement_str);
01701 }
01702 for (i = 0; i < ec->num_trans; i++) {
01703 rb_transcoding_close(ec->elems[i].tc);
01704 if (ec->elems[i].out_buf_start)
01705 xfree(ec->elems[i].out_buf_start);
01706 }
01707 xfree(ec->in_buf_start);
01708 xfree(ec->elems);
01709 xfree(ec);
01710 }
01711
01712 size_t
01713 rb_econv_memsize(rb_econv_t *ec)
01714 {
01715 size_t size = sizeof(rb_econv_t);
01716 int i;
01717
01718 if (ec->replacement_allocated) {
01719 size += ec->replacement_len;
01720 }
01721 for (i = 0; i < ec->num_trans; i++) {
01722 size += rb_transcoding_memsize(ec->elems[i].tc);
01723
01724 if (ec->elems[i].out_buf_start) {
01725 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
01726 }
01727 }
01728 size += ec->in_buf_end - ec->in_buf_start;
01729 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
01730
01731 return size;
01732 }
01733
01734 int
01735 rb_econv_putbackable(rb_econv_t *ec)
01736 {
01737 if (ec->num_trans == 0)
01738 return 0;
01739 #if SIZEOF_SIZE_T > SIZEOF_INT
01740 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
01741 #endif
01742 return (int)ec->elems[0].tc->readagain_len;
01743 }
01744
01745 void
01746 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
01747 {
01748 rb_transcoding *tc;
01749 if (ec->num_trans == 0 || n == 0)
01750 return;
01751 tc = ec->elems[0].tc;
01752 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
01753 tc->readagain_len -= n;
01754 }
01755
01756 struct asciicompat_encoding_t {
01757 const char *ascii_compat_name;
01758 const char *ascii_incompat_name;
01759 };
01760
01761 static int
01762 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
01763 {
01764 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
01765 transcoder_entry_t *entry = (transcoder_entry_t *)val;
01766 const rb_transcoder *tr;
01767
01768 if (DECORATOR_P(entry->sname, entry->dname))
01769 return ST_CONTINUE;
01770 tr = load_transcoder_entry(entry);
01771 if (tr && tr->asciicompat_type == asciicompat_decoder) {
01772 data->ascii_compat_name = tr->dst_encoding;
01773 return ST_STOP;
01774 }
01775 return ST_CONTINUE;
01776 }
01777
01778 const char *
01779 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
01780 {
01781 st_data_t v;
01782 st_table *table2;
01783 struct asciicompat_encoding_t data;
01784
01785 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
01786 return NULL;
01787 table2 = (st_table *)v;
01788
01789
01790
01791
01792
01793
01794
01795
01796 if (table2->num_entries != 1)
01797 return NULL;
01798
01799 data.ascii_incompat_name = ascii_incompat_name;
01800 data.ascii_compat_name = NULL;
01801 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
01802 return data.ascii_compat_name;
01803 }
01804
01805 VALUE
01806 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
01807 {
01808 unsigned const char *ss, *sp, *se;
01809 unsigned char *ds, *dp, *de;
01810 rb_econv_result_t res;
01811 int max_output;
01812
01813 if (NIL_P(dst)) {
01814 dst = rb_str_buf_new(len);
01815 if (ec->destination_encoding)
01816 rb_enc_associate(dst, ec->destination_encoding);
01817 }
01818
01819 if (ec->last_tc)
01820 max_output = ec->last_tc->transcoder->max_output;
01821 else
01822 max_output = 1;
01823
01824 res = econv_destination_buffer_full;
01825 while (res == econv_destination_buffer_full) {
01826 long dlen = RSTRING_LEN(dst);
01827 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
01828 unsigned long new_capa = (unsigned long)dlen + len + max_output;
01829 if (LONG_MAX < new_capa)
01830 rb_raise(rb_eArgError, "too long string");
01831 rb_str_resize(dst, new_capa);
01832 rb_str_set_len(dst, dlen);
01833 }
01834 ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
01835 se = ss + len;
01836 ds = (unsigned char *)RSTRING_PTR(dst);
01837 de = ds + rb_str_capacity(dst);
01838 dp = ds += dlen;
01839 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
01840 off += sp - ss;
01841 len -= sp - ss;
01842 rb_str_set_len(dst, dlen + (dp - ds));
01843 rb_econv_check_error(ec);
01844 }
01845
01846 return dst;
01847 }
01848
01849 VALUE
01850 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
01851 {
01852 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
01853 }
01854
01855 VALUE
01856 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
01857 {
01858 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
01859 }
01860
01861 VALUE
01862 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
01863 {
01864 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
01865 }
01866
01867 static int
01868 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
01869 {
01870 transcoder_entry_t *entry;
01871 const rb_transcoder *tr;
01872
01873 if (ec->started != 0)
01874 return -1;
01875
01876 entry = get_transcoder_entry(sname, dname);
01877 if (!entry)
01878 return -1;
01879
01880 tr = load_transcoder_entry(entry);
01881
01882 return rb_econv_add_transcoder_at(ec, tr, n);
01883 }
01884
01885 static int
01886 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
01887 {
01888 return rb_econv_add_converter(ec, "", decorator_name, n);
01889 }
01890
01891 int
01892 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
01893 {
01894 const rb_transcoder *tr;
01895
01896 if (ec->num_trans == 0)
01897 return rb_econv_decorate_at(ec, decorator_name, 0);
01898
01899 tr = ec->elems[0].tc->transcoder;
01900
01901 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01902 tr->asciicompat_type == asciicompat_decoder)
01903 return rb_econv_decorate_at(ec, decorator_name, 1);
01904
01905 return rb_econv_decorate_at(ec, decorator_name, 0);
01906 }
01907
01908 int
01909 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
01910 {
01911 const rb_transcoder *tr;
01912
01913 if (ec->num_trans == 0)
01914 return rb_econv_decorate_at(ec, decorator_name, 0);
01915
01916 tr = ec->elems[ec->num_trans-1].tc->transcoder;
01917
01918 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01919 tr->asciicompat_type == asciicompat_encoder)
01920 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
01921
01922 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
01923 }
01924
01925 void
01926 rb_econv_binmode(rb_econv_t *ec)
01927 {
01928 const rb_transcoder *trs[3];
01929 int n, i, j;
01930 transcoder_entry_t *entry;
01931 int num_trans;
01932
01933 n = 0;
01934 if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
01935 entry = get_transcoder_entry("", "universal_newline");
01936 if (entry->transcoder)
01937 trs[n++] = entry->transcoder;
01938 }
01939 if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
01940 entry = get_transcoder_entry("", "crlf_newline");
01941 if (entry->transcoder)
01942 trs[n++] = entry->transcoder;
01943 }
01944 if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
01945 entry = get_transcoder_entry("", "cr_newline");
01946 if (entry->transcoder)
01947 trs[n++] = entry->transcoder;
01948 }
01949
01950 num_trans = ec->num_trans;
01951 j = 0;
01952 for (i = 0; i < num_trans; i++) {
01953 int k;
01954 for (k = 0; k < n; k++)
01955 if (trs[k] == ec->elems[i].tc->transcoder)
01956 break;
01957 if (k == n) {
01958 ec->elems[j] = ec->elems[i];
01959 j++;
01960 }
01961 else {
01962 rb_transcoding_close(ec->elems[i].tc);
01963 xfree(ec->elems[i].out_buf_start);
01964 ec->num_trans--;
01965 }
01966 }
01967
01968 ec->flags &= ~(ECONV_UNIVERSAL_NEWLINE_DECORATOR|ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR);
01969
01970 }
01971
01972 static VALUE
01973 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
01974 {
01975 int has_description = 0;
01976
01977 if (NIL_P(mesg))
01978 mesg = rb_str_new(NULL, 0);
01979
01980 if (*sname != '\0' || *dname != '\0') {
01981 if (*sname == '\0')
01982 rb_str_cat2(mesg, dname);
01983 else if (*dname == '\0')
01984 rb_str_cat2(mesg, sname);
01985 else
01986 rb_str_catf(mesg, "%s to %s", sname, dname);
01987 has_description = 1;
01988 }
01989
01990 if (ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR|
01991 ECONV_CRLF_NEWLINE_DECORATOR|
01992 ECONV_CR_NEWLINE_DECORATOR|
01993 ECONV_XML_TEXT_DECORATOR|
01994 ECONV_XML_ATTR_CONTENT_DECORATOR|
01995 ECONV_XML_ATTR_QUOTE_DECORATOR)) {
01996 const char *pre = "";
01997 if (has_description)
01998 rb_str_cat2(mesg, " with ");
01999 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
02000 rb_str_cat2(mesg, pre); pre = ",";
02001 rb_str_cat2(mesg, "universal_newline");
02002 }
02003 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
02004 rb_str_cat2(mesg, pre); pre = ",";
02005 rb_str_cat2(mesg, "crlf_newline");
02006 }
02007 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
02008 rb_str_cat2(mesg, pre); pre = ",";
02009 rb_str_cat2(mesg, "cr_newline");
02010 }
02011 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
02012 rb_str_cat2(mesg, pre); pre = ",";
02013 rb_str_cat2(mesg, "xml_text");
02014 }
02015 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
02016 rb_str_cat2(mesg, pre); pre = ",";
02017 rb_str_cat2(mesg, "xml_attr_content");
02018 }
02019 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
02020 rb_str_cat2(mesg, pre); pre = ",";
02021 rb_str_cat2(mesg, "xml_attr_quote");
02022 }
02023 has_description = 1;
02024 }
02025 if (!has_description) {
02026 rb_str_cat2(mesg, "no-conversion");
02027 }
02028
02029 return mesg;
02030 }
02031
02032 VALUE
02033 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
02034 {
02035 VALUE mesg, exc;
02036 mesg = rb_str_new_cstr("code converter not found (");
02037 econv_description(sname, dname, ecflags, mesg);
02038 rb_str_cat2(mesg, ")");
02039 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
02040 return exc;
02041 }
02042
02043 static VALUE
02044 make_econv_exception(rb_econv_t *ec)
02045 {
02046 VALUE mesg, exc;
02047 if (ec->last_error.result == econv_invalid_byte_sequence ||
02048 ec->last_error.result == econv_incomplete_input) {
02049 const char *err = (const char *)ec->last_error.error_bytes_start;
02050 size_t error_len = ec->last_error.error_bytes_len;
02051 VALUE bytes = rb_str_new(err, error_len);
02052 VALUE dumped = rb_str_dump(bytes);
02053 size_t readagain_len = ec->last_error.readagain_len;
02054 VALUE bytes2 = Qnil;
02055 VALUE dumped2;
02056 int idx;
02057 if (ec->last_error.result == econv_incomplete_input) {
02058 mesg = rb_sprintf("incomplete %s on %s",
02059 StringValueCStr(dumped),
02060 ec->last_error.source_encoding);
02061 }
02062 else if (readagain_len) {
02063 bytes2 = rb_str_new(err+error_len, readagain_len);
02064 dumped2 = rb_str_dump(bytes2);
02065 mesg = rb_sprintf("%s followed by %s on %s",
02066 StringValueCStr(dumped),
02067 StringValueCStr(dumped2),
02068 ec->last_error.source_encoding);
02069 }
02070 else {
02071 mesg = rb_sprintf("%s on %s",
02072 StringValueCStr(dumped),
02073 ec->last_error.source_encoding);
02074 }
02075
02076 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
02077 rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
02078 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
02079 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
02080
02081 set_encs:
02082 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
02083 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
02084 idx = rb_enc_find_index(ec->last_error.source_encoding);
02085 if (0 <= idx)
02086 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02087 idx = rb_enc_find_index(ec->last_error.destination_encoding);
02088 if (0 <= idx)
02089 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02090 return exc;
02091 }
02092 if (ec->last_error.result == econv_undefined_conversion) {
02093 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
02094 ec->last_error.error_bytes_len);
02095 VALUE dumped = Qnil;
02096 int idx;
02097 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
02098 rb_encoding *utf8 = rb_utf8_encoding();
02099 const char *start, *end;
02100 int n;
02101 start = (const char *)ec->last_error.error_bytes_start;
02102 end = start + ec->last_error.error_bytes_len;
02103 n = rb_enc_precise_mbclen(start, end, utf8);
02104 if (MBCLEN_CHARFOUND_P(n) &&
02105 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
02106 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
02107 dumped = rb_sprintf("U+%04X", cc);
02108 }
02109 }
02110 if (dumped == Qnil)
02111 dumped = rb_str_dump(bytes);
02112 if (strcmp(ec->last_error.source_encoding,
02113 ec->source_encoding_name) == 0 &&
02114 strcmp(ec->last_error.destination_encoding,
02115 ec->destination_encoding_name) == 0) {
02116 mesg = rb_sprintf("%s from %s to %s",
02117 StringValueCStr(dumped),
02118 ec->last_error.source_encoding,
02119 ec->last_error.destination_encoding);
02120 }
02121 else {
02122 int i;
02123 mesg = rb_sprintf("%s to %s in conversion from %s",
02124 StringValueCStr(dumped),
02125 ec->last_error.destination_encoding,
02126 ec->source_encoding_name);
02127 for (i = 0; i < ec->num_trans; i++) {
02128 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
02129 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
02130 rb_str_catf(mesg, " to %s",
02131 ec->elems[i].tc->transcoder->dst_encoding);
02132 }
02133 }
02134 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
02135 idx = rb_enc_find_index(ec->last_error.source_encoding);
02136 if (0 <= idx)
02137 rb_enc_associate_index(bytes, idx);
02138 rb_ivar_set(exc, rb_intern("error_char"), bytes);
02139 goto set_encs;
02140 }
02141 return Qnil;
02142 }
02143
02144 static void
02145 more_output_buffer(
02146 VALUE destination,
02147 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02148 int max_output,
02149 unsigned char **out_start_ptr,
02150 unsigned char **out_pos,
02151 unsigned char **out_stop_ptr)
02152 {
02153 size_t len = (*out_pos - *out_start_ptr);
02154 size_t new_len = (len + max_output) * 2;
02155 *out_start_ptr = resize_destination(destination, len, new_len);
02156 *out_pos = *out_start_ptr + len;
02157 *out_stop_ptr = *out_start_ptr + new_len;
02158 }
02159
02160 static int
02161 make_replacement(rb_econv_t *ec)
02162 {
02163 rb_transcoding *tc;
02164 const rb_transcoder *tr;
02165 rb_encoding *enc;
02166 const unsigned char *replacement;
02167 const char *repl_enc;
02168 const char *ins_enc;
02169 size_t len;
02170
02171 if (ec->replacement_str)
02172 return 0;
02173
02174 ins_enc = rb_econv_encoding_to_insert_output(ec);
02175
02176 tc = ec->last_tc;
02177 if (*ins_enc) {
02178 tr = tc->transcoder;
02179 enc = rb_enc_find(tr->dst_encoding);
02180 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
02181 }
02182 else {
02183 replacement = (unsigned char *)"?";
02184 len = 1;
02185 repl_enc = "";
02186 }
02187
02188 ec->replacement_str = replacement;
02189 ec->replacement_len = len;
02190 ec->replacement_enc = repl_enc;
02191 ec->replacement_allocated = 0;
02192 return 0;
02193 }
02194
02195 int
02196 rb_econv_set_replacement(rb_econv_t *ec,
02197 const unsigned char *str, size_t len, const char *encname)
02198 {
02199 unsigned char *str2;
02200 size_t len2;
02201 const char *encname2;
02202
02203 encname2 = rb_econv_encoding_to_insert_output(ec);
02204
02205 if (encoding_equal(encname, encname2)) {
02206 str2 = xmalloc(len);
02207 MEMCPY(str2, str, unsigned char, len);
02208 len2 = len;
02209 encname2 = encname;
02210 }
02211 else {
02212 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
02213 if (!str2)
02214 return -1;
02215 }
02216
02217 if (ec->replacement_allocated) {
02218 xfree((void *)ec->replacement_str);
02219 }
02220 ec->replacement_allocated = 1;
02221 ec->replacement_str = str2;
02222 ec->replacement_len = len2;
02223 ec->replacement_enc = encname2;
02224 return 0;
02225 }
02226
02227 static int
02228 output_replacement_character(rb_econv_t *ec)
02229 {
02230 int ret;
02231
02232 if (make_replacement(ec) == -1)
02233 return -1;
02234
02235 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
02236 if (ret == -1)
02237 return -1;
02238
02239 return 0;
02240 }
02241
02242 #if 1
02243 static void
02244 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02245 const unsigned char *in_stop, unsigned char *out_stop,
02246 VALUE destination,
02247 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02248 const char *src_encoding,
02249 const char *dst_encoding,
02250 int ecflags,
02251 VALUE ecopts)
02252 {
02253 rb_econv_t *ec;
02254 rb_transcoding *last_tc;
02255 rb_econv_result_t ret;
02256 unsigned char *out_start = *out_pos;
02257 int max_output;
02258 VALUE exc;
02259 VALUE fallback = Qnil;
02260
02261 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02262 if (!ec)
02263 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02264
02265 if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH)
02266 fallback = rb_hash_aref(ecopts, sym_fallback);
02267 last_tc = ec->last_tc;
02268 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02269
02270 resume:
02271 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
02272
02273 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
02274 VALUE rep = rb_enc_str_new(
02275 (const char *)ec->last_error.error_bytes_start,
02276 ec->last_error.error_bytes_len,
02277 rb_enc_find(ec->last_error.source_encoding));
02278 rep = rb_hash_lookup2(fallback, rep, Qundef);
02279 if (rep != Qundef) {
02280 StringValue(rep);
02281 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
02282 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
02283 if ((int)ret == -1) {
02284 rb_raise(rb_eArgError, "too big fallback string");
02285 }
02286 goto resume;
02287 }
02288 }
02289
02290 if (ret == econv_invalid_byte_sequence ||
02291 ret == econv_incomplete_input ||
02292 ret == econv_undefined_conversion) {
02293 exc = make_econv_exception(ec);
02294 rb_econv_close(ec);
02295 rb_exc_raise(exc);
02296 }
02297
02298 if (ret == econv_destination_buffer_full) {
02299 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02300 goto resume;
02301 }
02302
02303 rb_econv_close(ec);
02304 return;
02305 }
02306 #else
02307
02308 static void
02309 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02310 const unsigned char *in_stop, unsigned char *out_stop,
02311 VALUE destination,
02312 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02313 const char *src_encoding,
02314 const char *dst_encoding,
02315 int ecflags,
02316 VALUE ecopts)
02317 {
02318 rb_econv_t *ec;
02319 rb_transcoding *last_tc;
02320 rb_econv_result_t ret;
02321 unsigned char *out_start = *out_pos;
02322 const unsigned char *ptr;
02323 int max_output;
02324 VALUE exc;
02325
02326 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02327 if (!ec)
02328 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02329
02330 last_tc = ec->last_tc;
02331 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02332
02333 ret = econv_source_buffer_empty;
02334 ptr = *in_pos;
02335 while (ret != econv_finished) {
02336 unsigned char input_byte;
02337 const unsigned char *p = &input_byte;
02338
02339 if (ret == econv_source_buffer_empty) {
02340 if (ptr < in_stop) {
02341 input_byte = *ptr;
02342 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02343 }
02344 else {
02345 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
02346 }
02347 }
02348 else {
02349 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02350 }
02351 if (&input_byte != p)
02352 ptr += p - &input_byte;
02353 switch (ret) {
02354 case econv_invalid_byte_sequence:
02355 case econv_incomplete_input:
02356 case econv_undefined_conversion:
02357 exc = make_econv_exception(ec);
02358 rb_econv_close(ec);
02359 rb_exc_raise(exc);
02360 break;
02361
02362 case econv_destination_buffer_full:
02363 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02364 break;
02365
02366 case econv_source_buffer_empty:
02367 break;
02368
02369 case econv_finished:
02370 break;
02371 }
02372 }
02373 rb_econv_close(ec);
02374 *in_pos = in_stop;
02375 return;
02376 }
02377 #endif
02378
02379
02380
02381
02382
02383
02384 static unsigned char *
02385 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
02386 {
02387 rb_str_resize(destination, new_len);
02388 return (unsigned char *)RSTRING_PTR(destination);
02389 }
02390
02391 static int
02392 econv_opts(VALUE opt)
02393 {
02394 VALUE v;
02395 int ecflags = 0;
02396
02397 v = rb_hash_aref(opt, sym_invalid);
02398 if (NIL_P(v)) {
02399 }
02400 else if (v==sym_replace) {
02401 ecflags |= ECONV_INVALID_REPLACE;
02402 }
02403 else {
02404 rb_raise(rb_eArgError, "unknown value for invalid character option");
02405 }
02406
02407 v = rb_hash_aref(opt, sym_undef);
02408 if (NIL_P(v)) {
02409 }
02410 else if (v==sym_replace) {
02411 ecflags |= ECONV_UNDEF_REPLACE;
02412 }
02413 else {
02414 rb_raise(rb_eArgError, "unknown value for undefined character option");
02415 }
02416
02417 v = rb_hash_aref(opt, sym_replace);
02418 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
02419 ecflags |= ECONV_UNDEF_REPLACE;
02420 }
02421
02422 v = rb_hash_aref(opt, sym_xml);
02423 if (!NIL_P(v)) {
02424 if (v==sym_text) {
02425 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02426 }
02427 else if (v==sym_attr) {
02428 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02429 }
02430 else if (TYPE(v) == T_SYMBOL) {
02431 rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
02432 }
02433 else {
02434 rb_raise(rb_eArgError, "unexpected value for xml option");
02435 }
02436 }
02437
02438 v = rb_hash_aref(opt, sym_universal_newline);
02439 if (RTEST(v))
02440 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02441
02442 v = rb_hash_aref(opt, sym_crlf_newline);
02443 if (RTEST(v))
02444 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02445
02446 v = rb_hash_aref(opt, sym_cr_newline);
02447 if (RTEST(v))
02448 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
02449
02450 return ecflags;
02451 }
02452
02453 int
02454 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
02455 {
02456 int ecflags;
02457 VALUE newhash = Qnil;
02458 VALUE v;
02459
02460 if (NIL_P(opthash)) {
02461 *opts = Qnil;
02462 return 0;
02463 }
02464 ecflags = econv_opts(opthash);
02465
02466 v = rb_hash_aref(opthash, sym_replace);
02467 if (!NIL_P(v)) {
02468 StringValue(v);
02469 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
02470 VALUE dumped = rb_str_dump(v);
02471 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
02472 StringValueCStr(dumped),
02473 rb_enc_name(rb_enc_get(v)));
02474 }
02475 v = rb_str_new_frozen(v);
02476 newhash = rb_hash_new();
02477 rb_hash_aset(newhash, sym_replace, v);
02478 }
02479
02480 v = rb_hash_aref(opthash, sym_fallback);
02481 if (!NIL_P(v)) {
02482 v = rb_convert_type(v, T_HASH, "Hash", "to_hash");
02483 if (!NIL_P(v)) {
02484 if (NIL_P(newhash))
02485 newhash = rb_hash_new();
02486 rb_hash_aset(newhash, sym_fallback, v);
02487 }
02488 }
02489
02490 if (!NIL_P(newhash))
02491 rb_hash_freeze(newhash);
02492 *opts = newhash;
02493
02494 return ecflags;
02495 }
02496
02497 rb_econv_t *
02498 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
02499 {
02500 rb_econv_t *ec;
02501 VALUE replacement;
02502
02503 if (NIL_P(opthash)) {
02504 replacement = Qnil;
02505 }
02506 else {
02507 if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
02508 rb_bug("rb_econv_open_opts called with invalid opthash");
02509 replacement = rb_hash_aref(opthash, sym_replace);
02510 }
02511
02512 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
02513 if (!ec)
02514 return ec;
02515
02516 if (!NIL_P(replacement)) {
02517 int ret;
02518 rb_encoding *enc = rb_enc_get(replacement);
02519
02520 ret = rb_econv_set_replacement(ec,
02521 (const unsigned char *)RSTRING_PTR(replacement),
02522 RSTRING_LEN(replacement),
02523 rb_enc_name(enc));
02524 if (ret == -1) {
02525 rb_econv_close(ec);
02526 return NULL;
02527 }
02528 }
02529 return ec;
02530 }
02531
02532 static int
02533 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
02534 {
02535 rb_encoding *enc;
02536 const char *n;
02537 int encidx;
02538 VALUE encval;
02539
02540 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
02541 !(enc = rb_enc_from_index(encidx))) {
02542 enc = NULL;
02543 encidx = 0;
02544 n = StringValueCStr(*arg);
02545 }
02546 else {
02547 n = rb_enc_name(enc);
02548 }
02549
02550 *name_p = n;
02551 *enc_p = enc;
02552
02553 return encidx;
02554 }
02555
02556 static int
02557 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
02558 const char **sname_p, rb_encoding **senc_p,
02559 const char **dname_p, rb_encoding **denc_p)
02560 {
02561 rb_encoding *senc, *denc;
02562 const char *sname, *dname;
02563 int sencidx, dencidx;
02564
02565 dencidx = enc_arg(arg1, &dname, &denc);
02566
02567 if (NIL_P(*arg2)) {
02568 sencidx = rb_enc_get_index(str);
02569 senc = rb_enc_from_index(sencidx);
02570 sname = rb_enc_name(senc);
02571 }
02572 else {
02573 sencidx = enc_arg(arg2, &sname, &senc);
02574 }
02575
02576 *sname_p = sname;
02577 *senc_p = senc;
02578 *dname_p = dname;
02579 *denc_p = denc;
02580 return dencidx;
02581 }
02582
02583 static int
02584 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
02585 {
02586 VALUE dest;
02587 VALUE str = *self;
02588 volatile VALUE arg1, arg2;
02589 long blen, slen;
02590 unsigned char *buf, *bp, *sp;
02591 const unsigned char *fromp;
02592 rb_encoding *senc, *denc;
02593 const char *sname, *dname;
02594 int dencidx;
02595
02596 if (argc <0 || argc > 2) {
02597 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
02598 }
02599
02600 if (argc == 0) {
02601 arg1 = rb_enc_default_internal();
02602 if (NIL_P(arg1)) {
02603 if (!ecflags) return -1;
02604 arg1 = rb_obj_encoding(str);
02605 }
02606 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
02607 }
02608 else {
02609 arg1 = argv[0];
02610 }
02611 arg2 = argc<=1 ? Qnil : argv[1];
02612 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
02613
02614 if ((ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR|
02615 ECONV_CRLF_NEWLINE_DECORATOR|
02616 ECONV_CR_NEWLINE_DECORATOR|
02617 ECONV_XML_TEXT_DECORATOR|
02618 ECONV_XML_ATTR_CONTENT_DECORATOR|
02619 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
02620 if (senc && senc == denc) {
02621 return NIL_P(arg2) ? -1 : dencidx;
02622 }
02623 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
02624 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02625 return dencidx;
02626 }
02627 }
02628 if (encoding_equal(sname, dname)) {
02629 return NIL_P(arg2) ? -1 : dencidx;
02630 }
02631 }
02632 else {
02633 if (encoding_equal(sname, dname)) {
02634 sname = "";
02635 dname = "";
02636 }
02637 }
02638
02639 fromp = sp = (unsigned char *)RSTRING_PTR(str);
02640 slen = RSTRING_LEN(str);
02641 blen = slen + 30;
02642 dest = rb_str_tmp_new(blen);
02643 bp = (unsigned char *)RSTRING_PTR(dest);
02644
02645 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
02646 if (fromp != sp+slen) {
02647 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
02648 }
02649 buf = (unsigned char *)RSTRING_PTR(dest);
02650 *bp = '\0';
02651 rb_str_set_len(dest, bp - buf);
02652
02653
02654 if (!denc) {
02655 dencidx = rb_define_dummy_encoding(dname);
02656 }
02657 *self = dest;
02658
02659 return dencidx;
02660 }
02661
02662 static int
02663 str_transcode(int argc, VALUE *argv, VALUE *self)
02664 {
02665 VALUE opt;
02666 int ecflags = 0;
02667 VALUE ecopts = Qnil;
02668
02669 if (0 < argc) {
02670 opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
02671 if (!NIL_P(opt)) {
02672 argc--;
02673 ecflags = rb_econv_prepare_opts(opt, &ecopts);
02674 }
02675 }
02676 return str_transcode0(argc, argv, self, ecflags, ecopts);
02677 }
02678
02679 static inline VALUE
02680 str_encode_associate(VALUE str, int encidx)
02681 {
02682 int cr = 0;
02683
02684 rb_enc_associate_index(str, encidx);
02685
02686
02687 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
02688 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
02689 }
02690 else {
02691 cr = ENC_CODERANGE_VALID;
02692 }
02693 ENC_CODERANGE_SET(str, cr);
02694 return str;
02695 }
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711 static VALUE
02712 str_encode_bang(int argc, VALUE *argv, VALUE str)
02713 {
02714 VALUE newstr;
02715 int encidx;
02716
02717 if (OBJ_FROZEN(str)) {
02718 rb_raise(rb_eRuntimeError, "string frozen");
02719 }
02720
02721 newstr = str;
02722 encidx = str_transcode(argc, argv, &newstr);
02723
02724 if (encidx < 0) return str;
02725 rb_str_shared_replace(str, newstr);
02726 return str_encode_associate(str, encidx);
02727 }
02728
02729
02730
02731
02732
02733
02734
02735
02736
02737
02738
02739
02740
02741
02742
02743
02744
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770
02771
02772
02773
02774
02775
02776
02777
02778
02779
02780
02781
02782
02783 static VALUE
02784 str_encode(int argc, VALUE *argv, VALUE str)
02785 {
02786 VALUE newstr = str;
02787 int encidx = str_transcode(argc, argv, &newstr);
02788
02789 if (encidx < 0) return rb_str_dup(str);
02790 if (newstr == str) {
02791 newstr = rb_str_dup(str);
02792 }
02793 else {
02794 RBASIC(newstr)->klass = rb_obj_class(str);
02795 }
02796 return str_encode_associate(newstr, encidx);
02797 }
02798
02799 VALUE
02800 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
02801 {
02802 int argc = 1;
02803 VALUE *argv = &to;
02804 VALUE newstr = str;
02805 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
02806
02807 if (encidx < 0) return rb_str_dup(str);
02808 if (newstr == str) {
02809 newstr = rb_str_dup(str);
02810 }
02811 else {
02812 RBASIC(newstr)->klass = rb_obj_class(str);
02813 }
02814 return str_encode_associate(newstr, encidx);
02815 }
02816
02817 static void
02818 econv_free(void *ptr)
02819 {
02820 rb_econv_t *ec = ptr;
02821 rb_econv_close(ec);
02822 }
02823
02824 static size_t
02825 econv_memsize(const void *ptr)
02826 {
02827 return ptr ? sizeof(rb_econv_t) : 0;
02828 }
02829
02830 static const rb_data_type_t econv_data_type = {
02831 "econv",
02832 NULL, econv_free, econv_memsize,
02833 };
02834
02835 static VALUE
02836 econv_s_allocate(VALUE klass)
02837 {
02838 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
02839 }
02840
02841 static rb_encoding *
02842 make_dummy_encoding(const char *name)
02843 {
02844 rb_encoding *enc;
02845 int idx;
02846 idx = rb_define_dummy_encoding(name);
02847 enc = rb_enc_from_index(idx);
02848 return enc;
02849 }
02850
02851 static rb_encoding *
02852 make_encoding(const char *name)
02853 {
02854 rb_encoding *enc;
02855 enc = rb_enc_find(name);
02856 if (!enc)
02857 enc = make_dummy_encoding(name);
02858 return enc;
02859 }
02860
02861 static VALUE
02862 make_encobj(const char *name)
02863 {
02864 return rb_enc_from_encoding(make_encoding(name));
02865 }
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885 static VALUE
02886 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
02887 {
02888 const char *arg_name, *result_name;
02889 rb_encoding *arg_enc, *result_enc;
02890
02891 enc_arg(&arg, &arg_name, &arg_enc);
02892
02893 result_name = rb_econv_asciicompat_encoding(arg_name);
02894
02895 if (result_name == NULL)
02896 return Qnil;
02897
02898 result_enc = make_encoding(result_name);
02899
02900 return rb_enc_from_encoding(result_enc);
02901 }
02902
02903 static void
02904 econv_args(int argc, VALUE *argv,
02905 volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
02906 const char **sname_p, const char **dname_p,
02907 rb_encoding **senc_p, rb_encoding **denc_p,
02908 int *ecflags_p,
02909 VALUE *ecopts_p)
02910 {
02911 VALUE opt, opthash, flags_v, ecopts;
02912 int sidx, didx;
02913 const char *sname, *dname;
02914 rb_encoding *senc, *denc;
02915 int ecflags;
02916
02917 rb_scan_args(argc, argv, "21", snamev_p, dnamev_p, &opt);
02918
02919 if (NIL_P(opt)) {
02920 ecflags = 0;
02921 ecopts = Qnil;
02922 }
02923 else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) {
02924 ecflags = NUM2INT(flags_v);
02925 ecopts = Qnil;
02926 }
02927 else {
02928 opthash = rb_convert_type(opt, T_HASH, "Hash", "to_hash");
02929 ecflags = rb_econv_prepare_opts(opthash, &ecopts);
02930 }
02931
02932 senc = NULL;
02933 sidx = rb_to_encoding_index(*snamev_p);
02934 if (0 <= sidx) {
02935 senc = rb_enc_from_index(sidx);
02936 }
02937 else {
02938 StringValue(*snamev_p);
02939 }
02940
02941 denc = NULL;
02942 didx = rb_to_encoding_index(*dnamev_p);
02943 if (0 <= didx) {
02944 denc = rb_enc_from_index(didx);
02945 }
02946 else {
02947 StringValue(*dnamev_p);
02948 }
02949
02950 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
02951 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
02952
02953 *sname_p = sname;
02954 *dname_p = dname;
02955 *senc_p = senc;
02956 *denc_p = denc;
02957 *ecflags_p = ecflags;
02958 *ecopts_p = ecopts;
02959 }
02960
02961 static int
02962 decorate_convpath(VALUE convpath, int ecflags)
02963 {
02964 int num_decorators;
02965 const char *decorators[MAX_ECFLAGS_DECORATORS];
02966 int i;
02967 int n, len;
02968
02969 num_decorators = decorator_names(ecflags, decorators);
02970 if (num_decorators == -1)
02971 return -1;
02972
02973 len = n = RARRAY_LENINT(convpath);
02974 if (n != 0) {
02975 VALUE pair = RARRAY_PTR(convpath)[n-1];
02976 if (TYPE(pair) == T_ARRAY) {
02977 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
02978 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
02979 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
02980 const rb_transcoder *tr = load_transcoder_entry(entry);
02981 if (!tr)
02982 return -1;
02983 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
02984 tr->asciicompat_type == asciicompat_encoder) {
02985 n--;
02986 rb_ary_store(convpath, len + num_decorators - 1, pair);
02987 }
02988 }
02989 else {
02990 rb_ary_store(convpath, len + num_decorators - 1, pair);
02991 }
02992 }
02993
02994 for (i = 0; i < num_decorators; i++)
02995 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
02996
02997 return 0;
02998 }
02999
03000 static void
03001 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03002 {
03003 VALUE *ary_p = arg;
03004 VALUE v;
03005
03006 if (*ary_p == Qnil) {
03007 *ary_p = rb_ary_new();
03008 }
03009
03010 if (DECORATOR_P(sname, dname)) {
03011 v = rb_str_new_cstr(dname);
03012 }
03013 else {
03014 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
03015 }
03016 rb_ary_store(*ary_p, depth, v);
03017 }
03018
03019
03020
03021
03022
03023
03024
03025
03026
03027
03028
03029
03030
03031
03032
03033
03034
03035
03036
03037
03038
03039
03040 static VALUE
03041 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
03042 {
03043 volatile VALUE snamev, dnamev;
03044 const char *sname, *dname;
03045 rb_encoding *senc, *denc;
03046 int ecflags;
03047 VALUE ecopts;
03048 VALUE convpath;
03049
03050 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03051
03052 convpath = Qnil;
03053 transcode_search_path(sname, dname, search_convpath_i, &convpath);
03054
03055 if (NIL_P(convpath))
03056 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03057
03058 if (decorate_convpath(convpath, ecflags) == -1)
03059 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03060
03061 return convpath;
03062 }
03063
03064
03065
03066
03067
03068
03069 int
03070 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
03071 {
03072 VALUE convpath = Qnil;
03073 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
03074 &convpath);
03075 return RTEST(convpath);
03076 }
03077
03078 struct rb_econv_init_by_convpath_t {
03079 rb_econv_t *ec;
03080 int index;
03081 int ret;
03082 };
03083
03084 static void
03085 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03086 {
03087 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
03088 int ret;
03089
03090 if (a->ret == -1)
03091 return;
03092
03093 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
03094
03095 a->ret = ret;
03096 return;
03097 }
03098
03099 static rb_econv_t *
03100 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
03101 const char **sname_p, const char **dname_p,
03102 rb_encoding **senc_p, rb_encoding**denc_p)
03103 {
03104 rb_econv_t *ec;
03105 long i;
03106 int ret, first=1;
03107 VALUE elt;
03108 rb_encoding *senc = 0, *denc = 0;
03109 const char *sname, *dname;
03110
03111 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
03112 DATA_PTR(self) = ec;
03113
03114 for (i = 0; i < RARRAY_LEN(convpath); i++) {
03115 volatile VALUE snamev, dnamev;
03116 VALUE pair;
03117 elt = rb_ary_entry(convpath, i);
03118 if (!NIL_P(pair = rb_check_array_type(elt))) {
03119 if (RARRAY_LEN(pair) != 2)
03120 rb_raise(rb_eArgError, "not a 2-element array in convpath");
03121 snamev = rb_ary_entry(pair, 0);
03122 enc_arg(&snamev, &sname, &senc);
03123 dnamev = rb_ary_entry(pair, 1);
03124 enc_arg(&dnamev, &dname, &denc);
03125 }
03126 else {
03127 sname = "";
03128 dname = StringValueCStr(elt);
03129 }
03130 if (DECORATOR_P(sname, dname)) {
03131 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
03132 if (ret == -1)
03133 rb_raise(rb_eArgError, "decoration failed: %s", dname);
03134 }
03135 else {
03136 int j = ec->num_trans;
03137 struct rb_econv_init_by_convpath_t arg;
03138 arg.ec = ec;
03139 arg.index = ec->num_trans;
03140 arg.ret = 0;
03141 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
03142 if (ret == -1 || arg.ret == -1)
03143 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
03144 if (first) {
03145 first = 0;
03146 *senc_p = senc;
03147 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
03148 }
03149 *denc_p = denc;
03150 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
03151 }
03152 }
03153
03154 if (first) {
03155 *senc_p = NULL;
03156 *denc_p = NULL;
03157 *sname_p = "";
03158 *dname_p = "";
03159 }
03160
03161 ec->source_encoding_name = *sname_p;
03162 ec->destination_encoding_name = *dname_p;
03163
03164 return ec;
03165 }
03166
03167
03168
03169
03170
03171
03172
03173
03174
03175
03176
03177
03178
03179
03180
03181
03182
03183
03184
03185
03186
03187
03188
03189
03190
03191
03192
03193
03194
03195
03196
03197
03198
03199
03200
03201
03202
03203
03204
03205
03206
03207
03208
03209
03210
03211
03212
03213
03214
03215
03216
03217
03218
03219
03220
03221
03222
03223
03224
03225
03226
03227
03228
03229
03230
03231
03232
03233
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243
03244
03245
03246
03247
03248
03249
03250
03251
03252
03253
03254
03255
03256
03257
03258
03259
03260
03261
03262
03263
03264
03265
03266
03267
03268
03269
03270 static VALUE
03271 econv_init(int argc, VALUE *argv, VALUE self)
03272 {
03273 VALUE ecopts;
03274 volatile VALUE snamev, dnamev;
03275 const char *sname, *dname;
03276 rb_encoding *senc, *denc;
03277 rb_econv_t *ec;
03278 int ecflags;
03279 VALUE convpath;
03280
03281 if (rb_check_typeddata(self, &econv_data_type)) {
03282 rb_raise(rb_eTypeError, "already initialized");
03283 }
03284
03285 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
03286 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
03287 ecflags = 0;
03288 ecopts = Qnil;
03289 }
03290 else {
03291 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03292 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
03293 }
03294
03295 if (!ec) {
03296 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03297 }
03298
03299 if (!DECORATOR_P(sname, dname)) {
03300 if (!senc)
03301 senc = make_dummy_encoding(sname);
03302 if (!denc)
03303 denc = make_dummy_encoding(dname);
03304 }
03305
03306 ec->source_encoding = senc;
03307 ec->destination_encoding = denc;
03308
03309 DATA_PTR(self) = ec;
03310
03311 return self;
03312 }
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324 static VALUE
03325 econv_inspect(VALUE self)
03326 {
03327 const char *cname = rb_obj_classname(self);
03328 rb_econv_t *ec;
03329
03330 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03331 if (!ec)
03332 return rb_sprintf("#<%s: uninitialized>", cname);
03333 else {
03334 const char *sname = ec->source_encoding_name;
03335 const char *dname = ec->destination_encoding_name;
03336 VALUE str;
03337 str = rb_sprintf("#<%s: ", cname);
03338 econv_description(sname, dname, ec->flags, str);
03339 rb_str_cat2(str, ">");
03340 return str;
03341 }
03342 }
03343
03344 static rb_econv_t *
03345 check_econv(VALUE self)
03346 {
03347 rb_econv_t *ec;
03348
03349 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03350 if (!ec) {
03351 rb_raise(rb_eTypeError, "uninitialized encoding converter");
03352 }
03353 return ec;
03354 }
03355
03356
03357
03358
03359
03360
03361
03362 static VALUE
03363 econv_source_encoding(VALUE self)
03364 {
03365 rb_econv_t *ec = check_econv(self);
03366 if (!ec->source_encoding)
03367 return Qnil;
03368 return rb_enc_from_encoding(ec->source_encoding);
03369 }
03370
03371
03372
03373
03374
03375
03376
03377 static VALUE
03378 econv_destination_encoding(VALUE self)
03379 {
03380 rb_econv_t *ec = check_econv(self);
03381 if (!ec->destination_encoding)
03382 return Qnil;
03383 return rb_enc_from_encoding(ec->destination_encoding);
03384 }
03385
03386
03387
03388
03389
03390
03391
03392
03393
03394
03395
03396
03397
03398
03399
03400
03401
03402
03403
03404
03405
03406
03407
03408 static VALUE
03409 econv_convpath(VALUE self)
03410 {
03411 rb_econv_t *ec = check_econv(self);
03412 VALUE result;
03413 int i;
03414
03415 result = rb_ary_new();
03416 for (i = 0; i < ec->num_trans; i++) {
03417 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
03418 VALUE v;
03419 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
03420 v = rb_str_new_cstr(tr->dst_encoding);
03421 else
03422 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
03423 rb_ary_push(result, v);
03424 }
03425 return result;
03426 }
03427
03428 static VALUE
03429 econv_result_to_symbol(rb_econv_result_t res)
03430 {
03431 switch (res) {
03432 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
03433 case econv_incomplete_input: return sym_incomplete_input;
03434 case econv_undefined_conversion: return sym_undefined_conversion;
03435 case econv_destination_buffer_full: return sym_destination_buffer_full;
03436 case econv_source_buffer_empty: return sym_source_buffer_empty;
03437 case econv_finished: return sym_finished;
03438 case econv_after_output: return sym_after_output;
03439 default: return INT2NUM(res);
03440 }
03441 }
03442
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460
03461
03462
03463
03464
03465
03466
03467
03468
03469
03470
03471
03472
03473
03474
03475
03476
03477
03478
03479
03480
03481
03482
03483
03484
03485
03486
03487
03488
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518
03519
03520
03521
03522
03523
03524
03525
03526
03527
03528
03529
03530
03531
03532
03533
03534 static VALUE
03535 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
03536 {
03537 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
03538 rb_econv_t *ec = check_econv(self);
03539 rb_econv_result_t res;
03540 const unsigned char *ip, *is;
03541 unsigned char *op, *os;
03542 long output_byteoffset, output_bytesize;
03543 unsigned long output_byteend;
03544 int flags;
03545
03546 rb_scan_args(argc, argv, "23", &input, &output, &output_byteoffset_v, &output_bytesize_v, &opt);
03547
03548 if (NIL_P(output_byteoffset_v))
03549 output_byteoffset = 0;
03550 else
03551 output_byteoffset = NUM2LONG(output_byteoffset_v);
03552
03553 if (NIL_P(output_bytesize_v))
03554 output_bytesize = 0;
03555 else
03556 output_bytesize = NUM2LONG(output_bytesize_v);
03557
03558 if (NIL_P(opt)) {
03559 flags = 0;
03560 }
03561 else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) {
03562 flags = NUM2INT(flags_v);
03563 }
03564 else {
03565 VALUE v;
03566 opt = rb_convert_type(opt, T_HASH, "Hash", "to_hash");
03567 flags = 0;
03568 v = rb_hash_aref(opt, sym_partial_input);
03569 if (RTEST(v))
03570 flags |= ECONV_PARTIAL_INPUT;
03571 v = rb_hash_aref(opt, sym_after_output);
03572 if (RTEST(v))
03573 flags |= ECONV_AFTER_OUTPUT;
03574 }
03575
03576 StringValue(output);
03577 if (!NIL_P(input))
03578 StringValue(input);
03579 rb_str_modify(output);
03580
03581 if (NIL_P(output_bytesize_v)) {
03582 output_bytesize = RSTRING_EMBED_LEN_MAX;
03583 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
03584 output_bytesize = RSTRING_LEN(input);
03585 }
03586
03587 retry:
03588
03589 if (NIL_P(output_byteoffset_v))
03590 output_byteoffset = RSTRING_LEN(output);
03591
03592 if (output_byteoffset < 0)
03593 rb_raise(rb_eArgError, "negative output_byteoffset");
03594
03595 if (RSTRING_LEN(output) < output_byteoffset)
03596 rb_raise(rb_eArgError, "output_byteoffset too big");
03597
03598 if (output_bytesize < 0)
03599 rb_raise(rb_eArgError, "negative output_bytesize");
03600
03601 output_byteend = (unsigned long)output_byteoffset +
03602 (unsigned long)output_bytesize;
03603
03604 if (output_byteend < (unsigned long)output_byteoffset ||
03605 LONG_MAX < output_byteend)
03606 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
03607
03608 if (rb_str_capacity(output) < output_byteend)
03609 rb_str_resize(output, output_byteend);
03610
03611 if (NIL_P(input)) {
03612 ip = is = NULL;
03613 }
03614 else {
03615 ip = (const unsigned char *)RSTRING_PTR(input);
03616 is = ip + RSTRING_LEN(input);
03617 }
03618
03619 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
03620 os = op + output_bytesize;
03621
03622 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
03623 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
03624 if (!NIL_P(input))
03625 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
03626
03627 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
03628 if (LONG_MAX / 2 < output_bytesize)
03629 rb_raise(rb_eArgError, "too long conversion result");
03630 output_bytesize *= 2;
03631 output_byteoffset_v = Qnil;
03632 goto retry;
03633 }
03634
03635 if (ec->destination_encoding) {
03636 rb_enc_associate(output, ec->destination_encoding);
03637 }
03638
03639 return econv_result_to_symbol(res);
03640 }
03641
03642
03643
03644
03645
03646
03647
03648
03649
03650
03651
03652
03653
03654
03655
03656
03657
03658
03659
03660
03661
03662
03663
03664
03665
03666
03667
03668
03669
03670
03671
03672
03673
03674
03675
03676 static VALUE
03677 econv_convert(VALUE self, VALUE source_string)
03678 {
03679 VALUE ret, dst;
03680 VALUE av[5];
03681 int ac;
03682 rb_econv_t *ec = check_econv(self);
03683
03684 StringValue(source_string);
03685
03686 dst = rb_str_new(NULL, 0);
03687
03688 av[0] = rb_str_dup(source_string);
03689 av[1] = dst;
03690 av[2] = Qnil;
03691 av[3] = Qnil;
03692 av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
03693 ac = 5;
03694
03695 ret = econv_primitive_convert(ac, av, self);
03696
03697 if (ret == sym_invalid_byte_sequence ||
03698 ret == sym_undefined_conversion ||
03699 ret == sym_incomplete_input) {
03700 VALUE exc = make_econv_exception(ec);
03701 rb_exc_raise(exc);
03702 }
03703
03704 if (ret == sym_finished) {
03705 rb_raise(rb_eArgError, "converter already finished");
03706 }
03707
03708 if (ret != sym_source_buffer_empty) {
03709 rb_bug("unexpected result of econv_primitive_convert");
03710 }
03711
03712 return dst;
03713 }
03714
03715
03716
03717
03718
03719
03720
03721
03722
03723
03724
03725
03726 static VALUE
03727 econv_finish(VALUE self)
03728 {
03729 VALUE ret, dst;
03730 VALUE av[5];
03731 int ac;
03732 rb_econv_t *ec = check_econv(self);
03733
03734 dst = rb_str_new(NULL, 0);
03735
03736 av[0] = Qnil;
03737 av[1] = dst;
03738 av[2] = Qnil;
03739 av[3] = Qnil;
03740 av[4] = INT2NUM(0);
03741 ac = 5;
03742
03743 ret = econv_primitive_convert(ac, av, self);
03744
03745 if (ret == sym_invalid_byte_sequence ||
03746 ret == sym_undefined_conversion ||
03747 ret == sym_incomplete_input) {
03748 VALUE exc = make_econv_exception(ec);
03749 rb_exc_raise(exc);
03750 }
03751
03752 if (ret != sym_finished) {
03753 rb_bug("unexpected result of econv_primitive_convert");
03754 }
03755
03756 return dst;
03757 }
03758
03759
03760
03761
03762
03763
03764
03765
03766
03767
03768
03769
03770
03771
03772
03773
03774
03775
03776
03777
03778
03779
03780
03781
03782
03783
03784
03785
03786
03787
03788
03789
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801
03802
03803
03804
03805
03806
03807
03808
03809
03810
03811
03812
03813
03814
03815
03816
03817
03818
03819
03820
03821
03822
03823
03824
03825
03826
03827
03828
03829
03830
03831
03832
03833
03834 static VALUE
03835 econv_primitive_errinfo(VALUE self)
03836 {
03837 rb_econv_t *ec = check_econv(self);
03838
03839 VALUE ary;
03840
03841 ary = rb_ary_new2(5);
03842
03843 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
03844 rb_ary_store(ary, 4, Qnil);
03845
03846 if (ec->last_error.source_encoding)
03847 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
03848
03849 if (ec->last_error.destination_encoding)
03850 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
03851
03852 if (ec->last_error.error_bytes_start) {
03853 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
03854 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
03855 }
03856
03857 return ary;
03858 }
03859
03860
03861
03862
03863
03864
03865
03866
03867
03868
03869
03870
03871
03872
03873
03874
03875
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892 static VALUE
03893 econv_insert_output(VALUE self, VALUE string)
03894 {
03895 const char *insert_enc;
03896
03897 int ret;
03898
03899 rb_econv_t *ec = check_econv(self);
03900
03901 StringValue(string);
03902 insert_enc = rb_econv_encoding_to_insert_output(ec);
03903 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
03904
03905 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
03906 if (ret == -1) {
03907 rb_raise(rb_eArgError, "too big string");
03908 }
03909
03910 return Qnil;
03911 }
03912
03913
03914
03915
03916
03917
03918
03919
03920
03921
03922
03923
03924
03925
03926
03927
03928
03929
03930
03931
03932
03933
03934
03935
03936
03937 static VALUE
03938 econv_putback(int argc, VALUE *argv, VALUE self)
03939 {
03940 rb_econv_t *ec = check_econv(self);
03941 int n;
03942 int putbackable;
03943 VALUE str, max;
03944
03945 rb_scan_args(argc, argv, "01", &max);
03946
03947 if (NIL_P(max))
03948 n = rb_econv_putbackable(ec);
03949 else {
03950 n = NUM2INT(max);
03951 putbackable = rb_econv_putbackable(ec);
03952 if (putbackable < n)
03953 n = putbackable;
03954 }
03955
03956 str = rb_str_new(NULL, n);
03957 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
03958
03959 if (ec->source_encoding) {
03960 rb_enc_associate(str, ec->source_encoding);
03961 }
03962
03963 return str;
03964 }
03965
03966
03967
03968
03969
03970
03971
03972
03973
03974
03975
03976
03977
03978
03979
03980
03981
03982
03983
03984
03985
03986 static VALUE
03987 econv_last_error(VALUE self)
03988 {
03989 rb_econv_t *ec = check_econv(self);
03990 VALUE exc;
03991
03992 exc = make_econv_exception(ec);
03993 if (NIL_P(exc))
03994 return Qnil;
03995 return exc;
03996 }
03997
03998
03999
04000
04001
04002
04003
04004
04005
04006
04007
04008
04009
04010 static VALUE
04011 econv_get_replacement(VALUE self)
04012 {
04013 rb_econv_t *ec = check_econv(self);
04014 int ret;
04015 rb_encoding *enc;
04016
04017 ret = make_replacement(ec);
04018 if (ret == -1) {
04019 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04020 }
04021
04022 enc = rb_enc_find(ec->replacement_enc);
04023 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
04024 }
04025
04026
04027
04028
04029
04030
04031
04032
04033
04034
04035
04036 static VALUE
04037 econv_set_replacement(VALUE self, VALUE arg)
04038 {
04039 rb_econv_t *ec = check_econv(self);
04040 VALUE string = arg;
04041 int ret;
04042 rb_encoding *enc;
04043
04044 StringValue(string);
04045 enc = rb_enc_get(string);
04046
04047 ret = rb_econv_set_replacement(ec,
04048 (const unsigned char *)RSTRING_PTR(string),
04049 RSTRING_LEN(string),
04050 rb_enc_name(enc));
04051
04052 if (ret == -1) {
04053
04054 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04055 }
04056
04057 return arg;
04058 }
04059
04060 VALUE
04061 rb_econv_make_exception(rb_econv_t *ec)
04062 {
04063 return make_econv_exception(ec);
04064 }
04065
04066 void
04067 rb_econv_check_error(rb_econv_t *ec)
04068 {
04069 VALUE exc;
04070
04071 exc = make_econv_exception(ec);
04072 if (NIL_P(exc))
04073 return;
04074 rb_exc_raise(exc);
04075 }
04076
04077
04078
04079
04080
04081
04082
04083 static VALUE
04084 ecerr_source_encoding_name(VALUE self)
04085 {
04086 return rb_attr_get(self, rb_intern("source_encoding_name"));
04087 }
04088
04089
04090
04091
04092
04093
04094
04095
04096
04097
04098
04099
04100
04101
04102
04103
04104
04105
04106
04107
04108
04109 static VALUE
04110 ecerr_source_encoding(VALUE self)
04111 {
04112 return rb_attr_get(self, rb_intern("source_encoding"));
04113 }
04114
04115
04116
04117
04118
04119
04120
04121 static VALUE
04122 ecerr_destination_encoding_name(VALUE self)
04123 {
04124 return rb_attr_get(self, rb_intern("destination_encoding_name"));
04125 }
04126
04127
04128
04129
04130
04131
04132
04133 static VALUE
04134 ecerr_destination_encoding(VALUE self)
04135 {
04136 return rb_attr_get(self, rb_intern("destination_encoding"));
04137 }
04138
04139
04140
04141
04142
04143
04144
04145
04146
04147
04148
04149
04150
04151
04152
04153
04154 static VALUE
04155 ecerr_error_char(VALUE self)
04156 {
04157 return rb_attr_get(self, rb_intern("error_char"));
04158 }
04159
04160
04161
04162
04163
04164
04165
04166
04167
04168
04169
04170
04171
04172
04173
04174
04175 static VALUE
04176 ecerr_error_bytes(VALUE self)
04177 {
04178 return rb_attr_get(self, rb_intern("error_bytes"));
04179 }
04180
04181
04182
04183
04184
04185
04186
04187 static VALUE
04188 ecerr_readagain_bytes(VALUE self)
04189 {
04190 return rb_attr_get(self, rb_intern("readagain_bytes"));
04191 }
04192
04193
04194
04195
04196
04197
04198
04199
04200
04201
04202
04203
04204
04205
04206
04207
04208
04209
04210
04211
04212
04213
04214
04215
04216
04217 static VALUE
04218 ecerr_incomplete_input(VALUE self)
04219 {
04220 return rb_attr_get(self, rb_intern("incomplete_input"));
04221 }
04222
04223 extern void Init_newline(void);
04224
04225
04226
04227
04228
04229
04230
04231
04232
04233
04234
04235
04236
04237
04238
04239
04240
04241
04242
04243
04244
04245
04246
04247 void
04248 Init_transcode(void)
04249 {
04250 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
04251 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
04252 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
04253
04254 transcoder_table = st_init_strcasetable();
04255
04256 sym_invalid = ID2SYM(rb_intern("invalid"));
04257 sym_undef = ID2SYM(rb_intern("undef"));
04258 sym_replace = ID2SYM(rb_intern("replace"));
04259 sym_fallback = ID2SYM(rb_intern("fallback"));
04260 sym_xml = ID2SYM(rb_intern("xml"));
04261 sym_text = ID2SYM(rb_intern("text"));
04262 sym_attr = ID2SYM(rb_intern("attr"));
04263
04264 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
04265 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
04266 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
04267 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
04268 sym_finished = ID2SYM(rb_intern("finished"));
04269 sym_after_output = ID2SYM(rb_intern("after_output"));
04270 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
04271 sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
04272 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
04273 sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
04274 sym_partial_input = ID2SYM(rb_intern("partial_input"));
04275
04276 rb_define_method(rb_cString, "encode", str_encode, -1);
04277 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
04278
04279 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
04280 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
04281 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
04282 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
04283 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
04284 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
04285 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
04286 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
04287 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
04288 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
04289 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
04290 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
04291 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
04292 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
04293 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
04294 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
04295 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
04296 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
04297
04298 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
04299 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
04300 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
04301 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
04302 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
04303 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
04304 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
04305 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
04306 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
04307 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
04308 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
04309 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
04310 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
04311
04312 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
04313 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04314 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
04315 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
04316 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
04317
04318 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
04319 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04320 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
04321 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
04322 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
04323 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
04324 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
04325
04326 Init_newline();
04327 }
04328