00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regenc.h"
00031
00032 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
00033 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
00034 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
00035
00036 static const int EncLen_UTF16[] = {
00037 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00053 };
00054
00055 static int
00056 utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED,
00057 OnigEncoding enc ARG_UNUSED)
00058 {
00059 int byte = p[0];
00060 if (!UTF16_IS_SURROGATE(byte)) {
00061 if (2 <= e-p)
00062 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
00063 else
00064 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00065 }
00066 if (UTF16_IS_SURROGATE_FIRST(byte)) {
00067 switch (e-p) {
00068 case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3);
00069 case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2);
00070 case 3:
00071 if (UTF16_IS_SURROGATE_SECOND(p[2]))
00072 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00073 break;
00074 default:
00075 if (UTF16_IS_SURROGATE_SECOND(p[2]))
00076 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
00077 break;
00078 }
00079 }
00080 return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00081 }
00082
00083 static int
00084 utf16be_is_mbc_newline(const UChar* p, const UChar* end,
00085 OnigEncoding enc)
00086 {
00087 if (p + 1 < end) {
00088 if (*(p+1) == 0x0a && *p == 0x00)
00089 return 1;
00090 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00091 if ((
00092 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00093 *(p+1) == 0x0d ||
00094 #endif
00095 *(p+1) == 0x85) && *p == 0x00)
00096 return 1;
00097 if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
00098 return 1;
00099 #endif
00100 }
00101 return 0;
00102 }
00103
00104 static OnigCodePoint
00105 utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
00106 OnigEncoding enc)
00107 {
00108 OnigCodePoint code;
00109
00110 if (UTF16_IS_SURROGATE_FIRST(*p)) {
00111 code = ((((p[0] << 8) + p[1]) & 0x03ff) << 10)
00112 + (((p[2] << 8) + p[3]) & 0x03ff) + 0x10000;
00113 }
00114 else {
00115 code = p[0] * 256 + p[1];
00116 }
00117 return code;
00118 }
00119
00120 static int
00121 utf16be_code_to_mbclen(OnigCodePoint code,
00122 OnigEncoding enc)
00123 {
00124 return (code > 0xffff ? 4 : 2);
00125 }
00126
00127 static int
00128 utf16be_code_to_mbc(OnigCodePoint code, UChar *buf,
00129 OnigEncoding enc)
00130 {
00131 UChar* p = buf;
00132
00133 if (code > 0xffff) {
00134 unsigned int high = (code >> 10) + 0xD7C0;
00135 unsigned int low = (code & 0x3FF) + 0xDC00;
00136 *p++ = (high >> 8) & 0xFF;
00137 *p++ = high & 0xFF;
00138 *p++ = (low >> 8) & 0xFF;
00139 *p++ = low & 0xFF;
00140 return 4;
00141 }
00142 else {
00143 *p++ = (UChar )((code & 0xff00) >> 8);
00144 *p++ = (UChar )(code & 0xff);
00145 return 2;
00146 }
00147 }
00148
00149 static int
00150 utf16be_mbc_case_fold(OnigCaseFoldType flag,
00151 const UChar** pp, const UChar* end, UChar* fold,
00152 OnigEncoding enc)
00153 {
00154 const UChar* p = *pp;
00155
00156 if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) {
00157 p++;
00158 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00159 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00160 if (*p == 0x49) {
00161 *fold++ = 0x01;
00162 *fold = 0x31;
00163 (*pp) += 2;
00164 return 2;
00165 }
00166 }
00167 #endif
00168
00169 *fold++ = 0;
00170 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00171 *pp += 2;
00172 return 2;
00173 }
00174 else
00175 return onigenc_unicode_mbc_case_fold(enc, flag,
00176 pp, end, fold);
00177 }
00178
00179 #if 0
00180 static int
00181 utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
00182 {
00183 const UChar* p = *pp;
00184
00185 (*pp) += EncLen_UTF16[*p];
00186
00187 if (*p == 0) {
00188 int c, v;
00189
00190 p++;
00191 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00192 return TRUE;
00193 }
00194
00195 c = *p;
00196 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
00197 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00198
00199 if ((v | BIT_CTYPE_LOWER) != 0) {
00200
00201 if (c >= 0xaa && c <= 0xba)
00202 return FALSE;
00203 else
00204 return TRUE;
00205 }
00206 return (v != 0 ? TRUE : FALSE);
00207 }
00208
00209 return FALSE;
00210 }
00211 #endif
00212
00213 static UChar*
00214 utf16be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
00215 OnigEncoding enc ARG_UNUSED)
00216 {
00217 if (s <= start) return (UChar* )s;
00218
00219 if ((s - start) % 2 == 1) {
00220 s--;
00221 }
00222
00223 if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
00224 s -= 2;
00225
00226 return (UChar* )s;
00227 }
00228
00229 static int
00230 utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag,
00231 const OnigUChar* p, const OnigUChar* end,
00232 OnigCaseFoldCodeItem items[],
00233 OnigEncoding enc)
00234 {
00235 return onigenc_unicode_get_case_fold_codes_by_str(enc,
00236 flag, p, end, items);
00237 }
00238
00239 OnigEncodingDefine(utf_16be, UTF_16BE) = {
00240 utf16be_mbc_enc_len,
00241 "UTF-16BE",
00242 4,
00243 2,
00244 utf16be_is_mbc_newline,
00245 utf16be_mbc_to_code,
00246 utf16be_code_to_mbclen,
00247 utf16be_code_to_mbc,
00248 utf16be_mbc_case_fold,
00249 onigenc_unicode_apply_all_case_fold,
00250 utf16be_get_case_fold_codes_by_str,
00251 onigenc_unicode_property_name_to_ctype,
00252 onigenc_unicode_is_code_ctype,
00253 onigenc_utf16_32_get_ctype_code_range,
00254 utf16be_left_adjust_char_head,
00255 onigenc_always_false_is_allowed_reverse_match
00256 };
00257 ENC_ALIAS("UCS-2BE", "UTF-16BE")
00258