• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/utf_16be.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   utf_16be.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define UTF16_IS_SURROGATE_FIRST(c)    (((c) & 0xfc) == 0xd8)
00033 #define UTF16_IS_SURROGATE_SECOND(c)   (((c) & 0xfc) == 0xdc)
00034 #define UTF16_IS_SURROGATE(c)          (((c) & 0xf8) == 0xd8)
00035 
00036 static const int EncLen_UTF16[] = {
00037   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00053 };
00054 
00055 static int
00056 utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED,
00057                     OnigEncoding enc ARG_UNUSED)
00058 {
00059   int byte = p[0];
00060   if (!UTF16_IS_SURROGATE(byte)) {
00061     if (2 <= e-p)
00062       return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
00063     else
00064       return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00065   }
00066   if (UTF16_IS_SURROGATE_FIRST(byte)) {
00067     switch (e-p) {
00068       case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3);
00069       case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2);
00070       case 3:
00071         if (UTF16_IS_SURROGATE_SECOND(p[2]))
00072           return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00073         break;
00074       default:
00075         if (UTF16_IS_SURROGATE_SECOND(p[2]))
00076           return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
00077         break;
00078     }
00079   }
00080   return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00081 }
00082 
00083 static int
00084 utf16be_is_mbc_newline(const UChar* p, const UChar* end,
00085                        OnigEncoding enc)
00086 {
00087   if (p + 1 < end) {
00088     if (*(p+1) == 0x0a && *p == 0x00)
00089       return 1;
00090 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00091     if ((
00092 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00093          *(p+1) == 0x0d ||
00094 #endif
00095          *(p+1) == 0x85) && *p == 0x00)
00096       return 1;
00097     if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
00098       return 1;
00099 #endif
00100   }
00101   return 0;
00102 }
00103 
00104 static OnigCodePoint
00105 utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
00106                     OnigEncoding enc)
00107 {
00108   OnigCodePoint code;
00109 
00110   if (UTF16_IS_SURROGATE_FIRST(*p)) {
00111     code = ((((p[0] << 8) + p[1]) & 0x03ff) << 10)
00112          + (((p[2] << 8) + p[3]) & 0x03ff) + 0x10000;
00113   }
00114   else {
00115     code = p[0] * 256 + p[1];
00116   }
00117   return code;
00118 }
00119 
00120 static int
00121 utf16be_code_to_mbclen(OnigCodePoint code,
00122                        OnigEncoding enc)
00123 {
00124   return (code > 0xffff ? 4 : 2);
00125 }
00126 
00127 static int
00128 utf16be_code_to_mbc(OnigCodePoint code, UChar *buf,
00129                     OnigEncoding enc)
00130 {
00131   UChar* p = buf;
00132 
00133   if (code > 0xffff) {
00134     unsigned int high = (code >> 10) + 0xD7C0;
00135     unsigned int low = (code & 0x3FF) + 0xDC00;
00136     *p++ = (high >> 8) & 0xFF;
00137     *p++ = high & 0xFF;
00138     *p++ = (low >> 8) & 0xFF;
00139     *p++ = low & 0xFF;
00140     return 4;
00141   }
00142   else {
00143     *p++ = (UChar )((code & 0xff00) >> 8);
00144     *p++ = (UChar )(code & 0xff);
00145     return 2;
00146   }
00147 }
00148 
00149 static int
00150 utf16be_mbc_case_fold(OnigCaseFoldType flag,
00151                       const UChar** pp, const UChar* end, UChar* fold,
00152                       OnigEncoding enc)
00153 {
00154   const UChar* p = *pp;
00155 
00156   if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) {
00157     p++;
00158 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00159     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00160       if (*p == 0x49) {
00161         *fold++ = 0x01;
00162         *fold   = 0x31;
00163         (*pp) += 2;
00164         return 2;
00165       }
00166     }
00167 #endif
00168 
00169     *fold++ = 0;
00170     *fold   = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00171     *pp += 2;
00172     return 2;
00173   }
00174   else
00175     return onigenc_unicode_mbc_case_fold(enc, flag,
00176                                          pp, end, fold);
00177 }
00178 
00179 #if 0
00180 static int
00181 utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
00182 {
00183   const UChar* p = *pp;
00184 
00185   (*pp) += EncLen_UTF16[*p];
00186 
00187   if (*p == 0) {
00188     int c, v;
00189 
00190     p++;
00191     if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00192       return TRUE;
00193     }
00194 
00195     c = *p;
00196     v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
00197                 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00198 
00199     if ((v | BIT_CTYPE_LOWER) != 0) {
00200       /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00201       if (c >= 0xaa && c <= 0xba)
00202         return FALSE;
00203       else
00204         return TRUE;
00205     }
00206     return (v != 0 ? TRUE : FALSE);
00207   }
00208 
00209   return FALSE;
00210 }
00211 #endif
00212 
00213 static UChar*
00214 utf16be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
00215                               OnigEncoding enc ARG_UNUSED)
00216 {
00217   if (s <= start) return (UChar* )s;
00218 
00219   if ((s - start) % 2 == 1) {
00220     s--;
00221   }
00222 
00223   if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
00224     s -= 2;
00225 
00226   return (UChar* )s;
00227 }
00228 
00229 static int
00230 utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag,
00231                                    const OnigUChar* p, const OnigUChar* end,
00232                                    OnigCaseFoldCodeItem items[],
00233                                    OnigEncoding enc)
00234 {
00235   return onigenc_unicode_get_case_fold_codes_by_str(enc,
00236                                                     flag, p, end, items);
00237 }
00238 
00239 OnigEncodingDefine(utf_16be, UTF_16BE) = {
00240   utf16be_mbc_enc_len,
00241   "UTF-16BE",   /* name */
00242   4,            /* max byte length */
00243   2,            /* min byte length */
00244   utf16be_is_mbc_newline,
00245   utf16be_mbc_to_code,
00246   utf16be_code_to_mbclen,
00247   utf16be_code_to_mbc,
00248   utf16be_mbc_case_fold,
00249   onigenc_unicode_apply_all_case_fold,
00250   utf16be_get_case_fold_codes_by_str,
00251   onigenc_unicode_property_name_to_ctype,
00252   onigenc_unicode_is_code_ctype,
00253   onigenc_utf16_32_get_ctype_code_range,
00254   utf16be_left_adjust_char_head,
00255   onigenc_always_false_is_allowed_reverse_match
00256 };
00257 ENC_ALIAS("UCS-2BE", "UTF-16BE")
00258 

Generated on Wed Sep 8 2010 09:54:33 for Ruby by  doxygen 1.7.1