• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/euc_jp.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   euc_jp.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regint.h"
00031 
00032 
00033 #define eucjp_islead(c)    ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
00034 
00035 static const int EncLen_EUCJP[] = {
00036   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00052 };
00053 
00054 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
00055 #define A ACCEPT
00056 #define F FAILURE
00057 static const signed char trans[][0x100] = {
00058   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00059     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00060     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00061     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00062     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00063     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00064     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00065     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00066     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00067     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
00068     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00069     /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00070     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00071     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00072     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00073     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00074     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 
00075   },
00076   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00077     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00078     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00079     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00080     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00081     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00082     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00083     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00084     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00085     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00086     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00087     /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00088     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091     /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092     /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 
00093   },
00094   { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00095     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00096     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00101     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00102     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00103     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105     /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00106     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00107     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00108     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00109     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00110     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 
00111   },
00112 
00113 };
00114 #undef A
00115 #undef F
00116 
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120   int firstbyte = *p++;
00121   state_t s;
00122   s = trans[0][firstbyte];
00123   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
00126   s = trans[s][*p++];
00127   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
00130   s = trans[s][*p++];
00131   return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
00132                        ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00133 }
00134 
00135 static OnigCodePoint
00136 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00137 {
00138   int c, i, len;
00139   OnigCodePoint n;
00140 
00141   len = enclen(enc, p, end);
00142   n = (OnigCodePoint )*p++;
00143   if (len == 1) return n;
00144 
00145   for (i = 1; i < len; i++) {
00146     if (p >= end) break;
00147     c = *p++;
00148     n <<= 8;  n += c;
00149   }
00150   return n;
00151 }
00152 
00153 static int
00154 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00155 {
00156   if (ONIGENC_IS_CODE_ASCII(code)) return 1;
00157   else if (code > 0xffffff) return 0;
00158   else if ((code & 0xff0000) >= 0x800000) return 3;
00159   else if ((code &   0xff00) >= 0x8000) return 2;
00160   else
00161     return ONIGERR_INVALID_CODE_POINT_VALUE;
00162 }
00163 
00164 #if 0
00165 static int
00166 code_to_mbc_first(OnigCodePoint code)
00167 {
00168   int first;
00169 
00170   if ((code & 0xff0000) != 0) {
00171     first = (code >> 16) & 0xff;
00172   }
00173   else if ((code & 0xff00) != 0) {
00174     first = (code >> 8) & 0xff;
00175   }
00176   else {
00177     return (int )code;
00178   }
00179   return first;
00180 }
00181 #endif
00182 
00183 static int
00184 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00185 {
00186   UChar *p = buf;
00187 
00188   if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
00189   if ((code &   0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00190   *p++ = (UChar )(code & 0xff);
00191 
00192 #if 1
00193   if (enclen(enc, buf, p) != (p - buf))
00194     return ONIGERR_INVALID_CODE_POINT_VALUE;
00195 #endif  
00196   return p - buf;
00197 }
00198 
00199 static int
00200 mbc_case_fold(OnigCaseFoldType flag,
00201               const UChar** pp, const UChar* end, UChar* lower,
00202               OnigEncoding enc)
00203 {
00204   int len;
00205   const UChar* p = *pp;
00206 
00207   if (ONIGENC_IS_MBC_ASCII(p)) {
00208     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00209     (*pp)++;
00210     return 1;
00211   }
00212   else {
00213     int i;
00214 
00215     len = enclen(enc, p, end);
00216     for (i = 0; i < len; i++) {
00217       *lower++ = *p++;
00218     }
00219     (*pp) += len;
00220     return len; /* return byte length of converted char to lower */
00221   }
00222 }
00223 
00224 static UChar*
00225 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00226 {
00227   /* In this encoding
00228      mb-trail bytes doesn't mix with single bytes.
00229   */
00230   const UChar *p;
00231   int len;
00232 
00233   if (s <= start) return (UChar* )s;
00234   p = s;
00235 
00236   while (!eucjp_islead(*p) && p > start) p--;
00237   len = enclen(enc, p, end);
00238   if (p + len > s) return (UChar* )p;
00239   p += len;
00240   return (UChar* )(p + ((s - p) & ~1));
00241 }
00242 
00243 static int
00244 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00245 {
00246   const UChar c = *s;
00247   if (c <= 0x7e || c == 0x8e || c == 0x8f)
00248     return TRUE;
00249   else
00250     return FALSE;
00251 }
00252 
00253 
00254 static int PropertyInited = 0;
00255 static const OnigCodePoint** PropertyList;
00256 static int PropertyListNum;
00257 static int PropertyListSize;
00258 static hash_table_type* PropertyNameTable;
00259 
00260 static const OnigCodePoint CR_Hiragana[] = {
00261   1,
00262   0xa4a1, 0xa4f3
00263 }; /* CR_Hiragana */
00264 
00265 static const OnigCodePoint CR_Katakana[] = {
00266   3,
00267   0xa5a1, 0xa5f6,
00268   0xaaa6, 0xaaaf,
00269   0xaab1, 0xaadd
00270 }; /* CR_Katakana */
00271 
00272 static int
00273 init_property_list(void)
00274 {
00275   int r;
00276 
00277   PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
00278   PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
00279   PropertyInited = 1;
00280 
00281  end:
00282   return r;
00283 }
00284 
00285 static int
00286 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00287 {
00288   st_data_t ctype;
00289 
00290   PROPERTY_LIST_INIT_CHECK;
00291 
00292   if (onig_st_lookup_strend(PropertyNameTable, p, end, &ctype) == 0) {
00293     return onigenc_minimum_property_name_to_ctype(enc, p, end);
00294   }
00295 
00296   return ctype;
00297 }
00298 
00299 static int
00300 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00301 {
00302   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00303     if (code < 128)
00304       return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00305     else {
00306       if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00307         return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
00308       }
00309     }
00310   }
00311   else {
00312     PROPERTY_LIST_INIT_CHECK;
00313 
00314     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00315     if (ctype >= (unsigned int )PropertyListNum)
00316       return ONIGERR_TYPE_BUG;
00317 
00318     return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00319   }
00320 
00321   return FALSE;
00322 }
00323 
00324 static int
00325 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00326                      const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00327 {
00328   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00329     return ONIG_NO_SUPPORT_CONFIG;
00330   }
00331   else {
00332     *sb_out = 0x80;
00333 
00334     PROPERTY_LIST_INIT_CHECK;
00335 
00336     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00337     if (ctype >= (OnigCtype )PropertyListNum)
00338       return ONIGERR_TYPE_BUG;
00339 
00340     *ranges = PropertyList[ctype];
00341     return 0;
00342   }
00343 }
00344 
00345 
00346 OnigEncodingDefine(euc_jp, EUC_JP) = {
00347   mbc_enc_len,
00348   "EUC-JP",   /* name */
00349   3,          /* max enc length */
00350   1,          /* min enc length */
00351   onigenc_is_mbc_newline_0x0a,
00352   mbc_to_code,
00353   code_to_mbclen,
00354   code_to_mbc,
00355   mbc_case_fold,
00356   onigenc_ascii_apply_all_case_fold,
00357   onigenc_ascii_get_case_fold_codes_by_str,
00358   property_name_to_ctype,
00359   is_code_ctype,
00360   get_ctype_code_range,
00361   left_adjust_char_head,
00362   is_allowed_reverse_match,
00363   0
00364 };
00365 /*
00366  * Name: EUC-JP
00367  * MIBenum: 18
00368  * Link: http://www.iana.org/assignments/character-sets
00369  * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html
00370  */
00371 ENC_ALIAS("eucJP", "EUC-JP") /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
00372 
00373 /*
00374  * Name: eucJP-ms
00375  * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html
00376  * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html
00377  * Link: http://ja.wikipedia.org/wiki/EUC-JP
00378  */
00379 ENC_REPLICATE("eucJP-ms", "EUC-JP") /* TOG/JVC CDE/Motif Technical WG */
00380 ENC_ALIAS("euc-jp-ms", "eucJP-ms")
00381 
00382 /*
00383  * Name: CP51932
00384  * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm
00385  * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
00386  * Link: http://msyk.at.webry.info/200511/article_2.html
00387  */
00388 ENC_REPLICATE("CP51932", "EUC-JP")
00389 

Generated on Wed Sep 8 2010 09:52:25 for Ruby by  doxygen 1.7.1