|
Rapicorn - Experimental UI Toolkit - Source Code 10.08.1
|
00001 /* RapicornUtf8 - UTF-8 utilities 00002 * Copyright (C) 2006 Tim Janik 00003 * 00004 * This library is free software; you can redistribute it and/or 00005 * modify it under the terms of the GNU Lesser General Public 00006 * License as published by the Free Software Foundation; either 00007 * version 2.1 of the License, or (at your option) any later version. 00008 * 00009 * This library is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 * Lesser General Public License for more details. 00013 * 00014 * A copy of the GNU Lesser General Public License should ship along 00015 * with this library; if not, see http://www.gnu.org/copyleft/. 00016 */ 00017 #ifndef __RAPICORN_UTF8_HH__ 00018 #define __RAPICORN_UTF8_HH__ 00019 00020 #include <rcore/utilities.hh> 00021 00022 namespace Rapicorn { 00023 00024 namespace Unichar { 00025 inline bool isvalid (unichar uc) RAPICORN_CONST; 00026 bool isalnum (unichar uc) RAPICORN_CONST; 00027 bool isalpha (unichar uc) RAPICORN_CONST; 00028 bool iscntrl (unichar uc) RAPICORN_CONST; 00029 bool isdigit (unichar uc) RAPICORN_CONST; 00030 int digit_value (unichar uc) RAPICORN_CONST; 00031 bool isgraph (unichar uc) RAPICORN_CONST; 00032 bool islower (unichar uc) RAPICORN_CONST; 00033 unichar tolower (unichar uc) RAPICORN_CONST; 00034 bool isprint (unichar uc) RAPICORN_CONST; 00035 bool ispunct (unichar uc) RAPICORN_CONST; 00036 bool isspace (unichar uc) RAPICORN_CONST; 00037 bool isupper (unichar uc) RAPICORN_CONST; 00038 unichar toupper (unichar uc) RAPICORN_CONST; 00039 bool isxdigit (unichar uc) RAPICORN_CONST; 00040 int xdigit_value (unichar uc) RAPICORN_CONST; 00041 bool istitle (unichar uc) RAPICORN_CONST; 00042 unichar totitle (unichar uc) RAPICORN_CONST; 00043 bool isdefined (unichar uc) RAPICORN_CONST; 00044 bool iswide (unichar uc) RAPICORN_CONST; 00045 bool iswide_cjk (unichar uc) RAPICORN_CONST; 00046 typedef enum { 00047 CONTROL, FORMAT, UNASSIGNED, 00048 PRIVATE_USE, SURROGATE, LOWERCASE_LETTER, 00049 MODIFIER_LETTER, OTHER_LETTER, TITLECASE_LETTER, 00050 UPPERCASE_LETTER, COMBINING_MARK, ENCLOSING_MARK, 00051 NON_SPACING_MARK, DECIMAL_NUMBER, LETTER_NUMBER, 00052 OTHER_NUMBER, CONNECT_PUNCTUATION, DASH_PUNCTUATION, 00053 CLOSE_PUNCTUATION, FINAL_PUNCTUATION, INITIAL_PUNCTUATION, 00054 OTHER_PUNCTUATION, OPEN_PUNCTUATION, CURRENCY_SYMBOL, 00055 MODIFIER_SYMBOL, MATH_SYMBOL, OTHER_SYMBOL, 00056 LINE_SEPARATOR, PARAGRAPH_SEPARATOR, SPACE_SEPARATOR 00057 } Type; 00058 Type get_type (unichar uc) RAPICORN_CONST; 00059 typedef enum { 00060 BREAK_MANDATORY, BREAK_CARRIAGE_RETURN, BREAK_LINE_FEED, 00061 BREAK_COMBINING_MARK, BREAK_SURROGATE, BREAK_ZERO_WIDTH_SPACE, 00062 BREAK_INSEPARABLE, BREAK_NON_BREAKING_GLUE, BREAK_CONTINGENT, 00063 BREAK_SPACE, BREAK_AFTER, BREAK_BEFORE, 00064 BREAK_BEFORE_AND_AFTER, BREAK_HYPHEN, BREAK_NON_STARTER, 00065 BREAK_OPEN_PUNCTUATION, BREAK_CLOSE_PUNCTUATION, BREAK_QUOTATION, 00066 BREAK_EXCLAMATION, BREAK_IDEOGRAPHIC, BREAK_NUMERIC, 00067 BREAK_INFIX_SEPARATOR, BREAK_SYMBOL, BREAK_ALPHABETIC, 00068 BREAK_PREFIX, BREAK_POSTFIX, BREAK_COMPLEX_CONTEXT, 00069 BREAK_AMBIGUOUS, BREAK_UNKNOWN, BREAK_NEXT_LINE, 00070 BREAK_WORD_JOINER, BREAK_HANGUL_L_JAMO, BREAK_HANGUL_V_JAMO, 00071 BREAK_HANGUL_T_JAMO, BREAK_HANGUL_LV_SYLLABLE, BREAK_HANGUL_LVT_SYLLABLE 00072 } BreakType; 00073 BreakType get_break (unichar uc) RAPICORN_CONST; 00074 00075 } // Unichar 00076 00077 /* --- UTF-8 movement --- */ 00078 inline const char* utf8_next (const char *c); 00079 inline char* utf8_next (char *c); 00080 inline const char* utf8_prev (const char *c); 00081 inline char* utf8_prev (char *c); 00082 inline const char* utf8_find_next (const char *c, 00083 const char *bound = NULL); 00084 inline char* utf8_find_next (char *current, 00085 const char *bound = NULL); 00086 inline const char* utf8_find_prev (const char *start, 00087 const char *current); 00088 inline char* utf8_find_prev (const char *start, 00089 char *currrent); 00090 inline const char* utf8_align (const char *start, 00091 const char *current); 00092 inline char* utf8_align (const char *start, 00093 char *current); 00094 inline bool utf8_aligned (const char *c); 00095 unichar utf8_to_unichar (const char *str); 00096 int utf8_from_unichar (unichar uc, 00097 char str[8]); 00098 bool utf8_validate (const String &string, 00099 int *bound = NULL); 00100 00101 /* --- implementation bits --- */ 00102 namespace Unichar { 00103 inline bool 00104 isvalid (unichar uc) 00105 { 00106 if (RAPICORN_UNLIKELY (uc > 0xfdcf && uc < 0xfdf0)) 00107 return false; 00108 if (RAPICORN_UNLIKELY ((uc & 0xfffe) == 0xfffe)) 00109 return false; 00110 if (RAPICORN_UNLIKELY (uc > 0x10ffff)) 00111 return false; 00112 if (RAPICORN_UNLIKELY ((uc & 0xfffff800) == 0xd800)) 00113 return false; 00114 return true; 00115 } 00116 } // Unichar 00117 00118 extern const int8 utf8_skip_table[256]; 00119 00120 inline const char* 00121 utf8_next (const char *c) 00122 { 00123 return c + utf8_skip_table[(uint8) *c]; 00124 } 00125 00126 inline char* 00127 utf8_next (char *c) 00128 { 00129 return c + utf8_skip_table[(uint8) *c]; 00130 } 00131 00132 inline const char* 00133 utf8_prev (const char *c) 00134 { 00135 do 00136 c--; 00137 while ((*c & 0xc0) == 0x80); 00138 return c; 00139 } 00140 00141 inline char* 00142 utf8_prev (char *c) 00143 { 00144 do 00145 c--; 00146 while ((*c & 0xc0) == 0x80); 00147 return c; 00148 } 00149 00150 inline const char* 00151 utf8_find_next (const char *c, 00152 const char *bound) 00153 { 00154 if (*c) 00155 do 00156 c++; 00157 while ((!bound || c < bound) && (*c & 0xc0) == 0x80); 00158 return !bound || c < bound ? c : NULL; 00159 } 00160 00161 inline char* 00162 utf8_find_next (char *c, 00163 const char *bound) 00164 { 00165 return const_cast<char*> (utf8_find_next (const_cast<const char*> (c), bound)); 00166 } 00167 00168 inline const char* 00169 utf8_find_prev (const char *start, 00170 const char *current) 00171 { 00172 do 00173 current--; 00174 while (current >= start && (*current & 0xc0) == 0x80); 00175 return current >= start ? current : NULL; 00176 } 00177 00178 inline char* 00179 utf8_find_prev (const char *start, 00180 char *current) 00181 { 00182 return const_cast<char*> (utf8_find_prev (start, const_cast<const char*> (current))); 00183 } 00184 00185 inline const char* 00186 utf8_align (const char *start, 00187 const char *current) 00188 { 00189 while (current > start && (*current & 0xc0) == 0x80) 00190 current--; 00191 return current; 00192 } 00193 00194 inline char* 00195 utf8_align (const char *start, 00196 char *current) 00197 { 00198 return const_cast<char*> (utf8_align (start, const_cast<const char*> (current))); 00199 } 00200 00201 inline bool 00202 utf8_aligned (const char *c) 00203 { 00204 return (*c & 0xc0) == 0x80; 00205 } 00206 00207 } // Rapicorn 00208 00209 #endif /* __RAPICORN_UTF8_HH__ */ 00210 /* vim:set ts=8 sts=2 sw=2: */
1.7.4