Rapicorn - Experimental UI Toolkit - Source Code 10.08.1
rapicornutf8.hh
Go to the documentation of this file.
00001 /* RapicornUtf8 - UTF-8 utilities
00002  * Copyright (C) 2006 Tim Janik
00003  *
00004  * This library is free software; you can redistribute it and/or
00005  * modify it under the terms of the GNU Lesser General Public
00006  * License as published by the Free Software Foundation; either
00007  * version 2.1 of the License, or (at your option) any later version.
00008  *
00009  * This library is distributed in the hope that it will be useful,
00010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012  * Lesser General Public License for more details.
00013  *
00014  * A copy of the GNU Lesser General Public License should ship along
00015  * with this library; if not, see http://www.gnu.org/copyleft/.
00016  */
00017 #ifndef __RAPICORN_UTF8_HH__
00018 #define __RAPICORN_UTF8_HH__
00019 
00020 #include <rcore/utilities.hh>
00021 
00022 namespace Rapicorn {
00023 
00024 namespace Unichar {
00025 inline bool isvalid      (unichar uc) RAPICORN_CONST;
00026 bool        isalnum      (unichar uc) RAPICORN_CONST;
00027 bool        isalpha      (unichar uc) RAPICORN_CONST;
00028 bool        iscntrl      (unichar uc) RAPICORN_CONST;
00029 bool        isdigit      (unichar uc) RAPICORN_CONST;
00030 int         digit_value  (unichar uc) RAPICORN_CONST;
00031 bool        isgraph      (unichar uc) RAPICORN_CONST;
00032 bool        islower      (unichar uc) RAPICORN_CONST;
00033 unichar     tolower      (unichar uc) RAPICORN_CONST;
00034 bool        isprint      (unichar uc) RAPICORN_CONST;
00035 bool        ispunct      (unichar uc) RAPICORN_CONST;
00036 bool        isspace      (unichar uc) RAPICORN_CONST;
00037 bool        isupper      (unichar uc) RAPICORN_CONST;
00038 unichar     toupper      (unichar uc) RAPICORN_CONST;
00039 bool        isxdigit     (unichar uc) RAPICORN_CONST;
00040 int         xdigit_value (unichar uc) RAPICORN_CONST;
00041 bool        istitle      (unichar uc) RAPICORN_CONST;
00042 unichar     totitle      (unichar uc) RAPICORN_CONST;
00043 bool        isdefined    (unichar uc) RAPICORN_CONST;
00044 bool        iswide       (unichar uc) RAPICORN_CONST;
00045 bool        iswide_cjk   (unichar uc) RAPICORN_CONST;
00046 typedef enum {
00047   CONTROL,              FORMAT,                 UNASSIGNED,
00048   PRIVATE_USE,          SURROGATE,              LOWERCASE_LETTER,
00049   MODIFIER_LETTER,      OTHER_LETTER,           TITLECASE_LETTER,
00050   UPPERCASE_LETTER,     COMBINING_MARK,         ENCLOSING_MARK,
00051   NON_SPACING_MARK,     DECIMAL_NUMBER,         LETTER_NUMBER,
00052   OTHER_NUMBER,         CONNECT_PUNCTUATION,    DASH_PUNCTUATION,
00053   CLOSE_PUNCTUATION,    FINAL_PUNCTUATION,      INITIAL_PUNCTUATION,
00054   OTHER_PUNCTUATION,    OPEN_PUNCTUATION,       CURRENCY_SYMBOL,
00055   MODIFIER_SYMBOL,      MATH_SYMBOL,            OTHER_SYMBOL,
00056   LINE_SEPARATOR,       PARAGRAPH_SEPARATOR,    SPACE_SEPARATOR
00057 } Type;
00058 Type    get_type     (unichar uc) RAPICORN_CONST;
00059 typedef enum {
00060   BREAK_MANDATORY,        BREAK_CARRIAGE_RETURN,    BREAK_LINE_FEED,
00061   BREAK_COMBINING_MARK,   BREAK_SURROGATE,          BREAK_ZERO_WIDTH_SPACE,
00062   BREAK_INSEPARABLE,      BREAK_NON_BREAKING_GLUE,  BREAK_CONTINGENT,
00063   BREAK_SPACE,            BREAK_AFTER,              BREAK_BEFORE,
00064   BREAK_BEFORE_AND_AFTER, BREAK_HYPHEN,             BREAK_NON_STARTER,
00065   BREAK_OPEN_PUNCTUATION, BREAK_CLOSE_PUNCTUATION,  BREAK_QUOTATION,
00066   BREAK_EXCLAMATION,      BREAK_IDEOGRAPHIC,        BREAK_NUMERIC,
00067   BREAK_INFIX_SEPARATOR,  BREAK_SYMBOL,             BREAK_ALPHABETIC,
00068   BREAK_PREFIX,           BREAK_POSTFIX,            BREAK_COMPLEX_CONTEXT,
00069   BREAK_AMBIGUOUS,        BREAK_UNKNOWN,            BREAK_NEXT_LINE,
00070   BREAK_WORD_JOINER,      BREAK_HANGUL_L_JAMO,      BREAK_HANGUL_V_JAMO,
00071   BREAK_HANGUL_T_JAMO,    BREAK_HANGUL_LV_SYLLABLE, BREAK_HANGUL_LVT_SYLLABLE
00072 } BreakType;
00073 BreakType get_break  (unichar uc) RAPICORN_CONST;
00074 
00075 } // Unichar
00076 
00077 /* --- UTF-8 movement --- */
00078 inline const char*    utf8_next         (const char     *c);
00079 inline char*          utf8_next         (char           *c);
00080 inline const char*    utf8_prev         (const char     *c);
00081 inline char*          utf8_prev         (char           *c);
00082 inline const char*    utf8_find_next    (const char     *c,
00083                                          const char     *bound = NULL);
00084 inline char*          utf8_find_next    (char           *current,
00085                                          const char     *bound = NULL);
00086 inline const char*    utf8_find_prev    (const char     *start,
00087                                          const char     *current);
00088 inline char*          utf8_find_prev    (const char     *start,
00089                                          char           *currrent);
00090 inline const char*    utf8_align        (const char     *start,
00091                                          const char     *current);
00092 inline char*          utf8_align        (const char     *start,
00093                                          char           *current);
00094 inline bool           utf8_aligned      (const char     *c);
00095 unichar               utf8_to_unichar   (const char     *str);
00096 int                   utf8_from_unichar (unichar         uc,
00097                                          char            str[8]);
00098 bool                  utf8_validate     (const String   &string,
00099                                          int            *bound = NULL);
00100 
00101 /* --- implementation bits --- */
00102 namespace Unichar {
00103 inline bool
00104 isvalid (unichar uc)
00105 {
00106   if (RAPICORN_UNLIKELY (uc > 0xfdcf && uc < 0xfdf0))
00107     return false;
00108   if (RAPICORN_UNLIKELY ((uc & 0xfffe) == 0xfffe))
00109     return false;
00110   if (RAPICORN_UNLIKELY (uc > 0x10ffff))
00111     return false;
00112   if (RAPICORN_UNLIKELY ((uc & 0xfffff800) == 0xd800))
00113     return false;
00114   return true;
00115 }
00116 } // Unichar
00117 
00118 extern const int8 utf8_skip_table[256];
00119 
00120 inline const char*
00121 utf8_next (const char *c)
00122 {
00123   return c + utf8_skip_table[(uint8) *c];
00124 }
00125 
00126 inline char*
00127 utf8_next (char *c)
00128 {
00129   return c + utf8_skip_table[(uint8) *c];
00130 }
00131 
00132 inline const char*
00133 utf8_prev (const char *c)
00134 {
00135   do
00136     c--;
00137   while ((*c & 0xc0) == 0x80);
00138   return c;
00139 }
00140 
00141 inline char*
00142 utf8_prev (char *c)
00143 {
00144   do
00145     c--;
00146   while ((*c & 0xc0) == 0x80);
00147   return c;
00148 }
00149 
00150 inline const char*
00151 utf8_find_next (const char *c,
00152                 const char *bound)
00153 {
00154   if (*c)
00155     do
00156       c++;
00157     while ((!bound || c < bound) && (*c & 0xc0) == 0x80);
00158   return !bound || c < bound ? c : NULL;
00159 }
00160 
00161 inline char*
00162 utf8_find_next (char       *c,
00163                 const char *bound)
00164 {
00165   return const_cast<char*> (utf8_find_next (const_cast<const char*> (c), bound));
00166 }
00167 
00168 inline const char*
00169 utf8_find_prev (const char     *start,
00170                 const char     *current)
00171 {
00172   do
00173     current--;
00174   while (current >= start && (*current & 0xc0) == 0x80);
00175   return current >= start ? current : NULL;
00176 }
00177 
00178 inline char*
00179 utf8_find_prev (const char     *start,
00180                 char           *current)
00181 {
00182   return const_cast<char*> (utf8_find_prev (start, const_cast<const char*> (current)));
00183 }
00184 
00185 inline const char*
00186 utf8_align (const char     *start,
00187             const char     *current)
00188 {
00189   while (current > start && (*current & 0xc0) == 0x80)
00190     current--;
00191   return current;
00192 }
00193 
00194 inline char*
00195 utf8_align (const char *start,
00196             char       *current)
00197 {
00198   return const_cast<char*> (utf8_align (start, const_cast<const char*> (current)));
00199 }
00200 
00201 inline bool
00202 utf8_aligned (const char *c)
00203 {
00204   return (*c & 0xc0) == 0x80;
00205 }
00206 
00207 } // Rapicorn
00208 
00209 #endif /* __RAPICORN_UTF8_HH__ */
00210 /* vim:set ts=8 sts=2 sw=2: */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines