diff options
| author | engelsman <engelsman> | 2009-04-18 11:51:32 +0000 |
|---|---|---|
| committer | engelsman <engelsman> | 2009-04-18 11:51:32 +0000 |
| commit | 78da588135ba3704ef110486af0c80104166c056 (patch) | |
| tree | f10df628b6cba55d0e63dca27a809a43d7ef6311 | |
| parent | 982f297d334b2e0182e7ee2b380b51f698893125 (diff) | |
grouped similar functions and added summary information in unicode.dox
corrected mismatched parameter names and typos on fl_utf8.h and fl_utf8.cxx
git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@6769 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
| -rw-r--r-- | FL/fl_utf8.h | 6 | ||||
| -rw-r--r-- | documentation/src/unicode.dox | 231 | ||||
| -rw-r--r-- | src/fl_utf8.cxx | 8 |
3 files changed, 212 insertions, 33 deletions
diff --git a/FL/fl_utf8.h b/FL/fl_utf8.h index fc1b653c3..3440bd727 100644 --- a/FL/fl_utf8.h +++ b/FL/fl_utf8.h @@ -107,16 +107,16 @@ FL_EXPORT int fl_utf8len(char c); FL_EXPORT int fl_utf_nb_char(const unsigned char *buf, int len); /* F2: Convert the next UTF8 char-sequence into a Unicode value (and say how many bytes were used) */ -FL_EXPORT unsigned fl_utf8decode(const char* start, const char* end, int* len); +FL_EXPORT unsigned fl_utf8decode(const char* p, const char* end, int* len); /* F2: Encode a Unicode value into a UTF8 sequence, return the number of bytes used */ FL_EXPORT int fl_utf8encode(unsigned ucs, char* buf); /* F2: Move forward to the next valid UTF8 sequence start betwen start and end */ -FL_EXPORT const char* fl_utf8fwd(const char* pos, const char* start, const char* end); +FL_EXPORT const char* fl_utf8fwd(const char* p, const char* start, const char* end); /* F2: Move backward to the previous valid UTF8 sequence start */ -FL_EXPORT const char* fl_utf8back(const char* pos, const char* start, const char* end); +FL_EXPORT const char* fl_utf8back(const char* p, const char* start, const char* end); /* F2: Convert a UTF8 string into UTF16 */ FL_EXPORT unsigned fl_utf8toUtf16(const char* src, unsigned srclen, unsigned short* dst, unsigned dstlen); diff --git a/documentation/src/unicode.dox b/documentation/src/unicode.dox index a3e05c21f..7346d246e 100644 --- a/documentation/src/unicode.dox +++ b/documentation/src/unicode.dox @@ -147,6 +147,10 @@ Unicode and UTF-8 in FLTK involves three important areas: The current implementation of Unicode / UTF-8 in FLTK will impose the following limitations: +- An implementation note in the code says that all functions are + LIMITED to 24 bit Unicode values, but also says that only 16 bits + are really used under linux and win32. + - FLTK will only handle single characters, so composed characters consisting of a base character and floating accent characters will be treated as multiple characters; @@ -155,56 +159,229 @@ the following limitations: and not on a general Unicode character basis; - FLTK will not handle right-to-left or bi-directional text; + + \todo + Verify 16/24 bit Unicode limit for different character sets? + OksiD's code appears limited to 16-bit whereas the FLTK2 code + appears to handle a wider set. What about illegal characters? + See comments in fl_utf8fromwc() and fl_utf8toUtf16(). + \section unicode_fltk_calls FLTK Unicode and UTF8 functions -- unsigned int fl_nonspacing(unsigned int ucs) - \b OksiD -- int fl_tolower(unsigned int ucs) +This section currently provides a brief overview of the functions. +For more details, consult the main text for each function via its link. + +int fl_utf8locale() + \b FLTK2 + <br> +\par +\p %fl_utf8locale() returns true if the "locale" seems to indicate +that UTF-8 encoding is used. +\par +<i>It is highly recommended that your change your system so this does return +true!</i> + + +int fl_utf8test(const char *src, unsigned len) + \b FLTK2 + <br> +\par +\p %fl_utf8test() examines the first \p len bytes of \p src. +It returns 0 if there are any illegal UTF-8 sequences; +1 if \p src contains plain ASCII or if \p len is zero; +or 2, 3 or 4 to indicate the range of Unicode characters found. + + +int fl_utf_nb_char(const unsigned char *buf, int len) \b OksiD -- int fl_toupper(unsigned int ucs) + <br> +\par +Returns the number of UTF-8 character in the first \p len bytes of \p buf. + + +int fl_unichar_to_utf8_size(Fl_Unichar) + <br> +int fl_utf8bytes(unsigned ucs) + <br> +\par +Returns the number of bytes needed to encode \p ucs in UTF-8. + + +int fl_utf8len(char c) \b OksiD -- int fl_unichar_to_utf8_size(Fl_Unichar) -- char* fl_utf2mbcs (const char *src) + <br> +\par +If \p c is a valid first byte of a UTF-8 encoded character sequence, +\p %fl_utf8len() will return the number of bytes in that sequence. +It returns -1 if \p c is not a valid first byte. + + +unsigned int fl_nonspacing(unsigned int ucs) \b OksiD -- const char* fl_utf8back(const char *pos, const char *start, const char *end) + <br> +\par +Returns true if \p ucs is a non-spacing character. +<b>[What are non-spacing characters?]</b> + + +const char* fl_utf8back(const char *p, const char *start, const char *end) \b FLTK2 -- int fl_utf8bytes(unsigned ucs) -- unsigned int fl_utf8decode(const char *start, const char *end, int *len) + <br> +const char* fl_utf8fwd(const char *p, const char *start, const char *end) \b FLTK2 -- int fl_utf8encode(unsigned ucs, char *buf) -- unsigned int fl_utf8from_mb(char *dst, unsigned dstlen, const char *src, unsigned srclen) + <br> +\par +If \p p already points to the start of a UTF-8 character sequence, +these functions will return \p p. +Otherwise \p %fl_utf8back() searches backwards from \p p +and \p %fl_utf8fwd() searches forwards from \p p, +within the \p start and \p end limits, +looking for the start of a UTF-8 character. + + +unsigned int fl_utf8decode(const char *p, const char *end, int *len) \b FLTK2 -- unsigned int fl_utf8froma(char *dst, unsigned dstlen, const char *src, unsigned srclen) + <br> +int fl_utf8encode(unsigned ucs, char *buf) \b FLTK2 -- unsigned int fl_utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src, unsigned srclen) + <br> +\par +\p %fl_utf8decode() attempts to decode the UTF-8 character that starts +at \p p and may not extend past \p end. +It returns the Unicode value, and the length of the UTF-8 character sequence +is returned via the \p len argument. +\p %fl_utf8encode() writes the UTF-8 encoding of \p ucs into \p buf +and returns the number of bytes in the sequence. +See the main documentation for the treatment of illegal Unicode +and UTF-8 sequences. + + +unsigned int fl_utf8froma(char *dst, unsigned dstlen, const char *src, unsigned srclen) \b FLTK2 -- const char* fl_utf8fwd(const char *pos, const char *start, const char *end) + <br> +unsigned int fl_utf8toa(const char *src, unsigned srclen, char *dst, unsigned dstlen) \b FLTK2 -- int fl_utf8len(char c) -- int fl_utf8locale() + <br> +\par +\p %fl_utf8froma() converts a character string containing single bytes +per character (i.e. ASCII or ISO-8859-1) into UTF-8. +If the \p src string contains only ASCII characters, the return value will +be the same as \p srclen. +\par +\p %fl_utf8toa() converts a string containing UTF-8 characters into +single byte characters. UTF-8 characters do not correspond to ASCII +or ISO-8859-1 characters below 0xFF are replaced with '?'. + +\par +Both functions return the number of bytes that would be written, not +counting the null terminator. +\p destlen provides a means of limiting the number of bytes written, +so setting \p destlen to zero is a means of measuring how much storage +would be needed before doing the real conversion. + + +char* fl_utf2mbcs(const char *src) + \b OksiD + <br> +\par +converts a UTF-8 string to a local multi-byte character string. +<b>[More info required here!]</b> + +unsigned int fl_utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src, unsigned srclen) \b FLTK2 -- int fl_utf8test(const char *src, unsigned len) + <br> +unsigned int fl_utf8towc(const char *src, unsigned srclen, wchar_t *dst, unsigned dstlen) \b FLTK2 -- unsigned int fl_utf8to_mb(const char *src, unsigned srclen, char *dst, unsigned dstlen) + <br> +unsigned int fl_utf8toUtf16(const char *src, unsigned srclen, unsigned short *dst, unsigned dstlen) \b FLTK2 -- unsigned int fl_utf8toa(const char *src, unsigned srclen, char *dst, unsigned dstlen) -- unsigned int fl_utf8toUtf16(const char *src, unsigned srclen, unsigned short *dst, unsigned dstlen) + <br> +\par +These routines convert between UTF-8 and \p wchar_t or "wide character" +strings. +The difficulty lies in the fact \p sizeof(wchar_t) is 2 on Windows +and 4 on Linux and most other systems. +Therefore some "wide characters" on Windows may be represented +as "surrogate pairs" of more than one \p wchar_t. + +\par +\p %fl_utf8fromwc() converts from a "wide character" string to UTF-8. +Note that \p srclen is the number of \p wchar_t elements in the source +string and on Windows and this might be larger than the number of characters. +\p dstlen specifies the maximum number of \b bytes to copy, including +the null terminator. + +\par +\p %fl_utf8towc() converts a UTF-8 string into a "wide character" string. +Note that on Windows, some "wide characters" might result in "surrogate +pairs" and therefore the return value might be more than the number of +characters. +\p dstlen specifies the maximum number of \b wchar_t elements to copy, +including a zero terminating element. +<b>[Is this all worded correctly?]</b> + +\par +\p %fl_utf8toUtf16() converts a UTF-8 string into a "wide character" +string using UTF-16 encoding to handle the "surrogate pairs" on Windows. +\p dstlen specifies the maximum number of \b wchar_t elements to copy, +including a zero terminating element. +<b>[Is this all worded correctly?]</b> + +\par +These routines all return the number of elements that would be required +for a full conversion of the \p src string, including the zero terminator. +Therefore setting \p dstlen to zero is a way of measuring how much storage +would be needed before doing the real conversion. + + +unsigned int fl_utf8from_mb(char *dst, unsigned dstlen, const char *src, unsigned srclen) \b FLTK2 -- unsigned int fl_utf8towc(const char *src, unsigned srclen, wchar_t *dst, unsigned dstlen) + <br> +unsigned int fl_utf8to_mb(const char *src, unsigned srclen, char *dst, unsigned dstlen) \b FLTK2 -- int fl_utf_nb_char(const unsigned char *buf, int len) + <br> +\par +These functions convert between UTF-8 and the locale-specific multi-byte +encodings used on some systems for filenames, etc. +If fl_utf8locale() returns true, these functions don't do anything useful. +<b>[Is this all worded correctly?]</b> + + +int fl_tolower(unsigned int ucs) \b OksiD -- int fl_utf_strcasecmp(const char *s1, const char *s2) + <br> +int fl_toupper(unsigned int ucs) \b OksiD -- int fl_utf_strncasecmp(const char *s1, const char *s2, int n) + <br> +int fl_utf_tolower(const unsigned char *str, int len, char *buf) \b OksiD -- int fl_utf_tolower(const unsigned char *str, int len, char *buf) + <br> +int fl_utf_toupper(const unsigned char *str, int len, char *buf) \b OksiD -- int fl_utf_toupper(const unsigned char *str, int len, char *buf) + <br> +\par +\p %fl_tolower() and \p %fl_toupper() convert a single Unicode character +from upper to lower case, and vice versa. +\p %fl_utf_tolower() and \p %fl_utf_toupper() convert a string of bytes, +some of which may be multi-byte UTF-8 encodings of Unicode characters, +from upper to lower case, and vice versa. +\par +Warning: to be safe, \p buf length must be at least \p 3*len +[for 16-bit Unicode] + + +int fl_utf_strcasecmp(const char *s1, const char *s2) \b OksiD -- int fl_utf8len(char c) + <br> +int fl_utf_strncasecmp(const char *s1, const char *s2, int n) \b OksiD + <br> +\par +\p %fl_utf_strcasecmp() is a UTF-8 aware string comparison function that +converts the strings to lower case Unicode as part of the comparison. +\p %flt_utf_strncasecmp() only compares the first \p n characters [bytes?] + \section unicode_system_calls FLTK Unicode versions of system calls diff --git a/src/fl_utf8.cxx b/src/fl_utf8.cxx index 83e70bca4..7634ca93b 100644 --- a/src/fl_utf8.cxx +++ b/src/fl_utf8.cxx @@ -111,7 +111,8 @@ Toupper( } /** - returns the byte length of the first UTF-8 char sequence or -1 is not valid. + return the byte length of the UTF-8 sequence with first byte \p c, + or -1 if \p c is not valid. */ int fl_utf8len(char c) { @@ -174,6 +175,7 @@ fl_utf_nb_char( UTF-8 aware strncasecmp - converts to lower case Unicode and tests. \todo Correct the incorrect logic where length of strings tested + \todo Clarify whether n means number of bytes, or characters. */ int fl_utf_strncasecmp(const char *s1, const char *s2, int n) { @@ -256,7 +258,7 @@ int fl_toupper(unsigned int ucs) /** converts the str string to the lower case equivalent into buf. - Warning: to be safe buf length must be at least 3 * len [for 24-bit Unicode] + Warning: to be safe buf length must be at least 3 * len [for 16-bit Unicode] */ int fl_utf_tolower(const unsigned char *str, int len, char *buf) { @@ -287,7 +289,7 @@ int fl_utf_tolower(const unsigned char *str, int len, char *buf) /** converts the str string to the upper case equivalent into buf. - Warning: to be safe buf length must be at least 3 * len [for 24-bit Unicode] + Warning: to be safe buf length must be at least 3 * len [for 16-bit Unicode] */ int fl_utf_toupper(const unsigned char *str, int len, char *buf) { |
