From 418689548fe72cbbb0214d72de732e80b53ee465 Mon Sep 17 00:00:00 2001 From: Matthias Melcher Date: Sat, 1 Nov 2025 02:12:32 +0100 Subject: Add more Unicode documentation (2/2) (#125) --- src/fl_utf8.cxx | 196 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 109 insertions(+), 87 deletions(-) (limited to 'src') diff --git a/src/fl_utf8.cxx b/src/fl_utf8.cxx index c7409acae..3fa29aeab 100644 --- a/src/fl_utf8.cxx +++ b/src/fl_utf8.cxx @@ -894,8 +894,10 @@ static unsigned short cp1252[32] = { }; #endif -/** Decode a single UTF-8 encoded character starting at \e p. The - resulting Unicode value (in the range 0-0x10ffff) is returned, +/** + Decode a single UTF-8 encoded character starting at \e p. + + The resulting Unicode value (in the range 0-0x10ffff) is returned, and \e len is set to the number of bytes in the UTF-8 encoding (adding \e len to \e p will point at the next character). @@ -924,6 +926,11 @@ static unsigned short cp1252[32] = { Direct testing for the 1-byte case (as shown above) will also speed up the scanning of strings where the majority of characters are ASCII. + + \param[in] p pointer to a UTF-8 encoded character + \param[in] end if set, points after the last character that may be read + \param[out] len if set, returns the length of the input UTF-8 sequence + \return 32 bit Unicode character, or Unicode REPLACEMENT CHARACTER */ unsigned fl_utf8decode(const char* p, const char* end, int* len) { @@ -1004,18 +1011,20 @@ unsigned fl_utf8decode(const char* p, const char* end, int* len) is returned unchanged. Any UTF-8 errors are treated as though each byte of the error is an individual character. - \e start is the start of the string and is used to limit the - backwards search for the start of a UTF-8 character. - - \e end is the end of the string and is assumed to be a break - between characters. It is assumed to be greater than p. - This function is for moving a pointer that was jumped to the middle of a string, such as when doing a binary search for a position. You should use either this or fl_utf8back() depending on which direction your algorithm can handle the pointer moving. Do not use this to scan strings, use fl_utf8decode() instead. + + \param[in] p points somewhere into a UTF-8 encoded string, need not be on + a UTF-8 sequence start or end. + \param[in] start is the start of the string and is used to limit the + backwards search for the start of a UTF-8 character. + \param[in] end is the end of the string and is assumed to be a break + between characters. It is assumed to be greater than p. + \return pointer to the start of a UTF-8 sequence or pointer to terminating NUL. */ const char* fl_utf8fwd(const char* p, const char* start, const char* end) { @@ -1040,13 +1049,14 @@ const char* fl_utf8fwd(const char* p, const char* start, const char* end) is returned unchanged. Any UTF-8 errors are treated as though each byte of the error is an individual character. - \e start is the start of the string and is used to limit the - backwards search for the start of a UTF-8 character. - - \e end is the end of the string and is assumed to be a break - between characters. It is assumed to be greater than p. - - If you wish to decrement a UTF-8 pointer, pass p-1 to this. + \param[in] p points somewhere into a UTF-8 encoded string, need not be on + a UTF-8 sequence start or end. If you wish to decrement a UTF-8 pointer, + pass p-1 to this. + \param[in] start is the start of the string and is used to limit the + backwards search for the start of a UTF-8 character. + \param[in] end is the end of the string and is assumed to be a break + between characters. It is assumed to be greater than p. + \return pointer to the start of a UTF-8 sequence. */ const char* fl_utf8back(const char* p, const char* start, const char* end) { @@ -1067,6 +1077,9 @@ const char* fl_utf8back(const char* p, const char* start, const char* end) /** Returns number of bytes that utf8encode() will use to encode the character \p ucs. + + \param[in] 32 bit Unicode character + \return number of bytes for UTF-8 encoded sequence. */ int fl_utf8bytes(unsigned ucs) { if (ucs < 0x000080U) { @@ -1097,6 +1110,11 @@ int fl_utf8bytes(unsigned ucs) { 0xffff). However I encode these as though they are legal, so that utf8encode/fl_utf8decode will be the identity for all codes between 0 and 0x10ffff. + + \param[in] ucs 32 bit Unicode character + \param[out] a buffer of at least four bytes to receive the UTF-8 byte + sequence. No terminating NUL is added. + \return number of bytes in UTF-8 sequence. */ int fl_utf8encode(unsigned ucs, char* buf) { if (ucs < 0x000080U) { @@ -1129,29 +1147,27 @@ int fl_utf8encode(unsigned ucs, char* buf) { /** Convert a single 32-bit Unicode codepoint into an array of 16-bit characters. These are used by some system calls, especially on Windows. - \p ucs is the value to convert. - - \p dst points at an array to write, and \p dstlen is the number of - locations in this array. At most \p dstlen words will be - written, and a 0 terminating word will be added if \p dstlen is - large enough. Thus this function will never overwrite the buffer - and will attempt return a zero-terminated string if space permits. - If \p dstlen is zero then \p dst can be set to NULL and no data - is written, but the length is returned. - - The return value is the number of 16-bit words that \e would be written - to \p dst if it is large enough, not counting any terminating - zero. - - If the return value is greater than \p dstlen it indicates truncation, - you should then allocate a new array of size return+1 and call this again. - Unicode characters in the range 0x10000 to 0x10ffff are converted to "surrogate pairs" which take two words each (in UTF-16 encoding). Typically, setting \p dstlen to 2 will ensure that any valid Unicode value can be converted, and setting \p dstlen to 3 or more will allow a NULL terminated sequence to be returned. -*/ + + \param[in] ucs is the value to convert. + \param[out] dst points at an array to write, and + \param[in] dstlen is the number of + locations in this array. At most \p dstlen words will be + written, and a 0 terminating word will be added if \p dstlen is + large enough. Thus this function will never overwrite the buffer + and will attempt return a zero-terminated string if space permits. + If \p dstlen is zero then \p dst can be set to NULL and no data + is written, but the length is returned. + \return The return value is the number of 16-bit words that \e would be + written to \p dst if it is large enough, not counting any terminating + zero. If the return value is greater than \p dstlen it indicates + truncation, you should then allocate a new array of size return+1 + and call this again. + */ unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen) { /* The rule for direct conversion from UCS to UTF16 is: @@ -1196,22 +1212,6 @@ unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned /** Convert a UTF-8 sequence into an array of 16-bit characters. These are used by some system calls, especially on Windows. - \p src points at the UTF-8, and \p srclen is the number of bytes to - convert. - - \p dst points at an array to write, and \p dstlen is the number of - locations in this array. At most \p dstlen-1 words will be - written there, plus a 0 terminating word. Thus this function - will never overwrite the buffer and will always return a - zero-terminated string. If \p dstlen is zero then \p dst can be - null and no data is written, but the length is returned. - - The return value is the number of 16-bit words that \e would be written - to \p dst if it were long enough, not counting the terminating - zero. If the return value is greater or equal to \p dstlen it - indicates truncation, you can then allocate a new array of size - return+1 and call this again. - Errors in the UTF-8 are converted as though each byte in the erroneous string is in the Microsoft CP1252 encoding. This allows ISO-8859-1 text mistakenly identified as UTF-8 to be printed @@ -1220,6 +1220,21 @@ unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned Unicode characters in the range 0x10000 to 0x10ffff are converted to "surrogate pairs" which take two words each (this is called UTF-16 encoding). + + \param[in] src points at the UTF-8, and + \param[in] srclen is the number of bytes to convert. + \param[out] dst points at an array to write, and + \param[in] dstlen is the number of + locations in this array. At most \p dstlen-1 words will be + written there, plus a 0 terminating word. Thus this function + will never overwrite the buffer and will always return a + zero-terminated string. If \p dstlen is zero then \p dst can be + null and no data is written, but the length is returned. + \return The return value is the number of 16-bit words that \e would be + written to \p dst if it were long enough, not counting the terminating + zero. If the return value is greater or equal to \p dstlen it + indicates truncation, you can then allocate a new array of size + return+1 and call this again. */ unsigned fl_utf8toUtf16(const char* src, unsigned srclen, unsigned short* dst, unsigned dstlen) @@ -1268,16 +1283,16 @@ unsigned fl_utf8toUtf16(const char* src, unsigned srclen, fl_utf8decode() does. This allows ISO-8859-1 text mistakenly identified as UTF-8 to be printed correctly (and possibly CP1252 on Windows). - \p src points at the UTF-8 sequence, and \p srclen is the number of - bytes to convert. - - Up to \p dstlen bytes are written to \p dst, including a null - terminator. The return value is the number of bytes that would be - written, not counting the null terminator. If greater or equal to - \p dstlen then if you malloc a new array of size n+1 you will have - the space needed for the entire string. If \p dstlen is zero then - nothing is written and this call just measures the storage space - needed. + \param[in] src points at the UTF-8 sequence, and + \param[in] srclen is the number of bytes to convert. + \param[out] dst Up to \p dstlen bytes are written to \p dst, including a null + terminator. The return value is the number of bytes that would be + written, not counting the null terminator. If greater or equal to... + \param[in] dstlen then if you malloc a new array of size n+1 you will have + the space needed for the entire string. If \p dstlen is zero then + nothing is written and this call just measures the storage space + needed. + \return number of characters converted. */ unsigned fl_utf8toa(const char* src, unsigned srclen, char* dst, unsigned dstlen) @@ -1320,19 +1335,18 @@ unsigned fl_utf8toa(const char* src, unsigned srclen, instead. This would translate the codes in the range 0x80-0x9f to different characters. Currently it does not do this. - Up to \p dstlen bytes are written to \p dst, including a null - terminator. The return value is the number of bytes that would be - written, not counting the null terminator. If greater or equal to - \p dstlen then if you malloc a new array of size n+1 you will have - the space needed for the entire string. If \p dstlen is zero then - nothing is written and this call just measures the storage space - needed. - - \p srclen is the number of bytes in \p src to convert. - - If the return value equals \p srclen then this indicates that - no conversion is necessary, as only ASCII characters are in the - string. + \param[out] dst Up to \p dstlen bytes are written to \p dst, including a null + terminator. The return value is the number of bytes that would be + written, not counting the null terminator. If greater or equal to... + \param[in] dstlen then if you malloc a new array of size n+1 you will have + the space needed for the entire string. If \p dstlen is zero then + nothing is written and this call just measures the storage space + needed. + \param[in] src pointer to ISO-8859-1 string. + \param[in] srclen is the number of bytes in \p src to convert. + \return Number of bytes written. If the return value equals \p srclen then + this indicates that no conversion is necessary, as only ASCII characters + are in the string. */ unsigned fl_utf8froma(char* dst, unsigned dstlen, const char* src, unsigned srclen) { @@ -1384,6 +1398,10 @@ unsigned fl_utf8froma(char* dst, unsigned dstlen, if it is UTF-8 or in the locale encoding. My hope is that if this is done we will be able to cleanly transition to a locale-less encoding. + + \param[in] src pointer to string of unknown encoding + \param[in] srclen number of bytes to compare, must not be -1 + \return 0 if this is probably not a UTF-8 encode string */ int fl_utf8test(const char* src, unsigned srclen) { int ret = 1; @@ -1455,19 +1473,6 @@ int fl_wcwidth(const char* src) { on Windows where it is equivalent to fl_utf8toUtf16 and returns UTF-16. - \p src points at the UTF-8, and \p srclen is the number of bytes to - convert. - - \p dst points at an array to write, and \p dstlen is the number of - locations in this array. At most \p dstlen-1 wchar_t will be - written there, plus a 0 terminating wchar_t. - - The return value is the number of wchar_t that \e would be written - to \p dst if it were long enough, not counting the terminating - zero. If the return value is greater or equal to \p dstlen it - indicates truncation, you can then allocate a new array of size - return+1 and call this again. - Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux and most other systems. Where wchar_t is 16 bits, Unicode characters in the range 0x10000 to 0x10ffff are converted to @@ -1475,8 +1480,19 @@ int fl_wcwidth(const char* src) { encoding). If wchar_t is 32 bits this rather nasty problem is avoided. - Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX + \note Windows includes Cygwin, i.e. compiled with Cygwin's POSIX layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11. + + \param[in] src points at the UTF-8, and + \param[in] srclen is the number of bytes to convert. + \param[out] dst points at an array to write, and \p dstlen is the number of + locations in this array. At most \p dstlen-1 wchar_t will be + written there, plus a 0 terminating wchar_t. + \return The return value is the number of wchar_t that \e would be written + to \p dst if it were long enough, not counting the terminating + zero. If the return value is greater or equal to \p dstlen it + indicates truncation, you can then allocate a new array of size + return+1 and call this again. */ unsigned fl_utf8towc(const char* src, unsigned srclen, wchar_t* dst, unsigned dstlen) @@ -1511,6 +1527,12 @@ unsigned fl_utf8towc(const char* src, unsigned srclen, On Windows "surrogate pairs" are converted to a single character and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate pairs are converted as though they are individual characters. + + \param[out] dst a destination buffer provided by the caller + \param[in] dstlen size of dst buffer + \param[in] src pointer to Windows wide char string + \param[in] srclen number of characters to convert + \return number of bytes written, not including the terminating NUL */ unsigned fl_utf8fromwc(char* dst, unsigned dstlen, const wchar_t* src, unsigned srclen) { @@ -1522,7 +1544,7 @@ unsigned fl_utf8fromwc(char* dst, unsigned dstlen, const wchar_t* src, unsigned is used. If true the fl_utf8to_mb and fl_utf8from_mb don't do anything useful. - It is highly recommended that you change your system so this + \note It is highly recommended that you change your system so this does return true. On Windows this is done by setting the "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE to a string containing the letters "utf" or "UTF" in it, or by -- cgit v1.2.3