From bed38ba3f5ba653ab41bd5abb92c923591442c8f Mon Sep 17 00:00:00 2001 From: ManoloFLTK <41016272+ManoloFLTK@users.noreply.github.com> Date: Thu, 29 Jan 2026 13:04:35 +0100 Subject: Emojis: add support of keycap emoji sequences. - It is expected that all emojis listed in the Wikipedia "emoji" article as of early 2026 are recognized as single glyphs by FLTK text widgets. - Document functions fl_utf8_{next|previous}_composed_char() relatively to the notion of "emoji sequence". - Remove signed/unsigned comparison compilation warnings. --- src/Fl_Text_Buffer.cxx | 2 +- src/fl_utf8.cxx | 54 +++++++++++++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/Fl_Text_Buffer.cxx b/src/Fl_Text_Buffer.cxx index 848cacd9a..81fa36d55 100644 --- a/src/Fl_Text_Buffer.cxx +++ b/src/Fl_Text_Buffer.cxx @@ -2127,7 +2127,7 @@ int Fl_Text_Buffer::prev_char(int pos) const int Fl_Text_Buffer::next_char(int pos) const { IS_UTF8_ALIGNED2(this, (pos)) - int l = fl_utf8len1(byte_at(pos)); + unsigned l = fl_utf8len1(byte_at(pos)); if (l > 2) { // test for composed character only if pos is at long codepoint int p = pos, ll, b; char t[40]; // crazyest composed characters I know use 28 bytes in UTF8 (e.g., 🏴󠁧󠁢󠁷󠁬󠁳󠁿) diff --git a/src/fl_utf8.cxx b/src/fl_utf8.cxx index 157f1dc54..62d393094 100644 --- a/src/fl_utf8.cxx +++ b/src/fl_utf8.cxx @@ -1637,32 +1637,36 @@ unsigned fl_utf8from_mb(char* dst, unsigned dstlen, const char* src, unsigned sr /** - Returns pointer to beginning of character after given location in UTF8 string accounting for composed characters. - Some unicode characters (examples: 👩‍✈️ "woman pilot", 🇸🇲 "San Marino flag") are composed of several unicode points. - They may pair two successive codepoints with U+200D (zero-width joiner) and may qualify any component with - variation selectors or Fitzpatrick emoji modifiers. Most flag emojis are composed of two successive - "regional indicator symbols", each in range [U+1F1E6 , U+1F1FF]. + Returns pointer to beginning of character after given location in UTF8 string accounting for emoji sequences. + Unicode encodes some emojis (examples: 👩‍✈️ "woman pilot", 🇸🇲 "San Marino flag", 9️⃣ "keycap 9") + via an emoji sequence, that is, they are represented by sequences of consecutive unicode points. + An emoji sequence may pair two successive codepoints with "zero-width joiner" and may qualify + any component with "variation selectors" or "Fitzpatrick emoji modifiers". Most flag emojis are encoded with two successive + "regional indicator symbols". Keycap emojis are encoded with key + "emoji variation selector" + "combining enclosing keycap". \param from points to a location within a UTF8 string. If this location is inside the UTF8 encoding of a codepoint or is an invalid byte, this function returns \p from + 1. \param end points past last codepoint of the string. - \return pointer to beginning of first codepoint after possibly composed character that begins at \p from. + \return pointer to beginning of first codepoint after character, possibly an emoji sequence, that begins at \p from. */ const char *fl_utf8_next_composed_char(const char *from, const char *end) { - int skip = fl_utf8len1(*from); - if (skip <= 2) return from + skip; - unsigned u = fl_utf8decode(from, end, NULL); - if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag - u = fl_utf8decode(from + skip, end, NULL); - if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag - return from + 2 * skip; + int skip = fl_utf8len(*from); + if (skip == -1) return from + 1; + unsigned u; + if (skip >= 4) { + u = fl_utf8decode(from, end, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag + u = fl_utf8decode(from + skip, end, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag + return from + 2 * skip; + } + } else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿) + const char *next = from + skip; + do { + u = fl_utf8decode(next, end, NULL); + next += fl_utf8len1(*next); + if (u == 0xE007F) return next; // ends with "cancel tag" + } while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components" } - } else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿) - const char *next = from + skip; - do { - u = fl_utf8decode(next, end, NULL); - next += fl_utf8len1(*next); - if (u == 0xE007F) return next; // ends with "cancel tag" - } while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components" } from += skip; // skip 1st codepoint while (from < end) { @@ -1674,6 +1678,8 @@ const char *fl_utf8_next_composed_char(const char *from, const char *end) { from += fl_utf8len(*from); // skip variation selector } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK from += fl_utf8len(*from); // skip modifier + } else if (u == 0x20E3) { // combining enclosing keycap (e.g., 9️⃣*️⃣#️⃣9︎⃣) + from += fl_utf8len(*from); // skip it } else break; } return from; @@ -1681,12 +1687,12 @@ const char *fl_utf8_next_composed_char(const char *from, const char *end) { /** - Returns pointer to beginning of character before given location in UTF8 string accounting for composed characters. - See fl_utf8_next_composed_char() for a hint about what is a composed unicode character. + Returns pointer to beginning of character before given location in UTF8 string accounting for emoji sequences. + See fl_utf8_next_composed_char() for a hint about what is an emoji sequence. \param from points to a location within a UTF8 string. If this location is inside the UTF8 encoding of a codepoint or is an invalid byte, this function returns \p from - 1. \param begin points to start of first codepoint of the string. - \return pointer to beginning of first possibly composed character before the codepoint that begins at \p from. + \return pointer to beginning of first character, possibly an emoji sequence, before the codepoint that begins at \p from. */ const char *fl_utf8_previous_composed_char(const char *from, const char *begin) { int l = fl_utf8len(*from); @@ -1715,6 +1721,8 @@ const char *fl_utf8_previous_composed_char(const char *from, const char *begin) from = fl_utf8back(from - 1, begin, NULL); } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK from = fl_utf8back(from - 1, begin, NULL); + } else if (u == 0x20E3) { // combining enclosing keycap + from = fl_utf8back(from - 1, begin, NULL); } else if (from > begin) { keep = fl_utf8back(from - 1, begin, NULL); u = fl_utf8decode(keep, from, NULL); -- cgit v1.2.3