summaryrefslogtreecommitdiff
path: root/src/fl_utf8.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/fl_utf8.cxx')
-rw-r--r--src/fl_utf8.cxx60
1 files changed, 60 insertions, 0 deletions
diff --git a/src/fl_utf8.cxx b/src/fl_utf8.cxx
index 6a69e0780..df4c6d423 100644
--- a/src/fl_utf8.cxx
+++ b/src/fl_utf8.cxx
@@ -1629,4 +1629,64 @@ unsigned fl_utf8from_mb(char* dst, unsigned dstlen, const char* src, unsigned sr
return Fl::system_driver()->utf8from_mb(dst, dstlen, src, srclen);
}
+
+/**
+ Returns pointer to beginning of next unicode character after potentially composed character.
+ Some unicode characters (example: 👩‍✈️ "woman pilot") are composed of several unicode points. They may pair two successive
+ codepoints with U+200D (zero-width joiner) and may qualify any component with variation selectors or Fitzpatrick emoji modifiers.
+ \param from points to a location within a UTF8 string. If this location is inside the UTF8
+ encoding of a codepoint or is an invalid byte, this function returns \p from + 1.
+ \param end points past last codepoint of the string.
+ \return pointer to start of first codepoint after potentially composed character beginning at \p from.
+ */
+const char *fl_utf8_next_composed_char(const char *from, const char *end) {
+ int skip = fl_utf8len(*from);
+ if (skip == -1) return from + 1;
+ from += skip; // skip 1st codepoint
+ while (from < end) {
+ unsigned u = fl_utf8decode(from, end, NULL);
+ if (u == 0x200D) { // zero-width joiner
+ from += fl_utf8len(*from); // skip joiner
+ from += fl_utf8len(*from); // skip joined codepoint
+ } else if (u >= 0xFE00 && u <= 0xFE0F) { // a variation selector
+ from += fl_utf8len(*from); // skip variation selector
+ } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK
+ from += fl_utf8len(*from); // skip modifier
+ } else break;
+ }
+ return from;
+}
+
+
+/**
+ Returns pointer to beginning of previous potentially composed character before given unicode character.
+ See fl_utf8_next_composed_char() for a hint about what is a composed unicode character.
+ \param from points to a location within a UTF8 string. If this location is inside the UTF8
+ encoding of a codepoint or is an invalid byte, this function returns \p from - 1.
+ \param begin points to start of first codepoint of the string.
+ \return pointer to start of first potentially composed character before the codepoint beginning at \p from.
+ */
+const char *fl_utf8_previous_composed_char(const char *from, const char *begin) {
+ if (from <= begin || fl_utf8len(*from) == -1) return from - 1;
+ const char *keep = from;
+ from = fl_utf8back(from - 1, begin, NULL);
+ while (from >= begin) {
+ unsigned u = fl_utf8decode(from, keep, NULL);
+ if (u >= 0xFE00 && u <= 0xFE0F) { // a variation selector
+ from = fl_utf8back(from - 1, begin, NULL);
+ } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK
+ from = fl_utf8back(from - 1, begin, NULL);
+ } else if (from > begin) {
+ keep = fl_utf8back(from - 1, begin, NULL);
+ u = fl_utf8decode(keep, from, NULL);
+ if (u == 0x200D) { // zero-width joiner
+ from = fl_utf8back(keep - 1, begin, NULL);
+ continue;
+ }
+ return from;
+ } else break;
+ }
+ return from;
+}
+
/** @} */