From fba4d58509e337c3f868c7ae68b3566f85ba86f5 Mon Sep 17 00:00:00 2001 From: Albrecht Schlosser Date: Fri, 8 Jan 2021 14:56:31 +0100 Subject: Fix Fl_Help_View::find() (issue #179) Fix search (string comparison) which had a few different issues. Document the function, arguments, and details about string matching. To do: correctly match complex HTML entities like "€" with Unicode code points >= U+0080 (UTF-8 multi byte encoding). --- src/Fl_Help_View.cxx | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) (limited to 'src/Fl_Help_View.cxx') diff --git a/src/Fl_Help_View.cxx b/src/Fl_Help_View.cxx index e27f81e37..5041d6f58 100644 --- a/src/Fl_Help_View.cxx +++ b/src/Fl_Help_View.cxx @@ -1177,10 +1177,27 @@ Fl_Help_View::draw() } // draw() - /** Finds the specified string \p s at starting position \p p. - \return the matching position or -1 if not found + The argument \p p and the return value are offsets in Fl_Help_View::value(), + counting from 0. If \p p is out of range, 0 is used. + + The string comparison is simple but honors some special cases: + - the specified string \p s must be in UTF-8 encoding + - HTML tags in value() are filtered (not compared as such, they never match) + - HTML entities like '\<' or '\&x#20ac;' are converted to Unicode (UTF-8) + - ASCII characters (7-bit, \< 0x80) are compared case insensitive + - every newline (LF, '\\n') in value() is treated like a single space + - all other strings are compared as-is (byte by byte) + + \todo complex HTML entities for Unicode code points \> 0x80 are currently treated + like one byte (not character!) and do not (yet) match correctly ("<" matches "<" + but "€" doesn't match "€", and "ü" doesn't match "ü") + + \param[in] s search string in UTF-8 encoding + \param[in] p starting position for search (0,...), Default = 0 + + \return the matching position or -1 if not found */ int // O - Matching position or -1 if not found Fl_Help_View::find(const char *s, // I - String to find @@ -1193,27 +1210,28 @@ Fl_Help_View::find(const char *s, // I - String to find *bs, // Start of current comparison *sp; // Search string pointer - DEBUG_FUNCTION(__LINE__,__FUNCTION__); // Range check input and value... if (!s || !value_) return -1; if (p < 0 || p >= (int)strlen(value_)) p = 0; - else if (p > 0) p ++; // Look for the string... - for (i = nblocks_, b = blocks_; i > 0; i --, b ++) { + for (i = nblocks_, b = blocks_; i > 0; i--, b++) { if (b->end < (value_ + p)) continue; if (b->start < (value_ + p)) bp = value_ + p; else bp = b->start; - for (sp = s, bs = bp; *sp && *bp && bp < b->end; bp ++) { + for (sp = s, bs = bp; *sp && *bp && bp < b->end; bp++) { if (*bp == '<') { // skip to end of element... - while (*bp && bp < b->end && *bp != '>') bp ++; + while (*bp && bp < b->end && *bp != '>') bp++; + // no match, so reset to start of search... + sp = s; + bs = bp + 1; continue; } else if (*bp == '&') { // decode HTML entity... @@ -1221,6 +1239,8 @@ Fl_Help_View::find(const char *s, // I - String to find else bp = strchr(bp + 1, ';') + 1; } else c = *bp; + if (c == '\n') c = ' '; // treat newline as a single space + // *FIXME* *UTF-8* (A.S. 02/14/2016) // At this point c may be an arbitrary Unicode Code Point corresponding // to a quoted character (see above), i.e. it _can_ be a multi byte @@ -1229,19 +1249,18 @@ Fl_Help_View::find(const char *s, // I - String to find // For instance: "€" == 0x20ac -> 0xe2 0x82 0xac (UTF-8: 3 bytes). // Hint: use fl_utf8encode() [see below] - if (tolower(*sp) == tolower(c)) sp ++; - else { - // No match, so reset to start of search... + if (c > 0x20 && c < 0x80 && tolower(*sp) == tolower(c)) sp++; + else if (*sp == c) sp++; + else { // No match, so reset to start of search... sp = s; - bs ++; bp = bs; + bs++; } } - if (!*sp) { - // Found a match! + if (!*sp) { // Found a match! topline(b->y - b->h); - return (int) (b->end - value_); + return int(bs - value_); } } -- cgit v1.2.3