api: add argument "length" in function utf8_is_valid()

2015-08-18 07:36:48 +02:00 · 2015-08-18 07:36:48 +02:00 · 46a9d17ac3
commit 46a9d17ac3
parent fd1886e883
13 changed files with 188 additions and 46 deletions
--- a/ChangeLog.asciidoc
+++ b/ChangeLog.asciidoc
@ -15,6 +15,14 @@ https://weechat.org/files/releasenotes/ReleaseNotes-devel.html[release notes]
 (file 'ReleaseNotes.asciidoc' in sources).


+== Version 1.4 (under dev)
+
+=== New features
+
+* api: add argument "length" in function utf8_is_valid()
+
+=== Bugs fixed
+
 == Version 1.3 (2015-08-16)

 === New features
--- a/doc/en/weechat_plugin_api.en.asciidoc
+++ b/doc/en/weechat_plugin_api.en.asciidoc
@ -2151,18 +2151,22 @@ This function is not available in scripting API.

 ==== utf8_is_valid

+_Updated in 1.4._
+
 Check if a string is UTF-8 valid.

 Prototype:

 [source,C]
 ----
-int weechat_utf8_is_valid (const char *string, char **error);
+int weechat_utf8_is_valid (const char *string, int length, char **error);
 ----

 Arguments:

 * 'string': string
+* 'length': max number of UTF-8 chars to check; if ≤ 0, the whole string is
+  checked _(WeeChat ≥ 1.4)_
 * 'error': if not NULL, '*error*' is set with pointer to first non valid UTF-8
  char in string, if any

@ -2175,7 +2179,7 @@ C example:
 [source,C]
 ----
 char *error;
-if (weechat_utf8_is_valid (string, &error))
+if (weechat_utf8_is_valid (string, -1, &error))
 {
    /* ... */
 }
--- a/doc/fr/weechat_plugin_api.fr.asciidoc
+++ b/doc/fr/weechat_plugin_api.fr.asciidoc
@ -2193,18 +2193,22 @@ Cette fonction n'est pas disponible dans l'API script.

 ==== utf8_is_valid

+_Mis à jour dans la 1.4._
+
 Vérifier si une chaîne est valide UTF-8.

 Prototype :

 [source,C]
 ----
-int weechat_utf8_is_valid (const char *string, char **error);
+int weechat_utf8_is_valid (const char *string, int length, char **error);
 ----

 Paramètres :

 * 'string' : chaîne
+* 'length' : nombre maximum de caractères UTF-8 à vérifier ; si ≤ 0, la chaîne
+  complète est vérifiée _(WeeChat ≥ 1.4)_
 * 'error' : si non NULL, '*error' est alimenté avec le pointeur vers le premier
  caractère non valide dans la chaîne, s'il y en a

@ -2217,7 +2221,7 @@ Exemple en C :
 [source,C]
 ----
 char *error;
-if (weechat_utf8_is_valid (string, &error))
+if (weechat_utf8_is_valid (string, -1, &error))
 {
    /* ... */
 }
--- a/doc/it/weechat_plugin_api.it.asciidoc
+++ b/doc/it/weechat_plugin_api.it.asciidoc
@ -2226,18 +2226,24 @@ Questa funzione non è disponibile nelle API per lo scripting.

 ==== utf8_is_valid

+// TRANSLATION MISSING
+_Updated in 1.4._
+
 Verifica che una stringa sia valida in UTF-8.

 Prototipo:

 [source,C]
 ----
-int weechat_utf8_is_valid (const char *string, char **error);
+int weechat_utf8_is_valid (const char *string, int length, char **error);
 ----

 Argomenti:

 * 'string': stringa
+// TRANSLATION MISSING
+* 'length': max number of UTF-8 chars to check; if ≤ 0, the whole string is
+  checked _(WeeChat ≥ 1.4)_
 * 'error': se non NULL, '*error*' è impostato con il puntatore al primo
  carattere UTF-8 non valido nella stringa, se esiste

@ -2250,7 +2256,7 @@ Esempio in C:
 [source,C]
 ----
 char *error;
-if (weechat_utf8_is_valid (string, &error))
+if (weechat_utf8_is_valid (string, -1, &error))
 {
    /* ... */
 }
--- a/doc/ja/weechat_plugin_api.ja.asciidoc
+++ b/doc/ja/weechat_plugin_api.ja.asciidoc
@ -2152,18 +2152,23 @@ if (weechat_utf8_has_8bits (string))

 ==== utf8_is_valid

+_バージョン 1.4 で更新。_
+
 文字列が妥当な UTF-8 表現か確認。

 プロトタイプ:

 [source,C]
 ----
-int weechat_utf8_is_valid (const char *string, char **error);
+int weechat_utf8_is_valid (const char *string, int length, char **error);
 ----

 引数:

 * 'string': 文字列
+// TRANSLATION MISSING
+* 'length': max number of UTF-8 chars to check; if ≤ 0, the whole string is
+  checked _(WeeChat ≥ 1.4)_
 * 'error': NULL でない場合は '*error*'
  は文字列に含まれる最初の妥当でない UTF-8 文字へのポインタ

@ -2176,7 +2181,7 @@ C 言語での使用例:
 [source,C]
 ----
 char *error;
-if (weechat_utf8_is_valid (string, &error))
+if (weechat_utf8_is_valid (string, -1, &error))
 {
    /* ... */
 }
--- a/src/core/wee-string.c
+++ b/src/core/wee-string.c
@ -2307,7 +2307,7 @@ string_iconv_to_internal (const char *charset, const char *string)
    if (local_utf8 && (!charset || !charset[0]))
        return input;

-    if (utf8_has_8bits (input) && utf8_is_valid (input, NULL))
+    if (utf8_has_8bits (input) && utf8_is_valid (input, -1, NULL))
        return input;

    output = string_iconv (0,
--- a/src/core/wee-utf8.c
+++ b/src/core/wee-utf8.c
@ -70,18 +70,24 @@ utf8_has_8bits (const char *string)
 /*
 * Checks if a string is UTF-8 valid.
 *
+ * If length is <= 0, checks whole string.
+ * If length is > 0, checks only this number of chars (not bytes).
+ *
 * Returns:
 *   1: string is UTF-8 valid
- *   0: string it not UTF-8 valid, and then if error is not NULL, it is set with
- *      first non valid UTF-8 char in string
+ *   0: string it not UTF-8 valid, and then if error is not NULL, it is set
+ *      with first non valid UTF-8 char in string
 */

 int
-utf8_is_valid (const char *string, char **error)
+utf8_is_valid (const char *string, int length, char **error)
 {
-    int code_point;
+    int code_point, current_char;

-    while (string && string[0])
+    current_char = 0;
+
+    while (string && string[0]
+           && ((length <= 0) || (current_char < length)))
    {
        /*
         * UTF-8, 2 bytes, should be: 110vvvvv 10vvvvvv
@ -142,6 +148,7 @@ utf8_is_valid (const char *string, char **error)
            goto invalid;
        else
            string++;
+        current_char++;
    }
    if (error)
        *error = NULL;
@ -165,7 +172,7 @@ utf8_normalize (char *string, char replacement)

    while (string && string[0])
    {
-        if (utf8_is_valid (string, &error))
+        if (utf8_is_valid (string, -1, &error))
            return;
        error[0] = replacement;
        string = error + 1;
--- a/src/core/wee-utf8.h
+++ b/src/core/wee-utf8.h
@ -30,7 +30,7 @@ extern int local_utf8;

 extern void utf8_init ();
 extern int utf8_has_8bits (const char *string);
-extern int utf8_is_valid (const char *string, char **error);
+extern int utf8_is_valid (const char *string, int length, char **error);
 extern void utf8_normalize (char *string, char replacement);
 extern const char *utf8_prev_char (const char *string_start,
                                   const char *string);
--- a/src/gui/curses/gui-curses-key.c
+++ b/src/gui/curses/gui-curses-key.c
@ -378,7 +378,7 @@ gui_key_flush (int paste)
                ptr_char = key_str;
                while (ptr_char && ptr_char[0])
                {
-                    (void) utf8_is_valid (ptr_char, &ptr_error);
+                    (void) utf8_is_valid (ptr_char, -1, &ptr_error);
                    if (!ptr_error)
                        break;
                    next_char = (char *)utf8_next_char (ptr_error);
--- a/src/gui/curses/gui-curses-mouse.c
+++ b/src/gui/curses/gui-curses-mouse.c
@ -265,7 +265,7 @@ gui_mouse_event_code2key (const char *code)
     * mouse code must have at least:
     *   one code (for event) + X + Y == 3 bytes or 3 UTF-8 chars
     */
-    code_utf8 = utf8_is_valid (code, NULL);
+    code_utf8 = utf8_is_valid (code, -1, NULL);
    length = (code_utf8) ? utf8_strlen (code) : (int)strlen (code);
    if (length < 3)
        return NULL;
--- a/src/gui/gui-key.c
+++ b/src/gui/gui-key.c
@ -214,7 +214,7 @@ gui_key_grab_end_timer_cb (void *data, int remaining_calls)
         * but some mouse codes can return ISO chars (for coordinates),
         * then we will convert them to UTF-8 string
         */
-        if (!utf8_is_valid (expanded_key, NULL))
+        if (!utf8_is_valid (expanded_key, -1, NULL))
        {
            expanded_key2 = string_iconv_to_internal ("iso-8859-1",
                                                      expanded_key);
--- a/src/plugins/weechat-plugin.h
+++ b/src/plugins/weechat-plugin.h
@ -57,7 +57,7 @@ struct timeval;
 * please change the date with current one; for a second change at same
 * date, increment the 01, otherwise please keep 01.
 */
-#define WEECHAT_PLUGIN_API_VERSION "20150704-02"
+#define WEECHAT_PLUGIN_API_VERSION "20150818-01"

 /* macros for defining plugin infos */
 #define WEECHAT_PLUGIN_NAME(__name)                                     \
@ -317,7 +317,7 @@ struct t_weechat_plugin

    /* UTF-8 strings */
    int (*utf8_has_8bits) (const char *string);
-    int (*utf8_is_valid) (const char *string, char **error);
+    int (*utf8_is_valid) (const char *string, int length, char **error);
    void (*utf8_normalize) (char *string, char replacement);
    const char *(*utf8_prev_char) (const char *string_start,
                                   const char *string);
@ -1110,8 +1110,8 @@ extern int weechat_plugin_end (struct t_weechat_plugin *plugin);
 /* UTF-8 strings */
 #define weechat_utf8_has_8bits(__string)                                \
    (weechat_plugin->utf8_has_8bits)(__string)
-#define weechat_utf8_is_valid(__string, __error)                        \
-    (weechat_plugin->utf8_is_valid)(__string, __error)
+#define weechat_utf8_is_valid(__string, __length, __error)              \
+    (weechat_plugin->utf8_is_valid)(__string, __length, __error)
 #define weechat_utf8_normalize(__string, __char)                        \
    (weechat_plugin->utf8_normalize)(__string, __char)
 #define weechat_utf8_prev_char(__start, __string)                       \
--- a/tests/unit/core/test-utf8.cpp
+++ b/tests/unit/core/test-utf8.cpp
@ -59,38 +59,146 @@ TEST(Utf8, Validity)
    LONGS_EQUAL(1, utf8_has_8bits ("no\xc3\xabl"));

    /* check validity */
-    LONGS_EQUAL(1, utf8_is_valid (NULL, NULL));
-    LONGS_EQUAL(1, utf8_is_valid (NULL, &error));
-    LONGS_EQUAL(1, utf8_is_valid ("", NULL));
-    LONGS_EQUAL(1, utf8_is_valid ("", &error));
-    LONGS_EQUAL(1, utf8_is_valid ("abc", &error));
+    LONGS_EQUAL(1, utf8_is_valid (NULL, -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid (NULL, 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid (NULL, 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid (NULL, -1, &error));
+    LONGS_EQUAL(1, utf8_is_valid (NULL, 0, &error));
+    LONGS_EQUAL(1, utf8_is_valid (NULL, 1, &error));
+    LONGS_EQUAL(1, utf8_is_valid ("", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("", -1, &error));
+    LONGS_EQUAL(1, utf8_is_valid ("", 0, &error));
+    LONGS_EQUAL(1, utf8_is_valid ("", 1, &error));
+    LONGS_EQUAL(1, utf8_is_valid ("abc", -1, &error));
    POINTERS_EQUAL(NULL, error);
-    LONGS_EQUAL(1, utf8_is_valid (noel_valid, &error));
+    LONGS_EQUAL(1, utf8_is_valid ("abc", 0, &error));
    POINTERS_EQUAL(NULL, error);
-    LONGS_EQUAL(0, utf8_is_valid (noel_invalid, &error));
+    LONGS_EQUAL(1, utf8_is_valid ("abc", 1, &error));
+    POINTERS_EQUAL(NULL, error);
+    LONGS_EQUAL(1, utf8_is_valid (noel_valid, -1, &error));
+    POINTERS_EQUAL(NULL, error);
+    LONGS_EQUAL(1, utf8_is_valid (noel_valid, 0, &error));
+    POINTERS_EQUAL(NULL, error);
+    LONGS_EQUAL(1, utf8_is_valid (noel_valid, 1, &error));
+    POINTERS_EQUAL(NULL, error);
+    LONGS_EQUAL(0, utf8_is_valid (noel_invalid, -1, &error));
+    POINTERS_EQUAL(noel_invalid + 2, error);
+    LONGS_EQUAL(0, utf8_is_valid (noel_invalid, 0, &error));
+    POINTERS_EQUAL(noel_invalid + 2, error);
+    LONGS_EQUAL(1, utf8_is_valid (noel_invalid, 1, &error));
+    POINTERS_EQUAL(NULL, error);
+    LONGS_EQUAL(1, utf8_is_valid (noel_invalid, 2, &error));
+    POINTERS_EQUAL(NULL, error);
+    LONGS_EQUAL(0, utf8_is_valid (noel_invalid, 3, &error));
+    POINTERS_EQUAL(noel_invalid + 2, error);
+    LONGS_EQUAL(0, utf8_is_valid (noel_invalid, 4, &error));
+    POINTERS_EQUAL(noel_invalid + 2, error);
+    LONGS_EQUAL(0, utf8_is_valid (noel_invalid, 5, &error));
    POINTERS_EQUAL(noel_invalid + 2, error);

    /* 2 bytes: code point must be in range U+0080-07FF */
-    LONGS_EQUAL(0, utf8_is_valid ("\xc0\x80", NULL));  /* U+0   */
-    LONGS_EQUAL(0, utf8_is_valid ("\xc1\xbf", NULL));  /* U+7F  */
-    LONGS_EQUAL(1, utf8_is_valid ("\xc2\x80", NULL));  /* U+80  */
-    LONGS_EQUAL(1, utf8_is_valid ("\xdf\xbf", NULL));  /* U+7FF */
+
+    /* U+0 */
+    LONGS_EQUAL(0, utf8_is_valid ("\xc0\x80", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xc0\x80", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xc0\x80", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xc0\x80", 2, NULL));
+
+    /* U+7F */
+    LONGS_EQUAL(0, utf8_is_valid ("\xc1\xbf", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xc1\xbf", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xc1\xbf", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xc1\xbf", 2, NULL));
+
+    /* U+80 */
+    LONGS_EQUAL(1, utf8_is_valid ("\xc2\x80", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xc2\x80", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xc2\x80", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xc2\x80", 2, NULL));
+
+    /* U+7FF */
+    LONGS_EQUAL(1, utf8_is_valid ("\xdf\xbf", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xdf\xbf", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xdf\xbf", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xdf\xbf", 2, NULL));

    /* 3 bytes: code point must be in range: U+0800-FFFF */
-    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x80\x80", NULL));  /* U+0    */
-    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x9f\xbf", NULL));  /* U+7FF  */
-    LONGS_EQUAL(0, utf8_is_valid ("\xed\xa0\x80", NULL));  /* U+D800 */
-    LONGS_EQUAL(0, utf8_is_valid ("\xed\xbf\xbf", NULL));  /* U+DFFF */
-    LONGS_EQUAL(1, utf8_is_valid ("\xe0\xa0\x80", NULL));  /* U+800  */
-    LONGS_EQUAL(1, utf8_is_valid ("\xed\x9f\xbf", NULL));  /* U+D7FF */
-    LONGS_EQUAL(1, utf8_is_valid ("\xe7\x80\x80", NULL));  /* U+E000 */
-    LONGS_EQUAL(1, utf8_is_valid ("\xef\xbf\xbf", NULL));  /* U+FFFF */
+
+    /* U+0 */
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x80\x80", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x80\x80", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x80\x80", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x80\x80", 2, NULL));
+
+    /* U+7FF  */
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x9f\xbf", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x9f\xbf", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x9f\xbf", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xe0\x9f\xbf", 2, NULL));
+
+    /* U+D800 */
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xa0\x80", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xa0\x80", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xa0\x80", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xa0\x80", 2, NULL));
+
+    /* U+DFFF */
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xbf\xbf", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xbf\xbf", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xbf\xbf", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xed\xbf\xbf", 2, NULL));
+
+    /* U+800  */
+    LONGS_EQUAL(1, utf8_is_valid ("\xe0\xa0\x80", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xe0\xa0\x80", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xe0\xa0\x80", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xe0\xa0\x80", 2, NULL));
+
+    /* U+D7FF */
+    LONGS_EQUAL(1, utf8_is_valid ("\xed\x9f\xbf", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xed\x9f\xbf", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xed\x9f\xbf", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xed\x9f\xbf", 2, NULL));
+
+    /* U+E000 */
+    LONGS_EQUAL(1, utf8_is_valid ("\xe7\x80\x80", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xe7\x80\x80", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xe7\x80\x80", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xe7\x80\x80", 2, NULL));
+
+    /* U+FFFF */
+    LONGS_EQUAL(1, utf8_is_valid ("\xef\xbf\xbf", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xef\xbf\xbf", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xef\xbf\xbf", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xef\xbf\xbf", 2, NULL));

    /* 4 bytes: code point must be in range: U+10000-1FFFFF */
-    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x80\x80\x80", NULL));  /* U+0      */
-    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x8f\xbf\xbf", NULL));  /* U+FFFF   */
-    LONGS_EQUAL(1, utf8_is_valid ("\xf0\x90\x80\x80", NULL));  /* U+10000  */
-    LONGS_EQUAL(1, utf8_is_valid ("\xf7\xbf\xbf\xbf", NULL));  /* U+1FFFFF */
+
+    /* U+0 */
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x80\x80\x80", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x80\x80\x80", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x80\x80\x80", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x80\x80\x80", 2, NULL));
+
+    /* U+FFFF   */
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x8f\xbf\xbf", -1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x8f\xbf\xbf", 0, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x8f\xbf\xbf", 1, NULL));
+    LONGS_EQUAL(0, utf8_is_valid ("\xf0\x8f\xbf\xbf", 2, NULL));
+
+    /* U+10000  */
+    LONGS_EQUAL(1, utf8_is_valid ("\xf0\x90\x80\x80", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xf0\x90\x80\x80", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xf0\x90\x80\x80", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xf0\x90\x80\x80", 2, NULL));
+
+    /* U+1FFFFF */
+    LONGS_EQUAL(1, utf8_is_valid ("\xf7\xbf\xbf\xbf", -1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xf7\xbf\xbf\xbf", 0, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xf7\xbf\xbf\xbf", 1, NULL));
+    LONGS_EQUAL(1, utf8_is_valid ("\xf7\xbf\xbf\xbf", 2, NULL));
 }

 /*