From f67a97edfbaaf9b7328f9fcbab18588faca95bfe Mon Sep 17 00:00:00 2001 From: nick black Date: Mon, 25 Nov 2019 21:11:27 -0500 Subject: [PATCH] correctly handle ASCII + combining chars, add unit tests #36 --- include/notcurses.h | 4 ++++ src/bin/demo.c | 13 ++++++++++--- src/lib/egcpool.h | 26 ++++++++++++------------- src/lib/notcurses.c | 23 ++++++++++++++-------- tests/egcpool.cpp | 12 ++++++++---- tests/ncplane.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 28 deletions(-) diff --git a/include/notcurses.h b/include/notcurses.h index b73a577a9..20bf72f38 100644 --- a/include/notcurses.h +++ b/include/notcurses.h @@ -240,6 +240,10 @@ cell_init(cell* c){ // Breaks the UTF-8 string in 'gcluster' down, setting up the cell 'c'. int cell_load(struct ncplane* n, cell* c, const char* gcluster); +// Duplicate 'c' into 'targ'. Not intended for external use; exposed for the +// benefit of unit tests. +int cell_duplicate(struct ncplane* n, cell* targ, const cell* c); + // Release resources held by the cell 'c'. void cell_release(struct ncplane* n, cell* c); diff --git a/src/bin/demo.c b/src/bin/demo.c index a925aadc1..79a6ad655 100644 --- a/src/bin/demo.c +++ b/src/bin/demo.c @@ -100,16 +100,23 @@ int main(int argc, char** argv){ goto err; } sleep(1); + const char s1[] = " Die Welt ist alles, was der Fall ist. "; const char str[] = " Wovon man nicht sprechen kann, darüber muss man schweigen. "; - if(ncplane_cursor_move_yx(ncp, rows / 2, (cols - strlen(str) + 4) / 2)){ - goto err; - } if(ncplane_fg_rgb8(ncp, 176, 121, 176)){ goto err; } if(ncplane_bg_rgb8(ncp, 100, 100, 100)){ goto err; } + if(ncplane_cursor_move_yx(ncp, rows / 2 - 1, (cols - strlen(s1) + 4) / 2)){ + goto err; + } + if(ncplane_putstr(ncp, s1) != (int)strlen(s1)){ + goto err; + } + if(ncplane_cursor_move_yx(ncp, rows / 2, (cols - strlen(str) + 4) / 2)){ + goto err; + } if(ncplane_putstr(ncp, str) != (int)strlen(str)){ goto err; } diff --git a/src/lib/egcpool.h b/src/lib/egcpool.h index 5c79c1add..35fbe8dbb 100644 --- a/src/lib/egcpool.h +++ b/src/lib/egcpool.h @@ -17,10 +17,10 @@ extern "C" { // recognizable as use for another cell. typedef struct egcpool { - char* pool; // ringbuffer of attached extension storage - size_t poolsize; // total number of bytes in pool - size_t poolused; // bytes actively used, grow when this gets too large - size_t poolwrite; // next place to *look for* a place to write + char* pool; // ringbuffer of attached extension storage + int poolsize; // total number of bytes in pool + int poolused; // bytes actively used, grow when this gets too large + int poolwrite; // next place to *look for* a place to write } egcpool; static inline void @@ -37,8 +37,8 @@ int egcpool_grow(egcpool* pool, size_t len); // consumed, not including any NUL terminator. Note that neither the number // of bytes nor columns is necessarily equivalent to the number of decoded code // points. Such are the ways of Unicode. -static inline size_t -utf8_gce_len(const char* gcluster, int* colcount){ +static inline int +utf8_egc_len(const char* gcluster, int* colcount){ size_t ret = 0; *colcount = 0; wchar_t wc; @@ -64,8 +64,8 @@ utf8_gce_len(const char* gcluster, int* colcount){ // if we're inserting a EGC of |len| bytes, ought we proactively realloc? static inline bool -egcpool_alloc_justified(const egcpool* pool, size_t len){ - const size_t poolfree = pool->poolsize - pool->poolused; +egcpool_alloc_justified(const egcpool* pool, int len){ + const int poolfree = pool->poolsize - pool->poolused; // proactively get more space if we have less than 10% free. this doesn't // guarantee that we'll have enough space to insert the string -- we could // theoretically have every 10th byte free, and be unable to write even a @@ -83,7 +83,7 @@ egcpool_alloc_justified(const egcpool* pool, size_t len){ // columns is stored to '*cols'. static inline int egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){ - size_t len = utf8_gce_len(egc, cols) + 1; // count the NUL terminator + int len = utf8_egc_len(egc, cols) + 1; // count the NUL terminator if(len <= 2){ // should never be empty, nor a single byte + NUL return -1; } @@ -111,7 +111,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){ // row. starting at pool->poolwrite, look for such a range of unused // memory. if we find it, write it out, and update used count. if we come // back to where we started, force a growth and try again. - size_t curpos = pool->poolwrite; + int curpos = pool->poolwrite; do{ if(curpos == pool->poolsize){ curpos = 0; @@ -124,7 +124,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){ } curpos = 0; // can this skip pool->poolwrite? }else{ // promising! let's see if there's enough space - size_t need = len; + int need = len; size_t trial = curpos; while(--need){ if(pool->pool[++trial]){ // alas, not enough space here @@ -144,7 +144,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){ } curpos += len - need; } - }while(curpos != pool->poolwrite); + }while(curpos != pool->poolwrite); }while( (searched = !searched) ); free(duplicated); return -1; // should never get here @@ -154,7 +154,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){ // we find a zero (our own NUL terminator). remove that number of bytes from // the usedcount. static inline void -egcpool_release(egcpool* pool, size_t offset){ +egcpool_release(egcpool* pool, int offset){ size_t freed = 1; // account for free(d) NUL terminator while(pool->pool[offset]){ pool->pool[offset] = '\0'; diff --git a/src/lib/notcurses.c b/src/lib/notcurses.c index fdd23f739..3d8a13aad 100644 --- a/src/lib/notcurses.c +++ b/src/lib/notcurses.c @@ -555,9 +555,15 @@ term_movyx(int y, int x){ // is it a single ASCII byte, wholly contained within the cell? static inline bool simple_gcluster_p(const char* gcluster){ - return *gcluster == '\0' || - // FIXME need to ensure next character is not a nonspacer! - (*(unsigned char*)gcluster < 0x80); + if(*gcluster == '\0'){ + return true; + } + if(*(unsigned char*)gcluster >= 0x80){ + return false; + } + // we might be a simple ASCII, if the next character is *not* a nonspacing + // combining character + return false; // FIXME } static inline bool @@ -677,8 +683,7 @@ ncplane_cursor_stuck(const ncplane* n){ return (n->x == n->lenx && n->y == n->leny); } -static int -cell_duplicate(ncplane* n, cell* targ, const cell* c){ +int cell_duplicate(ncplane* n, cell* targ, const cell* c){ cell_release(n, targ); targ->attrword = c->attrword; targ->channels = c->channels; @@ -688,7 +693,7 @@ cell_duplicate(ncplane* n, cell* targ, const cell* c){ } size_t ulen; int cols; - // FIXME insert colcount into cell... + // FIXME insert colcount into cell...if it's ever valid, anyway int eoffset = egcpool_stash(&n->pool, extended_gcluster(n, c), &ulen, &cols); if(eoffset < 0){ return -1; @@ -733,12 +738,14 @@ void cell_release(ncplane* n, cell* c){ // bytes copied out of 'gcluster', or -1 on failure. int cell_load(ncplane* n, cell* c, const char* gcluster){ cell_release(n, c); - if(simple_gcluster_p(gcluster)){ + int bytes; + int cols; + if((bytes = utf8_egc_len(gcluster, &cols)) >= 0 && bytes <= 1){ c->gcluster = *gcluster; return !!c->gcluster; } size_t ulen; - int cols; + // FIXME feed in already-calculated lengths from prior utf8_egc_len()! int eoffset = egcpool_stash(&n->pool, gcluster, &ulen, &cols); if(eoffset < 0){ return -1; diff --git a/tests/egcpool.cpp b/tests/egcpool.cpp index 02b3e40c4..fe41d9024 100644 --- a/tests/egcpool.cpp +++ b/tests/egcpool.cpp @@ -26,7 +26,7 @@ TEST_F(EGCPoolTest, Initialized) { TEST_F(EGCPoolTest, UTF8EGC) { const char* wstr = "☢"; int c; - auto ulen = utf8_gce_len(wstr, &c); + auto ulen = utf8_egc_len(wstr, &c); ASSERT_LT(0, ulen); EXPECT_LT(0, c); } @@ -36,13 +36,17 @@ TEST_F(EGCPoolTest, UTF8EGC) { TEST_F(EGCPoolTest, UTF8EGCCombining) { const char* w1 = "à"; // U+00E0, U+0000 (c3 a0) const char* w2 = "à"; // U+0061, U+0300, U+0000 (61 cc 80) - int c1, c2; - auto u1 = utf8_gce_len(w1, &c1); - auto u2 = utf8_gce_len(w2, &c2); + const char* w3 = "a"; // U+0061, U+0000 (61) + int c1, c2, c3; + auto u1 = utf8_egc_len(w1, &c1); + auto u2 = utf8_egc_len(w2, &c2); + auto u3 = utf8_egc_len(w3, &c3); ASSERT_EQ(2, u1); ASSERT_EQ(3, u2); + ASSERT_EQ(1, u3); ASSERT_EQ(1, c1); ASSERT_EQ(1, c2); + ASSERT_EQ(1, c3); } TEST_F(EGCPoolTest, AddAndRemove) { diff --git a/tests/ncplane.cpp b/tests/ncplane.cpp index e632a75e8..a1624747a 100644 --- a/tests/ncplane.cpp +++ b/tests/ncplane.cpp @@ -209,3 +209,50 @@ TEST_F(NcplaneTest, PerimeterBox) { TEST_F(NcplaneTest, EraseScreen) { ncplane_erase(n_); } + +// we're gonna run both a composed latin a with grave, and then a latin a with +// a combining nonspacing grave +TEST_F(NcplaneTest, CellLoadCombining) { + const char* w1 = "à"; // U+00E0, U+0000 (c3 a0) + const char* w2 = "à"; // U+0061, U+0300, U+0000 (61 cc 80) + const char* w3 = "a"; // U+0061, U+0000 (61) + cell cell1 = CELL_TRIVIAL_INITIALIZER; + cell cell2 = CELL_TRIVIAL_INITIALIZER; + cell cell3 = CELL_TRIVIAL_INITIALIZER; + auto u1 = cell_load(n_, &cell1, w1); + auto u2 = cell_load(n_, &cell2, w2); + auto u3 = cell_load(n_, &cell3, w3); + ASSERT_EQ(2, u1); + ASSERT_EQ(3, u2); + ASSERT_EQ(1, u3); + cell_release(n_, &cell1); + cell_release(n_, &cell2); + cell_release(n_, &cell3); +} + +TEST_F(NcplaneTest, CellDuplicateCombining) { + const char* w1 = "à"; // U+00E0, U+0000 (c3 a0) + const char* w2 = "à"; // U+0061, U+0300, U+0000 (61 cc 80) + const char* w3 = "a"; // U+0061, U+0000 (61) + cell cell1 = CELL_TRIVIAL_INITIALIZER; + cell cell2 = CELL_TRIVIAL_INITIALIZER; + cell cell3 = CELL_TRIVIAL_INITIALIZER; + auto u1 = cell_load(n_, &cell1, w1); + auto u2 = cell_load(n_, &cell2, w2); + auto u3 = cell_load(n_, &cell3, w3); + ASSERT_EQ(2, u1); + ASSERT_EQ(3, u2); + ASSERT_EQ(1, u3); + cell cell4 = CELL_TRIVIAL_INITIALIZER; + cell cell5 = CELL_TRIVIAL_INITIALIZER; + cell cell6 = CELL_TRIVIAL_INITIALIZER; + EXPECT_EQ(2, cell_duplicate(n_, &cell4, &cell1)); + EXPECT_EQ(3, cell_duplicate(n_, &cell5, &cell2)); + EXPECT_EQ(1, cell_duplicate(n_, &cell6, &cell3)); + cell_release(n_, &cell1); + cell_release(n_, &cell2); + cell_release(n_, &cell3); + cell_release(n_, &cell4); + cell_release(n_, &cell5); + cell_release(n_, &cell6); +}