From f67a97edfbaaf9b7328f9fcbab18588faca95bfe Mon Sep 17 00:00:00 2001
From: nick black <dankamongmen@gmail.com>
Date: Mon, 25 Nov 2019 21:11:27 -0500
Subject: [PATCH] correctly handle ASCII + combining chars, add unit tests #36

---
 include/notcurses.h |  4 ++++
 src/bin/demo.c      | 13 ++++++++++---
 src/lib/egcpool.h   | 26 ++++++++++++-------------
 src/lib/notcurses.c | 23 ++++++++++++++--------
 tests/egcpool.cpp   | 12 ++++++++----
 tests/ncplane.cpp   | 47 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/include/notcurses.h b/include/notcurses.h
index b73a577a9..20bf72f38 100644
--- a/include/notcurses.h
+++ b/include/notcurses.h
@@ -240,6 +240,10 @@ cell_init(cell* c){
 // Breaks the UTF-8 string in 'gcluster' down, setting up the cell 'c'.
 int cell_load(struct ncplane* n, cell* c, const char* gcluster);
 
+// Duplicate 'c' into 'targ'. Not intended for external use; exposed for the
+// benefit of unit tests.
+int cell_duplicate(struct ncplane* n, cell* targ, const cell* c);
+
 // Release resources held by the cell 'c'.
 void cell_release(struct ncplane* n, cell* c);
 
diff --git a/src/bin/demo.c b/src/bin/demo.c
index a925aadc1..79a6ad655 100644
--- a/src/bin/demo.c
+++ b/src/bin/demo.c
@@ -100,16 +100,23 @@ int main(int argc, char** argv){
     goto err;
   }
   sleep(1);
+  const char s1[] = " Die Welt ist alles, was der Fall ist. ";
   const char str[] = " Wovon man nicht sprechen kann, darüber muss man schweigen. ";
-  if(ncplane_cursor_move_yx(ncp, rows / 2, (cols - strlen(str) + 4) / 2)){
-    goto err;
-  }
   if(ncplane_fg_rgb8(ncp, 176, 121, 176)){
     goto err;
   }
   if(ncplane_bg_rgb8(ncp, 100, 100, 100)){
     goto err;
   }
+  if(ncplane_cursor_move_yx(ncp, rows / 2 - 1, (cols - strlen(s1) + 4) / 2)){
+    goto err;
+  }
+  if(ncplane_putstr(ncp, s1) != (int)strlen(s1)){
+    goto err;
+  }
+  if(ncplane_cursor_move_yx(ncp, rows / 2, (cols - strlen(str) + 4) / 2)){
+    goto err;
+  }
   if(ncplane_putstr(ncp, str) != (int)strlen(str)){
     goto err;
   }
diff --git a/src/lib/egcpool.h b/src/lib/egcpool.h
index 5c79c1add..35fbe8dbb 100644
--- a/src/lib/egcpool.h
+++ b/src/lib/egcpool.h
@@ -17,10 +17,10 @@ extern "C" {
 // recognizable as use for another cell.
 
 typedef struct egcpool {
-  char* pool;            // ringbuffer of attached extension storage
-  size_t poolsize;       // total number of bytes in pool
-  size_t poolused;       // bytes actively used, grow when this gets too large
-  size_t poolwrite;      // next place to *look for* a place to write
+  char* pool;         // ringbuffer of attached extension storage
+  int poolsize;       // total number of bytes in pool
+  int poolused;       // bytes actively used, grow when this gets too large
+  int poolwrite;      // next place to *look for* a place to write
 } egcpool;
 
 static inline void
@@ -37,8 +37,8 @@ int egcpool_grow(egcpool* pool, size_t len);
 // consumed, not including any NUL terminator. Note that neither the number
 // of bytes nor columns is necessarily equivalent to the number of decoded code
 // points. Such are the ways of Unicode.
-static inline size_t
-utf8_gce_len(const char* gcluster, int* colcount){
+static inline int
+utf8_egc_len(const char* gcluster, int* colcount){
   size_t ret = 0;
   *colcount = 0;
   wchar_t wc;
@@ -64,8 +64,8 @@ utf8_gce_len(const char* gcluster, int* colcount){
 
 // if we're inserting a EGC of |len| bytes, ought we proactively realloc?
 static inline bool
-egcpool_alloc_justified(const egcpool* pool, size_t len){
-  const size_t poolfree = pool->poolsize - pool->poolused;
+egcpool_alloc_justified(const egcpool* pool, int len){
+  const int poolfree = pool->poolsize - pool->poolused;
   // proactively get more space if we have less than 10% free. this doesn't
   // guarantee that we'll have enough space to insert the string -- we could
   // theoretically have every 10th byte free, and be unable to write even a
@@ -83,7 +83,7 @@ egcpool_alloc_justified(const egcpool* pool, size_t len){
 // columns is stored to '*cols'.
 static inline int
 egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){
-  size_t len = utf8_gce_len(egc, cols) + 1; // count the NUL terminator
+  int len = utf8_egc_len(egc, cols) + 1; // count the NUL terminator
   if(len <= 2){ // should never be empty, nor a single byte + NUL
     return -1;
   }
@@ -111,7 +111,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){
     // row. starting at pool->poolwrite, look for such a range of unused
     // memory. if we find it, write it out, and update used count. if we come
     // back to where we started, force a growth and try again.
-    size_t curpos = pool->poolwrite; 
+    int curpos = pool->poolwrite;
     do{
       if(curpos == pool->poolsize){
         curpos = 0;
@@ -124,7 +124,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){
         }
         curpos = 0; // can this skip pool->poolwrite?
       }else{ // promising! let's see if there's enough space
-        size_t need = len;
+        int need = len;
         size_t trial = curpos;
         while(--need){
           if(pool->pool[++trial]){ // alas, not enough space here
@@ -144,7 +144,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){
         }
         curpos += len - need;
       }
-    }while(curpos != pool->poolwrite); 
+    }while(curpos != pool->poolwrite);
   }while( (searched = !searched) );
   free(duplicated);
   return -1; // should never get here
@@ -154,7 +154,7 @@ egcpool_stash(egcpool* pool, const char* egc, size_t* ulen, int* cols){
 // we find a zero (our own NUL terminator). remove that number of bytes from
 // the usedcount.
 static inline void
-egcpool_release(egcpool* pool, size_t offset){
+egcpool_release(egcpool* pool, int offset){
   size_t freed = 1; // account for free(d) NUL terminator
   while(pool->pool[offset]){
     pool->pool[offset] = '\0';
diff --git a/src/lib/notcurses.c b/src/lib/notcurses.c
index fdd23f739..3d8a13aad 100644
--- a/src/lib/notcurses.c
+++ b/src/lib/notcurses.c
@@ -555,9 +555,15 @@ term_movyx(int y, int x){
 // is it a single ASCII byte, wholly contained within the cell?
 static inline bool
 simple_gcluster_p(const char* gcluster){
-  return *gcluster == '\0' ||
-         // FIXME need to ensure next character is not a nonspacer!
-         (*(unsigned char*)gcluster < 0x80);
+  if(*gcluster == '\0'){
+    return true;
+  }
+  if(*(unsigned char*)gcluster >= 0x80){
+    return false;
+  }
+  // we might be a simple ASCII, if the next character is *not* a nonspacing
+  // combining character
+  return false; // FIXME
 }
 
 static inline bool
@@ -677,8 +683,7 @@ ncplane_cursor_stuck(const ncplane* n){
   return (n->x == n->lenx && n->y == n->leny);
 }
 
-static int
-cell_duplicate(ncplane* n, cell* targ, const cell* c){
+int cell_duplicate(ncplane* n, cell* targ, const cell* c){
   cell_release(n, targ);
   targ->attrword = c->attrword;
   targ->channels = c->channels;
@@ -688,7 +693,7 @@ cell_duplicate(ncplane* n, cell* targ, const cell* c){
   }
   size_t ulen;
   int cols;
-  // FIXME insert colcount into cell...
+  // FIXME insert colcount into cell...if it's ever valid, anyway
   int eoffset = egcpool_stash(&n->pool, extended_gcluster(n, c), &ulen, &cols);
   if(eoffset < 0){
     return -1;
@@ -733,12 +738,14 @@ void cell_release(ncplane* n, cell* c){
 // bytes copied out of 'gcluster', or -1 on failure.
 int cell_load(ncplane* n, cell* c, const char* gcluster){
   cell_release(n, c);
-  if(simple_gcluster_p(gcluster)){
+  int bytes;
+  int cols;
+  if((bytes = utf8_egc_len(gcluster, &cols)) >= 0 && bytes <= 1){
     c->gcluster = *gcluster;
     return !!c->gcluster;
   }
   size_t ulen;
-  int cols;
+  // FIXME feed in already-calculated lengths from prior utf8_egc_len()!
   int eoffset = egcpool_stash(&n->pool, gcluster, &ulen, &cols);
   if(eoffset < 0){
     return -1;
diff --git a/tests/egcpool.cpp b/tests/egcpool.cpp
index 02b3e40c4..fe41d9024 100644
--- a/tests/egcpool.cpp
+++ b/tests/egcpool.cpp
@@ -26,7 +26,7 @@ TEST_F(EGCPoolTest, Initialized) {
 TEST_F(EGCPoolTest, UTF8EGC) {
   const char* wstr = "☢";
   int c;
-  auto ulen = utf8_gce_len(wstr, &c);
+  auto ulen = utf8_egc_len(wstr, &c);
   ASSERT_LT(0, ulen);
   EXPECT_LT(0, c);
 }
@@ -36,13 +36,17 @@ TEST_F(EGCPoolTest, UTF8EGC) {
 TEST_F(EGCPoolTest, UTF8EGCCombining) {
   const char* w1 = "à"; // U+00E0, U+0000         (c3 a0)
   const char* w2 = "à"; // U+0061, U+0300, U+0000 (61 cc 80)
-  int c1, c2;
-  auto u1 = utf8_gce_len(w1, &c1);
-  auto u2 = utf8_gce_len(w2, &c2);
+  const char* w3 = "a"; // U+0061, U+0000         (61)
+  int c1, c2, c3;
+  auto u1 = utf8_egc_len(w1, &c1);
+  auto u2 = utf8_egc_len(w2, &c2);
+  auto u3 = utf8_egc_len(w3, &c3);
   ASSERT_EQ(2, u1);
   ASSERT_EQ(3, u2);
+  ASSERT_EQ(1, u3);
   ASSERT_EQ(1, c1);
   ASSERT_EQ(1, c2);
+  ASSERT_EQ(1, c3);
 }
 
 TEST_F(EGCPoolTest, AddAndRemove) {
diff --git a/tests/ncplane.cpp b/tests/ncplane.cpp
index e632a75e8..a1624747a 100644
--- a/tests/ncplane.cpp
+++ b/tests/ncplane.cpp
@@ -209,3 +209,50 @@ TEST_F(NcplaneTest, PerimeterBox) {
 TEST_F(NcplaneTest, EraseScreen) {
   ncplane_erase(n_);
 }
+
+// we're gonna run both a composed latin a with grave, and then a latin a with
+// a combining nonspacing grave
+TEST_F(NcplaneTest, CellLoadCombining) {
+  const char* w1 = "à"; // U+00E0, U+0000         (c3 a0)
+  const char* w2 = "à"; // U+0061, U+0300, U+0000 (61 cc 80)
+  const char* w3 = "a"; // U+0061, U+0000         (61)
+  cell cell1 = CELL_TRIVIAL_INITIALIZER;
+  cell cell2 = CELL_TRIVIAL_INITIALIZER;
+  cell cell3 = CELL_TRIVIAL_INITIALIZER;
+  auto u1 = cell_load(n_, &cell1, w1);
+  auto u2 = cell_load(n_, &cell2, w2);
+  auto u3 = cell_load(n_, &cell3, w3);
+  ASSERT_EQ(2, u1);
+  ASSERT_EQ(3, u2);
+  ASSERT_EQ(1, u3);
+  cell_release(n_, &cell1);
+  cell_release(n_, &cell2);
+  cell_release(n_, &cell3);
+}
+
+TEST_F(NcplaneTest, CellDuplicateCombining) {
+  const char* w1 = "à"; // U+00E0, U+0000         (c3 a0)
+  const char* w2 = "à"; // U+0061, U+0300, U+0000 (61 cc 80)
+  const char* w3 = "a"; // U+0061, U+0000         (61)
+  cell cell1 = CELL_TRIVIAL_INITIALIZER;
+  cell cell2 = CELL_TRIVIAL_INITIALIZER;
+  cell cell3 = CELL_TRIVIAL_INITIALIZER;
+  auto u1 = cell_load(n_, &cell1, w1);
+  auto u2 = cell_load(n_, &cell2, w2);
+  auto u3 = cell_load(n_, &cell3, w3);
+  ASSERT_EQ(2, u1);
+  ASSERT_EQ(3, u2);
+  ASSERT_EQ(1, u3);
+  cell cell4 = CELL_TRIVIAL_INITIALIZER;
+  cell cell5 = CELL_TRIVIAL_INITIALIZER;
+  cell cell6 = CELL_TRIVIAL_INITIALIZER;
+  EXPECT_EQ(2, cell_duplicate(n_, &cell4, &cell1));
+  EXPECT_EQ(3, cell_duplicate(n_, &cell5, &cell2));
+  EXPECT_EQ(1, cell_duplicate(n_, &cell6, &cell3));
+  cell_release(n_, &cell1);
+  cell_release(n_, &cell2);
+  cell_release(n_, &cell3);
+  cell_release(n_, &cell4);
+  cell_release(n_, &cell5);
+  cell_release(n_, &cell6);
+}