From 98c2b6490259c2845e66931e066acdfbce12ef83 Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 02:49:36 -0700
Subject: [PATCH 1/7] changed char.trunc to better handle full-width and
 combining characters

---
 R/print.data.table.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/print.data.table.R b/R/print.data.table.R
index 6588ca458f..0acd666df9 100644
--- a/R/print.data.table.R
+++ b/R/print.data.table.R
@@ -232,8 +232,11 @@ format_list_item.default = function(x, ...) {
 char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) {
   trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
   if (!is.character(x) || trunc.char <= 0L) return(x)
-  idx = which(nchar(x) > trunc.char)
-  x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...")
+  nchar_width = nchar(x, 'width') # Test whether string is full-width or half-width, #5096 
+  nchar_chars = nchar(x, 'char')
+  is_full_width = nchar_width > nchar_chars
+  idx = pmin(nchar_width, nchar_chars) > trunc.char
+  x[idx] = paste0(strtrim(x[idx], trunc.char * fifelse(is_full_width[idx], 2L, 1L)), "...")
   x
 }
 

From 725c03700994a8475d5054024e258186855a7200 Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 03:32:57 -0700
Subject: [PATCH 2/7] Tests added, make pretty later

---
 R/print.data.table.R  |  2 +-
 inst/tests/tests.Rraw | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/R/print.data.table.R b/R/print.data.table.R
index 0acd666df9..7a76155206 100644
--- a/R/print.data.table.R
+++ b/R/print.data.table.R
@@ -232,7 +232,7 @@ format_list_item.default = function(x, ...) {
 char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) {
   trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
   if (!is.character(x) || trunc.char <= 0L) return(x)
-  nchar_width = nchar(x, 'width') # Test whether string is full-width or half-width, #5096 
+  nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 
   nchar_chars = nchar(x, 'char')
   is_full_width = nchar_width > nchar_chars
   idx = pmin(nchar_width, nchar_chars) > trunc.char
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 287d36713f..11d33dd97f 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18439,3 +18439,38 @@ dt = data.table(a = 1L)
 test(2252.1, dt[, b:=2L], error = "\\[ was called on a data.table.*not data.table-aware.*':='")
 test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.table-aware.*'let'")
 rm(.datatable.aware)
+
+# tests for trunc.char handling wide characters # 5096
+accented_a = "\u0061\u0301"
+ja_ichi = "\u4E00"
+ja_ni = "\u4E8C"
+ja_ko = "\u3053"
+ja_n = "\u3093"
+dots = "..."
+clean_regex = "^1:\\s+" # cleans "1:    " from beginning of output
+# Tests for combining character latin a and acute accent, single row
+DT = data.table(strrep(accented_a, 4L))
+test(2253.01, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 4L), options=list(datatable.prettyprint.char = 4L))
+test(2253.02, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 3L), dots), options=list(datatable.prettyprint.char = 3L))
+test(2253.03, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 1L), dots), options=list(datatable.prettyprint.char = 1L))
+# Tests for full-width japanese character ichi, single row
+DT = data.table(strrep(ja_ichi, 4L))
+test(2253.04, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 4L), options=list(datatable.prettyprint.char = 4L))
+test(2253.05, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 3L), dots), options=list(datatable.prettyprint.char = 3L))
+test(2253.06, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 1L), dots), options=list(datatable.prettyprint.char = 1L))
+# Tests for multiple, different length combining character rows
+DT = data.table(strrep(accented_a, 1L:4L))
+test(2253.07, DT, output="     V1\n1:    á\n2:   áá\n3:  ááá\n4: áááá", options=list(datatable.prettyprint.char = 4L))
+test(2253.08, DT, output="       V1\n1:      á\n2:     áá\n3:    ááá\n4: ááá...", options=list(datatable.prettyprint.char = 3L))
+test(2253.09, DT, output="     V1\n1:    á\n2: á...\n3: á...\n4: á...", options=list(datatable.prettyprint.char = 1L))
+# Tests for multiple, different length full-width characters
+DT = data.table(strrep(ja_ichi, 1L:4L))
+test(2253.10, DT, output="         V1\n1:       一\n2:     一一\n3:   一一一\n4: 一一一一", options=list(datatable.prettyprint.char = 4L))
+test(2253.11, DT, output="          V1\n1:        一\n2:      一一\n3:    一一一\n4: 一一一...", options=list(datatable.prettyprint.char = 3L))
+test(2253.12, DT, output="      V1\n1:    一\n2: 一...\n3: 一...\n4: 一...", options=list(datatable.prettyprint.char = 1L))
+# Tests for combined characters
+DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
+test(2253.13, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 4L))
+test(2253.14, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 3L))
+test(2253.15, capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...", options=list(datatable.prettyprint.char = 2L))
+test(2253.16, capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...", options=list(datatable.prettyprint.char = 1L))

From 03d75fbb74306dc5a2afa782060169d86e263876 Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 11:58:38 -0700
Subject: [PATCH 3/7] Added comment to char.trunc for future issues/suggestions

---
 R/print.data.table.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/print.data.table.R b/R/print.data.table.R
index 7a76155206..919c8aaeda 100644
--- a/R/print.data.table.R
+++ b/R/print.data.table.R
@@ -229,6 +229,8 @@ format_list_item.default = function(x, ...) {
 
 # FR #1091 for pretty printing of character
 # TODO: maybe instead of doing "this is...", we could do "this ... test"?
+# Current implementation may have issues when dealing with strings that have combinations of full-width and half-width characters,
+# if this becomes a problem in the future, we could consider string traversal instead.
 char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) {
   trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
   if (!is.character(x) || trunc.char <= 0L) return(x)

From fd89e81c54bdfac7306bd024fc00eff68f8ffcdc Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 11:59:07 -0700
Subject: [PATCH 4/7] Refactored tests for readability, added multiple
 rows/columns tests

---
 inst/tests/tests.Rraw | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 11d33dd97f..1035ed6bcc 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18447,7 +18447,7 @@ ja_ni = "\u4E8C"
 ja_ko = "\u3053"
 ja_n = "\u3093"
 dots = "..."
-clean_regex = "^1:\\s+" # cleans "1:    " from beginning of output
+clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
 # Tests for combining character latin a and acute accent, single row
 DT = data.table(strrep(accented_a, 4L))
 test(2253.01, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 4L), options=list(datatable.prettyprint.char = 4L))
@@ -18460,17 +18460,22 @@ test(2253.05, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strr
 test(2253.06, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 1L), dots), options=list(datatable.prettyprint.char = 1L))
 # Tests for multiple, different length combining character rows
 DT = data.table(strrep(accented_a, 1L:4L))
-test(2253.07, DT, output="     V1\n1:    á\n2:   áá\n3:  ááá\n4: áááá", options=list(datatable.prettyprint.char = 4L))
-test(2253.08, DT, output="       V1\n1:      á\n2:     áá\n3:    ááá\n4: ááá...", options=list(datatable.prettyprint.char = 3L))
-test(2253.09, DT, output="     V1\n1:    á\n2: á...\n3: á...\n4: á...", options=list(datatable.prettyprint.char = 1L))
+test(2253.07, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"), options=list(datatable.prettyprint.char = 4L))
+test(2253.08, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."), options=list(datatable.prettyprint.char = 3L))
+test(2253.09, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."), options=list(datatable.prettyprint.char = 1L))
 # Tests for multiple, different length full-width characters
 DT = data.table(strrep(ja_ichi, 1L:4L))
-test(2253.10, DT, output="         V1\n1:       一\n2:     一一\n3:   一一一\n4: 一一一一", options=list(datatable.prettyprint.char = 4L))
-test(2253.11, DT, output="          V1\n1:        一\n2:      一一\n3:    一一一\n4: 一一一...", options=list(datatable.prettyprint.char = 3L))
-test(2253.12, DT, output="      V1\n1:    一\n2: 一...\n3: 一...\n4: 一...", options=list(datatable.prettyprint.char = 1L))
-# Tests for combined characters
+test(2253.10, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"), options=list(datatable.prettyprint.char = 4L))
+test(2253.11, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."), options=list(datatable.prettyprint.char = 3L))
+test(2253.12, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."), options=list(datatable.prettyprint.char = 1L))
+# Tests for combined characters, multiple columns
 DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
 test(2253.13, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 4L))
 test(2253.14, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 3L))
 test(2253.15, capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...", options=list(datatable.prettyprint.char = 2L))
 test(2253.16, capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...", options=list(datatable.prettyprint.char = 1L))
+# Tests for multiple columns, multiple rows
+DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3))
+test(2253.17, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ     んん ááá", "ここ   んんん ááá", "こここ んんんん ááá"), options=list(datatable.prettyprint.char = 4L))
+test(2253.18, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ      んん ááá", "ここ    んんん ááá", "こここ んんん... ááá"), options=list(datatable.prettyprint.char = 3L))
+test(2253.19, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."), options=list(datatable.prettyprint.char = 1L))

From da9cd01232dc793362bde90c208d9dce37097513 Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 11:59:15 -0700
Subject: [PATCH 5/7] Updated NEWS.md

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 902f2fecc2..8a3a1a965b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -56,6 +56,8 @@
 
 8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup.
 
+9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix.
+
 # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29)  (30 Jan 2024)
 
 ## BREAKING CHANGE

From e94d863fc89e5b6c10ef61478c710b6d57abd73f Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 11:59:28 -0700
Subject: [PATCH 6/7] Added myself as contributor in DESCRIPTION

---
 DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 6b29cb848a..9e00eb8f15 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -83,5 +83,6 @@ Authors@R: c(
   person("Dereck","de Mezquita",   role="ctb"),
   person("Michael","Czekanski",    role="ctb"),
   person("Dmitry", "Shemetov",     role="ctb"),
-  person("Nitish", "Jha",          role="ctb")
+  person("Nitish", "Jha",          role="ctb"),
+  person("Joshua", "Wu",           role="ctb")
   )

From 2bb5da392f3d94a3cb10bf34e475e9ed26a8dca1 Mon Sep 17 00:00:00 2001
From: joshhwuu <joshuawu2004@gmail.com>
Date: Thu, 4 Apr 2024 15:58:33 -0700
Subject: [PATCH 7/7] changed test style to have options in front

---
 inst/tests/tests.Rraw | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 1035ed6bcc..b17e2f7d78 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18450,32 +18450,32 @@ dots = "..."
 clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
 # Tests for combining character latin a and acute accent, single row
 DT = data.table(strrep(accented_a, 4L))
-test(2253.01, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 4L), options=list(datatable.prettyprint.char = 4L))
-test(2253.02, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 3L), dots), options=list(datatable.prettyprint.char = 3L))
-test(2253.03, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 1L), dots), options=list(datatable.prettyprint.char = 1L))
+test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L))
+test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots))
+test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots))
 # Tests for full-width japanese character ichi, single row
 DT = data.table(strrep(ja_ichi, 4L))
-test(2253.04, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 4L), options=list(datatable.prettyprint.char = 4L))
-test(2253.05, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 3L), dots), options=list(datatable.prettyprint.char = 3L))
-test(2253.06, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 1L), dots), options=list(datatable.prettyprint.char = 1L))
+test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L))
+test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots))
+test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots))
 # Tests for multiple, different length combining character rows
 DT = data.table(strrep(accented_a, 1L:4L))
-test(2253.07, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"), options=list(datatable.prettyprint.char = 4L))
-test(2253.08, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."), options=list(datatable.prettyprint.char = 3L))
-test(2253.09, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."), options=list(datatable.prettyprint.char = 1L))
+test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"))
+test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."))
+test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."))
 # Tests for multiple, different length full-width characters
 DT = data.table(strrep(ja_ichi, 1L:4L))
-test(2253.10, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"), options=list(datatable.prettyprint.char = 4L))
-test(2253.11, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."), options=list(datatable.prettyprint.char = 3L))
-test(2253.12, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."), options=list(datatable.prettyprint.char = 1L))
+test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"))
+test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."))
+test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."))
 # Tests for combined characters, multiple columns
 DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
-test(2253.13, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 4L))
-test(2253.14, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 3L))
-test(2253.15, capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...", options=list(datatable.prettyprint.char = 2L))
-test(2253.16, capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...", options=list(datatable.prettyprint.char = 1L))
+test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa")
+test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa")
+test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...")
+test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...")
 # Tests for multiple columns, multiple rows
 DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3))
-test(2253.17, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ     んん ááá", "ここ   んんん ááá", "こここ んんんん ááá"), options=list(datatable.prettyprint.char = 4L))
-test(2253.18, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ      んん ááá", "ここ    んんん ááá", "こここ んんん... ááá"), options=list(datatable.prettyprint.char = 3L))
-test(2253.19, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."), options=list(datatable.prettyprint.char = 1L))
+test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ     んん ááá", "ここ   んんん ááá", "こここ んんんん ááá"))
+test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ      んん ááá", "ここ    んんん ááá", "こここ んんん... ááá"))
+test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."))