From 98c2b6490259c2845e66931e066acdfbce12ef83 Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 02:49:36 -0700 Subject: [PATCH 1/7] changed char.trunc to better handle full-width and combining characters --- R/print.data.table.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/print.data.table.R b/R/print.data.table.R index 6588ca458f..0acd666df9 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -232,8 +232,11 @@ format_list_item.default = function(x, ...) { char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) - idx = which(nchar(x) > trunc.char) - x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") + nchar_width = nchar(x, 'width') # Test whether string is full-width or half-width, #5096 + nchar_chars = nchar(x, 'char') + is_full_width = nchar_width > nchar_chars + idx = pmin(nchar_width, nchar_chars) > trunc.char + x[idx] = paste0(strtrim(x[idx], trunc.char * fifelse(is_full_width[idx], 2L, 1L)), "...") x } From 725c03700994a8475d5054024e258186855a7200 Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 03:32:57 -0700 Subject: [PATCH 2/7] Tests added, make pretty later --- R/print.data.table.R | 2 +- inst/tests/tests.Rraw | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/R/print.data.table.R b/R/print.data.table.R index 0acd666df9..7a76155206 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -232,7 +232,7 @@ format_list_item.default = function(x, ...) { char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) - nchar_width = nchar(x, 'width') # Test whether string is full-width or half-width, #5096 + nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 nchar_chars = nchar(x, 'char') is_full_width = nchar_width > nchar_chars idx = pmin(nchar_width, nchar_chars) > trunc.char diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 287d36713f..11d33dd97f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18439,3 +18439,38 @@ dt = data.table(a = 1L) test(2252.1, dt[, b:=2L], error = "\\[ was called on a data.table.*not data.table-aware.*':='") test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.table-aware.*'let'") rm(.datatable.aware) + +# tests for trunc.char handling wide characters # 5096 +accented_a = "\u0061\u0301" +ja_ichi = "\u4E00" +ja_ni = "\u4E8C" +ja_ko = "\u3053" +ja_n = "\u3093" +dots = "..." +clean_regex = "^1:\\s+" # cleans "1: " from beginning of output +# Tests for combining character latin a and acute accent, single row +DT = data.table(strrep(accented_a, 4L)) +test(2253.01, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 4L), options=list(datatable.prettyprint.char = 4L)) +test(2253.02, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 3L), dots), options=list(datatable.prettyprint.char = 3L)) +test(2253.03, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 1L), dots), options=list(datatable.prettyprint.char = 1L)) +# Tests for full-width japanese character ichi, single row +DT = data.table(strrep(ja_ichi, 4L)) +test(2253.04, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 4L), options=list(datatable.prettyprint.char = 4L)) +test(2253.05, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 3L), dots), options=list(datatable.prettyprint.char = 3L)) +test(2253.06, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 1L), dots), options=list(datatable.prettyprint.char = 1L)) +# Tests for multiple, different length combining character rows +DT = data.table(strrep(accented_a, 1L:4L)) +test(2253.07, DT, output=" V1\n1: á\n2: áá\n3: ááá\n4: áááá", options=list(datatable.prettyprint.char = 4L)) +test(2253.08, DT, output=" V1\n1: á\n2: áá\n3: ááá\n4: ááá...", options=list(datatable.prettyprint.char = 3L)) +test(2253.09, DT, output=" V1\n1: á\n2: á...\n3: á...\n4: á...", options=list(datatable.prettyprint.char = 1L)) +# Tests for multiple, different length full-width characters +DT = data.table(strrep(ja_ichi, 1L:4L)) +test(2253.10, DT, output=" V1\n1: 一\n2: 一一\n3: 一一一\n4: 一一一一", options=list(datatable.prettyprint.char = 4L)) +test(2253.11, DT, output=" V1\n1: 一\n2: 一一\n3: 一一一\n4: 一一一...", options=list(datatable.prettyprint.char = 3L)) +test(2253.12, DT, output=" V1\n1: 一\n2: 一...\n3: 一...\n4: 一...", options=list(datatable.prettyprint.char = 1L)) +# Tests for combined characters +DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") +test(2253.13, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 4L)) +test(2253.14, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 3L)) +test(2253.15, capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...", options=list(datatable.prettyprint.char = 2L)) +test(2253.16, capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...", options=list(datatable.prettyprint.char = 1L)) From 03d75fbb74306dc5a2afa782060169d86e263876 Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 11:58:38 -0700 Subject: [PATCH 3/7] Added comment to char.trunc for future issues/suggestions --- R/print.data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/print.data.table.R b/R/print.data.table.R index 7a76155206..919c8aaeda 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -229,6 +229,8 @@ format_list_item.default = function(x, ...) { # FR #1091 for pretty printing of character # TODO: maybe instead of doing "this is...", we could do "this ... test"? +# Current implementation may have issues when dealing with strings that have combinations of full-width and half-width characters, +# if this becomes a problem in the future, we could consider string traversal instead. char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) From fd89e81c54bdfac7306bd024fc00eff68f8ffcdc Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 11:59:07 -0700 Subject: [PATCH 4/7] Refactored tests for readability, added multiple rows/columns tests --- inst/tests/tests.Rraw | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 11d33dd97f..1035ed6bcc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18447,7 +18447,7 @@ ja_ni = "\u4E8C" ja_ko = "\u3053" ja_n = "\u3093" dots = "..." -clean_regex = "^1:\\s+" # cleans "1: " from beginning of output +clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output # Tests for combining character latin a and acute accent, single row DT = data.table(strrep(accented_a, 4L)) test(2253.01, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 4L), options=list(datatable.prettyprint.char = 4L)) @@ -18460,17 +18460,22 @@ test(2253.05, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strr test(2253.06, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 1L), dots), options=list(datatable.prettyprint.char = 1L)) # Tests for multiple, different length combining character rows DT = data.table(strrep(accented_a, 1L:4L)) -test(2253.07, DT, output=" V1\n1: á\n2: áá\n3: ááá\n4: áááá", options=list(datatable.prettyprint.char = 4L)) -test(2253.08, DT, output=" V1\n1: á\n2: áá\n3: ááá\n4: ááá...", options=list(datatable.prettyprint.char = 3L)) -test(2253.09, DT, output=" V1\n1: á\n2: á...\n3: á...\n4: á...", options=list(datatable.prettyprint.char = 1L)) +test(2253.07, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"), options=list(datatable.prettyprint.char = 4L)) +test(2253.08, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."), options=list(datatable.prettyprint.char = 3L)) +test(2253.09, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."), options=list(datatable.prettyprint.char = 1L)) # Tests for multiple, different length full-width characters DT = data.table(strrep(ja_ichi, 1L:4L)) -test(2253.10, DT, output=" V1\n1: 一\n2: 一一\n3: 一一一\n4: 一一一一", options=list(datatable.prettyprint.char = 4L)) -test(2253.11, DT, output=" V1\n1: 一\n2: 一一\n3: 一一一\n4: 一一一...", options=list(datatable.prettyprint.char = 3L)) -test(2253.12, DT, output=" V1\n1: 一\n2: 一...\n3: 一...\n4: 一...", options=list(datatable.prettyprint.char = 1L)) -# Tests for combined characters +test(2253.10, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"), options=list(datatable.prettyprint.char = 4L)) +test(2253.11, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."), options=list(datatable.prettyprint.char = 3L)) +test(2253.12, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."), options=list(datatable.prettyprint.char = 1L)) +# Tests for combined characters, multiple columns DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") test(2253.13, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 4L)) test(2253.14, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 3L)) test(2253.15, capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...", options=list(datatable.prettyprint.char = 2L)) test(2253.16, capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...", options=list(datatable.prettyprint.char = 1L)) +# Tests for multiple columns, multiple rows +DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) +test(2253.17, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá"), options=list(datatable.prettyprint.char = 4L)) +test(2253.18, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá"), options=list(datatable.prettyprint.char = 3L)) +test(2253.19, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."), options=list(datatable.prettyprint.char = 1L)) From da9cd01232dc793362bde90c208d9dce37097513 Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 11:59:15 -0700 Subject: [PATCH 5/7] Updated NEWS.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 902f2fecc2..8a3a1a965b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,6 +56,8 @@ 8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup. +9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE From e94d863fc89e5b6c10ef61478c710b6d57abd73f Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 11:59:28 -0700 Subject: [PATCH 6/7] Added myself as contributor in DESCRIPTION --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6b29cb848a..9e00eb8f15 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,5 +83,6 @@ Authors@R: c( person("Dereck","de Mezquita", role="ctb"), person("Michael","Czekanski", role="ctb"), person("Dmitry", "Shemetov", role="ctb"), - person("Nitish", "Jha", role="ctb") + person("Nitish", "Jha", role="ctb"), + person("Joshua", "Wu", role="ctb") ) From 2bb5da392f3d94a3cb10bf34e475e9ed26a8dca1 Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 4 Apr 2024 15:58:33 -0700 Subject: [PATCH 7/7] changed test style to have options in front --- inst/tests/tests.Rraw | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1035ed6bcc..b17e2f7d78 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18450,32 +18450,32 @@ dots = "..." clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output # Tests for combining character latin a and acute accent, single row DT = data.table(strrep(accented_a, 4L)) -test(2253.01, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 4L), options=list(datatable.prettyprint.char = 4L)) -test(2253.02, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 3L), dots), options=list(datatable.prettyprint.char = 3L)) -test(2253.03, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(accented_a, 1L), dots), options=list(datatable.prettyprint.char = 1L)) +test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) +test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) +test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) # Tests for full-width japanese character ichi, single row DT = data.table(strrep(ja_ichi, 4L)) -test(2253.04, gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 4L), options=list(datatable.prettyprint.char = 4L)) -test(2253.05, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 3L), dots), options=list(datatable.prettyprint.char = 3L)) -test(2253.06, gsub(clean_regex, "", capture.output(print(DT))[-1L]), paste0(strrep(ja_ichi, 1L), dots), options=list(datatable.prettyprint.char = 1L)) +test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) +test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) +test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) # Tests for multiple, different length combining character rows DT = data.table(strrep(accented_a, 1L:4L)) -test(2253.07, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"), options=list(datatable.prettyprint.char = 4L)) -test(2253.08, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."), options=list(datatable.prettyprint.char = 3L)) -test(2253.09, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."), options=list(datatable.prettyprint.char = 1L)) +test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá")) +test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá...")) +test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á...")) # Tests for multiple, different length full-width characters DT = data.table(strrep(ja_ichi, 1L:4L)) -test(2253.10, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"), options=list(datatable.prettyprint.char = 4L)) -test(2253.11, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."), options=list(datatable.prettyprint.char = 3L)) -test(2253.12, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."), options=list(datatable.prettyprint.char = 1L)) +test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一")) +test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一...")) +test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一...")) # Tests for combined characters, multiple columns DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") -test(2253.13, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 4L)) -test(2253.14, capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa", options=list(datatable.prettyprint.char = 3L)) -test(2253.15, capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...", options=list(datatable.prettyprint.char = 2L)) -test(2253.16, capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...", options=list(datatable.prettyprint.char = 1L)) +test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa") +test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa") +test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...") +test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...") # Tests for multiple columns, multiple rows DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) -test(2253.17, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá"), options=list(datatable.prettyprint.char = 4L)) -test(2253.18, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá"), options=list(datatable.prettyprint.char = 3L)) -test(2253.19, gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."), options=list(datatable.prettyprint.char = 1L)) +test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá")) +test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá")) +test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."))