From d70de6bb8779e4c47889bdcfc5e07947acb77c96 Mon Sep 17 00:00:00 2001 From: Jack Dockerty Date: Fri, 13 Dec 2024 14:10:54 +0000 Subject: [PATCH 1/4] fix: add `null_buffer` check for `LargeStringArray` Add a safety check to ensure that the alignment of buffers cannot be overflowed. This introduces a panic if they are not aligned through a runtime assertion. --- datafusion/functions/src/strings.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index d2fb5d58519e5..0d2e62bdd801e 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -335,7 +335,27 @@ impl LargeStringArrayBuilder { unsafe { self.offsets_buffer.push_unchecked(next_offset) }; } + /// Finalise the builder into a concrete [`LargeStringArray`]. + /// + /// # Panics + /// + /// This method can panic when: + /// + /// - the provided `null_buffer` is not the same length as the `offsets_buffer`. + /// - the provided `null_buffer` is not the same length as the `value_buffer`. pub fn finish(self, null_buffer: Option) -> LargeStringArray { + if let Some(ref null_buffer) = null_buffer { + assert_eq!( + null_buffer.len(), + self.offsets_buffer.len() / size_of::() - 1, + "Null buffer and offsets buffer must be the same length" + ); + assert_eq!( + null_buffer.len(), + self.value_buffer.len(), + "Null buffer and value buffer must be the same length" + ); + } let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8) .len(self.offsets_buffer.len() / size_of::() - 1) .add_buffer(self.offsets_buffer.into()) From 19b538ae6a381b5f3fe613820e745240223839c8 Mon Sep 17 00:00:00 2001 From: Jack Dockerty Date: Fri, 13 Dec 2024 15:04:04 +0000 Subject: [PATCH 2/4] fix: remove value_buffer assertion These buffers can be misaligned and it is not problematic, it is the `null_buffer` which we care about being of the same length. --- datafusion/functions/src/strings.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 0d2e62bdd801e..b6a6914f9b71e 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -342,7 +342,6 @@ impl LargeStringArrayBuilder { /// This method can panic when: /// /// - the provided `null_buffer` is not the same length as the `offsets_buffer`. - /// - the provided `null_buffer` is not the same length as the `value_buffer`. pub fn finish(self, null_buffer: Option) -> LargeStringArray { if let Some(ref null_buffer) = null_buffer { assert_eq!( @@ -350,11 +349,6 @@ impl LargeStringArrayBuilder { self.offsets_buffer.len() / size_of::() - 1, "Null buffer and offsets buffer must be the same length" ); - assert_eq!( - null_buffer.len(), - self.value_buffer.len(), - "Null buffer and value buffer must be the same length" - ); } let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8) .len(self.offsets_buffer.len() / size_of::() - 1) From 3cdddc042adef12037cbfebafe4473fe0b23b072 Mon Sep 17 00:00:00 2001 From: Jack Dockerty Date: Fri, 13 Dec 2024 15:04:45 +0000 Subject: [PATCH 3/4] feat: add `null_buffer` check to `StringArray` This is in a similar vein to `LargeStringArray`, as the code is the same, except for `i32`'s instead of `i64`. --- datafusion/functions/src/strings.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index b6a6914f9b71e..420bc1cfa2d33 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -185,7 +185,21 @@ impl StringArrayBuilder { unsafe { self.offsets_buffer.push_unchecked(next_offset) }; } + /// Finalise the builder into a concrete [`StringArray`]. + /// + /// # Panics + /// + /// This method can panic when: + /// + /// - the provided `null_buffer` is not the same length as the `offsets_buffer`. pub fn finish(self, null_buffer: Option) -> StringArray { + if let Some(ref null_buffer) = null_buffer { + assert_eq!( + null_buffer.len(), + self.offsets_buffer.len() / size_of::() - 1, + "Null buffer and offsets buffer must be the same length" + ); + } let array_builder = ArrayDataBuilder::new(DataType::Utf8) .len(self.offsets_buffer.len() / size_of::() - 1) .add_buffer(self.offsets_buffer.into()) From a8a15fce1125e9ab0b531d51765e4ab3e50dd3d6 Mon Sep 17 00:00:00 2001 From: Jack Dockerty Date: Fri, 13 Dec 2024 16:02:22 +0000 Subject: [PATCH 4/4] feat: use `row_count` var to avoid drift --- datafusion/functions/src/strings.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 420bc1cfa2d33..caafbae6ba5fe 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -193,15 +193,16 @@ impl StringArrayBuilder { /// /// - the provided `null_buffer` is not the same length as the `offsets_buffer`. pub fn finish(self, null_buffer: Option) -> StringArray { + let row_count = self.offsets_buffer.len() / size_of::() - 1; if let Some(ref null_buffer) = null_buffer { assert_eq!( null_buffer.len(), - self.offsets_buffer.len() / size_of::() - 1, + row_count, "Null buffer and offsets buffer must be the same length" ); } let array_builder = ArrayDataBuilder::new(DataType::Utf8) - .len(self.offsets_buffer.len() / size_of::() - 1) + .len(row_count) .add_buffer(self.offsets_buffer.into()) .add_buffer(self.value_buffer.into()) .nulls(null_buffer); @@ -357,15 +358,16 @@ impl LargeStringArrayBuilder { /// /// - the provided `null_buffer` is not the same length as the `offsets_buffer`. pub fn finish(self, null_buffer: Option) -> LargeStringArray { + let row_count = self.offsets_buffer.len() / size_of::() - 1; if let Some(ref null_buffer) = null_buffer { assert_eq!( null_buffer.len(), - self.offsets_buffer.len() / size_of::() - 1, + row_count, "Null buffer and offsets buffer must be the same length" ); } let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8) - .len(self.offsets_buffer.len() / size_of::() - 1) + .len(row_count) .add_buffer(self.offsets_buffer.into()) .add_buffer(self.value_buffer.into()) .nulls(null_buffer);