larray-project · gdementen · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/doc/source/changes/version_0_35_1.rst.inc b/doc/source/changes/version_0_35_1.rst.inc
@@ -70,3 +70,7 @@ Fixes
   (closes :issue:`1166`).
 
 * fixed `sequence()` when both `inc` and `mult` are defined.
+
+* fixed taking the subset of an Axis with mixed type labels (or of an Array
+  using such an axis) when specifying the labels as a numpy array
+  (closes :issue:`1194`).
diff --git a/larray/core/axis.py b/larray/core/axis.py
@@ -244,6 +244,8 @@ def _mapping(self) -> Dict[Scalar, int]:
 
     def _update_key_values(self) -> Tuple[np.ndarray, np.ndarray]:
         mapping = self._mapping
+        assert self.dtype.kind != 'O', ("Axis with object dtype should not use "
+                                        "the sorted_keys/values code path")
         if mapping:
             sorted_keys, sorted_values = tuple(zip(*sorted(mapping.items())))
         else:
@@ -1100,7 +1102,10 @@ def index(self, key) -> Union[int, np.ndarray, slice]:
             # stop is inclusive in the input key and exclusive in the output !
             stop = mapping[key.stop] + 1 if key.stop is not None else None
             return slice(start, stop, key.step)
-        elif isinstance(key, (tuple, list, OrderedSet)):
+        elif (isinstance(key, (tuple, list, OrderedSet)) or
+              # object axis labels can contain mixed-types and those are not
+              # supported by the array_lookup2 code path
+              (isinstance(key, np.ndarray) and self.dtype.kind == 'O')):
             # TODO: the result should be cached
             # Note that this is faster than array_lookup(np.array(key), mapping)
             res = np.empty(len(key), int)

diff --git a/larray/core/group.py b/larray/core/group.py
@@ -334,10 +334,6 @@ def _can_have_groups(seq) -> bool:
     return _is_object_array(seq) or isinstance(seq, (tuple, list))
 
 
-def _contain_group_ticks(ticks) -> bool:
-    return _can_have_groups(ticks) and any(isinstance(tick, Group) for tick in ticks)
-
-
 def _seq_group_to_name(seq) -> Sequence[Any]:
     if _can_have_groups(seq):
         return [v.name if isinstance(v, Group) else v for v in seq]
@@ -387,7 +383,7 @@ def _to_tick(v) -> Scalar:
         return str(v)
 
 
-def _to_ticks(s, parse_single_int=False) -> Iterable[Scalar]:
+def _to_ticks(s, parse_single_int=False) -> np.ndarray:
     r"""
     Make a (list of) value(s) usable as the collection of labels for an Axis (ie hashable).
 

diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py
@@ -39,27 +39,55 @@ def parse(s):
         return s
 
 
-def index_to_labels(idx, sort=True): # -> list[np.ndarray]:
+def simple_index_to_labels(idx: pd.Index, sort=True, keep_object=True) -> np.ndarray:
+    r"""
+    Return unique labels for a simple index as a numpy array
+
+    keep_object is an option which shouldn't exist (it should be always True)
+    It is a bug to use keep_object=False, but I introduced the option on purpose
+    to avoid breaking our users existing code in a bug-fix release (see #1193).
+    """
+    if sort:
+        dtype = 'O' if keep_object and idx.dtype.kind == 'O' else None
+        # this will fail for mixed-type labels (as does np.sort(idx.to_numpy()))
+        labels = np.asarray(sorted(idx.to_list()), dtype=dtype)
+    else:
+        if keep_object:
+            # this is NOT the same as idx.to_numpy() (which we should always
+            # use) because it converts mixed str-numbers object indexes to a
+            # single str type.
+            labels = idx.to_numpy()
+        else:
+            labels = np.asarray(idx.to_list())
+    # this is a bug introduced on purpose to keep backward compatibility
+    # (see issue #1187)
+    if isinstance(idx, pd.DatetimeIndex):
+        labels = labels.astype(str)
+    return labels
+
+
+def index_to_labels(idx, sort=True, keep_object=True): # -> list[np.ndarray]:
     r"""
     Return unique labels for each dimension as a list of numpy arrays
+
+    keep_object means that object dtype indexes will be returned as object
+    dtype arrays, even if they contain only strings or numbers (see #1193).
     """
     if isinstance(idx, pd.MultiIndex):
         if sort:
-            return [level.to_numpy() for level in idx.levels]
+            # idx.levels is a FrozenList of Index objects (which are already
+            # sorted)
+            return [simple_index_to_labels(idx_for_level, sort=False,
+                                           keep_object=keep_object)
+                    for idx_for_level in idx.levels]
         else:
             # requires Pandas >= 0.23 (and it does NOT sort the values)
-            return [idx.unique(level_num).to_numpy()
+            return [simple_index_to_labels(idx.unique(level_num), sort=False,
+                                           keep_object=keep_object)
                     for level_num in range(idx.nlevels)]
     else:
         assert isinstance(idx, pd.Index)
-        if sort:
-            # TODO: we should probably only pass via Python when the dtype is
-            #       object
-            # For object arrays, it is often faster to sort the labels
-            # in Python than to use np.sort, which is very slow in that case
-            return [np.asarray(sorted(idx.to_list()))]
-        else:
-            return [idx.to_numpy()]
+        return [simple_index_to_labels(idx, sort=sort, keep_object=keep_object)]
 
 
 def product_index(idx, sort=False):
@@ -330,8 +358,11 @@ def from_frame(df,
         if sort_rows or sort_columns:
             raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
                              'Please call the method sort_labels on the returned array to sort rows or columns')
-        index_labels = index_to_labels(df.index, sort=False)
-        column_labels = index_to_labels(df.columns, sort=False)
+        # keep_object=False is an intentional bug to avoid breaking backwards
+        # compatibility (see issue #1193)
+        index_labels = index_to_labels(df.index, sort=False, keep_object=False)
+        column_labels = index_to_labels(df.columns, sort=False,
+                                        keep_object=False)
         axes_labels = index_labels + column_labels
 
     # Pandas treats column labels as column names (strings) so we need to convert them to values

diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py
@@ -4134,6 +4134,8 @@ def test_read_excel_pandas():
 
 
 def test_from_lists():
+    from datetime import datetime
+
     expected = ndtest((2, 2, 3))
 
     # simple
@@ -4200,6 +4202,15 @@ def test_from_lists():
                              ['c1', 'FO', 0, 0, 2]], sort_columns=True)
     assert_larray_equal(sorted_arr, expected)
 
+    # with datetime labels
+    res = from_lists([['str', 'date', 'value'],
+                      ['abc', datetime.now(), 1]],
+                     nb_axes=3)
+    # this is what we SHOULD return but we do not so far to avoid breaking
+    # backward compatibility (see issue #1187)
+    # assert res.axes[1].dtype == 'datetime64[ns]'
+    assert res.axes[1].dtype.kind == 'U'
+
 
 def test_to_series():
     # simple test
@@ -4852,6 +4863,17 @@ def test_from_frame():
     assert_larray_equal(res, expected)
     assert res.data.flags.writeable
 
+    # test that mixed-type (str-number) multi index come out as a mixed type
+    # labels axis (see issue #1193)
+    df = pd.DataFrame([['s0', 'a', 0], ['s0', 1, 1]],
+                      columns=['str', 'obj', 'value']).set_index(['str', 'obj'])
+    res = from_frame(df)
+    expected_axes = [Axis('str=s0'),
+                     Axis(np.array(['a', 1], dtype=object), 'obj'),
+                     Axis(['value'])]
+    expected = Array([[[0], [1]]], expected_axes)
+    assert_larray_equal(res, expected)
+
 
 def test_asarray():
     series = pd.Series([0, 1, 2], ['a0', 'a1', 'a2'], name='a')

diff --git a/larray/tests/test_axis.py b/larray/tests/test_axis.py
@@ -122,11 +122,7 @@ def test_getitem():
 
 
 def test_index():
-    # an axis with labels having the object dtype
-    a = Axis(np.array(["a0", "a1"], dtype=object), 'a')
-    assert a.index('a1') == 1
-    assert a.index('a1 >> A1') == 1
-
+    # a normal axis
     time = Axis([2007, 2009], 'time')
     res = time.index(time.i[1])
     assert res == 1
@@ -137,6 +133,19 @@ def test_index():
     res = time.index('time.i[1]')
     assert res == 1
 
+    # an axis with labels having the object dtype (but homogeneous types)
+    a = Axis(np.array(["a0", "a1"], dtype=object), 'a')
+    assert a.index('a1') == 1
+    assert a.index('a1 >> A1') == 1
+
+    # an axis with labels having the object dtype and mixed types
+    a = Axis(np.array(["a0", 1], dtype=object), 'a')
+    assert a.index('a0') == 0
+    assert a.index(['a0']) == [0]
+    assert a.index(1) == 1
+    # issue #1194
+    assert a.index(np.array(['a0'])) == [0]
+
 
 def test_astype():
     arr = ndtest(Axis('time=2015..2020,total')).drop('total')