Skip to content
Merged

0.35.1 #1196

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/changes/version_0_35_1.rst.inc
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@ Fixes
(closes :issue:`1166`).

* fixed `sequence()` when both `inc` and `mult` are defined.

* fixed taking the subset of an Axis with mixed type labels (or of an Array
using such an axis) when specifying the labels as a numpy array
(closes :issue:`1194`).
7 changes: 6 additions & 1 deletion larray/core/axis.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ def _mapping(self) -> Dict[Scalar, int]:

def _update_key_values(self) -> Tuple[np.ndarray, np.ndarray]:
mapping = self._mapping
assert self.dtype.kind != 'O', ("Axis with object dtype should not use "
"the sorted_keys/values code path")
if mapping:
sorted_keys, sorted_values = tuple(zip(*sorted(mapping.items())))
else:
Expand Down Expand Up @@ -1100,7 +1102,10 @@ def index(self, key) -> Union[int, np.ndarray, slice]:
# stop is inclusive in the input key and exclusive in the output !
stop = mapping[key.stop] + 1 if key.stop is not None else None
return slice(start, stop, key.step)
elif isinstance(key, (tuple, list, OrderedSet)):
elif (isinstance(key, (tuple, list, OrderedSet)) or
# object axis labels can contain mixed-types and those are not
# supported by the array_lookup2 code path
(isinstance(key, np.ndarray) and self.dtype.kind == 'O')):
# TODO: the result should be cached
# Note that this is faster than array_lookup(np.array(key), mapping)
res = np.empty(len(key), int)
Expand Down
6 changes: 1 addition & 5 deletions larray/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,10 +334,6 @@ def _can_have_groups(seq) -> bool:
return _is_object_array(seq) or isinstance(seq, (tuple, list))


def _contain_group_ticks(ticks) -> bool:
return _can_have_groups(ticks) and any(isinstance(tick, Group) for tick in ticks)


def _seq_group_to_name(seq) -> Sequence[Any]:
if _can_have_groups(seq):
return [v.name if isinstance(v, Group) else v for v in seq]
Expand Down Expand Up @@ -387,7 +383,7 @@ def _to_tick(v) -> Scalar:
return str(v)


def _to_ticks(s, parse_single_int=False) -> Iterable[Scalar]:
def _to_ticks(s, parse_single_int=False) -> np.ndarray:
r"""
Make a (list of) value(s) usable as the collection of labels for an Axis (ie hashable).

Expand Down
57 changes: 44 additions & 13 deletions larray/inout/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,55 @@ def parse(s):
return s


def index_to_labels(idx, sort=True): # -> list[np.ndarray]:
def simple_index_to_labels(idx: pd.Index, sort=True, keep_object=True) -> np.ndarray:
r"""
Return unique labels for a simple index as a numpy array

keep_object is an option which shouldn't exist (it should be always True)
It is a bug to use keep_object=False, but I introduced the option on purpose
to avoid breaking our users existing code in a bug-fix release (see #1193).
"""
if sort:
dtype = 'O' if keep_object and idx.dtype.kind == 'O' else None
# this will fail for mixed-type labels (as does np.sort(idx.to_numpy()))
labels = np.asarray(sorted(idx.to_list()), dtype=dtype)
else:
if keep_object:
# this is NOT the same as idx.to_numpy() (which we should always
# use) because it converts mixed str-numbers object indexes to a
# single str type.
labels = idx.to_numpy()
else:
labels = np.asarray(idx.to_list())
# this is a bug introduced on purpose to keep backward compatibility
# (see issue #1187)
if isinstance(idx, pd.DatetimeIndex):
labels = labels.astype(str)
return labels


def index_to_labels(idx, sort=True, keep_object=True): # -> list[np.ndarray]:
r"""
Return unique labels for each dimension as a list of numpy arrays

keep_object means that object dtype indexes will be returned as object
dtype arrays, even if they contain only strings or numbers (see #1193).
"""
if isinstance(idx, pd.MultiIndex):
if sort:
return [level.to_numpy() for level in idx.levels]
# idx.levels is a FrozenList of Index objects (which are already
# sorted)
return [simple_index_to_labels(idx_for_level, sort=False,
keep_object=keep_object)
for idx_for_level in idx.levels]
else:
# requires Pandas >= 0.23 (and it does NOT sort the values)
return [idx.unique(level_num).to_numpy()
return [simple_index_to_labels(idx.unique(level_num), sort=False,
keep_object=keep_object)
for level_num in range(idx.nlevels)]
else:
assert isinstance(idx, pd.Index)
if sort:
# TODO: we should probably only pass via Python when the dtype is
# object
# For object arrays, it is often faster to sort the labels
# in Python than to use np.sort, which is very slow in that case
return [np.asarray(sorted(idx.to_list()))]
else:
return [idx.to_numpy()]
return [simple_index_to_labels(idx, sort=sort, keep_object=keep_object)]


def product_index(idx, sort=False):
Expand Down Expand Up @@ -330,8 +358,11 @@ def from_frame(df,
if sort_rows or sort_columns:
raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
'Please call the method sort_labels on the returned array to sort rows or columns')
index_labels = index_to_labels(df.index, sort=False)
column_labels = index_to_labels(df.columns, sort=False)
# keep_object=False is an intentional bug to avoid breaking backwards
# compatibility (see issue #1193)
index_labels = index_to_labels(df.index, sort=False, keep_object=False)
column_labels = index_to_labels(df.columns, sort=False,
keep_object=False)
axes_labels = index_labels + column_labels

# Pandas treats column labels as column names (strings) so we need to convert them to values
Expand Down
22 changes: 22 additions & 0 deletions larray/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4134,6 +4134,8 @@ def test_read_excel_pandas():


def test_from_lists():
from datetime import datetime

expected = ndtest((2, 2, 3))

# simple
Expand Down Expand Up @@ -4200,6 +4202,15 @@ def test_from_lists():
['c1', 'FO', 0, 0, 2]], sort_columns=True)
assert_larray_equal(sorted_arr, expected)

# with datetime labels
res = from_lists([['str', 'date', 'value'],
['abc', datetime.now(), 1]],
nb_axes=3)
# this is what we SHOULD return but we do not so far to avoid breaking
# backward compatibility (see issue #1187)
# assert res.axes[1].dtype == 'datetime64[ns]'
assert res.axes[1].dtype.kind == 'U'


def test_to_series():
# simple test
Expand Down Expand Up @@ -4852,6 +4863,17 @@ def test_from_frame():
assert_larray_equal(res, expected)
assert res.data.flags.writeable

# test that mixed-type (str-number) multi index come out as a mixed type
# labels axis (see issue #1193)
df = pd.DataFrame([['s0', 'a', 0], ['s0', 1, 1]],
columns=['str', 'obj', 'value']).set_index(['str', 'obj'])
res = from_frame(df)
expected_axes = [Axis('str=s0'),
Axis(np.array(['a', 1], dtype=object), 'obj'),
Axis(['value'])]
expected = Array([[[0], [1]]], expected_axes)
assert_larray_equal(res, expected)


def test_asarray():
series = pd.Series([0, 1, 2], ['a0', 'a1', 'a2'], name='a')
Expand Down
19 changes: 14 additions & 5 deletions larray/tests/test_axis.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,7 @@ def test_getitem():


def test_index():
# an axis with labels having the object dtype
a = Axis(np.array(["a0", "a1"], dtype=object), 'a')
assert a.index('a1') == 1
assert a.index('a1 >> A1') == 1

# a normal axis
time = Axis([2007, 2009], 'time')
res = time.index(time.i[1])
assert res == 1
Expand All @@ -137,6 +133,19 @@ def test_index():
res = time.index('time.i[1]')
assert res == 1

# an axis with labels having the object dtype (but homogeneous types)
a = Axis(np.array(["a0", "a1"], dtype=object), 'a')
assert a.index('a1') == 1
assert a.index('a1 >> A1') == 1

# an axis with labels having the object dtype and mixed types
a = Axis(np.array(["a0", 1], dtype=object), 'a')
assert a.index('a0') == 0
assert a.index(['a0']) == [0]
assert a.index(1) == 1
# issue #1194
assert a.index(np.array(['a0'])) == [0]


def test_astype():
arr = ndtest(Axis('time=2015..2020,total')).drop('total')
Expand Down
Loading