Skip to content

Commit d049ddf

Browse files
authored
BUG: ensure to always return new objects in Index set operations (avoid metadata mutation) (#63174)
1 parent 1fc9072 commit d049ddf

File tree

5 files changed

+93
-13
lines changed

5 files changed

+93
-13
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1227,6 +1227,7 @@ Indexing
12271227
- Bug in :meth:`DataFrame.loc.__getitem__` and :meth:`DataFrame.iloc.__getitem__` with a :class:`CategoricalDtype` column with integer categories raising when trying to index a row containing a ``NaN`` entry (:issue:`58954`)
12281228
- Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`)
12291229
- Bug in :meth:`Index.get_indexer` not casting missing values correctly for new string datatype (:issue:`55833`)
1230+
- Bug in :meth:`Index.intersection`, :meth:`Index.union`, :meth:`MultiIndex.intersection`, and :meth:`MultiIndex.union` returning a reference to the original Index instead of a new instance when operating on identical indexes, which could cause metadata corruption when modifying the result (:issue:`63169`)
12301231
- Bug in adding new rows with :meth:`DataFrame.loc.__setitem__` or :class:`Series.loc.__setitem__` which failed to retain dtype on the object's index in some cases (:issue:`41626`)
12311232
- Bug in indexing on a :class:`DatetimeIndex` with a ``timestamp[pyarrow]`` dtype or on a :class:`TimedeltaIndex` with a ``duration[pyarrow]`` dtype (:issue:`62277`)
12321233

pandas/core/indexes/base.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2984,13 +2984,12 @@ def __bool__(self) -> NoReturn:
29842984
def _get_reconciled_name_object(self, other):
29852985
"""
29862986
If the result of a set operation will be self,
2987-
return self, unless the name changes, in which
2988-
case make a shallow copy of self.
2987+
return a shallow copy of self.
29892988
"""
29902989
name = get_op_result_name(self, other)
29912990
if self.name is not name:
29922991
return self.rename(name)
2993-
return self
2992+
return self.copy(deep=False)
29942993

29952994
@final
29962995
def _validate_sort_keyword(self, sort) -> None:

pandas/core/indexes/multi.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4099,13 +4099,12 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
40994099
def _get_reconciled_name_object(self, other) -> MultiIndex:
41004100
"""
41014101
If the result of a set operation will be self,
4102-
return self, unless the names change, in which
4103-
case make a shallow copy of self.
4102+
return a shallow copy of self.
41044103
"""
41054104
names = self._maybe_match_names(other)
41064105
if self.names != names:
41074106
return self.rename(names)
4108-
return self
4107+
return self.copy(deep=False)
41094108

41104109
def _maybe_match_names(self, other):
41114110
"""

pandas/tests/indexes/test_setops.py

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ def test_intersection(self, index, sort):
727727

728728
# Corner cases
729729
inter = first.intersection(first, sort=sort)
730-
assert inter is first
730+
assert inter is not first
731731

732732
@pytest.mark.parametrize(
733733
"index2_name,keeps_name",
@@ -812,16 +812,16 @@ def test_union_identity(self, index, sort):
812812
first = index[5:20]
813813

814814
union = first.union(first, sort=sort)
815-
# i.e. identity is not preserved when sort is True
816-
assert (union is first) is (not sort)
815+
# GH#63169 - identity is not preserved to prevent shared mutable state
816+
assert union is not first
817817

818818
# This should no longer be the same object, since [] is not consistent,
819819
# both objects will be recast to dtype('O')
820820
union = first.union(Index([], dtype=first.dtype), sort=sort)
821-
assert (union is first) is (not sort)
821+
assert union is not first
822822

823823
union = Index([], dtype=first.dtype).union(first, sort=sort)
824-
assert (union is first) is (not sort)
824+
assert union is not first
825825

826826
@pytest.mark.parametrize("index", ["string"], indirect=True)
827827
@pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")])
@@ -984,3 +984,83 @@ def test_union_pyarrow_timestamp(self):
984984
res = left.union(right)
985985
expected = Index(["2020-01-01", "2020-01-02"], dtype=left.dtype)
986986
tm.assert_index_equal(res, expected)
987+
988+
989+
def test_intersection_mutation_safety():
990+
# GH#63169
991+
index1 = Index([0, 1], name="original")
992+
index2 = Index([0, 1], name="original")
993+
994+
result = index1.intersection(index2)
995+
996+
assert result is not index1
997+
assert result is not index2
998+
999+
tm.assert_index_equal(result, index1)
1000+
assert result.name == "original"
1001+
1002+
index1.name = "changed"
1003+
1004+
assert result.name == "original"
1005+
assert index1.name == "changed"
1006+
1007+
1008+
def test_union_mutation_safety():
1009+
# GH#63169
1010+
index1 = Index([0, 1], name="original")
1011+
index2 = Index([0, 1], name="original")
1012+
1013+
result = index1.union(index2)
1014+
1015+
assert result is not index1
1016+
assert result is not index2
1017+
1018+
tm.assert_index_equal(result, index1)
1019+
assert result.name == "original"
1020+
1021+
index1.name = "changed"
1022+
1023+
assert result.name == "original"
1024+
assert index1.name == "changed"
1025+
1026+
1027+
def test_union_mutation_safety_other():
1028+
# GH#63169
1029+
index1 = Index([0, 1], name="original")
1030+
index2 = Index([0, 1], name="original")
1031+
1032+
result = index1.union(index2)
1033+
1034+
assert result is not index2
1035+
1036+
tm.assert_index_equal(result, index2)
1037+
assert result.name == "original"
1038+
1039+
index2.name = "changed"
1040+
1041+
assert result.name == "original"
1042+
assert index2.name == "changed"
1043+
1044+
1045+
def test_multiindex_intersection_mutation_safety():
1046+
# GH#63169
1047+
mi1 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"])
1048+
mi2 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"])
1049+
1050+
result = mi1.intersection(mi2)
1051+
assert result is not mi1
1052+
1053+
mi1.names = ["changed1", "changed2"]
1054+
assert result.names == ["x", "y"]
1055+
1056+
1057+
def test_multiindex_union_mutation_safety():
1058+
# GH#63169
1059+
mi1 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"])
1060+
mi2 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"])
1061+
1062+
result = mi1.union(mi2)
1063+
assert result is not mi1
1064+
1065+
mi1.names = ["changed1", "changed2"]
1066+
assert result.names == ["x", "y"]

pandas/tests/indexes/timedeltas/test_setops.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def test_intersection_bug_1708(self):
114114

115115
def test_intersection_equal(self, sort):
116116
# GH 24471 Test intersection outcome given the sort keyword
117-
# for equal indices intersection should return the original index
117+
# GH#63169 intersection returns a copy to prevent shared mutable state
118118
first = timedelta_range("1 day", periods=4, freq="h")
119119
second = timedelta_range("1 day", periods=4, freq="h")
120120
intersect = first.intersection(second, sort=sort)
@@ -124,7 +124,8 @@ def test_intersection_equal(self, sort):
124124

125125
# Corner cases
126126
inter = first.intersection(first, sort=sort)
127-
assert inter is first
127+
assert inter is not first
128+
tm.assert_index_equal(inter, first)
128129

129130
@pytest.mark.parametrize("period_1, period_2", [(0, 4), (4, 0)])
130131
def test_intersection_zero_length(self, period_1, period_2, sort):

0 commit comments

Comments
 (0)