python-scikit-learn/backport-CVE-2024-5206.patch

From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Mon, 22 Apr 2024 15:10:46 +0200
Subject: [PATCH] FIX remove the computed stop_words_ attribute of text
 vectorizer (#28823)

---
 doc/whats_new/v1.4.rst                        | 18 ++++++++
 sklearn/feature_extraction/tests/test_text.py | 42 -------------------
 sklearn/feature_extraction/text.py            | 36 +---------------
 3 files changed, 20 insertions(+), 76 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index ad3cc40..321db3b 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -14,6 +14,24 @@ For a short description of the main highlights of the release, please refer to
 
 .. include:: changelog_legend.inc
 
+Security
+--------
+
+- |Fix| :class:`feature_extraction.text.CountVectorizer` and
+  :class:`feature_extraction.text.TfidfVectorizer` no longer store discarded
+  tokens from the training set in their `stop_words_` attribute. This attribute
+  would hold too frequent (above `max_df`) but also too rare tokens (below
+  `min_df`). This fixes a potential security issue (data leak) if the discarded
+  rare tokens hold sensitive information from the training set without the
+  model developer's knowledge.
+
+  Note: users of those classes are encouraged to either retrain their pipelines
+  with the new scikit-learn version or to manually clear the `stop_words_`
+  attribute from previously trained instances of those transformers. This
+  attribute was designed only for model inspection purposes and has no impact
+  on the behavior of the transformers.
+  :pr:`28823` by :user:`Olivier Grisel <ogrisel>`.
+
 Changed models
 --------------
 
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 7c7cac8..b784716 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -757,21 +757,11 @@ def test_feature_names():
 @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))
 def test_vectorizer_max_features(Vectorizer):
     expected_vocabulary = {"burger", "beer", "salad", "pizza"}
-    expected_stop_words = {
-        "celeri",
-        "tomato",
-        "copyright",
-        "coke",
-        "sparkling",
-        "water",
-        "the",
-    }
 
     # test bounded number of extracted features
     vectorizer = Vectorizer(max_df=0.6, max_features=4)
     vectorizer.fit(ALL_FOOD_DOCS)
     assert set(vectorizer.vocabulary_) == expected_vocabulary
-    assert vectorizer.stop_words_ == expected_stop_words
 
 
 def test_count_vectorizer_max_features():
@@ -806,21 +796,16 @@ def test_vectorizer_max_df():
     vect.fit(test_data)
     assert "a" in vect.vocabulary_.keys()
     assert len(vect.vocabulary_.keys()) == 6
-    assert len(vect.stop_words_) == 0
 
     vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
     vect.fit(test_data)
     assert "a" not in vect.vocabulary_.keys()  # {ae} ignored
     assert len(vect.vocabulary_.keys()) == 4  # {bcdt} remain
-    assert "a" in vect.stop_words_
-    assert len(vect.stop_words_) == 2
 
     vect.max_df = 1
     vect.fit(test_data)
     assert "a" not in vect.vocabulary_.keys()  # {ae} ignored
     assert len(vect.vocabulary_.keys()) == 4  # {bcdt} remain
-    assert "a" in vect.stop_words_
-    assert len(vect.stop_words_) == 2
 
 
 def test_vectorizer_min_df():
@@ -829,21 +814,16 @@ def test_vectorizer_min_df():
     vect.fit(test_data)
     assert "a" in vect.vocabulary_.keys()
     assert len(vect.vocabulary_.keys()) == 6
-    assert len(vect.stop_words_) == 0
 
     vect.min_df = 2
     vect.fit(test_data)
     assert "c" not in vect.vocabulary_.keys()  # {bcdt} ignored
     assert len(vect.vocabulary_.keys()) == 2  # {ae} remain
-    assert "c" in vect.stop_words_
-    assert len(vect.stop_words_) == 4
 
     vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
     vect.fit(test_data)
     assert "c" not in vect.vocabulary_.keys()  # {bcdet} ignored
     assert len(vect.vocabulary_.keys()) == 1  # {a} remains
-    assert "c" in vect.stop_words_
-    assert len(vect.stop_words_) == 5
 
 
 def test_count_binary_occurrences():
@@ -1156,28 +1136,6 @@ def test_countvectorizer_vocab_dicts_when_pickling():
         )
 
 
-def test_stop_words_removal():
-    # Ensure that deleting the stop_words_ attribute doesn't affect transform
-
-    fitted_vectorizers = (
-        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
-        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
-        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
-    )
-
-    for vect in fitted_vectorizers:
-        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
-
-        vect.stop_words_ = None
-        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
-
-        delattr(vect, "stop_words_")
-        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
-
-        assert_array_equal(stop_None_transform, vect_transform)
-        assert_array_equal(stop_del_transform, vect_transform)
-
-
 def test_pickling_transformer():
     X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
     orig = TfidfTransformer().fit(X)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 29104c2..e9727ae 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1081,15 +1081,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
         True if a fixed vocabulary of term to indices mapping
         is provided by the user.
 
-    stop_words_ : set
-        Terms that were ignored because they either:
-
-          - occurred in too many documents (`max_df`)
-          - occurred in too few documents (`min_df`)
-          - were cut off by feature selection (`max_features`).
-
-        This is only available if no vocabulary was given.
-
     See Also
     --------
     HashingVectorizer : Convert a collection of text documents to a
@@ -1098,12 +1089,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
     TfidfVectorizer : Convert a collection of raw documents to a matrix
         of TF-IDF features.
 
-    Notes
-    -----
-    The ``stop_words_`` attribute can get large and increase the model size
-    when pickling. This attribute is provided only for introspection and can
-    be safely removed using delattr or set to None before pickling.
-
     Examples
     --------
     >>> from sklearn.feature_extraction.text import CountVectorizer
@@ -1242,19 +1227,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
             mask = new_mask
 
         new_indices = np.cumsum(mask) - 1  # maps old indices to new
-        removed_terms = set()
         for term, old_index in list(vocabulary.items()):
             if mask[old_index]:
                 vocabulary[term] = new_indices[old_index]
             else:
                 del vocabulary[term]
-                removed_terms.add(term)
         kept_indices = np.where(mask)[0]
         if len(kept_indices) == 0:
             raise ValueError(
                 "After pruning, no terms remain. Try a lower min_df or a higher max_df."
             )
-        return X[:, kept_indices], removed_terms
+        return X[:, kept_indices]
 
     def _count_vocab(self, raw_documents, fixed_vocab):
         """Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
@@ -1399,7 +1382,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
                 raise ValueError("max_df corresponds to < documents than min_df")
             if max_features is not None:
                 X = self._sort_features(X, vocabulary)
-            X, self.stop_words_ = self._limit_features(
+            X = self._limit_features(
                 X, vocabulary, max_doc_count, min_doc_count, max_features
             )
             if max_features is None:
@@ -1932,15 +1915,6 @@ class TfidfVectorizer(CountVectorizer):
         The inverse document frequency (IDF) vector; only defined
         if ``use_idf`` is True.
 
-    stop_words_ : set
-        Terms that were ignored because they either:
-
-          - occurred in too many documents (`max_df`)
-          - occurred in too few documents (`min_df`)
-          - were cut off by feature selection (`max_features`).
-
-        This is only available if no vocabulary was given.
-
     See Also
     --------
     CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
@@ -1948,12 +1922,6 @@ class TfidfVectorizer(CountVectorizer):
     TfidfTransformer : Performs the TF-IDF transformation from a provided
         matrix of counts.
 
-    Notes
-    -----
-    The ``stop_words_`` attribute can get large and increase the model size
-    when pickling. This attribute is provided only for introspection and can
-    be safely removed using delattr or set to None before pickling.
-
     Examples
     --------
     >>> from sklearn.feature_extraction.text import TfidfVectorizer
-- 
2.27.0
backport CVE-2024-5206 (cherry picked from commit 82ebfe6bffa5567446ce6af9ffe83343cef428ff) 2024-06-09 09:31:32 +08:00			`From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001`
			`From: Olivier Grisel <olivier.grisel@ensta.org>`
			`Date: Mon, 22 Apr 2024 15:10:46 +0200`
			`Subject: [PATCH] FIX remove the computed stop_words_ attribute of text`
			`vectorizer (#28823)`

			`---`
			`doc/whats_new/v1.4.rst \| 18 ++++++++`
			`sklearn/feature_extraction/tests/test_text.py \| 42 -------------------`
			`sklearn/feature_extraction/text.py \| 36 +---------------`
			`3 files changed, 20 insertions(+), 76 deletions(-)`

			`diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst`
			`index ad3cc40..321db3b 100644`
			`--- a/doc/whats_new/v1.4.rst`
			`+++ b/doc/whats_new/v1.4.rst`
			`@@ -14,6 +14,24 @@ For a short description of the main highlights of the release, please refer to`

			`.. include:: changelog_legend.inc`

			`+Security`
			`+--------`
			`+`
			+- \|Fix\| :class:`feature_extraction.text.CountVectorizer` and
			+ :class:`feature_extraction.text.TfidfVectorizer` no longer store discarded
			+ tokens from the training set in their `stop_words_` attribute. This attribute
			+ would hold too frequent (above `max_df`) but also too rare tokens (below
			+ `min_df`). This fixes a potential security issue (data leak) if the discarded
			`+ rare tokens hold sensitive information from the training set without the`
			`+ model developer's knowledge.`
			`+`
			`+ Note: users of those classes are encouraged to either retrain their pipelines`
			+ with the new scikit-learn version or to manually clear the `stop_words_`
			`+ attribute from previously trained instances of those transformers. This`
			`+ attribute was designed only for model inspection purposes and has no impact`
			`+ on the behavior of the transformers.`
			+ :pr:`28823` by :user:`Olivier Grisel <ogrisel>`.
			`+`
			`Changed models`
			`--------------`

			`diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py`
			`index 7c7cac8..b784716 100644`
			`--- a/sklearn/feature_extraction/tests/test_text.py`
			`+++ b/sklearn/feature_extraction/tests/test_text.py`
			`@@ -757,21 +757,11 @@ def test_feature_names():`
			`@pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))`
			`def test_vectorizer_max_features(Vectorizer):`
			`expected_vocabulary = {"burger", "beer", "salad", "pizza"}`
			`- expected_stop_words = {`
			`- "celeri",`
			`- "tomato",`
			`- "copyright",`
			`- "coke",`
			`- "sparkling",`
			`- "water",`
			`- "the",`
			`- }`

			`# test bounded number of extracted features`
			`vectorizer = Vectorizer(max_df=0.6, max_features=4)`
			`vectorizer.fit(ALL_FOOD_DOCS)`
			`assert set(vectorizer.vocabulary_) == expected_vocabulary`
			`- assert vectorizer.stop_words_ == expected_stop_words`


			`def test_count_vectorizer_max_features():`
			`@@ -806,21 +796,16 @@ def test_vectorizer_max_df():`
			`vect.fit(test_data)`
			`assert "a" in vect.vocabulary_.keys()`
			`assert len(vect.vocabulary_.keys()) == 6`
			`- assert len(vect.stop_words_) == 0`

			`vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5`
			`vect.fit(test_data)`
			`assert "a" not in vect.vocabulary_.keys() # {ae} ignored`
			`assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain`
			`- assert "a" in vect.stop_words_`
			`- assert len(vect.stop_words_) == 2`

			`vect.max_df = 1`
			`vect.fit(test_data)`
			`assert "a" not in vect.vocabulary_.keys() # {ae} ignored`
			`assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain`
			`- assert "a" in vect.stop_words_`
			`- assert len(vect.stop_words_) == 2`


			`def test_vectorizer_min_df():`
			`@@ -829,21 +814,16 @@ def test_vectorizer_min_df():`
			`vect.fit(test_data)`
			`assert "a" in vect.vocabulary_.keys()`
			`assert len(vect.vocabulary_.keys()) == 6`
			`- assert len(vect.stop_words_) == 0`

			`vect.min_df = 2`
			`vect.fit(test_data)`
			`assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored`
			`assert len(vect.vocabulary_.keys()) == 2 # {ae} remain`
			`- assert "c" in vect.stop_words_`
			`- assert len(vect.stop_words_) == 4`

			`vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4`
			`vect.fit(test_data)`
			`assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored`
			`assert len(vect.vocabulary_.keys()) == 1 # {a} remains`
			`- assert "c" in vect.stop_words_`
			`- assert len(vect.stop_words_) == 5`


			`def test_count_binary_occurrences():`
			`@@ -1156,28 +1136,6 @@ def test_countvectorizer_vocab_dicts_when_pickling():`
			`)`


			`-def test_stop_words_removal():`
			`- # Ensure that deleting the stop_words_ attribute doesn't affect transform`
			`-`
			`- fitted_vectorizers = (`
			`- TfidfVectorizer().fit(JUNK_FOOD_DOCS),`
			`- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),`
			`- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),`
			`- )`
			`-`
			`- for vect in fitted_vectorizers:`
			`- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()`
			`-`
			`- vect.stop_words_ = None`
			`- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()`
			`-`
			`- delattr(vect, "stop_words_")`
			`- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()`
			`-`
			`- assert_array_equal(stop_None_transform, vect_transform)`
			`- assert_array_equal(stop_del_transform, vect_transform)`
			`-`
			`-`
			`def test_pickling_transformer():`
			`X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)`
			`orig = TfidfTransformer().fit(X)`
			`diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py`
			`index 29104c2..e9727ae 100644`
			`--- a/sklearn/feature_extraction/text.py`
			`+++ b/sklearn/feature_extraction/text.py`
			`@@ -1081,15 +1081,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):`
			`True if a fixed vocabulary of term to indices mapping`
			`is provided by the user.`

			`- stop_words_ : set`
			`- Terms that were ignored because they either:`
			`-`
			- - occurred in too many documents (`max_df`)
			- - occurred in too few documents (`min_df`)
			- - were cut off by feature selection (`max_features`).
			`-`
			`- This is only available if no vocabulary was given.`
			`-`
			`See Also`
			`--------`
			`HashingVectorizer : Convert a collection of text documents to a`
			`@@ -1098,12 +1089,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):`
			`TfidfVectorizer : Convert a collection of raw documents to a matrix`
			`of TF-IDF features.`

			`- Notes`
			`- -----`
			- The ``stop_words_`` attribute can get large and increase the model size
			`- when pickling. This attribute is provided only for introspection and can`
			`- be safely removed using delattr or set to None before pickling.`
			`-`
			`Examples`
			`--------`
			`>>> from sklearn.feature_extraction.text import CountVectorizer`
			`@@ -1242,19 +1227,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):`
			`mask = new_mask`

			`new_indices = np.cumsum(mask) - 1 # maps old indices to new`
			`- removed_terms = set()`
			`for term, old_index in list(vocabulary.items()):`
			`if mask[old_index]:`
			`vocabulary[term] = new_indices[old_index]`
			`else:`
			`del vocabulary[term]`
			`- removed_terms.add(term)`
			`kept_indices = np.where(mask)[0]`
			`if len(kept_indices) == 0:`
			`raise ValueError(`
			`"After pruning, no terms remain. Try a lower min_df or a higher max_df."`
			`)`
			`- return X[:, kept_indices], removed_terms`
			`+ return X[:, kept_indices]`

			`def _count_vocab(self, raw_documents, fixed_vocab):`
			`"""Create sparse feature matrix, and vocabulary where fixed_vocab=False"""`
			`@@ -1399,7 +1382,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):`
			`raise ValueError("max_df corresponds to < documents than min_df")`
			`if max_features is not None:`
			`X = self._sort_features(X, vocabulary)`
			`- X, self.stop_words_ = self._limit_features(`
			`+ X = self._limit_features(`
			`X, vocabulary, max_doc_count, min_doc_count, max_features`
			`)`
			`if max_features is None:`
			`@@ -1932,15 +1915,6 @@ class TfidfVectorizer(CountVectorizer):`
			`The inverse document frequency (IDF) vector; only defined`
			if ``use_idf`` is True.

			`- stop_words_ : set`
			`- Terms that were ignored because they either:`
			`-`
			- - occurred in too many documents (`max_df`)
			- - occurred in too few documents (`min_df`)
			- - were cut off by feature selection (`max_features`).
			`-`
			`- This is only available if no vocabulary was given.`
			`-`
			`See Also`
			`--------`
			`CountVectorizer : Transforms text into a sparse matrix of n-gram counts.`
			`@@ -1948,12 +1922,6 @@ class TfidfVectorizer(CountVectorizer):`
			`TfidfTransformer : Performs the TF-IDF transformation from a provided`
			`matrix of counts.`

			`- Notes`
			`- -----`
			- The ``stop_words_`` attribute can get large and increase the model size
			`- when pickling. This attribute is provided only for introspection and can`
			`- be safely removed using delattr or set to None before pickling.`
			`-`
			`Examples`
			`--------`
			`>>> from sklearn.feature_extraction.text import TfidfVectorizer`
			`--`
			`2.27.0`