From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 22 Apr 2024 15:10:46 +0200 Subject: [PATCH] FIX remove the computed stop_words_ attribute of text vectorizer (#28823) --- doc/whats_new/v1.4.rst | 18 ++++++++ sklearn/feature_extraction/tests/test_text.py | 42 ------------------- sklearn/feature_extraction/text.py | 36 +--------------- 3 files changed, 20 insertions(+), 76 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index ad3cc40..321db3b 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -14,6 +14,24 @@ For a short description of the main highlights of the release, please refer to .. include:: changelog_legend.inc +Security +-------- + +- |Fix| :class:`feature_extraction.text.CountVectorizer` and + :class:`feature_extraction.text.TfidfVectorizer` no longer store discarded + tokens from the training set in their `stop_words_` attribute. This attribute + would hold too frequent (above `max_df`) but also too rare tokens (below + `min_df`). This fixes a potential security issue (data leak) if the discarded + rare tokens hold sensitive information from the training set without the + model developer's knowledge. + + Note: users of those classes are encouraged to either retrain their pipelines + with the new scikit-learn version or to manually clear the `stop_words_` + attribute from previously trained instances of those transformers. This + attribute was designed only for model inspection purposes and has no impact + on the behavior of the transformers. + :pr:`28823` by :user:`Olivier Grisel `. + Changed models -------------- diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 7c7cac8..b784716 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -757,21 +757,11 @@ def test_feature_names(): @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer)) def test_vectorizer_max_features(Vectorizer): expected_vocabulary = {"burger", "beer", "salad", "pizza"} - expected_stop_words = { - "celeri", - "tomato", - "copyright", - "coke", - "sparkling", - "water", - "the", - } # test bounded number of extracted features vectorizer = Vectorizer(max_df=0.6, max_features=4) vectorizer.fit(ALL_FOOD_DOCS) assert set(vectorizer.vocabulary_) == expected_vocabulary - assert vectorizer.stop_words_ == expected_stop_words def test_count_vectorizer_max_features(): @@ -806,21 +796,16 @@ def test_vectorizer_max_df(): vect.fit(test_data) assert "a" in vect.vocabulary_.keys() assert len(vect.vocabulary_.keys()) == 6 - assert len(vect.stop_words_) == 0 vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 vect.fit(test_data) assert "a" not in vect.vocabulary_.keys() # {ae} ignored assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain - assert "a" in vect.stop_words_ - assert len(vect.stop_words_) == 2 vect.max_df = 1 vect.fit(test_data) assert "a" not in vect.vocabulary_.keys() # {ae} ignored assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain - assert "a" in vect.stop_words_ - assert len(vect.stop_words_) == 2 def test_vectorizer_min_df(): @@ -829,21 +814,16 @@ def test_vectorizer_min_df(): vect.fit(test_data) assert "a" in vect.vocabulary_.keys() assert len(vect.vocabulary_.keys()) == 6 - assert len(vect.stop_words_) == 0 vect.min_df = 2 vect.fit(test_data) assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored assert len(vect.vocabulary_.keys()) == 2 # {ae} remain - assert "c" in vect.stop_words_ - assert len(vect.stop_words_) == 4 vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 vect.fit(test_data) assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored assert len(vect.vocabulary_.keys()) == 1 # {a} remains - assert "c" in vect.stop_words_ - assert len(vect.stop_words_) == 5 def test_count_binary_occurrences(): @@ -1156,28 +1136,6 @@ def test_countvectorizer_vocab_dicts_when_pickling(): ) -def test_stop_words_removal(): - # Ensure that deleting the stop_words_ attribute doesn't affect transform - - fitted_vectorizers = ( - TfidfVectorizer().fit(JUNK_FOOD_DOCS), - CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), - CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), - ) - - for vect in fitted_vectorizers: - vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() - - vect.stop_words_ = None - stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() - - delattr(vect, "stop_words_") - stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() - - assert_array_equal(stop_None_transform, vect_transform) - assert_array_equal(stop_del_transform, vect_transform) - - def test_pickling_transformer(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 29104c2..e9727ae 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1081,15 +1081,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): True if a fixed vocabulary of term to indices mapping is provided by the user. - stop_words_ : set - Terms that were ignored because they either: - - - occurred in too many documents (`max_df`) - - occurred in too few documents (`min_df`) - - were cut off by feature selection (`max_features`). - - This is only available if no vocabulary was given. - See Also -------- HashingVectorizer : Convert a collection of text documents to a @@ -1098,12 +1089,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): TfidfVectorizer : Convert a collection of raw documents to a matrix of TF-IDF features. - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. - Examples -------- >>> from sklearn.feature_extraction.text import CountVectorizer @@ -1242,19 +1227,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): mask = new_mask new_indices = np.cumsum(mask) - 1 # maps old indices to new - removed_terms = set() for term, old_index in list(vocabulary.items()): if mask[old_index]: vocabulary[term] = new_indices[old_index] else: del vocabulary[term] - removed_terms.add(term) kept_indices = np.where(mask)[0] if len(kept_indices) == 0: raise ValueError( "After pruning, no terms remain. Try a lower min_df or a higher max_df." ) - return X[:, kept_indices], removed_terms + return X[:, kept_indices] def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False""" @@ -1399,7 +1382,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): raise ValueError("max_df corresponds to < documents than min_df") if max_features is not None: X = self._sort_features(X, vocabulary) - X, self.stop_words_ = self._limit_features( + X = self._limit_features( X, vocabulary, max_doc_count, min_doc_count, max_features ) if max_features is None: @@ -1932,15 +1915,6 @@ class TfidfVectorizer(CountVectorizer): The inverse document frequency (IDF) vector; only defined if ``use_idf`` is True. - stop_words_ : set - Terms that were ignored because they either: - - - occurred in too many documents (`max_df`) - - occurred in too few documents (`min_df`) - - were cut off by feature selection (`max_features`). - - This is only available if no vocabulary was given. - See Also -------- CountVectorizer : Transforms text into a sparse matrix of n-gram counts. @@ -1948,12 +1922,6 @@ class TfidfVectorizer(CountVectorizer): TfidfTransformer : Performs the TF-IDF transformation from a provided matrix of counts. - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. - Examples -------- >>> from sklearn.feature_extraction.text import TfidfVectorizer -- 2.27.0