backport CVE-2024-5206
(cherry picked from commit 82ebfe6bffa5567446ce6af9ffe83343cef428ff)
This commit is contained in:
parent
06f3a37618
commit
8bfdeda175
235
backport-CVE-2024-5206.patch
Normal file
235
backport-CVE-2024-5206.patch
Normal file
@ -0,0 +1,235 @@
|
||||
From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001
|
||||
From: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
Date: Mon, 22 Apr 2024 15:10:46 +0200
|
||||
Subject: [PATCH] FIX remove the computed stop_words_ attribute of text
|
||||
vectorizer (#28823)
|
||||
|
||||
---
|
||||
doc/whats_new/v1.4.rst | 18 ++++++++
|
||||
sklearn/feature_extraction/tests/test_text.py | 42 -------------------
|
||||
sklearn/feature_extraction/text.py | 36 +---------------
|
||||
3 files changed, 20 insertions(+), 76 deletions(-)
|
||||
|
||||
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
|
||||
index ad3cc40..321db3b 100644
|
||||
--- a/doc/whats_new/v1.4.rst
|
||||
+++ b/doc/whats_new/v1.4.rst
|
||||
@@ -14,6 +14,24 @@ For a short description of the main highlights of the release, please refer to
|
||||
|
||||
.. include:: changelog_legend.inc
|
||||
|
||||
+Security
|
||||
+--------
|
||||
+
|
||||
+- |Fix| :class:`feature_extraction.text.CountVectorizer` and
|
||||
+ :class:`feature_extraction.text.TfidfVectorizer` no longer store discarded
|
||||
+ tokens from the training set in their `stop_words_` attribute. This attribute
|
||||
+ would hold too frequent (above `max_df`) but also too rare tokens (below
|
||||
+ `min_df`). This fixes a potential security issue (data leak) if the discarded
|
||||
+ rare tokens hold sensitive information from the training set without the
|
||||
+ model developer's knowledge.
|
||||
+
|
||||
+ Note: users of those classes are encouraged to either retrain their pipelines
|
||||
+ with the new scikit-learn version or to manually clear the `stop_words_`
|
||||
+ attribute from previously trained instances of those transformers. This
|
||||
+ attribute was designed only for model inspection purposes and has no impact
|
||||
+ on the behavior of the transformers.
|
||||
+ :pr:`28823` by :user:`Olivier Grisel <ogrisel>`.
|
||||
+
|
||||
Changed models
|
||||
--------------
|
||||
|
||||
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
|
||||
index 7c7cac8..b784716 100644
|
||||
--- a/sklearn/feature_extraction/tests/test_text.py
|
||||
+++ b/sklearn/feature_extraction/tests/test_text.py
|
||||
@@ -757,21 +757,11 @@ def test_feature_names():
|
||||
@pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))
|
||||
def test_vectorizer_max_features(Vectorizer):
|
||||
expected_vocabulary = {"burger", "beer", "salad", "pizza"}
|
||||
- expected_stop_words = {
|
||||
- "celeri",
|
||||
- "tomato",
|
||||
- "copyright",
|
||||
- "coke",
|
||||
- "sparkling",
|
||||
- "water",
|
||||
- "the",
|
||||
- }
|
||||
|
||||
# test bounded number of extracted features
|
||||
vectorizer = Vectorizer(max_df=0.6, max_features=4)
|
||||
vectorizer.fit(ALL_FOOD_DOCS)
|
||||
assert set(vectorizer.vocabulary_) == expected_vocabulary
|
||||
- assert vectorizer.stop_words_ == expected_stop_words
|
||||
|
||||
|
||||
def test_count_vectorizer_max_features():
|
||||
@@ -806,21 +796,16 @@ def test_vectorizer_max_df():
|
||||
vect.fit(test_data)
|
||||
assert "a" in vect.vocabulary_.keys()
|
||||
assert len(vect.vocabulary_.keys()) == 6
|
||||
- assert len(vect.stop_words_) == 0
|
||||
|
||||
vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5
|
||||
vect.fit(test_data)
|
||||
assert "a" not in vect.vocabulary_.keys() # {ae} ignored
|
||||
assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
|
||||
- assert "a" in vect.stop_words_
|
||||
- assert len(vect.stop_words_) == 2
|
||||
|
||||
vect.max_df = 1
|
||||
vect.fit(test_data)
|
||||
assert "a" not in vect.vocabulary_.keys() # {ae} ignored
|
||||
assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
|
||||
- assert "a" in vect.stop_words_
|
||||
- assert len(vect.stop_words_) == 2
|
||||
|
||||
|
||||
def test_vectorizer_min_df():
|
||||
@@ -829,21 +814,16 @@ def test_vectorizer_min_df():
|
||||
vect.fit(test_data)
|
||||
assert "a" in vect.vocabulary_.keys()
|
||||
assert len(vect.vocabulary_.keys()) == 6
|
||||
- assert len(vect.stop_words_) == 0
|
||||
|
||||
vect.min_df = 2
|
||||
vect.fit(test_data)
|
||||
assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored
|
||||
assert len(vect.vocabulary_.keys()) == 2 # {ae} remain
|
||||
- assert "c" in vect.stop_words_
|
||||
- assert len(vect.stop_words_) == 4
|
||||
|
||||
vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4
|
||||
vect.fit(test_data)
|
||||
assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored
|
||||
assert len(vect.vocabulary_.keys()) == 1 # {a} remains
|
||||
- assert "c" in vect.stop_words_
|
||||
- assert len(vect.stop_words_) == 5
|
||||
|
||||
|
||||
def test_count_binary_occurrences():
|
||||
@@ -1156,28 +1136,6 @@ def test_countvectorizer_vocab_dicts_when_pickling():
|
||||
)
|
||||
|
||||
|
||||
-def test_stop_words_removal():
|
||||
- # Ensure that deleting the stop_words_ attribute doesn't affect transform
|
||||
-
|
||||
- fitted_vectorizers = (
|
||||
- TfidfVectorizer().fit(JUNK_FOOD_DOCS),
|
||||
- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
|
||||
- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
|
||||
- )
|
||||
-
|
||||
- for vect in fitted_vectorizers:
|
||||
- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
||||
-
|
||||
- vect.stop_words_ = None
|
||||
- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
||||
-
|
||||
- delattr(vect, "stop_words_")
|
||||
- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
||||
-
|
||||
- assert_array_equal(stop_None_transform, vect_transform)
|
||||
- assert_array_equal(stop_del_transform, vect_transform)
|
||||
-
|
||||
-
|
||||
def test_pickling_transformer():
|
||||
X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
|
||||
orig = TfidfTransformer().fit(X)
|
||||
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
|
||||
index 29104c2..e9727ae 100644
|
||||
--- a/sklearn/feature_extraction/text.py
|
||||
+++ b/sklearn/feature_extraction/text.py
|
||||
@@ -1081,15 +1081,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
|
||||
True if a fixed vocabulary of term to indices mapping
|
||||
is provided by the user.
|
||||
|
||||
- stop_words_ : set
|
||||
- Terms that were ignored because they either:
|
||||
-
|
||||
- - occurred in too many documents (`max_df`)
|
||||
- - occurred in too few documents (`min_df`)
|
||||
- - were cut off by feature selection (`max_features`).
|
||||
-
|
||||
- This is only available if no vocabulary was given.
|
||||
-
|
||||
See Also
|
||||
--------
|
||||
HashingVectorizer : Convert a collection of text documents to a
|
||||
@@ -1098,12 +1089,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
|
||||
TfidfVectorizer : Convert a collection of raw documents to a matrix
|
||||
of TF-IDF features.
|
||||
|
||||
- Notes
|
||||
- -----
|
||||
- The ``stop_words_`` attribute can get large and increase the model size
|
||||
- when pickling. This attribute is provided only for introspection and can
|
||||
- be safely removed using delattr or set to None before pickling.
|
||||
-
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction.text import CountVectorizer
|
||||
@@ -1242,19 +1227,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
|
||||
mask = new_mask
|
||||
|
||||
new_indices = np.cumsum(mask) - 1 # maps old indices to new
|
||||
- removed_terms = set()
|
||||
for term, old_index in list(vocabulary.items()):
|
||||
if mask[old_index]:
|
||||
vocabulary[term] = new_indices[old_index]
|
||||
else:
|
||||
del vocabulary[term]
|
||||
- removed_terms.add(term)
|
||||
kept_indices = np.where(mask)[0]
|
||||
if len(kept_indices) == 0:
|
||||
raise ValueError(
|
||||
"After pruning, no terms remain. Try a lower min_df or a higher max_df."
|
||||
)
|
||||
- return X[:, kept_indices], removed_terms
|
||||
+ return X[:, kept_indices]
|
||||
|
||||
def _count_vocab(self, raw_documents, fixed_vocab):
|
||||
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
|
||||
@@ -1399,7 +1382,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
|
||||
raise ValueError("max_df corresponds to < documents than min_df")
|
||||
if max_features is not None:
|
||||
X = self._sort_features(X, vocabulary)
|
||||
- X, self.stop_words_ = self._limit_features(
|
||||
+ X = self._limit_features(
|
||||
X, vocabulary, max_doc_count, min_doc_count, max_features
|
||||
)
|
||||
if max_features is None:
|
||||
@@ -1932,15 +1915,6 @@ class TfidfVectorizer(CountVectorizer):
|
||||
The inverse document frequency (IDF) vector; only defined
|
||||
if ``use_idf`` is True.
|
||||
|
||||
- stop_words_ : set
|
||||
- Terms that were ignored because they either:
|
||||
-
|
||||
- - occurred in too many documents (`max_df`)
|
||||
- - occurred in too few documents (`min_df`)
|
||||
- - were cut off by feature selection (`max_features`).
|
||||
-
|
||||
- This is only available if no vocabulary was given.
|
||||
-
|
||||
See Also
|
||||
--------
|
||||
CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
|
||||
@@ -1948,12 +1922,6 @@ class TfidfVectorizer(CountVectorizer):
|
||||
TfidfTransformer : Performs the TF-IDF transformation from a provided
|
||||
matrix of counts.
|
||||
|
||||
- Notes
|
||||
- -----
|
||||
- The ``stop_words_`` attribute can get large and increase the model size
|
||||
- when pickling. This attribute is provided only for introspection and can
|
||||
- be safely removed using delattr or set to None before pickling.
|
||||
-
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -3,10 +3,11 @@
|
||||
Name: python-scikit-learn
|
||||
Summary: A Python module for machine learning built on top of SciPy
|
||||
Version: 1.4.0
|
||||
Release: 1
|
||||
Release: 2
|
||||
License: BSD
|
||||
URL: https://scikit-learn.org/stable/
|
||||
Source0: https://files.pythonhosted.org/packages/source/s/scikit-learn/scikit-learn-%{version}.tar.gz
|
||||
Patch3000: backport-CVE-2024-5206.patch
|
||||
|
||||
%global _description\
|
||||
scikit-learn is a Python module for machine learning built on top of SciPy\
|
||||
@ -43,6 +44,12 @@ CFLAGS="$RPM_OPT_FLAGS -s"
|
||||
%{python3_sitearch}/scikit_learn-%{version}.dist-info/
|
||||
|
||||
%changelog
|
||||
* Fri Jun 07 2024 xuchenchen <xuchenchen@kylinos.cn> - 1.4.0-2
|
||||
- Type:CVES
|
||||
- ID:CVE-2024-5206
|
||||
- SUG:NA
|
||||
- DESC:fix CVE-2024-5206
|
||||
|
||||
* Fri Mar 08 2024 jiangxinyu <jiangxinyu@kylinos.cn> - 1.4.0-1
|
||||
- Update package to version 1.4.0
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user