2063 lines
87 KiB
Diff
2063 lines
87 KiB
Diff
diff --git a/README.md b/README.md
|
||
index 92dd339..884f9eb 100644
|
||
--- a/README.md
|
||
+++ b/README.md
|
||
@@ -53,17 +53,11 @@ To go beyond the basics, [comprehensive documentation is available](http://www.c
|
||
|
||
# Note on Python 2 sunsetting
|
||
|
||
-Since 2012, Beautiful Soup has been developed as a Python 2 library
|
||
-which is automatically converted to Python 3 code as necessary. This
|
||
-makes it impossible to take advantage of some features of Python
|
||
-3.
|
||
-
|
||
-For this reason, I plan to discontinue Beautiful Soup's Python 2
|
||
-support at some point after December 31, 2020: one year after the
|
||
-sunset date for Python 2 itself. Beyond that point, new Beautiful Soup
|
||
-development will exclusively target Python 3. Of course, older
|
||
-releases of Beautiful Soup, which support both versions, will continue
|
||
-to be available.
|
||
+Beautiful Soup's support for Python 2 was discontinued on December 31,
|
||
+2020: one year after the sunset date for Python 2 itself. From this
|
||
+point onward, new Beautiful Soup development will exclusively target
|
||
+Python 3. The final release of Beautiful Soup 4 to support Python 2
|
||
+was 4.9.3.
|
||
|
||
# Supporting the project
|
||
|
||
@@ -93,10 +87,5 @@ $ nosetests
|
||
```
|
||
|
||
```
|
||
-$ python -m unittest discover -s bs4
|
||
+$ python3 -m unittest discover -s bs4
|
||
```
|
||
-
|
||
-If you checked out the source tree, you should see a script in the
|
||
-home directory called test-all-versions. This script will run the unit
|
||
-tests under Python 2, then create a temporary Python 3 conversion of
|
||
-the source and run the unit tests again under Python 3.
|
||
diff --git a/bs4/__init__.py b/bs4/__init__.py
|
||
index 8f78809..51ccc21 100644
|
||
--- a/bs4/__init__.py
|
||
+++ b/bs4/__init__.py
|
||
@@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||
provides methods and Pythonic idioms that make it easy to navigate,
|
||
search, and modify the parse tree.
|
||
|
||
-Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||
+Beautiful Soup works with Python 3.5 and up. It works better if lxml
|
||
and/or html5lib is installed.
|
||
|
||
For more than you ever wanted to know about Beautiful Soup, see the
|
||
@@ -29,6 +29,11 @@ import sys
|
||
import traceback
|
||
import warnings
|
||
|
||
+# The very first thing we do is give a useful error if someone is
|
||
+# running this code under Python 2.
|
||
+if sys.version_info.major < 3:
|
||
+ raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
|
||
+
|
||
from .builder import builder_registry, ParserRejectedMarkup
|
||
from .dammit import UnicodeDammit
|
||
from .element import (
|
||
@@ -49,10 +54,6 @@ from .element import (
|
||
TemplateString,
|
||
)
|
||
|
||
-# The very first thing we do is give a useful error if someone is
|
||
-# running this code under Python 3 without converting it.
|
||
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||
-
|
||
# Define some custom warnings.
|
||
class GuessedAtParserWarning(UserWarning):
|
||
"""The warning issued when BeautifulSoup has to guess what parser to
|
||
@@ -100,7 +101,7 @@ class BeautifulSoup(Tag):
|
||
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
|
||
# a Tag with a .name. This name makes it clear the BeautifulSoup
|
||
# object isn't a real markup tag.
|
||
- ROOT_TAG_NAME = u'[document]'
|
||
+ ROOT_TAG_NAME = '[document]'
|
||
|
||
# If the end-user gives no indication which tree builder they
|
||
# want, look for one with these features.
|
||
@@ -217,7 +218,7 @@ class BeautifulSoup(Tag):
|
||
from_encoding = from_encoding or deprecated_argument(
|
||
"fromEncoding", "from_encoding")
|
||
|
||
- if from_encoding and isinstance(markup, unicode):
|
||
+ if from_encoding and isinstance(markup, str):
|
||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||
from_encoding = None
|
||
|
||
@@ -234,7 +235,7 @@ class BeautifulSoup(Tag):
|
||
builder_class = builder
|
||
builder = None
|
||
elif builder is None:
|
||
- if isinstance(features, basestring):
|
||
+ if isinstance(features, str):
|
||
features = [features]
|
||
if features is None or len(features) == 0:
|
||
features = self.DEFAULT_BUILDER_FEATURES
|
||
@@ -309,13 +310,13 @@ class BeautifulSoup(Tag):
|
||
markup = markup.read()
|
||
elif len(markup) <= 256 and (
|
||
(isinstance(markup, bytes) and not b'<' in markup)
|
||
- or (isinstance(markup, unicode) and not u'<' in markup)
|
||
+ or (isinstance(markup, str) and not '<' in markup)
|
||
):
|
||
# Print out warnings for a couple beginner problems
|
||
# involving passing non-markup to Beautiful Soup.
|
||
# Beautiful Soup will still parse the input as markup,
|
||
# just in case that's what the user really wants.
|
||
- if (isinstance(markup, unicode)
|
||
+ if (isinstance(markup, str)
|
||
and not os.path.supports_unicode_filenames):
|
||
possible_filename = markup.encode("utf8")
|
||
else:
|
||
@@ -323,7 +324,7 @@ class BeautifulSoup(Tag):
|
||
is_file = False
|
||
try:
|
||
is_file = os.path.exists(possible_filename)
|
||
- except Exception, e:
|
||
+ except Exception as e:
|
||
# This is almost certainly a problem involving
|
||
# characters not valid in filenames on this
|
||
# system. Just let it go.
|
||
@@ -353,9 +354,9 @@ class BeautifulSoup(Tag):
|
||
pass
|
||
|
||
if not success:
|
||
- other_exceptions = [unicode(e) for e in rejections]
|
||
+ other_exceptions = [str(e) for e in rejections]
|
||
raise ParserRejectedMarkup(
|
||
- u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
|
||
+ "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
|
||
)
|
||
|
||
# Clear out the markup and remove the builder's circular
|
||
@@ -406,9 +407,9 @@ class BeautifulSoup(Tag):
|
||
if isinstance(markup, bytes):
|
||
space = b' '
|
||
cant_start_with = (b"http:", b"https:")
|
||
- elif isinstance(markup, unicode):
|
||
- space = u' '
|
||
- cant_start_with = (u"http:", u"https:")
|
||
+ elif isinstance(markup, str):
|
||
+ space = ' '
|
||
+ cant_start_with = ("http:", "https:")
|
||
else:
|
||
return
|
||
|
||
@@ -545,7 +546,7 @@ class BeautifulSoup(Tag):
|
||
containerClass = self.string_container(containerClass)
|
||
|
||
if self.current_data:
|
||
- current_data = u''.join(self.current_data)
|
||
+ current_data = ''.join(self.current_data)
|
||
# If whitespace is not preserved, and this string contains
|
||
# nothing but ASCII spaces, replace it with a single space
|
||
# or newline.
|
||
@@ -748,9 +749,9 @@ class BeautifulSoup(Tag):
|
||
eventual_encoding = None
|
||
if eventual_encoding != None:
|
||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||
- prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||
+ prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||
else:
|
||
- prefix = u''
|
||
+ prefix = ''
|
||
if not pretty_print:
|
||
indent_level = None
|
||
else:
|
||
@@ -788,4 +789,4 @@ class FeatureNotFound(ValueError):
|
||
if __name__ == '__main__':
|
||
import sys
|
||
soup = BeautifulSoup(sys.stdin)
|
||
- print(soup.prettify())
|
||
+ print((soup.prettify()))
|
||
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
|
||
index 03da4c6..03fbd6a 100644
|
||
--- a/bs4/builder/__init__.py
|
||
+++ b/bs4/builder/__init__.py
|
||
@@ -300,13 +300,13 @@ class TreeBuilder(object):
|
||
universal = self.cdata_list_attributes.get('*', [])
|
||
tag_specific = self.cdata_list_attributes.get(
|
||
tag_name.lower(), None)
|
||
- for attr in attrs.keys():
|
||
+ for attr in list(attrs.keys()):
|
||
if attr in universal or (tag_specific and attr in tag_specific):
|
||
# We have a "class"-type attribute whose string
|
||
# value is a whitespace-separated list of
|
||
# values. Split it into a list.
|
||
value = attrs[attr]
|
||
- if isinstance(value, basestring):
|
||
+ if isinstance(value, str):
|
||
values = nonwhitespace_re.findall(value)
|
||
else:
|
||
# html5lib sometimes calls setAttributes twice
|
||
@@ -496,7 +496,7 @@ class ParserRejectedMarkup(Exception):
|
||
"""
|
||
if isinstance(message_or_exception, Exception):
|
||
e = message_or_exception
|
||
- message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
|
||
+ message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
|
||
super(ParserRejectedMarkup, self).__init__(message_or_exception)
|
||
|
||
# Builders are registered in reverse order of priority, so that custom
|
||
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
|
||
index a1c6134..69aefd7 100644
|
||
--- a/bs4/builder/_html5lib.py
|
||
+++ b/bs4/builder/_html5lib.py
|
||
@@ -33,7 +33,7 @@ try:
|
||
# Pre-0.99999999
|
||
from html5lib.treebuilders import _base as treebuilder_base
|
||
new_html5lib = False
|
||
-except ImportError, e:
|
||
+except ImportError as e:
|
||
# 0.99999999 and up
|
||
from html5lib.treebuilders import base as treebuilder_base
|
||
new_html5lib = True
|
||
@@ -79,7 +79,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||
self.underlying_builder.parser = parser
|
||
extra_kwargs = dict()
|
||
- if not isinstance(markup, unicode):
|
||
+ if not isinstance(markup, str):
|
||
if new_html5lib:
|
||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||
else:
|
||
@@ -87,13 +87,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||
doc = parser.parse(markup, **extra_kwargs)
|
||
|
||
# Set the character encoding detected by the tokenizer.
|
||
- if isinstance(markup, unicode):
|
||
+ if isinstance(markup, str):
|
||
# We need to special-case this because html5lib sets
|
||
# charEncoding to UTF-8 if it gets Unicode input.
|
||
doc.original_encoding = None
|
||
else:
|
||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||
- if not isinstance(original_encoding, basestring):
|
||
+ if not isinstance(original_encoding, str):
|
||
# In 0.99999999 and up, the encoding is an html5lib
|
||
# Encoding object. We want to use a string for compatibility
|
||
# with other tree builders.
|
||
@@ -110,7 +110,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||
|
||
def test_fragment_to_document(self, fragment):
|
||
"""See `TreeBuilder`."""
|
||
- return u'<html><head></head><body>%s</body></html>' % fragment
|
||
+ return '<html><head></head><body>%s</body></html>' % fragment
|
||
|
||
|
||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||
@@ -217,7 +217,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||
if element.attrs:
|
||
attributes = []
|
||
- for name, value in element.attrs.items():
|
||
+ for name, value in list(element.attrs.items()):
|
||
if isinstance(name, NamespacedAttribute):
|
||
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||
if isinstance(value, list):
|
||
@@ -272,7 +272,7 @@ class Element(treebuilder_base.Node):
|
||
|
||
def appendChild(self, node):
|
||
string_child = child = None
|
||
- if isinstance(node, basestring):
|
||
+ if isinstance(node, str):
|
||
# Some other piece of code decided to pass in a string
|
||
# instead of creating a TextElement object to contain the
|
||
# string.
|
||
@@ -289,7 +289,7 @@ class Element(treebuilder_base.Node):
|
||
child = node.element
|
||
node.parent = self
|
||
|
||
- if not isinstance(child, basestring) and child.parent is not None:
|
||
+ if not isinstance(child, str) and child.parent is not None:
|
||
node.element.extract()
|
||
|
||
if (string_child is not None and self.element.contents
|
||
@@ -302,7 +302,7 @@ class Element(treebuilder_base.Node):
|
||
old_element.replace_with(new_element)
|
||
self.soup._most_recent_element = new_element
|
||
else:
|
||
- if isinstance(node, basestring):
|
||
+ if isinstance(node, str):
|
||
# Create a brand new NavigableString from this string.
|
||
child = self.soup.new_string(node)
|
||
|
||
@@ -340,7 +340,7 @@ class Element(treebuilder_base.Node):
|
||
|
||
self.soup.builder._replace_cdata_list_attribute_values(
|
||
self.name, attributes)
|
||
- for name, value in attributes.items():
|
||
+ for name, value in list(attributes.items()):
|
||
self.element[name] = value
|
||
|
||
# The attributes may contain variables that need substitution.
|
||
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
|
||
index 96a7b7d..88860a9 100644
|
||
--- a/bs4/builder/_htmlparser.py
|
||
+++ b/bs4/builder/_htmlparser.py
|
||
@@ -8,11 +8,11 @@ __all__ = [
|
||
'HTMLParserTreeBuilder',
|
||
]
|
||
|
||
-from HTMLParser import HTMLParser
|
||
+from html.parser import HTMLParser
|
||
|
||
try:
|
||
- from HTMLParser import HTMLParseError
|
||
-except ImportError, e:
|
||
+ from html.parser import HTMLParseError
|
||
+except ImportError as e:
|
||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||
class HTMLParseError(Exception):
|
||
@@ -219,14 +219,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||
continue
|
||
try:
|
||
data = bytearray([real_name]).decode(encoding)
|
||
- except UnicodeDecodeError, e:
|
||
+ except UnicodeDecodeError as e:
|
||
pass
|
||
if not data:
|
||
try:
|
||
- data = unichr(real_name)
|
||
- except (ValueError, OverflowError), e:
|
||
+ data = chr(real_name)
|
||
+ except (ValueError, OverflowError) as e:
|
||
pass
|
||
- data = data or u"\N{REPLACEMENT CHARACTER}"
|
||
+ data = data or "\N{REPLACEMENT CHARACTER}"
|
||
self.handle_data(data)
|
||
|
||
def handle_entityref(self, name):
|
||
@@ -353,7 +353,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||
document to Unicode and parsing it. Each strategy will be tried
|
||
in turn.
|
||
"""
|
||
- if isinstance(markup, unicode):
|
||
+ if isinstance(markup, str):
|
||
# Parse Unicode as-is.
|
||
yield (markup, None, None, False)
|
||
return
|
||
@@ -376,7 +376,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||
try:
|
||
parser.feed(markup)
|
||
parser.close()
|
||
- except HTMLParseError, e:
|
||
+ except HTMLParseError as e:
|
||
warnings.warn(RuntimeWarning(
|
||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||
raise e
|
||
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
|
||
index 1b44d75..432a2c8 100644
|
||
--- a/bs4/builder/_lxml.py
|
||
+++ b/bs4/builder/_lxml.py
|
||
@@ -8,11 +8,11 @@ __all__ = [
|
||
|
||
try:
|
||
from collections.abc import Callable # Python 3.6
|
||
-except ImportError , e:
|
||
+except ImportError as e:
|
||
from collections import Callable
|
||
|
||
from io import BytesIO
|
||
-from StringIO import StringIO
|
||
+from io import StringIO
|
||
from lxml import etree
|
||
from bs4.element import (
|
||
Comment,
|
||
@@ -35,7 +35,7 @@ LXML = 'lxml'
|
||
|
||
def _invert(d):
|
||
"Invert a dictionary."
|
||
- return dict((v,k) for k, v in d.items())
|
||
+ return dict((v,k) for k, v in list(d.items()))
|
||
|
||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||
@@ -81,7 +81,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
|
||
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
||
"""
|
||
- for key, value in mapping.items():
|
||
+ for key, value in list(mapping.items()):
|
||
if key and key not in self.soup._namespaces:
|
||
# Let the BeautifulSoup object know about a new namespace.
|
||
# If there are multiple namespaces defined with the same
|
||
@@ -169,12 +169,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
else:
|
||
self.processing_instruction_class = XMLProcessingInstruction
|
||
|
||
- if isinstance(markup, unicode):
|
||
+ if isinstance(markup, str):
|
||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||
# this system?
|
||
yield markup, None, document_declared_encoding, False
|
||
|
||
- if isinstance(markup, unicode):
|
||
+ if isinstance(markup, str):
|
||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||
# tell lxml to parse it as UTF-8.
|
||
yield (markup.encode("utf8"), "utf8",
|
||
@@ -189,7 +189,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
def feed(self, markup):
|
||
if isinstance(markup, bytes):
|
||
markup = BytesIO(markup)
|
||
- elif isinstance(markup, unicode):
|
||
+ elif isinstance(markup, str):
|
||
markup = StringIO(markup)
|
||
|
||
# Call feed() at least once, even if the markup is empty,
|
||
@@ -204,7 +204,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
if len(data) != 0:
|
||
self.parser.feed(data)
|
||
self.parser.close()
|
||
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||
raise ParserRejectedMarkup(e)
|
||
|
||
def close(self):
|
||
@@ -233,7 +233,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
# Also treat the namespace mapping as a set of attributes on the
|
||
# tag, so we can recreate it later.
|
||
attrs = attrs.copy()
|
||
- for prefix, namespace in nsmap.items():
|
||
+ for prefix, namespace in list(nsmap.items()):
|
||
attribute = NamespacedAttribute(
|
||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||
attrs[attribute] = namespace
|
||
@@ -242,7 +242,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
# from lxml with namespaces attached to their names, and
|
||
# turn then into NamespacedAttribute objects.
|
||
new_attrs = {}
|
||
- for attr, value in attrs.items():
|
||
+ for attr, value in list(attrs.items()):
|
||
namespace, attr = self._getNsTag(attr)
|
||
if namespace is None:
|
||
new_attrs[attr] = value
|
||
@@ -302,7 +302,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||
|
||
def test_fragment_to_document(self, fragment):
|
||
"""See `TreeBuilder`."""
|
||
- return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||
+ return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||
|
||
|
||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||
@@ -323,10 +323,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||
self.parser = self.parser_for(encoding)
|
||
self.parser.feed(markup)
|
||
self.parser.close()
|
||
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||
raise ParserRejectedMarkup(e)
|
||
|
||
|
||
def test_fragment_to_document(self, fragment):
|
||
"""See `TreeBuilder`."""
|
||
- return u'<html><body>%s</body></html>' % fragment
|
||
+ return '<html><body>%s</body></html>' % fragment
|
||
diff --git a/bs4/dammit.py b/bs4/dammit.py
|
||
index 33f7b7d..ee3708f 100644
|
||
--- a/bs4/dammit.py
|
||
+++ b/bs4/dammit.py
|
||
@@ -10,7 +10,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||
__license__ = "MIT"
|
||
|
||
import codecs
|
||
-from htmlentitydefs import codepoint2name
|
||
+from html.entities import codepoint2name
|
||
import re
|
||
import logging
|
||
import string
|
||
@@ -22,7 +22,7 @@ try:
|
||
# PyPI package: cchardet
|
||
import cchardet
|
||
def chardet_dammit(s):
|
||
- if isinstance(s, unicode):
|
||
+ if isinstance(s, str):
|
||
return None
|
||
return cchardet.detect(s)['encoding']
|
||
except ImportError:
|
||
@@ -32,7 +32,7 @@ except ImportError:
|
||
# PyPI package: chardet
|
||
import chardet
|
||
def chardet_dammit(s):
|
||
- if isinstance(s, unicode):
|
||
+ if isinstance(s, str):
|
||
return None
|
||
return chardet.detect(s)['encoding']
|
||
#import chardet.constants
|
||
@@ -53,14 +53,14 @@ except ImportError:
|
||
|
||
# Build bytestring and Unicode versions of regular expressions for finding
|
||
# a declared encoding inside an XML or HTML document.
|
||
-xml_encoding = u'^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
||
-html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
|
||
+xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
||
+html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
|
||
encoding_res = dict()
|
||
encoding_res[bytes] = {
|
||
'html' : re.compile(html_meta.encode("ascii"), re.I),
|
||
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
|
||
}
|
||
-encoding_res[unicode] = {
|
||
+encoding_res[str] = {
|
||
'html' : re.compile(html_meta, re.I),
|
||
'xml' : re.compile(xml_encoding, re.I)
|
||
}
|
||
@@ -80,7 +80,7 @@ class EntitySubstitution(object):
|
||
# entities, but that's a little tricky.
|
||
extra = [(39, 'apos')]
|
||
for codepoint, name in list(codepoint2name.items()) + extra:
|
||
- character = unichr(codepoint)
|
||
+ character = chr(codepoint)
|
||
if codepoint not in (34, 39):
|
||
# There's no point in turning the quotation mark into
|
||
# " or the single quote into ', unless it
|
||
@@ -323,7 +323,7 @@ class EncodingDetector:
|
||
:return: A 2-tuple (modified data, implied encoding)
|
||
"""
|
||
encoding = None
|
||
- if isinstance(data, unicode):
|
||
+ if isinstance(data, str):
|
||
# Unicode data cannot have a byte-order mark.
|
||
return data, encoding
|
||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||
@@ -370,7 +370,7 @@ class EncodingDetector:
|
||
if isinstance(markup, bytes):
|
||
res = encoding_res[bytes]
|
||
else:
|
||
- res = encoding_res[unicode]
|
||
+ res = encoding_res[str]
|
||
|
||
xml_re = res['xml']
|
||
html_re = res['html']
|
||
@@ -431,9 +431,9 @@ class UnicodeDammit:
|
||
markup, override_encodings, is_html, exclude_encodings)
|
||
|
||
# Short-circuit if the data is in Unicode to begin with.
|
||
- if isinstance(markup, unicode) or markup == '':
|
||
+ if isinstance(markup, str) or markup == '':
|
||
self.markup = markup
|
||
- self.unicode_markup = unicode(markup)
|
||
+ self.unicode_markup = str(markup)
|
||
self.original_encoding = None
|
||
return
|
||
|
||
@@ -523,7 +523,7 @@ class UnicodeDammit:
|
||
|
||
:param encoding: The name of an encoding.
|
||
"""
|
||
- return unicode(data, encoding, errors)
|
||
+ return str(data, encoding, errors)
|
||
|
||
@property
|
||
def declared_html_encoding(self):
|
||
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
|
||
index e4f2f47..500e92d 100644
|
||
--- a/bs4/diagnose.py
|
||
+++ b/bs4/diagnose.py
|
||
@@ -4,8 +4,8 @@
|
||
__license__ = "MIT"
|
||
|
||
import cProfile
|
||
-from StringIO import StringIO
|
||
-from HTMLParser import HTMLParser
|
||
+from io import StringIO
|
||
+from html.parser import HTMLParser
|
||
import bs4
|
||
from bs4 import BeautifulSoup, __version__
|
||
from bs4.builder import builder_registry
|
||
@@ -25,8 +25,8 @@ def diagnose(data):
|
||
:param data: A string containing markup that needs to be explained.
|
||
:return: None; diagnostics are printed to standard output.
|
||
"""
|
||
- print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||
- print("Python version %s" % sys.version)
|
||
+ print(("Diagnostic running on Beautiful Soup %s" % __version__))
|
||
+ print(("Python version %s" % sys.version))
|
||
|
||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||
for name in basic_parsers:
|
||
@@ -35,16 +35,16 @@ def diagnose(data):
|
||
break
|
||
else:
|
||
basic_parsers.remove(name)
|
||
- print(
|
||
+ print((
|
||
"I noticed that %s is not installed. Installing it may help." %
|
||
- name)
|
||
+ name))
|
||
|
||
if 'lxml' in basic_parsers:
|
||
basic_parsers.append("lxml-xml")
|
||
try:
|
||
from lxml import etree
|
||
- print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||
- except ImportError, e:
|
||
+ print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
|
||
+ except ImportError as e:
|
||
print(
|
||
"lxml is not installed or couldn't be imported.")
|
||
|
||
@@ -52,21 +52,21 @@ def diagnose(data):
|
||
if 'html5lib' in basic_parsers:
|
||
try:
|
||
import html5lib
|
||
- print("Found html5lib version %s" % html5lib.__version__)
|
||
- except ImportError, e:
|
||
+ print(("Found html5lib version %s" % html5lib.__version__))
|
||
+ except ImportError as e:
|
||
print(
|
||
"html5lib is not installed or couldn't be imported.")
|
||
|
||
if hasattr(data, 'read'):
|
||
data = data.read()
|
||
elif data.startswith("http:") or data.startswith("https:"):
|
||
- print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||
+ print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
|
||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||
return
|
||
else:
|
||
try:
|
||
if os.path.exists(data):
|
||
- print('"%s" looks like a filename. Reading data from the file.' % data)
|
||
+ print(('"%s" looks like a filename. Reading data from the file.' % data))
|
||
with open(data) as fp:
|
||
data = fp.read()
|
||
except ValueError:
|
||
@@ -76,19 +76,19 @@ def diagnose(data):
|
||
print("")
|
||
|
||
for parser in basic_parsers:
|
||
- print("Trying to parse your markup with %s" % parser)
|
||
+ print(("Trying to parse your markup with %s" % parser))
|
||
success = False
|
||
try:
|
||
soup = BeautifulSoup(data, features=parser)
|
||
success = True
|
||
- except Exception, e:
|
||
- print("%s could not parse the markup." % parser)
|
||
+ except Exception as e:
|
||
+ print(("%s could not parse the markup." % parser))
|
||
traceback.print_exc()
|
||
if success:
|
||
- print("Here's what %s did with the markup:" % parser)
|
||
- print(soup.prettify())
|
||
+ print(("Here's what %s did with the markup:" % parser))
|
||
+ print((soup.prettify()))
|
||
|
||
- print("-" * 80)
|
||
+ print(("-" * 80))
|
||
|
||
def lxml_trace(data, html=True, **kwargs):
|
||
"""Print out the lxml events that occur during parsing.
|
||
@@ -104,7 +104,7 @@ def lxml_trace(data, html=True, **kwargs):
|
||
"""
|
||
from lxml import etree
|
||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||
- print("%s, %4s, %s" % (event, element.tag, element.text))
|
||
+ print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||
|
||
class AnnouncingParser(HTMLParser):
|
||
"""Subclass of HTMLParser that announces parse events, without doing
|
||
@@ -193,9 +193,9 @@ def rdoc(num_elements=1000):
|
||
|
||
def benchmark_parsers(num_elements=100000):
|
||
"""Very basic head-to-head performance benchmark."""
|
||
- print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||
+ print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
|
||
data = rdoc(num_elements)
|
||
- print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||
+ print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
|
||
|
||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||
success = False
|
||
@@ -204,24 +204,24 @@ def benchmark_parsers(num_elements=100000):
|
||
soup = BeautifulSoup(data, parser)
|
||
b = time.time()
|
||
success = True
|
||
- except Exception, e:
|
||
- print("%s could not parse the markup." % parser)
|
||
+ except Exception as e:
|
||
+ print(("%s could not parse the markup." % parser))
|
||
traceback.print_exc()
|
||
if success:
|
||
- print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||
+ print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
|
||
|
||
from lxml import etree
|
||
a = time.time()
|
||
etree.HTML(data)
|
||
b = time.time()
|
||
- print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||
+ print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
|
||
|
||
import html5lib
|
||
parser = html5lib.HTMLParser()
|
||
a = time.time()
|
||
parser.parse(data)
|
||
b = time.time()
|
||
- print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||
+ print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
|
||
|
||
def profile(num_elements=100000, parser="lxml"):
|
||
"""Use Python's profiler on a randomly generated document."""
|
||
diff --git a/bs4/element.py b/bs4/element.py
|
||
index 09a81d9..81d9db9 100644
|
||
--- a/bs4/element.py
|
||
+++ b/bs4/element.py
|
||
@@ -3,14 +3,14 @@ __license__ = "MIT"
|
||
|
||
try:
|
||
from collections.abc import Callable # Python 3.6
|
||
-except ImportError , e:
|
||
+except ImportError as e:
|
||
from collections import Callable
|
||
import re
|
||
import sys
|
||
import warnings
|
||
try:
|
||
import soupsieve
|
||
-except ImportError, e:
|
||
+except ImportError as e:
|
||
soupsieve = None
|
||
warnings.warn(
|
||
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||
@@ -57,22 +57,22 @@ def _alias(attr):
|
||
# Source:
|
||
# https://docs.python.org/3/library/codecs.html#python-specific-encodings
|
||
PYTHON_SPECIFIC_ENCODINGS = set([
|
||
- u"idna",
|
||
- u"mbcs",
|
||
- u"oem",
|
||
- u"palmos",
|
||
- u"punycode",
|
||
- u"raw_unicode_escape",
|
||
- u"undefined",
|
||
- u"unicode_escape",
|
||
- u"raw-unicode-escape",
|
||
- u"unicode-escape",
|
||
- u"string-escape",
|
||
- u"string_escape",
|
||
+ "idna",
|
||
+ "mbcs",
|
||
+ "oem",
|
||
+ "palmos",
|
||
+ "punycode",
|
||
+ "raw_unicode_escape",
|
||
+ "undefined",
|
||
+ "unicode_escape",
|
||
+ "raw-unicode-escape",
|
||
+ "unicode-escape",
|
||
+ "string-escape",
|
||
+ "string_escape",
|
||
])
|
||
|
||
|
||
-class NamespacedAttribute(unicode):
|
||
+class NamespacedAttribute(str):
|
||
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
|
||
('xml') and the name ('lang') that were used to create it.
|
||
"""
|
||
@@ -84,18 +84,18 @@ class NamespacedAttribute(unicode):
|
||
name = None
|
||
|
||
if name is None:
|
||
- obj = unicode.__new__(cls, prefix)
|
||
+ obj = str.__new__(cls, prefix)
|
||
elif prefix is None:
|
||
# Not really namespaced.
|
||
- obj = unicode.__new__(cls, name)
|
||
+ obj = str.__new__(cls, name)
|
||
else:
|
||
- obj = unicode.__new__(cls, prefix + ":" + name)
|
||
+ obj = str.__new__(cls, prefix + ":" + name)
|
||
obj.prefix = prefix
|
||
obj.name = name
|
||
obj.namespace = namespace
|
||
return obj
|
||
|
||
-class AttributeValueWithCharsetSubstitution(unicode):
|
||
+class AttributeValueWithCharsetSubstitution(str):
|
||
"""A stand-in object for a character encoding specified in HTML."""
|
||
|
||
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||
@@ -106,7 +106,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||
"""
|
||
|
||
def __new__(cls, original_value):
|
||
- obj = unicode.__new__(cls, original_value)
|
||
+ obj = str.__new__(cls, original_value)
|
||
obj.original_value = original_value
|
||
return obj
|
||
|
||
@@ -134,9 +134,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||
match = cls.CHARSET_RE.search(original_value)
|
||
if match is None:
|
||
# No substitution necessary.
|
||
- return unicode.__new__(unicode, original_value)
|
||
+ return str.__new__(str, original_value)
|
||
|
||
- obj = unicode.__new__(cls, original_value)
|
||
+ obj = str.__new__(cls, original_value)
|
||
obj.original_value = original_value
|
||
return obj
|
||
|
||
@@ -376,7 +376,7 @@ class PageElement(object):
|
||
raise ValueError("Cannot insert None into a tag.")
|
||
if new_child is self:
|
||
raise ValueError("Cannot insert a tag into itself.")
|
||
- if (isinstance(new_child, basestring)
|
||
+ if (isinstance(new_child, str)
|
||
and not isinstance(new_child, NavigableString)):
|
||
new_child = NavigableString(new_child)
|
||
|
||
@@ -753,7 +753,7 @@ class PageElement(object):
|
||
result = (element for element in generator
|
||
if isinstance(element, Tag))
|
||
return ResultSet(strainer, result)
|
||
- elif isinstance(name, basestring):
|
||
+ elif isinstance(name, str):
|
||
# Optimization to find all tags with a given name.
|
||
if name.count(':') == 1:
|
||
# This is a name with a prefix. If this is a namespace-aware document,
|
||
@@ -872,7 +872,7 @@ class PageElement(object):
|
||
return self.parents
|
||
|
||
|
||
-class NavigableString(unicode, PageElement):
|
||
+class NavigableString(str, PageElement):
|
||
"""A Python Unicode string that is part of a parse tree.
|
||
|
||
When Beautiful Soup parses the markup <b>penguin</b>, it will
|
||
@@ -895,10 +895,10 @@ class NavigableString(unicode, PageElement):
|
||
passed in to the superclass's __new__ or the superclass won't know
|
||
how to handle non-ASCII characters.
|
||
"""
|
||
- if isinstance(value, unicode):
|
||
- u = unicode.__new__(cls, value)
|
||
+ if isinstance(value, str):
|
||
+ u = str.__new__(cls, value)
|
||
else:
|
||
- u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||
+ u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||
u.setup()
|
||
return u
|
||
|
||
@@ -909,7 +909,7 @@ class NavigableString(unicode, PageElement):
|
||
return type(self)(self)
|
||
|
||
def __getnewargs__(self):
|
||
- return (unicode(self),)
|
||
+ return (str(self),)
|
||
|
||
def __getattr__(self, attr):
|
||
"""text.string gives you text. This is for backwards
|
||
@@ -975,30 +975,30 @@ class PreformattedString(NavigableString):
|
||
|
||
class CData(PreformattedString):
|
||
"""A CDATA block."""
|
||
- PREFIX = u'<![CDATA['
|
||
- SUFFIX = u']]>'
|
||
+ PREFIX = '<![CDATA['
|
||
+ SUFFIX = ']]>'
|
||
|
||
class ProcessingInstruction(PreformattedString):
|
||
"""A SGML processing instruction."""
|
||
|
||
- PREFIX = u'<?'
|
||
- SUFFIX = u'>'
|
||
+ PREFIX = '<?'
|
||
+ SUFFIX = '>'
|
||
|
||
class XMLProcessingInstruction(ProcessingInstruction):
|
||
"""An XML processing instruction."""
|
||
- PREFIX = u'<?'
|
||
- SUFFIX = u'?>'
|
||
+ PREFIX = '<?'
|
||
+ SUFFIX = '?>'
|
||
|
||
class Comment(PreformattedString):
|
||
"""An HTML or XML comment."""
|
||
- PREFIX = u'<!--'
|
||
- SUFFIX = u'-->'
|
||
+ PREFIX = '<!--'
|
||
+ SUFFIX = '-->'
|
||
|
||
|
||
class Declaration(PreformattedString):
|
||
"""An XML declaration."""
|
||
- PREFIX = u'<?'
|
||
- SUFFIX = u'?>'
|
||
+ PREFIX = '<?'
|
||
+ SUFFIX = '?>'
|
||
|
||
|
||
class Doctype(PreformattedString):
|
||
@@ -1026,8 +1026,8 @@ class Doctype(PreformattedString):
|
||
|
||
return Doctype(value)
|
||
|
||
- PREFIX = u'<!DOCTYPE '
|
||
- SUFFIX = u'>\n'
|
||
+ PREFIX = '<!DOCTYPE '
|
||
+ SUFFIX = '>\n'
|
||
|
||
|
||
class Stylesheet(NavigableString):
|
||
@@ -1263,7 +1263,7 @@ class Tag(PageElement):
|
||
for string in self._all_strings(True):
|
||
yield string
|
||
|
||
- def get_text(self, separator=u"", strip=False,
|
||
+ def get_text(self, separator="", strip=False,
|
||
types=(NavigableString, CData)):
|
||
"""Get all child strings, concatenated using the given separator.
|
||
|
||
@@ -1416,7 +1416,7 @@ class Tag(PageElement):
|
||
def __contains__(self, x):
|
||
return x in self.contents
|
||
|
||
- def __nonzero__(self):
|
||
+ def __bool__(self):
|
||
"A tag is non-None even if it has no contents."
|
||
return True
|
||
|
||
@@ -1565,8 +1565,8 @@ class Tag(PageElement):
|
||
else:
|
||
if isinstance(val, list) or isinstance(val, tuple):
|
||
val = ' '.join(val)
|
||
- elif not isinstance(val, basestring):
|
||
- val = unicode(val)
|
||
+ elif not isinstance(val, str):
|
||
+ val = str(val)
|
||
elif (
|
||
isinstance(val, AttributeValueWithCharsetSubstitution)
|
||
and eventual_encoding is not None
|
||
@@ -1575,7 +1575,7 @@ class Tag(PageElement):
|
||
|
||
text = formatter.attribute_value(val)
|
||
decoded = (
|
||
- unicode(key) + '='
|
||
+ str(key) + '='
|
||
+ formatter.quoted_attribute_value(text))
|
||
attrs.append(decoded)
|
||
close = ''
|
||
@@ -1934,7 +1934,7 @@ class SoupStrainer(object):
|
||
else:
|
||
attrs = kwargs
|
||
normalized_attrs = {}
|
||
- for key, value in attrs.items():
|
||
+ for key, value in list(attrs.items()):
|
||
normalized_attrs[key] = self._normalize_search_value(value)
|
||
|
||
self.attrs = normalized_attrs
|
||
@@ -1943,7 +1943,7 @@ class SoupStrainer(object):
|
||
def _normalize_search_value(self, value):
|
||
# Leave it alone if it's a Unicode string, a callable, a
|
||
# regular expression, a boolean, or None.
|
||
- if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match')
|
||
+ if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
|
||
or isinstance(value, bool) or value is None):
|
||
return value
|
||
|
||
@@ -1956,7 +1956,7 @@ class SoupStrainer(object):
|
||
new_value = []
|
||
for v in value:
|
||
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
|
||
- and not isinstance(v, unicode)):
|
||
+ and not isinstance(v, str)):
|
||
# This is almost certainly the user's mistake. In the
|
||
# interests of avoiding infinite loops, we'll let
|
||
# it through as-is rather than doing a recursive call.
|
||
@@ -1968,7 +1968,7 @@ class SoupStrainer(object):
|
||
# Otherwise, convert it into a Unicode string.
|
||
# The unicode(str()) thing is so this will do the same thing on Python 2
|
||
# and Python 3.
|
||
- return unicode(str(value))
|
||
+ return str(str(value))
|
||
|
||
def __str__(self):
|
||
"""A human-readable representation of this SoupStrainer."""
|
||
@@ -1996,7 +1996,7 @@ class SoupStrainer(object):
|
||
markup = markup_name
|
||
markup_attrs = markup
|
||
|
||
- if isinstance(self.name, basestring):
|
||
+ if isinstance(self.name, str):
|
||
# Optimization for a very common case where the user is
|
||
# searching for a tag with one specific name, and we're
|
||
# looking at a tag with a different name.
|
||
@@ -2052,7 +2052,7 @@ class SoupStrainer(object):
|
||
found = None
|
||
# If given a list of items, scan it for a text element that
|
||
# matches.
|
||
- if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
|
||
+ if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
|
||
for element in markup:
|
||
if isinstance(element, NavigableString) \
|
||
and self.search(element):
|
||
@@ -2065,7 +2065,7 @@ class SoupStrainer(object):
|
||
found = self.search_tag(markup)
|
||
# If it's text, make sure the text matches.
|
||
elif isinstance(markup, NavigableString) or \
|
||
- isinstance(markup, basestring):
|
||
+ isinstance(markup, str):
|
||
if not self.name and not self.attrs and self._matches(markup, self.text):
|
||
found = markup
|
||
else:
|
||
@@ -2110,7 +2110,7 @@ class SoupStrainer(object):
|
||
return not match_against
|
||
|
||
if (hasattr(match_against, '__iter__')
|
||
- and not isinstance(match_against, basestring)):
|
||
+ and not isinstance(match_against, str)):
|
||
# We're asked to match against an iterable of items.
|
||
# The markup must be match at least one item in the
|
||
# iterable. We'll try each one in turn.
|
||
@@ -2137,7 +2137,7 @@ class SoupStrainer(object):
|
||
# the tag's name and once against its prefixed name.
|
||
match = False
|
||
|
||
- if not match and isinstance(match_against, unicode):
|
||
+ if not match and isinstance(match_against, str):
|
||
# Exact string match
|
||
match = markup == match_against
|
||
|
||
diff --git a/bs4/formatter.py b/bs4/formatter.py
|
||
index 9a692ec..2cbab4c 100644
|
||
--- a/bs4/formatter.py
|
||
+++ b/bs4/formatter.py
|
||
@@ -83,7 +83,7 @@ class Formatter(EntitySubstitution):
|
||
"""
|
||
if not self.entity_substitution:
|
||
return ns
|
||
- from element import NavigableString
|
||
+ from .element import NavigableString
|
||
if (isinstance(ns, NavigableString)
|
||
and ns.parent is not None
|
||
and ns.parent.name in self.cdata_containing_tags):
|
||
diff --git a/bs4/testing.py b/bs4/testing.py
|
||
index a2f83a1..9ca507b 100644
|
||
--- a/bs4/testing.py
|
||
+++ b/bs4/testing.py
|
||
@@ -25,7 +25,7 @@ from bs4.element import (
|
||
from bs4.builder import HTMLParserTreeBuilder
|
||
default_builder = HTMLParserTreeBuilder
|
||
|
||
-BAD_DOCUMENT = u"""A bare string
|
||
+BAD_DOCUMENT = """A bare string
|
||
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
|
||
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
|
||
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
|
||
@@ -94,7 +94,7 @@ class SoupTest(unittest.TestCase):
|
||
# Verify that every tag that was opened was eventually closed.
|
||
|
||
# There are no tags in the open tag counter.
|
||
- assert all(v==0 for v in obj.open_tag_counter.values())
|
||
+ assert all(v==0 for v in list(obj.open_tag_counter.values()))
|
||
|
||
# The only tag in the tag stack is the one for the root
|
||
# document.
|
||
@@ -372,7 +372,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||
# process_markup correctly sets processing_instruction_class
|
||
# even when the markup is already Unicode and there is no
|
||
# need to process anything.
|
||
- markup = u"""<?PITarget PIContent?>"""
|
||
+ markup = """<?PITarget PIContent?>"""
|
||
soup = self.soup(markup)
|
||
self.assertEqual(markup, soup.decode())
|
||
|
||
@@ -544,14 +544,14 @@ Hello, world!
|
||
# "&T" and "&p" look like incomplete character entities, but they are
|
||
# not.
|
||
self.assertSoupEquals(
|
||
- u"<p>• AT&T is in the s&p 500</p>",
|
||
- u"<p>\u2022 AT&T is in the s&p 500</p>"
|
||
+ "<p>• AT&T is in the s&p 500</p>",
|
||
+ "<p>\u2022 AT&T is in the s&p 500</p>"
|
||
)
|
||
|
||
def test_apos_entity(self):
|
||
self.assertSoupEquals(
|
||
- u"<p>Bob's Bar</p>",
|
||
- u"<p>Bob's Bar</p>",
|
||
+ "<p>Bob's Bar</p>",
|
||
+ "<p>Bob's Bar</p>",
|
||
)
|
||
|
||
def test_entities_in_foreign_document_encoding(self):
|
||
@@ -564,17 +564,17 @@ Hello, world!
|
||
# characters.
|
||
markup = "<p>“Hello” -☃</p>"
|
||
soup = self.soup(markup)
|
||
- self.assertEquals(u"“Hello” -☃", soup.p.string)
|
||
+ self.assertEqual("“Hello” -☃", soup.p.string)
|
||
|
||
def test_entities_in_attributes_converted_to_unicode(self):
|
||
- expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||
+ expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||
|
||
def test_entities_in_text_converted_to_unicode(self):
|
||
- expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||
+ expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||
@@ -585,7 +585,7 @@ Hello, world!
|
||
'<p>I said "good day!"</p>')
|
||
|
||
def test_out_of_range_entity(self):
|
||
- expect = u"\N{REPLACEMENT CHARACTER}"
|
||
+ expect = "\N{REPLACEMENT CHARACTER}"
|
||
self.assertSoupEquals("�", expect)
|
||
self.assertSoupEquals("�", expect)
|
||
self.assertSoupEquals("�", expect)
|
||
@@ -663,9 +663,9 @@ Hello, world!
|
||
# A seemingly innocuous document... but it's in Unicode! And
|
||
# it contains characters that can't be represented in the
|
||
# encoding found in the declaration! The horror!
|
||
- markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||
+ markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
||
+ self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
|
||
|
||
def test_soupstrainer(self):
|
||
"""Parsers should be able to work with SoupStrainers."""
|
||
@@ -705,7 +705,7 @@ Hello, world!
|
||
# Both XML and HTML entities are converted to Unicode characters
|
||
# during parsing.
|
||
text = "<p><<sacré bleu!>></p>"
|
||
- expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||
+ expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||
self.assertSoupEquals(text, expected)
|
||
|
||
def test_smart_quotes_converted_on_the_way_in(self):
|
||
@@ -715,15 +715,15 @@ Hello, world!
|
||
soup = self.soup(quote)
|
||
self.assertEqual(
|
||
soup.p.string,
|
||
- u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||
+ "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||
|
||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||
soup = self.soup("<a> </a>")
|
||
- self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||
+ self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
|
||
|
||
def test_entities_converted_on_the_way_out(self):
|
||
text = "<p><<sacré bleu!>></p>"
|
||
- expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||
+ expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||
soup = self.soup(text)
|
||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||
|
||
@@ -732,7 +732,7 @@ Hello, world!
|
||
# easy-to-understand document.
|
||
|
||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||
- unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||
+ unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||
|
||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||
# that to test.
|
||
@@ -848,8 +848,8 @@ Hello, world!
|
||
soup = self.soup(markup)
|
||
for encoding in PYTHON_SPECIFIC_ENCODINGS:
|
||
if encoding in (
|
||
- u'idna', u'mbcs', u'oem', u'undefined',
|
||
- u'string_escape', u'string-escape'
|
||
+ 'idna', 'mbcs', 'oem', 'undefined',
|
||
+ 'string_escape', 'string-escape'
|
||
):
|
||
# For one reason or another, these will raise an
|
||
# exception if we actually try to use them, so don't
|
||
@@ -910,8 +910,8 @@ class XMLTreeBuilderSmokeTest(object):
|
||
soup = self.soup(markup)
|
||
for encoding in PYTHON_SPECIFIC_ENCODINGS:
|
||
if encoding in (
|
||
- u'idna', u'mbcs', u'oem', u'undefined',
|
||
- u'string_escape', u'string-escape'
|
||
+ 'idna', 'mbcs', 'oem', 'undefined',
|
||
+ 'string_escape', 'string-escape'
|
||
):
|
||
# For one reason or another, these will raise an
|
||
# exception if we actually try to use them, so don't
|
||
@@ -962,15 +962,15 @@ class XMLTreeBuilderSmokeTest(object):
|
||
self.assertTrue(b"< < hey > >" in encoded)
|
||
|
||
def test_can_parse_unicode_document(self):
|
||
- markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||
+ markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
||
+ self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
|
||
|
||
def test_popping_namespaced_tag(self):
|
||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||
soup = self.soup(markup)
|
||
self.assertEqual(
|
||
- unicode(soup.rss), markup)
|
||
+ str(soup.rss), markup)
|
||
|
||
def test_docstring_includes_correct_encoding(self):
|
||
soup = self.soup("<root/>")
|
||
@@ -1001,17 +1001,17 @@ class XMLTreeBuilderSmokeTest(object):
|
||
def test_closing_namespaced_tag(self):
|
||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(unicode(soup.p), markup)
|
||
+ self.assertEqual(str(soup.p), markup)
|
||
|
||
def test_namespaced_attributes(self):
|
||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(unicode(soup.foo), markup)
|
||
+ self.assertEqual(str(soup.foo), markup)
|
||
|
||
def test_namespaced_attributes_xml_namespace(self):
|
||
markup = '<foo xml:lang="fr">bar</foo>'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(unicode(soup.foo), markup)
|
||
+ self.assertEqual(str(soup.foo), markup)
|
||
|
||
def test_find_by_prefixed_name(self):
|
||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
|
||
index 7b0a6d4..b77659b 100644
|
||
--- a/bs4/tests/test_html5lib.py
|
||
+++ b/bs4/tests/test_html5lib.py
|
||
@@ -5,7 +5,7 @@ import warnings
|
||
try:
|
||
from bs4.builder import HTML5TreeBuilder
|
||
HTML5LIB_PRESENT = True
|
||
-except ImportError, e:
|
||
+except ImportError as e:
|
||
HTML5LIB_PRESENT = False
|
||
from bs4.element import SoupStrainer
|
||
from bs4.testing import (
|
||
@@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||
def test_reparented_markup(self):
|
||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||
+ self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||
self.assertEqual(2, len(soup.find_all('p')))
|
||
|
||
|
||
def test_reparented_markup_ends_with_whitespace(self):
|
||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||
+ self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||
self.assertEqual(2, len(soup.find_all('p')))
|
||
|
||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||
@@ -127,7 +127,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||
def test_foster_parenting(self):
|
||
markup = b"""<table><td></tbody>A"""
|
||
soup = self.soup(markup)
|
||
- self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||
+ self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||
|
||
def test_extraction(self):
|
||
"""
|
||
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
|
||
index 7ee91aa..aeff094 100644
|
||
--- a/bs4/tests/test_htmlparser.py
|
||
+++ b/bs4/tests/test_htmlparser.py
|
||
@@ -60,20 +60,20 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||
# If you don't provide any particular value for
|
||
# on_duplicate_attribute, later values replace earlier values.
|
||
soup = self.soup(markup)
|
||
- self.assertEquals("url3", soup.a['href'])
|
||
- self.assertEquals(["cls"], soup.a['class'])
|
||
- self.assertEquals("id", soup.a['id'])
|
||
+ self.assertEqual("url3", soup.a['href'])
|
||
+ self.assertEqual(["cls"], soup.a['class'])
|
||
+ self.assertEqual("id", soup.a['id'])
|
||
|
||
# You can also get this behavior explicitly.
|
||
def assert_attribute(on_duplicate_attribute, expected):
|
||
soup = self.soup(
|
||
markup, on_duplicate_attribute=on_duplicate_attribute
|
||
)
|
||
- self.assertEquals(expected, soup.a['href'])
|
||
+ self.assertEqual(expected, soup.a['href'])
|
||
|
||
# Verify that non-duplicate attributes are treated normally.
|
||
- self.assertEquals(["cls"], soup.a['class'])
|
||
- self.assertEquals("id", soup.a['id'])
|
||
+ self.assertEqual(["cls"], soup.a['class'])
|
||
+ self.assertEqual("id", soup.a['id'])
|
||
assert_attribute(None, "url3")
|
||
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
|
||
|
||
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
|
||
index f96e4ae..3d0c75f 100644
|
||
--- a/bs4/tests/test_lxml.py
|
||
+++ b/bs4/tests/test_lxml.py
|
||
@@ -7,7 +7,7 @@ try:
|
||
import lxml.etree
|
||
LXML_PRESENT = True
|
||
LXML_VERSION = lxml.etree.LXML_VERSION
|
||
-except ImportError, e:
|
||
+except ImportError as e:
|
||
LXML_PRESENT = False
|
||
LXML_VERSION = (0,)
|
||
|
||
@@ -68,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||
# if one is installed.
|
||
with warnings.catch_warnings(record=True) as w:
|
||
soup = BeautifulStoneSoup("<b />")
|
||
- self.assertEqual(u"<b/>", unicode(soup.b))
|
||
+ self.assertEqual("<b/>", str(soup.b))
|
||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||
|
||
def test_tracking_line_numbers(self):
|
||
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
|
||
index 857eb41..e1035ea 100644
|
||
--- a/bs4/tests/test_soup.py
|
||
+++ b/bs4/tests/test_soup.py
|
||
@@ -51,17 +51,17 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||
class TestConstructor(SoupTest):
|
||
|
||
def test_short_unicode_input(self):
|
||
- data = u"<h1>éé</h1>"
|
||
+ data = "<h1>éé</h1>"
|
||
soup = self.soup(data)
|
||
- self.assertEqual(u"éé", soup.h1.string)
|
||
+ self.assertEqual("éé", soup.h1.string)
|
||
|
||
def test_embedded_null(self):
|
||
- data = u"<h1>foo\0bar</h1>"
|
||
+ data = "<h1>foo\0bar</h1>"
|
||
soup = self.soup(data)
|
||
- self.assertEqual(u"foo\0bar", soup.h1.string)
|
||
+ self.assertEqual("foo\0bar", soup.h1.string)
|
||
|
||
def test_exclude_encodings(self):
|
||
- utf8_data = u"Räksmörgås".encode("utf-8")
|
||
+ utf8_data = "Räksmörgås".encode("utf-8")
|
||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||
self.assertEqual("windows-1252", soup.original_encoding)
|
||
|
||
@@ -127,7 +127,7 @@ class TestConstructor(SoupTest):
|
||
yield markup, None, None, False
|
||
|
||
import re
|
||
- self.assertRaisesRegexp(
|
||
+ self.assertRaisesRegex(
|
||
ParserRejectedMarkup,
|
||
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
|
||
BeautifulSoup, '', builder=Mock,
|
||
@@ -303,7 +303,7 @@ class TestWarnings(SoupTest):
|
||
with warnings.catch_warnings(record=True) as warning_list:
|
||
# note - this url must differ from the bytes one otherwise
|
||
# python's warnings system swallows the second warning
|
||
- soup = self.soup(u"http://www.crummyunicode.com/")
|
||
+ soup = self.soup("http://www.crummyunicode.com/")
|
||
warning = self._assert_warning(
|
||
warning_list, MarkupResemblesLocatorWarning
|
||
)
|
||
@@ -319,7 +319,7 @@ class TestWarnings(SoupTest):
|
||
|
||
def test_url_warning_with_unicode_and_space(self):
|
||
with warnings.catch_warnings(record=True) as warning_list:
|
||
- soup = self.soup(u"http://www.crummyuncode.com/ is great")
|
||
+ soup = self.soup("http://www.crummyuncode.com/ is great")
|
||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||
for w in warning_list))
|
||
|
||
@@ -341,9 +341,9 @@ class TestEntitySubstitution(unittest.TestCase):
|
||
def test_simple_html_substitution(self):
|
||
# Unicode characters corresponding to named HTML entites
|
||
# are substituted, and no others.
|
||
- s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
||
+ s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
||
self.assertEqual(self.sub.substitute_html(s),
|
||
- u"foo∀\N{SNOWMAN}õbar")
|
||
+ "foo∀\N{SNOWMAN}õbar")
|
||
|
||
def test_smart_quote_substitution(self):
|
||
# MS smart quotes are a common source of frustration, so we
|
||
@@ -408,7 +408,7 @@ class TestEncodingConversion(SoupTest):
|
||
|
||
def setUp(self):
|
||
super(TestEncodingConversion, self).setUp()
|
||
- self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||
+ self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||
self.utf8_data = self.unicode_data.encode("utf-8")
|
||
# Just so you know what it looks like.
|
||
self.assertEqual(
|
||
@@ -428,7 +428,7 @@ class TestEncodingConversion(SoupTest):
|
||
ascii = b"<foo>a</foo>"
|
||
soup_from_ascii = self.soup(ascii)
|
||
unicode_output = soup_from_ascii.decode()
|
||
- self.assertTrue(isinstance(unicode_output, unicode))
|
||
+ self.assertTrue(isinstance(unicode_output, str))
|
||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||
finally:
|
||
@@ -440,7 +440,7 @@ class TestEncodingConversion(SoupTest):
|
||
# is not set.
|
||
soup_from_unicode = self.soup(self.unicode_data)
|
||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||
- self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
||
+ self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
|
||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||
|
||
def test_utf8_in_unicode_out(self):
|
||
@@ -448,7 +448,7 @@ class TestEncodingConversion(SoupTest):
|
||
# attribute is set.
|
||
soup_from_utf8 = self.soup(self.utf8_data)
|
||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||
- self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
||
+ self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
|
||
|
||
def test_utf8_out(self):
|
||
# The internal data structures can be encoded as UTF-8.
|
||
@@ -459,7 +459,7 @@ class TestEncodingConversion(SoupTest):
|
||
PYTHON_3_PRE_3_2,
|
||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||
def test_attribute_name_containing_unicode_characters(self):
|
||
- markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||
+ markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||
|
||
class TestUnicodeDammit(unittest.TestCase):
|
||
@@ -526,7 +526,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
||
|
||
def test_exclude_encodings(self):
|
||
# This is UTF-8.
|
||
- utf8_data = u"Räksmörgås".encode("utf-8")
|
||
+ utf8_data = "Räksmörgås".encode("utf-8")
|
||
|
||
# But if we exclude UTF-8 from consideration, the guess is
|
||
# Windows-1252.
|
||
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
|
||
index 2246346..b4f2a86 100644
|
||
--- a/bs4/tests/test_tree.py
|
||
+++ b/bs4/tests/test_tree.py
|
||
@@ -75,13 +75,13 @@ class TestFind(TreeTest):
|
||
self.assertEqual(soup.find("b").string, "2")
|
||
|
||
def test_unicode_text_find(self):
|
||
- soup = self.soup(u'<h1>Räksmörgås</h1>')
|
||
- self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
|
||
+ soup = self.soup('<h1>Räksmörgås</h1>')
|
||
+ self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
|
||
|
||
def test_unicode_attribute_find(self):
|
||
- soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>')
|
||
+ soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
|
||
str(soup)
|
||
- self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
|
||
+ self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
|
||
|
||
|
||
def test_find_everything(self):
|
||
@@ -101,17 +101,17 @@ class TestFindAll(TreeTest):
|
||
"""You can search the tree for text nodes."""
|
||
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
|
||
# Exact match.
|
||
- self.assertEqual(soup.find_all(string="bar"), [u"bar"])
|
||
- self.assertEqual(soup.find_all(text="bar"), [u"bar"])
|
||
+ self.assertEqual(soup.find_all(string="bar"), ["bar"])
|
||
+ self.assertEqual(soup.find_all(text="bar"), ["bar"])
|
||
# Match any of a number of strings.
|
||
self.assertEqual(
|
||
- soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
|
||
+ soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
|
||
# Match a regular expression.
|
||
self.assertEqual(soup.find_all(text=re.compile('.*')),
|
||
- [u"Foo", u"bar", u'\xbb'])
|
||
+ ["Foo", "bar", '\xbb'])
|
||
# Match anything.
|
||
self.assertEqual(soup.find_all(text=True),
|
||
- [u"Foo", u"bar", u'\xbb'])
|
||
+ ["Foo", "bar", '\xbb'])
|
||
|
||
def test_find_all_limit(self):
|
||
"""You can limit the number of items returned by find_all."""
|
||
@@ -254,8 +254,8 @@ class TestFindAllByAttribute(TreeTest):
|
||
["Matching a.", "Matching b."])
|
||
|
||
def test_find_all_by_utf8_attribute_value(self):
|
||
- peace = u"םולש".encode("utf8")
|
||
- data = u'<a title="םולש"></a>'.encode("utf8")
|
||
+ peace = "םולש".encode("utf8")
|
||
+ data = '<a title="םולש"></a>'.encode("utf8")
|
||
soup = self.soup(data)
|
||
self.assertEqual([soup.a], soup.find_all(title=peace))
|
||
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
|
||
@@ -444,7 +444,7 @@ class TestSmooth(TreeTest):
|
||
# output.
|
||
|
||
# Since the <span> tag has two children, its .string is None.
|
||
- self.assertEquals(None, div.span.string)
|
||
+ self.assertEqual(None, div.span.string)
|
||
|
||
self.assertEqual(7, len(div.contents))
|
||
div.smooth()
|
||
@@ -755,18 +755,18 @@ class TestTag(SoupTest):
|
||
|
||
# No list of whitespace-preserving tags -> pretty-print
|
||
tag._preserve_whitespace_tags = None
|
||
- self.assertEquals(True, tag._should_pretty_print(0))
|
||
+ self.assertEqual(True, tag._should_pretty_print(0))
|
||
|
||
# List exists but tag is not on the list -> pretty-print
|
||
tag.preserve_whitespace_tags = ["some_other_tag"]
|
||
- self.assertEquals(True, tag._should_pretty_print(1))
|
||
+ self.assertEqual(True, tag._should_pretty_print(1))
|
||
|
||
# Indent level is None -> don't pretty-print
|
||
- self.assertEquals(False, tag._should_pretty_print(None))
|
||
+ self.assertEqual(False, tag._should_pretty_print(None))
|
||
|
||
# Tag is on the whitespace-preserving list -> don't pretty-print
|
||
tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"]
|
||
- self.assertEquals(False, tag._should_pretty_print(1))
|
||
+ self.assertEqual(False, tag._should_pretty_print(1))
|
||
|
||
|
||
class TestTagCreation(SoupTest):
|
||
@@ -905,10 +905,10 @@ class TestTreeModification(SoupTest):
|
||
assert not isinstance(i, BeautifulSoup)
|
||
|
||
p1, p2, p3, p4 = list(soup.children)
|
||
- self.assertEquals("And now, a word:", p1.string)
|
||
- self.assertEquals("p2", p2.string)
|
||
- self.assertEquals("p3", p3.string)
|
||
- self.assertEquals("And we're back.", p4.string)
|
||
+ self.assertEqual("And now, a word:", p1.string)
|
||
+ self.assertEqual("p2", p2.string)
|
||
+ self.assertEqual("p3", p3.string)
|
||
+ self.assertEqual("And we're back.", p4.string)
|
||
|
||
|
||
def test_replace_with_maintains_next_element_throughout(self):
|
||
@@ -1015,8 +1015,8 @@ class TestTreeModification(SoupTest):
|
||
d1 = soup.find('div', id='d1')
|
||
d2 = soup.find('div', id='d2')
|
||
d2.extend(d1)
|
||
- self.assertEqual(u'<div id="d1"></div>', d1.decode())
|
||
- self.assertEqual(u'<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
|
||
+ self.assertEqual('<div id="d1"></div>', d1.decode())
|
||
+ self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
|
||
|
||
def test_move_tag_to_beginning_of_parent(self):
|
||
data = "<a><b></b><c></c><d></d></a>"
|
||
@@ -1262,7 +1262,7 @@ class TestTreeModification(SoupTest):
|
||
<script>baz</script>
|
||
</html>""")
|
||
[soup.script.extract() for i in soup.find_all("script")]
|
||
- self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
|
||
+ self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
|
||
|
||
|
||
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
|
||
@@ -1524,7 +1524,7 @@ class TestPersistence(SoupTest):
|
||
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||
encoding = soup.original_encoding
|
||
copy = soup.__copy__()
|
||
- self.assertEqual(u"<p> </p>", unicode(copy))
|
||
+ self.assertEqual("<p> </p>", str(copy))
|
||
self.assertEqual(encoding, copy.original_encoding)
|
||
|
||
def test_copy_preserves_builder_information(self):
|
||
@@ -1554,14 +1554,14 @@ class TestPersistence(SoupTest):
|
||
|
||
def test_unicode_pickle(self):
|
||
# A tree containing Unicode characters can be pickled.
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
|
||
loaded = pickle.loads(dumped)
|
||
self.assertEqual(loaded.decode(), soup.decode())
|
||
|
||
def test_copy_navigablestring_is_not_attached_to_tree(self):
|
||
- html = u"<b>Foo<a></a></b><b>Bar</b>"
|
||
+ html = "<b>Foo<a></a></b><b>Bar</b>"
|
||
soup = self.soup(html)
|
||
s1 = soup.find(string="Foo")
|
||
s2 = copy.copy(s1)
|
||
@@ -1573,7 +1573,7 @@ class TestPersistence(SoupTest):
|
||
self.assertEqual(None, s2.previous_element)
|
||
|
||
def test_copy_navigablestring_subclass_has_same_type(self):
|
||
- html = u"<b><!--Foo--></b>"
|
||
+ html = "<b><!--Foo--></b>"
|
||
soup = self.soup(html)
|
||
s1 = soup.string
|
||
s2 = copy.copy(s1)
|
||
@@ -1581,19 +1581,19 @@ class TestPersistence(SoupTest):
|
||
self.assertTrue(isinstance(s2, Comment))
|
||
|
||
def test_copy_entire_soup(self):
|
||
- html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||
+ html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||
soup = self.soup(html)
|
||
soup_copy = copy.copy(soup)
|
||
self.assertEqual(soup, soup_copy)
|
||
|
||
def test_copy_tag_copies_contents(self):
|
||
- html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||
+ html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||
soup = self.soup(html)
|
||
div = soup.div
|
||
div_copy = copy.copy(div)
|
||
|
||
# The two tags look the same, and evaluate to equal.
|
||
- self.assertEqual(unicode(div), unicode(div_copy))
|
||
+ self.assertEqual(str(div), str(div_copy))
|
||
self.assertEqual(div, div_copy)
|
||
|
||
# But they're not the same object.
|
||
@@ -1609,17 +1609,17 @@ class TestPersistence(SoupTest):
|
||
class TestSubstitutions(SoupTest):
|
||
|
||
def test_default_formatter_is_minimal(self):
|
||
- markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
+ markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
soup = self.soup(markup)
|
||
decoded = soup.decode(formatter="minimal")
|
||
# The < is converted back into < but the e-with-acute is left alone.
|
||
self.assertEqual(
|
||
decoded,
|
||
self.document_for(
|
||
- u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||
+ "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||
|
||
def test_formatter_html(self):
|
||
- markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
+ markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
soup = self.soup(markup)
|
||
decoded = soup.decode(formatter="html")
|
||
self.assertEqual(
|
||
@@ -1627,7 +1627,7 @@ class TestSubstitutions(SoupTest):
|
||
self.document_for("<br/><b><<Sacré bleu!>></b>"))
|
||
|
||
def test_formatter_html5(self):
|
||
- markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
+ markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
soup = self.soup(markup)
|
||
decoded = soup.decode(formatter="html5")
|
||
self.assertEqual(
|
||
@@ -1635,49 +1635,49 @@ class TestSubstitutions(SoupTest):
|
||
self.document_for("<br><b><<Sacré bleu!>></b>"))
|
||
|
||
def test_formatter_minimal(self):
|
||
- markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
+ markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
soup = self.soup(markup)
|
||
decoded = soup.decode(formatter="minimal")
|
||
# The < is converted back into < but the e-with-acute is left alone.
|
||
self.assertEqual(
|
||
decoded,
|
||
self.document_for(
|
||
- u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||
+ "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||
|
||
def test_formatter_null(self):
|
||
- markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
+ markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||
soup = self.soup(markup)
|
||
decoded = soup.decode(formatter=None)
|
||
# Neither the angle brackets nor the e-with-acute are converted.
|
||
# This is not valid HTML, but it's what the user wanted.
|
||
self.assertEqual(decoded,
|
||
- self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||
+ self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||
|
||
def test_formatter_custom(self):
|
||
- markup = u"<b><foo></b><b>bar</b><br/>"
|
||
+ markup = "<b><foo></b><b>bar</b><br/>"
|
||
soup = self.soup(markup)
|
||
decoded = soup.decode(formatter = lambda x: x.upper())
|
||
# Instead of normal entity conversion code, the custom
|
||
# callable is called on every string.
|
||
self.assertEqual(
|
||
decoded,
|
||
- self.document_for(u"<b><FOO></b><b>BAR</b><br/>"))
|
||
+ self.document_for("<b><FOO></b><b>BAR</b><br/>"))
|
||
|
||
def test_formatter_is_run_on_attribute_values(self):
|
||
- markup = u'<a href="http://a.com?a=b&c=é">e</a>'
|
||
+ markup = '<a href="http://a.com?a=b&c=é">e</a>'
|
||
soup = self.soup(markup)
|
||
a = soup.a
|
||
|
||
- expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>'
|
||
+ expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>'
|
||
|
||
self.assertEqual(expect_minimal, a.decode())
|
||
self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
|
||
|
||
- expect_html = u'<a href="http://a.com?a=b&c=é">e</a>'
|
||
+ expect_html = '<a href="http://a.com?a=b&c=é">e</a>'
|
||
self.assertEqual(expect_html, a.decode(formatter="html"))
|
||
|
||
self.assertEqual(markup, a.decode(formatter=None))
|
||
- expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||
+ expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
|
||
|
||
def test_formatter_skips_script_tag_for_html_documents(self):
|
||
@@ -1703,7 +1703,7 @@ class TestSubstitutions(SoupTest):
|
||
# Everything outside the <pre> tag is reformatted, but everything
|
||
# inside is left alone.
|
||
self.assertEqual(
|
||
- u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
|
||
+ '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
|
||
soup.div.prettify())
|
||
|
||
def test_prettify_accepts_formatter_function(self):
|
||
@@ -1713,14 +1713,14 @@ class TestSubstitutions(SoupTest):
|
||
|
||
def test_prettify_outputs_unicode_by_default(self):
|
||
soup = self.soup("<a></a>")
|
||
- self.assertEqual(unicode, type(soup.prettify()))
|
||
+ self.assertEqual(str, type(soup.prettify()))
|
||
|
||
def test_prettify_can_encode_data(self):
|
||
soup = self.soup("<a></a>")
|
||
self.assertEqual(bytes, type(soup.prettify("utf-8")))
|
||
|
||
def test_html_entity_substitution_off_by_default(self):
|
||
- markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
||
+ markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
||
soup = self.soup(markup)
|
||
encoded = soup.b.encode("utf-8")
|
||
self.assertEqual(encoded, markup.encode('utf-8'))
|
||
@@ -1764,48 +1764,48 @@ class TestEncoding(SoupTest):
|
||
"""Test the ability to encode objects into strings."""
|
||
|
||
def test_unicode_string_can_be_encoded(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
self.assertEqual(soup.b.string.encode("utf-8"),
|
||
- u"\N{SNOWMAN}".encode("utf-8"))
|
||
+ "\N{SNOWMAN}".encode("utf-8"))
|
||
|
||
def test_tag_containing_unicode_string_can_be_encoded(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
self.assertEqual(
|
||
soup.b.encode("utf-8"), html.encode("utf-8"))
|
||
|
||
def test_encoding_substitutes_unrecognized_characters_by_default(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>")
|
||
|
||
def test_encoding_can_be_made_strict(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
self.assertRaises(
|
||
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
|
||
|
||
def test_decode_contents(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
- self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
|
||
+ self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
|
||
|
||
def test_encode_contents(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
self.assertEqual(
|
||
- u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
|
||
+ "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
|
||
encoding="utf8"))
|
||
|
||
def test_deprecated_renderContents(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
self.assertEqual(
|
||
- u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
|
||
+ "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
|
||
|
||
def test_repr(self):
|
||
- html = u"<b>\N{SNOWMAN}</b>"
|
||
+ html = "<b>\N{SNOWMAN}</b>"
|
||
soup = self.soup(html)
|
||
if PY3K:
|
||
self.assertEqual(html, repr(soup))
|
||
@@ -1993,7 +1993,7 @@ class TestSoupSelector(TreeTest):
|
||
els = self.soup.select('title')
|
||
self.assertEqual(len(els), 1)
|
||
self.assertEqual(els[0].name, 'title')
|
||
- self.assertEqual(els[0].contents, [u'The title'])
|
||
+ self.assertEqual(els[0].contents, ['The title'])
|
||
|
||
def test_one_tag_many(self):
|
||
els = self.soup.select('div')
|
||
@@ -2039,7 +2039,7 @@ class TestSoupSelector(TreeTest):
|
||
self.assertEqual(dashed[0]['id'], 'dash2')
|
||
|
||
def test_dashed_tag_text(self):
|
||
- self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.')
|
||
+ self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
|
||
|
||
def test_select_dashed_matches_find_all(self):
|
||
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
|
||
@@ -2225,12 +2225,12 @@ class TestSoupSelector(TreeTest):
|
||
# Try to select first paragraph
|
||
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||
self.assertEqual(len(els), 1)
|
||
- self.assertEqual(els[0].string, u'Some text')
|
||
+ self.assertEqual(els[0].string, 'Some text')
|
||
|
||
# Try to select third paragraph
|
||
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||
self.assertEqual(len(els), 1)
|
||
- self.assertEqual(els[0].string, u'Another')
|
||
+ self.assertEqual(els[0].string, 'Another')
|
||
|
||
# Try to select (non-existent!) fourth paragraph
|
||
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||
@@ -2243,7 +2243,7 @@ class TestSoupSelector(TreeTest):
|
||
def test_nth_of_type_direct_descendant(self):
|
||
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||
self.assertEqual(len(els), 1)
|
||
- self.assertEqual(els[0].string, u'Some text')
|
||
+ self.assertEqual(els[0].string, 'Some text')
|
||
|
||
def test_id_child_selector_nth_of_type(self):
|
||
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
|
||
@@ -2324,7 +2324,7 @@ class TestSoupSelector(TreeTest):
|
||
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||
soup = BeautifulSoup(markup, 'html.parser')
|
||
selected = soup.select(".c1, .c2")
|
||
- self.assertEquals(3, len(selected))
|
||
+ self.assertEqual(3, len(selected))
|
||
|
||
# Verify that find_all finds the same elements, though because
|
||
# of an implementation detail it finds them in a different
|
||
diff --git a/convert-py3k b/convert-py3k
|
||
deleted file mode 100755
|
||
index 05fab53..0000000
|
||
--- a/convert-py3k
|
||
+++ /dev/null
|
||
@@ -1,16 +0,0 @@
|
||
-#!/bin/sh
|
||
-#
|
||
-# The Python 2 source is the definitive source. This script uses 2to3-3.2 to
|
||
-# create a new python3/bs4 source tree that works under Python 3.
|
||
-#
|
||
-# See README.txt to see how to run the test suite after conversion.
|
||
-echo "About to destroy and rebuild the py3k/bs4 directory."
|
||
-echo "If you've got stuff in there, Ctrl-C out of this script or answer 'n'."
|
||
-mkdir -p py3k
|
||
-rm -rfI py3k/bs4
|
||
-cp -r bs4/ py3k/
|
||
-2to3 -w py3k
|
||
-echo ""
|
||
-echo "OK, conversion is done."
|
||
-echo "Now running the unit tests."
|
||
-(cd py3k && python3 -m unittest discover -s bs4)
|
||
\ No newline at end of file
|
||
diff --git a/doc/source/index.rst b/doc/source/index.rst
|
||
index 34ec7cf..b8ca011 100644
|
||
--- a/doc/source/index.rst
|
||
+++ b/doc/source/index.rst
|
||
@@ -170,16 +170,13 @@ Installing Beautiful Soup
|
||
If you're using a recent version of Debian or Ubuntu Linux, you can
|
||
install Beautiful Soup with the system package manager:
|
||
|
||
-:kbd:`$ apt-get install python-bs4` (for Python 2)
|
||
-
|
||
-:kbd:`$ apt-get install python3-bs4` (for Python 3)
|
||
+:kbd:`$ apt-get install python3-bs4`
|
||
|
||
Beautiful Soup 4 is published through PyPi, so if you can't install it
|
||
with the system packager, you can install it with ``easy_install`` or
|
||
-``pip``. The package name is ``beautifulsoup4``, and the same package
|
||
-works on Python 2 and Python 3. Make sure you use the right version of
|
||
-``pip`` or ``easy_install`` for your Python version (these may be named
|
||
-``pip3`` and ``easy_install3`` respectively if you're using Python 3).
|
||
+``pip``. The package name is ``beautifulsoup4``. Make sure you use the
|
||
+right version of ``pip`` or ``easy_install`` for your Python version
|
||
+(these may be named ``pip3`` and ``easy_install3`` respectively).
|
||
|
||
:kbd:`$ easy_install beautifulsoup4`
|
||
|
||
@@ -202,40 +199,8 @@ package the entire library with your application. You can download the
|
||
tarball, copy its ``bs4`` directory into your application's codebase,
|
||
and use Beautiful Soup without installing it at all.
|
||
|
||
-I use Python 2.7 and Python 3.8 to develop Beautiful Soup, but it
|
||
-should work with other recent versions.
|
||
-
|
||
-Problems after installation
|
||
----------------------------
|
||
-
|
||
-Beautiful Soup is packaged as Python 2 code. When you install it for
|
||
-use with Python 3, it's automatically converted to Python 3 code. If
|
||
-you don't install the package, the code won't be converted. There have
|
||
-also been reports on Windows machines of the wrong version being
|
||
-installed.
|
||
-
|
||
-If you get the ``ImportError`` "No module named HTMLParser", your
|
||
-problem is that you're running the Python 2 version of the code under
|
||
-Python 3.
|
||
-
|
||
-If you get the ``ImportError`` "No module named html.parser", your
|
||
-problem is that you're running the Python 3 version of the code under
|
||
-Python 2.
|
||
-
|
||
-In both cases, your best bet is to completely remove the Beautiful
|
||
-Soup installation from your system (including any directory created
|
||
-when you unzipped the tarball) and try the installation again.
|
||
-
|
||
-If you get the ``SyntaxError`` "Invalid syntax" on the line
|
||
-``ROOT_TAG_NAME = u'[document]'``, you need to convert the Python 2
|
||
-code to Python 3. You can do this either by installing the package:
|
||
-
|
||
-:kbd:`$ python3 setup.py install`
|
||
-
|
||
-or by manually running Python's ``2to3`` conversion script on the
|
||
-``bs4`` directory:
|
||
-
|
||
-:kbd:`$ 2to3-3.2 -w bs4`
|
||
+I use Python 3.8 to develop Beautiful Soup, but it should work with
|
||
+other recent versions.
|
||
|
||
.. _parser-installation:
|
||
|
||
@@ -272,8 +237,7 @@ This table summarizes the advantages and disadvantages of each parser library:
|
||
+----------------------+--------------------------------------------+--------------------------------+--------------------------+
|
||
| Python's html.parser | ``BeautifulSoup(markup, "html.parser")`` | * Batteries included | * Not as fast as lxml, |
|
||
| | | * Decent speed | less lenient than |
|
||
-| | | * Lenient (As of Python 2.7.3 | html5lib. |
|
||
-| | | and 3.2.) | |
|
||
+| | | * Lenient (As of Python 3.2) | html5lib. |
|
||
+----------------------+--------------------------------------------+--------------------------------+--------------------------+
|
||
| lxml's HTML parser | ``BeautifulSoup(markup, "lxml")`` | * Very fast | * External C dependency |
|
||
| | | * Lenient | |
|
||
@@ -289,9 +253,9 @@ This table summarizes the advantages and disadvantages of each parser library:
|
||
+----------------------+--------------------------------------------+--------------------------------+--------------------------+
|
||
|
||
If you can, I recommend you install and use lxml for speed. If you're
|
||
-using a very old version of Python -- earlier than 2.7.3 or 3.2.2 --
|
||
-it's `essential` that you install lxml or html5lib. Python's built-in
|
||
-HTML parser is just not very good in those old versions.
|
||
+using a very old version of Python -- earlier than 3.2.2 -- it's
|
||
+`essential` that you install lxml or html5lib. Python's built-in HTML
|
||
+parser is just not very good in those old versions.
|
||
|
||
Note that if a document is invalid, different parsers will generate
|
||
different Beautiful Soup trees for it. See `Differences
|
||
@@ -481,8 +445,7 @@ uses the ``NavigableString`` class to contain these bits of text::
|
||
A ``NavigableString`` is just like a Python Unicode string, except
|
||
that it also supports some of the features described in `Navigating
|
||
the tree`_ and `Searching the tree`_. You can convert a
|
||
-``NavigableString`` to a Unicode string with ``unicode()`` (in
|
||
-Python 2) or ``str`` (in Python 3)::
|
||
+``NavigableString`` to a Unicode string with ``str``::
|
||
|
||
unicode_string = str(tag.string)
|
||
unicode_string
|
||
@@ -2230,8 +2193,7 @@ Non-pretty printing
|
||
-------------------
|
||
|
||
If you just want a string, with no fancy formatting, you can call
|
||
-``str()`` on a ``BeautifulSoup`` object (``unicode()`` in Python 2),
|
||
-or on a ``Tag`` within it::
|
||
+``str()`` on a ``BeautifulSoup`` object, or on a ``Tag`` within it::
|
||
|
||
str(soup)
|
||
# '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'
|
||
@@ -3139,10 +3101,10 @@ Version mismatch problems
|
||
-------------------------
|
||
|
||
* ``SyntaxError: Invalid syntax`` (on the line ``ROOT_TAG_NAME =
|
||
- '[document]'``): Caused by running the Python 2 version of
|
||
+ '[document]'``): Caused by running an old Python 2 version of
|
||
Beautiful Soup under Python 3, without converting the code.
|
||
|
||
-* ``ImportError: No module named HTMLParser`` - Caused by running the
|
||
+* ``ImportError: No module named HTMLParser`` - Caused by running an old
|
||
Python 2 version of Beautiful Soup under Python 3.
|
||
|
||
* ``ImportError: No module named html.parser`` - Caused by running the
|
||
diff --git a/setup.py b/setup.py
|
||
index 7b4b393..b9b4ed2 100644
|
||
--- a/setup.py
|
||
+++ b/setup.py
|
||
@@ -4,23 +4,22 @@ from setuptools import (
|
||
)
|
||
import sys
|
||
|
||
+from bs4 import __version__
|
||
+
|
||
with open("README.md", "r") as fh:
|
||
long_description = fh.read()
|
||
|
||
setup(
|
||
name="beautifulsoup4",
|
||
- # NOTE: We can't import __version__ from bs4 because bs4/__init__.py is Python 2 code,
|
||
- # and converting it to Python 3 means going through this code to run 2to3.
|
||
- # So we have to specify it twice for the time being.
|
||
- version = '4.9.3',
|
||
+ version = __version__,
|
||
author="Leonard Richardson",
|
||
author_email='leonardr@segfault.org',
|
||
url="http://www.crummy.com/software/BeautifulSoup/bs4/",
|
||
download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/",
|
||
description="Screen-scraping library",
|
||
+ python_requires='>3.0.0',
|
||
install_requires=[
|
||
- "soupsieve >1.2; python_version>='3.0'",
|
||
- "soupsieve >1.2, <2.0; python_version<'3.0'",
|
||
+ "soupsieve >1.2",
|
||
],
|
||
long_description=long_description,
|
||
long_description_content_type="text/markdown",
|
||
@@ -30,12 +29,10 @@ setup(
|
||
'lxml' : [ 'lxml'],
|
||
'html5lib' : ['html5lib'],
|
||
},
|
||
- use_2to3 = True,
|
||
classifiers=["Development Status :: 5 - Production/Stable",
|
||
"Intended Audience :: Developers",
|
||
"License :: OSI Approved :: MIT License",
|
||
"Programming Language :: Python",
|
||
- "Programming Language :: Python :: 2.7",
|
||
'Programming Language :: Python :: 3',
|
||
"Topic :: Text Processing :: Markup :: HTML",
|
||
"Topic :: Text Processing :: Markup :: XML",
|
||
diff --git a/test-all-versions b/test-all-versions
|
||
index 01e436b..fe7758a 100755
|
||
--- a/test-all-versions
|
||
+++ b/test-all-versions
|
||
@@ -1 +1 @@
|
||
-python2.7 -m unittest discover -s bs4 && ./convert-py3k
|
||
+python3 -m unittest discover -s bs4
|
||
--
|
||
2.13.7
|
||
|