From 91827993fb8bece431f1d8831468aa43b6f3dfef Mon Sep 17 00:00:00 2001 From: shixuantong Date: Mon, 10 Jan 2022 17:58:01 +0800 Subject: [PATCH] converts the code base to Python 3, and removes the use_2to3 reference in setup.py. --- ...e-base-to-Python-3-and-removes-the-u.patch | 2062 +++++++++++++++++ python-beautifulsoup4.spec | 14 +- 2 files changed, 2070 insertions(+), 6 deletions(-) create mode 100644 backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch diff --git a/backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch b/backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch new file mode 100644 index 0000000..1cab095 --- /dev/null +++ b/backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch @@ -0,0 +1,2062 @@ +diff --git a/README.md b/README.md +index 92dd339..884f9eb 100644 +--- a/README.md ++++ b/README.md +@@ -53,17 +53,11 @@ To go beyond the basics, [comprehensive documentation is available](http://www.c + + # Note on Python 2 sunsetting + +-Since 2012, Beautiful Soup has been developed as a Python 2 library +-which is automatically converted to Python 3 code as necessary. This +-makes it impossible to take advantage of some features of Python +-3. +- +-For this reason, I plan to discontinue Beautiful Soup's Python 2 +-support at some point after December 31, 2020: one year after the +-sunset date for Python 2 itself. Beyond that point, new Beautiful Soup +-development will exclusively target Python 3. Of course, older +-releases of Beautiful Soup, which support both versions, will continue +-to be available. ++Beautiful Soup's support for Python 2 was discontinued on December 31, ++2020: one year after the sunset date for Python 2 itself. From this ++point onward, new Beautiful Soup development will exclusively target ++Python 3. The final release of Beautiful Soup 4 to support Python 2 ++was 4.9.3. + + # Supporting the project + +@@ -93,10 +87,5 @@ $ nosetests + ``` + + ``` +-$ python -m unittest discover -s bs4 ++$ python3 -m unittest discover -s bs4 + ``` +- +-If you checked out the source tree, you should see a script in the +-home directory called test-all-versions. This script will run the unit +-tests under Python 2, then create a temporary Python 3 conversion of +-the source and run the unit tests again under Python 3. +diff --git a/bs4/__init__.py b/bs4/__init__.py +index 8f78809..51ccc21 100644 +--- a/bs4/__init__.py ++++ b/bs4/__init__.py +@@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a + provides methods and Pythonic idioms that make it easy to navigate, + search, and modify the parse tree. + +-Beautiful Soup works with Python 2.7 and up. It works better if lxml ++Beautiful Soup works with Python 3.5 and up. It works better if lxml + and/or html5lib is installed. + + For more than you ever wanted to know about Beautiful Soup, see the +@@ -29,6 +29,11 @@ import sys + import traceback + import warnings + ++# The very first thing we do is give a useful error if someone is ++# running this code under Python 2. ++if sys.version_info.major < 3: ++ raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') ++ + from .builder import builder_registry, ParserRejectedMarkup + from .dammit import UnicodeDammit + from .element import ( +@@ -49,10 +54,6 @@ from .element import ( + TemplateString, + ) + +-# The very first thing we do is give a useful error if someone is +-# running this code under Python 3 without converting it. +-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +- + # Define some custom warnings. + class GuessedAtParserWarning(UserWarning): + """The warning issued when BeautifulSoup has to guess what parser to +@@ -100,7 +101,7 @@ class BeautifulSoup(Tag): + # Since BeautifulSoup subclasses Tag, it's possible to treat it as + # a Tag with a .name. This name makes it clear the BeautifulSoup + # object isn't a real markup tag. +- ROOT_TAG_NAME = u'[document]' ++ ROOT_TAG_NAME = '[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. +@@ -217,7 +218,7 @@ class BeautifulSoup(Tag): + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + +- if from_encoding and isinstance(markup, unicode): ++ if from_encoding and isinstance(markup, str): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + +@@ -234,7 +235,7 @@ class BeautifulSoup(Tag): + builder_class = builder + builder = None + elif builder is None: +- if isinstance(features, basestring): ++ if isinstance(features, str): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES +@@ -309,13 +310,13 @@ class BeautifulSoup(Tag): + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) +- or (isinstance(markup, unicode) and not u'<' in markup) ++ or (isinstance(markup, str) and not '<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. +- if (isinstance(markup, unicode) ++ if (isinstance(markup, str) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: +@@ -323,7 +324,7 @@ class BeautifulSoup(Tag): + is_file = False + try: + is_file = os.path.exists(possible_filename) +- except Exception, e: ++ except Exception as e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. +@@ -353,9 +354,9 @@ class BeautifulSoup(Tag): + pass + + if not success: +- other_exceptions = [unicode(e) for e in rejections] ++ other_exceptions = [str(e) for e in rejections] + raise ParserRejectedMarkup( +- u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) ++ "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + + # Clear out the markup and remove the builder's circular +@@ -406,9 +407,9 @@ class BeautifulSoup(Tag): + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") +- elif isinstance(markup, unicode): +- space = u' ' +- cant_start_with = (u"http:", u"https:") ++ elif isinstance(markup, str): ++ space = ' ' ++ cant_start_with = ("http:", "https:") + else: + return + +@@ -545,7 +546,7 @@ class BeautifulSoup(Tag): + containerClass = self.string_container(containerClass) + + if self.current_data: +- current_data = u''.join(self.current_data) ++ current_data = ''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. +@@ -748,9 +749,9 @@ class BeautifulSoup(Tag): + eventual_encoding = None + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding +- prefix = u'\n' % encoding_part ++ prefix = '\n' % encoding_part + else: +- prefix = u'' ++ prefix = '' + if not pretty_print: + indent_level = None + else: +@@ -788,4 +789,4 @@ class FeatureNotFound(ValueError): + if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) +- print(soup.prettify()) ++ print((soup.prettify())) +diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py +index 03da4c6..03fbd6a 100644 +--- a/bs4/builder/__init__.py ++++ b/bs4/builder/__init__.py +@@ -300,13 +300,13 @@ class TreeBuilder(object): + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) +- for attr in attrs.keys(): ++ for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] +- if isinstance(value, basestring): ++ if isinstance(value, str): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice +@@ -496,7 +496,7 @@ class ParserRejectedMarkup(Exception): + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception +- message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) ++ message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + + # Builders are registered in reverse order of priority, so that custom +diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py +index a1c6134..69aefd7 100644 +--- a/bs4/builder/_html5lib.py ++++ b/bs4/builder/_html5lib.py +@@ -33,7 +33,7 @@ try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +-except ImportError, e: ++except ImportError as e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True +@@ -79,7 +79,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + self.underlying_builder.parser = parser + extra_kwargs = dict() +- if not isinstance(markup, unicode): ++ if not isinstance(markup, str): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: +@@ -87,13 +87,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] +- if not isinstance(original_encoding, basestring): ++ if not isinstance(original_encoding, str): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. +@@ -110,7 +110,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" +- return u'%s' % fragment ++ return '%s' % fragment + + + class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): +@@ -217,7 +217,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] +- for name, value in element.attrs.items(): ++ for name, value in list(element.attrs.items()): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): +@@ -272,7 +272,7 @@ class Element(treebuilder_base.Node): + + def appendChild(self, node): + string_child = child = None +- if isinstance(node, basestring): ++ if isinstance(node, str): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. +@@ -289,7 +289,7 @@ class Element(treebuilder_base.Node): + child = node.element + node.parent = self + +- if not isinstance(child, basestring) and child.parent is not None: ++ if not isinstance(child, str) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents +@@ -302,7 +302,7 @@ class Element(treebuilder_base.Node): + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: +- if isinstance(node, basestring): ++ if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + +@@ -340,7 +340,7 @@ class Element(treebuilder_base.Node): + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) +- for name, value in attributes.items(): ++ for name, value in list(attributes.items()): + self.element[name] = value + + # The attributes may contain variables that need substitution. +diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py +index 96a7b7d..88860a9 100644 +--- a/bs4/builder/_htmlparser.py ++++ b/bs4/builder/_htmlparser.py +@@ -8,11 +8,11 @@ __all__ = [ + 'HTMLParserTreeBuilder', + ] + +-from HTMLParser import HTMLParser ++from html.parser import HTMLParser + + try: +- from HTMLParser import HTMLParseError +-except ImportError, e: ++ from html.parser import HTMLParseError ++except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): +@@ -219,14 +219,14 @@ class BeautifulSoupHTMLParser(HTMLParser): + continue + try: + data = bytearray([real_name]).decode(encoding) +- except UnicodeDecodeError, e: ++ except UnicodeDecodeError as e: + pass + if not data: + try: +- data = unichr(real_name) +- except (ValueError, OverflowError), e: ++ data = chr(real_name) ++ except (ValueError, OverflowError) as e: + pass +- data = data or u"\N{REPLACEMENT CHARACTER}" ++ data = data or "\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): +@@ -353,7 +353,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # Parse Unicode as-is. + yield (markup, None, None, False) + return +@@ -376,7 +376,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): + try: + parser.feed(markup) + parser.close() +- except HTMLParseError, e: ++ except HTMLParseError as e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e +diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py +index 1b44d75..432a2c8 100644 +--- a/bs4/builder/_lxml.py ++++ b/bs4/builder/_lxml.py +@@ -8,11 +8,11 @@ __all__ = [ + + try: + from collections.abc import Callable # Python 3.6 +-except ImportError , e: ++except ImportError as e: + from collections import Callable + + from io import BytesIO +-from StringIO import StringIO ++from io import StringIO + from lxml import etree + from bs4.element import ( + Comment, +@@ -35,7 +35,7 @@ LXML = 'lxml' + + def _invert(d): + "Invert a dictionary." +- return dict((v,k) for k, v in d.items()) ++ return dict((v,k) for k, v in list(d.items())) + + class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser +@@ -81,7 +81,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + + :param mapping: A dictionary mapping namespace prefixes to URIs. + """ +- for key, value in mapping.items(): ++ for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same +@@ -169,12 +169,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): + else: + self.processing_instruction_class = XMLProcessingInstruction + +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", +@@ -189,7 +189,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) +- elif isinstance(markup, unicode): ++ elif isinstance(markup, str): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, +@@ -204,7 +204,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + if len(data) != 0: + self.parser.feed(data) + self.parser.close() +- except (UnicodeDecodeError, LookupError, etree.ParserError), e: ++ except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + def close(self): +@@ -233,7 +233,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() +- for prefix, namespace in nsmap.items(): ++ for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace +@@ -242,7 +242,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} +- for attr, value in attrs.items(): ++ for attr, value in list(attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value +@@ -302,7 +302,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" +- return u'\n%s' % fragment ++ return '\n%s' % fragment + + + class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): +@@ -323,10 +323,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() +- except (UnicodeDecodeError, LookupError, etree.ParserError), e: ++ except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" +- return u'%s' % fragment ++ return '%s' % fragment +diff --git a/bs4/dammit.py b/bs4/dammit.py +index 33f7b7d..ee3708f 100644 +--- a/bs4/dammit.py ++++ b/bs4/dammit.py +@@ -10,7 +10,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job. + __license__ = "MIT" + + import codecs +-from htmlentitydefs import codepoint2name ++from html.entities import codepoint2name + import re + import logging + import string +@@ -22,7 +22,7 @@ try: + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): +- if isinstance(s, unicode): ++ if isinstance(s, str): + return None + return cchardet.detect(s)['encoding'] + except ImportError: +@@ -32,7 +32,7 @@ except ImportError: + # PyPI package: chardet + import chardet + def chardet_dammit(s): +- if isinstance(s, unicode): ++ if isinstance(s, str): + return None + return chardet.detect(s)['encoding'] + #import chardet.constants +@@ -53,14 +53,14 @@ except ImportError: + + # Build bytestring and Unicode versions of regular expressions for finding + # a declared encoding inside an XML or HTML document. +-xml_encoding = u'^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +-html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' ++xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' ++html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' + encoding_res = dict() + encoding_res[bytes] = { + 'html' : re.compile(html_meta.encode("ascii"), re.I), + 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), + } +-encoding_res[unicode] = { ++encoding_res[str] = { + 'html' : re.compile(html_meta, re.I), + 'xml' : re.compile(xml_encoding, re.I) + } +@@ -80,7 +80,7 @@ class EntitySubstitution(object): + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: +- character = unichr(codepoint) ++ character = chr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it +@@ -323,7 +323,7 @@ class EncodingDetector: + :return: A 2-tuple (modified data, implied encoding) + """ + encoding = None +- if isinstance(data, unicode): ++ if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ +@@ -370,7 +370,7 @@ class EncodingDetector: + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: +- res = encoding_res[unicode] ++ res = encoding_res[str] + + xml_re = res['xml'] + html_re = res['html'] +@@ -431,9 +431,9 @@ class UnicodeDammit: + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. +- if isinstance(markup, unicode) or markup == '': ++ if isinstance(markup, str) or markup == '': + self.markup = markup +- self.unicode_markup = unicode(markup) ++ self.unicode_markup = str(markup) + self.original_encoding = None + return + +@@ -523,7 +523,7 @@ class UnicodeDammit: + + :param encoding: The name of an encoding. + """ +- return unicode(data, encoding, errors) ++ return str(data, encoding, errors) + + @property + def declared_html_encoding(self): +diff --git a/bs4/diagnose.py b/bs4/diagnose.py +index e4f2f47..500e92d 100644 +--- a/bs4/diagnose.py ++++ b/bs4/diagnose.py +@@ -4,8 +4,8 @@ + __license__ = "MIT" + + import cProfile +-from StringIO import StringIO +-from HTMLParser import HTMLParser ++from io import StringIO ++from html.parser import HTMLParser + import bs4 + from bs4 import BeautifulSoup, __version__ + from bs4.builder import builder_registry +@@ -25,8 +25,8 @@ def diagnose(data): + :param data: A string containing markup that needs to be explained. + :return: None; diagnostics are printed to standard output. + """ +- print("Diagnostic running on Beautiful Soup %s" % __version__) +- print("Python version %s" % sys.version) ++ print(("Diagnostic running on Beautiful Soup %s" % __version__)) ++ print(("Python version %s" % sys.version)) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: +@@ -35,16 +35,16 @@ def diagnose(data): + break + else: + basic_parsers.remove(name) +- print( ++ print(( + "I noticed that %s is not installed. Installing it may help." % +- name) ++ name)) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree +- print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) +- except ImportError, e: ++ print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) ++ except ImportError as e: + print( + "lxml is not installed or couldn't be imported.") + +@@ -52,21 +52,21 @@ def diagnose(data): + if 'html5lib' in basic_parsers: + try: + import html5lib +- print("Found html5lib version %s" % html5lib.__version__) +- except ImportError, e: ++ print(("Found html5lib version %s" % html5lib.__version__)) ++ except ImportError as e: + print( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): +- print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) ++ print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + else: + try: + if os.path.exists(data): +- print('"%s" looks like a filename. Reading data from the file.' % data) ++ print(('"%s" looks like a filename. Reading data from the file.' % data)) + with open(data) as fp: + data = fp.read() + except ValueError: +@@ -76,19 +76,19 @@ def diagnose(data): + print("") + + for parser in basic_parsers: +- print("Trying to parse your markup with %s" % parser) ++ print(("Trying to parse your markup with %s" % parser)) + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True +- except Exception, e: +- print("%s could not parse the markup." % parser) ++ except Exception as e: ++ print(("%s could not parse the markup." % parser)) + traceback.print_exc() + if success: +- print("Here's what %s did with the markup:" % parser) +- print(soup.prettify()) ++ print(("Here's what %s did with the markup:" % parser)) ++ print((soup.prettify())) + +- print("-" * 80) ++ print(("-" * 80)) + + def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. +@@ -104,7 +104,7 @@ def lxml_trace(data, html=True, **kwargs): + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): +- print("%s, %4s, %s" % (event, element.tag, element.text)) ++ print(("%s, %4s, %s" % (event, element.tag, element.text))) + + class AnnouncingParser(HTMLParser): + """Subclass of HTMLParser that announces parse events, without doing +@@ -193,9 +193,9 @@ def rdoc(num_elements=1000): + + def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" +- print("Comparative parser benchmark on Beautiful Soup %s" % __version__) ++ print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) + data = rdoc(num_elements) +- print("Generated a large invalid HTML document (%d bytes)." % len(data)) ++ print(("Generated a large invalid HTML document (%d bytes)." % len(data))) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False +@@ -204,24 +204,24 @@ def benchmark_parsers(num_elements=100000): + soup = BeautifulSoup(data, parser) + b = time.time() + success = True +- except Exception, e: +- print("%s could not parse the markup." % parser) ++ except Exception as e: ++ print(("%s could not parse the markup." % parser)) + traceback.print_exc() + if success: +- print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) ++ print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() +- print("Raw lxml parsed the markup in %.2fs." % (b-a)) ++ print(("Raw lxml parsed the markup in %.2fs." % (b-a))) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() +- print("Raw html5lib parsed the markup in %.2fs." % (b-a)) ++ print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) + + def profile(num_elements=100000, parser="lxml"): + """Use Python's profiler on a randomly generated document.""" +diff --git a/bs4/element.py b/bs4/element.py +index 09a81d9..81d9db9 100644 +--- a/bs4/element.py ++++ b/bs4/element.py +@@ -3,14 +3,14 @@ __license__ = "MIT" + + try: + from collections.abc import Callable # Python 3.6 +-except ImportError , e: ++except ImportError as e: + from collections import Callable + import re + import sys + import warnings + try: + import soupsieve +-except ImportError, e: ++except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' +@@ -57,22 +57,22 @@ def _alias(attr): + # Source: + # https://docs.python.org/3/library/codecs.html#python-specific-encodings + PYTHON_SPECIFIC_ENCODINGS = set([ +- u"idna", +- u"mbcs", +- u"oem", +- u"palmos", +- u"punycode", +- u"raw_unicode_escape", +- u"undefined", +- u"unicode_escape", +- u"raw-unicode-escape", +- u"unicode-escape", +- u"string-escape", +- u"string_escape", ++ "idna", ++ "mbcs", ++ "oem", ++ "palmos", ++ "punycode", ++ "raw_unicode_escape", ++ "undefined", ++ "unicode_escape", ++ "raw-unicode-escape", ++ "unicode-escape", ++ "string-escape", ++ "string_escape", + ]) + + +-class NamespacedAttribute(unicode): ++class NamespacedAttribute(str): + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ +@@ -84,18 +84,18 @@ class NamespacedAttribute(unicode): + name = None + + if name is None: +- obj = unicode.__new__(cls, prefix) ++ obj = str.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. +- obj = unicode.__new__(cls, name) ++ obj = str.__new__(cls, name) + else: +- obj = unicode.__new__(cls, prefix + ":" + name) ++ obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +-class AttributeValueWithCharsetSubstitution(unicode): ++class AttributeValueWithCharsetSubstitution(str): + """A stand-in object for a character encoding specified in HTML.""" + + class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): +@@ -106,7 +106,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """ + + def __new__(cls, original_value): +- obj = unicode.__new__(cls, original_value) ++ obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + +@@ -134,9 +134,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. +- return unicode.__new__(unicode, original_value) ++ return str.__new__(str, original_value) + +- obj = unicode.__new__(cls, original_value) ++ obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + +@@ -376,7 +376,7 @@ class PageElement(object): + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") +- if (isinstance(new_child, basestring) ++ if (isinstance(new_child, str) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + +@@ -753,7 +753,7 @@ class PageElement(object): + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) +- elif isinstance(name, basestring): ++ elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, +@@ -872,7 +872,7 @@ class PageElement(object): + return self.parents + + +-class NavigableString(unicode, PageElement): ++class NavigableString(str, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup penguin, it will +@@ -895,10 +895,10 @@ class NavigableString(unicode, PageElement): + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ +- if isinstance(value, unicode): +- u = unicode.__new__(cls, value) ++ if isinstance(value, str): ++ u = str.__new__(cls, value) + else: +- u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) ++ u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + +@@ -909,7 +909,7 @@ class NavigableString(unicode, PageElement): + return type(self)(self) + + def __getnewargs__(self): +- return (unicode(self),) ++ return (str(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards +@@ -975,30 +975,30 @@ class PreformattedString(NavigableString): + + class CData(PreformattedString): + """A CDATA block.""" +- PREFIX = u'' ++ PREFIX = '' + + class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + +- PREFIX = u'' ++ PREFIX = '' + + class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" +- PREFIX = u'' ++ PREFIX = '' + + class Comment(PreformattedString): + """An HTML or XML comment.""" +- PREFIX = u'' ++ PREFIX = '' + + + class Declaration(PreformattedString): + """An XML declaration.""" +- PREFIX = u'' ++ PREFIX = '' + + + class Doctype(PreformattedString): +@@ -1026,8 +1026,8 @@ class Doctype(PreformattedString): + + return Doctype(value) + +- PREFIX = u'\n' ++ PREFIX = '\n' + + + class Stylesheet(NavigableString): +@@ -1263,7 +1263,7 @@ class Tag(PageElement): + for string in self._all_strings(True): + yield string + +- def get_text(self, separator=u"", strip=False, ++ def get_text(self, separator="", strip=False, + types=(NavigableString, CData)): + """Get all child strings, concatenated using the given separator. + +@@ -1416,7 +1416,7 @@ class Tag(PageElement): + def __contains__(self, x): + return x in self.contents + +- def __nonzero__(self): ++ def __bool__(self): + "A tag is non-None even if it has no contents." + return True + +@@ -1565,8 +1565,8 @@ class Tag(PageElement): + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) +- elif not isinstance(val, basestring): +- val = unicode(val) ++ elif not isinstance(val, str): ++ val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None +@@ -1575,7 +1575,7 @@ class Tag(PageElement): + + text = formatter.attribute_value(val) + decoded = ( +- unicode(key) + '=' ++ str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' +@@ -1934,7 +1934,7 @@ class SoupStrainer(object): + else: + attrs = kwargs + normalized_attrs = {} +- for key, value in attrs.items(): ++ for key, value in list(attrs.items()): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs +@@ -1943,7 +1943,7 @@ class SoupStrainer(object): + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. +- if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') ++ if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + +@@ -1956,7 +1956,7 @@ class SoupStrainer(object): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) +- and not isinstance(v, unicode)): ++ and not isinstance(v, str)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. +@@ -1968,7 +1968,7 @@ class SoupStrainer(object): + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. +- return unicode(str(value)) ++ return str(str(value)) + + def __str__(self): + """A human-readable representation of this SoupStrainer.""" +@@ -1996,7 +1996,7 @@ class SoupStrainer(object): + markup = markup_name + markup_attrs = markup + +- if isinstance(self.name, basestring): ++ if isinstance(self.name, str): + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. +@@ -2052,7 +2052,7 @@ class SoupStrainer(object): + found = None + # If given a list of items, scan it for a text element that + # matches. +- if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): ++ if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): +@@ -2065,7 +2065,7 @@ class SoupStrainer(object): + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ +- isinstance(markup, basestring): ++ isinstance(markup, str): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: +@@ -2110,7 +2110,7 @@ class SoupStrainer(object): + return not match_against + + if (hasattr(match_against, '__iter__') +- and not isinstance(match_against, basestring)): ++ and not isinstance(match_against, str)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. +@@ -2137,7 +2137,7 @@ class SoupStrainer(object): + # the tag's name and once against its prefixed name. + match = False + +- if not match and isinstance(match_against, unicode): ++ if not match and isinstance(match_against, str): + # Exact string match + match = markup == match_against + +diff --git a/bs4/formatter.py b/bs4/formatter.py +index 9a692ec..2cbab4c 100644 +--- a/bs4/formatter.py ++++ b/bs4/formatter.py +@@ -83,7 +83,7 @@ class Formatter(EntitySubstitution): + """ + if not self.entity_substitution: + return ns +- from element import NavigableString ++ from .element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): +diff --git a/bs4/testing.py b/bs4/testing.py +index a2f83a1..9ca507b 100644 +--- a/bs4/testing.py ++++ b/bs4/testing.py +@@ -25,7 +25,7 @@ from bs4.element import ( + from bs4.builder import HTMLParserTreeBuilder + default_builder = HTMLParserTreeBuilder + +-BAD_DOCUMENT = u"""A bare string ++BAD_DOCUMENT = """A bare string + + +
+@@ -94,7 +94,7 @@ class SoupTest(unittest.TestCase): + # Verify that every tag that was opened was eventually closed. + + # There are no tags in the open tag counter. +- assert all(v==0 for v in obj.open_tag_counter.values()) ++ assert all(v==0 for v in list(obj.open_tag_counter.values())) + + # The only tag in the tag stack is the one for the root + # document. +@@ -372,7 +372,7 @@ class HTMLTreeBuilderSmokeTest(object): + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. +- markup = u"""""" ++ markup = """""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + +@@ -544,14 +544,14 @@ Hello, world! + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( +- u"

• AT&T is in the s&p 500

", +- u"

\u2022 AT&T is in the s&p 500

" ++ "

• AT&T is in the s&p 500

", ++ "

\u2022 AT&T is in the s&p 500

" + ) + + def test_apos_entity(self): + self.assertSoupEquals( +- u"

Bob's Bar

", +- u"

Bob's Bar

", ++ "

Bob's Bar

", ++ "

Bob's Bar

", + ) + + def test_entities_in_foreign_document_encoding(self): +@@ -564,17 +564,17 @@ Hello, world! + # characters. + markup = "

“Hello” -☃

" + soup = self.soup(markup) +- self.assertEquals(u"“Hello” -☃", soup.p.string) ++ self.assertEqual("“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): +- expect = u'

' ++ expect = '

' + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + + def test_entities_in_text_converted_to_unicode(self): +- expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' ++ expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) +@@ -585,7 +585,7 @@ Hello, world! + '

I said "good day!"

') + + def test_out_of_range_entity(self): +- expect = u"\N{REPLACEMENT CHARACTER}" ++ expect = "\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) +@@ -663,9 +663,9 @@ Hello, world! + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! +- markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' ++ markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) +- self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) ++ self.assertEqual('Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" +@@ -705,7 +705,7 @@ Hello, world! + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" +- expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" ++ expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): +@@ -715,15 +715,15 @@ Hello, world! + soup = self.soup(quote) + self.assertEqual( + soup.p.string, +- u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") ++ "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") +- self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) ++ self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" +- expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") ++ expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + +@@ -732,7 +732,7 @@ Hello, world! + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. +- unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' ++ unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. +@@ -848,8 +848,8 @@ Hello, world! + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( +- u'idna', u'mbcs', u'oem', u'undefined', +- u'string_escape', u'string-escape' ++ 'idna', 'mbcs', 'oem', 'undefined', ++ 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't +@@ -910,8 +910,8 @@ class XMLTreeBuilderSmokeTest(object): + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( +- u'idna', u'mbcs', u'oem', u'undefined', +- u'string_escape', u'string-escape' ++ 'idna', 'mbcs', 'oem', 'undefined', ++ 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't +@@ -962,15 +962,15 @@ class XMLTreeBuilderSmokeTest(object): + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): +- markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' ++ markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) +- self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) ++ self.assertEqual('Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = 'b2012-07-02T20:33:42Zcd' + soup = self.soup(markup) + self.assertEqual( +- unicode(soup.rss), markup) ++ str(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") +@@ -1001,17 +1001,17 @@ class XMLTreeBuilderSmokeTest(object): + def test_closing_namespaced_tag(self): + markup = '

20010504

' + soup = self.soup(markup) +- self.assertEqual(unicode(soup.p), markup) ++ self.assertEqual(str(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) +- self.assertEqual(unicode(soup.foo), markup) ++ self.assertEqual(str(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = 'bar' + soup = self.soup(markup) +- self.assertEqual(unicode(soup.foo), markup) ++ self.assertEqual(str(soup.foo), markup) + + def test_find_by_prefixed_name(self): + doc = """ +diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py +index 7b0a6d4..b77659b 100644 +--- a/bs4/tests/test_html5lib.py ++++ b/bs4/tests/test_html5lib.py +@@ -5,7 +5,7 @@ import warnings + try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +-except ImportError, e: ++except ImportError as e: + HTML5LIB_PRESENT = False + from bs4.element import SoupStrainer + from bs4.testing import ( +@@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + def test_reparented_markup(self): + markup = '

foo

\n

bar

' + soup = self.soup(markup) +- self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) ++ self.assertEqual("

foo

\n

bar

", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + + def test_reparented_markup_ends_with_whitespace(self): + markup = '

foo

\n

bar

\n' + soup = self.soup(markup) +- self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) ++ self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): +@@ -127,7 +127,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + def test_foster_parenting(self): + markup = b"""A""" + soup = self.soup(markup) +- self.assertEqual(u"A
", soup.body.decode()) ++ self.assertEqual("A
", soup.body.decode()) + + def test_extraction(self): + """ +diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py +index 7ee91aa..aeff094 100644 +--- a/bs4/tests/test_htmlparser.py ++++ b/bs4/tests/test_htmlparser.py +@@ -60,20 +60,20 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + # If you don't provide any particular value for + # on_duplicate_attribute, later values replace earlier values. + soup = self.soup(markup) +- self.assertEquals("url3", soup.a['href']) +- self.assertEquals(["cls"], soup.a['class']) +- self.assertEquals("id", soup.a['id']) ++ self.assertEqual("url3", soup.a['href']) ++ self.assertEqual(["cls"], soup.a['class']) ++ self.assertEqual("id", soup.a['id']) + + # You can also get this behavior explicitly. + def assert_attribute(on_duplicate_attribute, expected): + soup = self.soup( + markup, on_duplicate_attribute=on_duplicate_attribute + ) +- self.assertEquals(expected, soup.a['href']) ++ self.assertEqual(expected, soup.a['href']) + + # Verify that non-duplicate attributes are treated normally. +- self.assertEquals(["cls"], soup.a['class']) +- self.assertEquals("id", soup.a['id']) ++ self.assertEqual(["cls"], soup.a['class']) ++ self.assertEqual("id", soup.a['id']) + assert_attribute(None, "url3") + assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") + +diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py +index f96e4ae..3d0c75f 100644 +--- a/bs4/tests/test_lxml.py ++++ b/bs4/tests/test_lxml.py +@@ -7,7 +7,7 @@ try: + import lxml.etree + LXML_PRESENT = True + LXML_VERSION = lxml.etree.LXML_VERSION +-except ImportError, e: ++except ImportError as e: + LXML_PRESENT = False + LXML_VERSION = (0,) + +@@ -68,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + # if one is installed. + with warnings.catch_warnings(record=True) as w: + soup = BeautifulStoneSoup("") +- self.assertEqual(u"", unicode(soup.b)) ++ self.assertEqual("", str(soup.b)) + self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) + + def test_tracking_line_numbers(self): +diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py +index 857eb41..e1035ea 100644 +--- a/bs4/tests/test_soup.py ++++ b/bs4/tests/test_soup.py +@@ -51,17 +51,17 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) + class TestConstructor(SoupTest): + + def test_short_unicode_input(self): +- data = u"

éé

" ++ data = "

éé

" + soup = self.soup(data) +- self.assertEqual(u"éé", soup.h1.string) ++ self.assertEqual("éé", soup.h1.string) + + def test_embedded_null(self): +- data = u"

foo\0bar

" ++ data = "

foo\0bar

" + soup = self.soup(data) +- self.assertEqual(u"foo\0bar", soup.h1.string) ++ self.assertEqual("foo\0bar", soup.h1.string) + + def test_exclude_encodings(self): +- utf8_data = u"Räksmörgås".encode("utf-8") ++ utf8_data = "Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + +@@ -127,7 +127,7 @@ class TestConstructor(SoupTest): + yield markup, None, None, False + + import re +- self.assertRaisesRegexp( ++ self.assertRaisesRegex( + ParserRejectedMarkup, + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.", + BeautifulSoup, '', builder=Mock, +@@ -303,7 +303,7 @@ class TestWarnings(SoupTest): + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning +- soup = self.soup(u"http://www.crummyunicode.com/") ++ soup = self.soup("http://www.crummyunicode.com/") + warning = self._assert_warning( + warning_list, MarkupResemblesLocatorWarning + ) +@@ -319,7 +319,7 @@ class TestWarnings(SoupTest): + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: +- soup = self.soup(u"http://www.crummyuncode.com/ is great") ++ soup = self.soup("http://www.crummyuncode.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + +@@ -341,9 +341,9 @@ class TestEntitySubstitution(unittest.TestCase): + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. +- s = u"foo\u2200\N{SNOWMAN}\u00f5bar" ++ s = "foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEqual(self.sub.substitute_html(s), +- u"foo∀\N{SNOWMAN}õbar") ++ "foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we +@@ -408,7 +408,7 @@ class TestEncodingConversion(SoupTest): + + def setUp(self): + super(TestEncodingConversion, self).setUp() +- self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' ++ self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + self.assertEqual( +@@ -428,7 +428,7 @@ class TestEncodingConversion(SoupTest): + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() +- self.assertTrue(isinstance(unicode_output, unicode)) ++ self.assertTrue(isinstance(unicode_output, str)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: +@@ -440,7 +440,7 @@ class TestEncodingConversion(SoupTest): + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) +- self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') ++ self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.original_encoding, None) + + def test_utf8_in_unicode_out(self): +@@ -448,7 +448,7 @@ class TestEncodingConversion(SoupTest): + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) +- self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') ++ self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. +@@ -459,7 +459,7 @@ class TestEncodingConversion(SoupTest): + PYTHON_3_PRE_3_2, + "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") + def test_attribute_name_containing_unicode_characters(self): +- markup = u'
' ++ markup = '
' + self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) + + class TestUnicodeDammit(unittest.TestCase): +@@ -526,7 +526,7 @@ class TestUnicodeDammit(unittest.TestCase): + + def test_exclude_encodings(self): + # This is UTF-8. +- utf8_data = u"Räksmörgås".encode("utf-8") ++ utf8_data = "Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. +diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py +index 2246346..b4f2a86 100644 +--- a/bs4/tests/test_tree.py ++++ b/bs4/tests/test_tree.py +@@ -75,13 +75,13 @@ class TestFind(TreeTest): + self.assertEqual(soup.find("b").string, "2") + + def test_unicode_text_find(self): +- soup = self.soup(u'

Räksmörgås

') +- self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås') ++ soup = self.soup('

Räksmörgås

') ++ self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') + + def test_unicode_attribute_find(self): +- soup = self.soup(u'

here it is

') ++ soup = self.soup('

here it is

') + str(soup) +- self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text) ++ self.assertEqual("here it is", soup.find(id='Räksmörgås').text) + + + def test_find_everything(self): +@@ -101,17 +101,17 @@ class TestFindAll(TreeTest): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. +- self.assertEqual(soup.find_all(string="bar"), [u"bar"]) +- self.assertEqual(soup.find_all(text="bar"), [u"bar"]) ++ self.assertEqual(soup.find_all(string="bar"), ["bar"]) ++ self.assertEqual(soup.find_all(text="bar"), ["bar"]) + # Match any of a number of strings. + self.assertEqual( +- soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) ++ soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) + # Match a regular expression. + self.assertEqual(soup.find_all(text=re.compile('.*')), +- [u"Foo", u"bar", u'\xbb']) ++ ["Foo", "bar", '\xbb']) + # Match anything. + self.assertEqual(soup.find_all(text=True), +- [u"Foo", u"bar", u'\xbb']) ++ ["Foo", "bar", '\xbb']) + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" +@@ -254,8 +254,8 @@ class TestFindAllByAttribute(TreeTest): + ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): +- peace = u"םולש".encode("utf8") +- data = u''.encode("utf8") ++ peace = "םולש".encode("utf8") ++ data = ''.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) +@@ -444,7 +444,7 @@ class TestSmooth(TreeTest): + # output. + + # Since the tag has two children, its .string is None. +- self.assertEquals(None, div.span.string) ++ self.assertEqual(None, div.span.string) + + self.assertEqual(7, len(div.contents)) + div.smooth() +@@ -755,18 +755,18 @@ class TestTag(SoupTest): + + # No list of whitespace-preserving tags -> pretty-print + tag._preserve_whitespace_tags = None +- self.assertEquals(True, tag._should_pretty_print(0)) ++ self.assertEqual(True, tag._should_pretty_print(0)) + + # List exists but tag is not on the list -> pretty-print + tag.preserve_whitespace_tags = ["some_other_tag"] +- self.assertEquals(True, tag._should_pretty_print(1)) ++ self.assertEqual(True, tag._should_pretty_print(1)) + + # Indent level is None -> don't pretty-print +- self.assertEquals(False, tag._should_pretty_print(None)) ++ self.assertEqual(False, tag._should_pretty_print(None)) + + # Tag is on the whitespace-preserving list -> don't pretty-print + tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"] +- self.assertEquals(False, tag._should_pretty_print(1)) ++ self.assertEqual(False, tag._should_pretty_print(1)) + + + class TestTagCreation(SoupTest): +@@ -905,10 +905,10 @@ class TestTreeModification(SoupTest): + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) +- self.assertEquals("And now, a word:", p1.string) +- self.assertEquals("p2", p2.string) +- self.assertEquals("p3", p3.string) +- self.assertEquals("And we're back.", p4.string) ++ self.assertEqual("And now, a word:", p1.string) ++ self.assertEqual("p2", p2.string) ++ self.assertEqual("p3", p3.string) ++ self.assertEqual("And we're back.", p4.string) + + + def test_replace_with_maintains_next_element_throughout(self): +@@ -1015,8 +1015,8 @@ class TestTreeModification(SoupTest): + d1 = soup.find('div', id='d1') + d2 = soup.find('div', id='d2') + d2.extend(d1) +- self.assertEqual(u'
', d1.decode()) +- self.assertEqual(u'', d2.decode()) ++ self.assertEqual('
', d1.decode()) ++ self.assertEqual('', d2.decode()) + + def test_move_tag_to_beginning_of_parent(self): + data = "" +@@ -1262,7 +1262,7 @@ class TestTreeModification(SoupTest): + + """) + [soup.script.extract() for i in soup.find_all("script")] +- self.assertEqual("\n\n\n", unicode(soup.body)) ++ self.assertEqual("\n\n\n", str(soup.body)) + + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): +@@ -1524,7 +1524,7 @@ class TestPersistence(SoupTest): + soup = BeautifulSoup(b'

 

', 'html.parser') + encoding = soup.original_encoding + copy = soup.__copy__() +- self.assertEqual(u"

 

", unicode(copy)) ++ self.assertEqual("

 

", str(copy)) + self.assertEqual(encoding, copy.original_encoding) + + def test_copy_preserves_builder_information(self): +@@ -1554,14 +1554,14 @@ class TestPersistence(SoupTest): + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + def test_copy_navigablestring_is_not_attached_to_tree(self): +- html = u"FooBar" ++ html = "FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) +@@ -1573,7 +1573,7 @@ class TestPersistence(SoupTest): + self.assertEqual(None, s2.previous_element) + + def test_copy_navigablestring_subclass_has_same_type(self): +- html = u"" ++ html = "" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) +@@ -1581,19 +1581,19 @@ class TestPersistence(SoupTest): + self.assertTrue(isinstance(s2, Comment)) + + def test_copy_entire_soup(self): +- html = u"
FooBar
end" ++ html = "
FooBar
end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): +- html = u"
FooBar
end" ++ html = "
FooBar
end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. +- self.assertEqual(unicode(div), unicode(div_copy)) ++ self.assertEqual(str(div), str(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. +@@ -1609,17 +1609,17 @@ class TestPersistence(SoupTest): + class TestSubstitutions(SoupTest): + + def test_default_formatter_is_minimal(self): +- markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( +- u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) ++ "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_html(self): +- markup = u"
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( +@@ -1627,7 +1627,7 @@ class TestSubstitutions(SoupTest): + self.document_for("
<<Sacré bleu!>>")) + + def test_formatter_html5(self): +- markup = u"
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( +@@ -1635,49 +1635,49 @@ class TestSubstitutions(SoupTest): + self.document_for("
<<Sacré bleu!>>")) + + def test_formatter_minimal(self): +- markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( +- u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) ++ "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_null(self): +- markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + self.assertEqual(decoded, +- self.document_for(u"<>")) ++ self.document_for("<>")) + + def test_formatter_custom(self): +- markup = u"<foo>bar
" ++ markup = "<foo>bar
" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, +- self.document_for(u"BAR
")) ++ self.document_for("BAR
")) + + def test_formatter_is_run_on_attribute_values(self): +- markup = u'e' ++ markup = 'e' + soup = self.soup(markup) + a = soup.a + +- expect_minimal = u'e' ++ expect_minimal = 'e' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + +- expect_html = u'e' ++ expect_html = 'e' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) +- expect_upper = u'E' ++ expect_upper = 'E' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + + def test_formatter_skips_script_tag_for_html_documents(self): +@@ -1703,7 +1703,7 @@ class TestSubstitutions(SoupTest): + # Everything outside the
 tag is reformatted, but everything
+         # inside is left alone.
+         self.assertEqual(
+-            u'
\n foo\n
  \tbar\n  \n  
\n baz\n \n
', ++ '
\n foo\n
  \tbar\n  \n  
\n baz\n \n
', + soup.div.prettify()) + + def test_prettify_accepts_formatter_function(self): +@@ -1713,14 +1713,14 @@ class TestSubstitutions(SoupTest): + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("") +- self.assertEqual(unicode, type(soup.prettify())) ++ self.assertEqual(str, type(soup.prettify())) + + def test_prettify_can_encode_data(self): + soup = self.soup("") + self.assertEqual(bytes, type(soup.prettify("utf-8"))) + + def test_html_entity_substitution_off_by_default(self): +- markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" ++ markup = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEqual(encoded, markup.encode('utf-8')) +@@ -1764,48 +1764,48 @@ class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.string.encode("utf-8"), +- u"\N{SNOWMAN}".encode("utf-8")) ++ "\N{SNOWMAN}".encode("utf-8")) + + def test_tag_containing_unicode_string_can_be_encoded(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + soup.b.encode("utf-8"), html.encode("utf-8")) + + def test_encoding_substitutes_unrecognized_characters_by_default(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.encode("ascii"), b"") + + def test_encoding_can_be_made_strict(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertRaises( + UnicodeEncodeError, soup.encode, "ascii", errors="strict") + + def test_decode_contents(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) +- self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) ++ self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) + + def test_encode_contents(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( +- u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( ++ "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + encoding="utf8")) + + def test_deprecated_renderContents(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( +- u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) ++ "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + + def test_repr(self): +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + if PY3K: + self.assertEqual(html, repr(soup)) +@@ -1993,7 +1993,7 @@ class TestSoupSelector(TreeTest): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') +- self.assertEqual(els[0].contents, [u'The title']) ++ self.assertEqual(els[0].contents, ['The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') +@@ -2039,7 +2039,7 @@ class TestSoupSelector(TreeTest): + self.assertEqual(dashed[0]['id'], 'dash2') + + def test_dashed_tag_text(self): +- self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.') ++ self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') + + def test_select_dashed_matches_find_all(self): + self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) +@@ -2225,12 +2225,12 @@ class TestSoupSelector(TreeTest): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + self.assertEqual(len(els), 1) +- self.assertEqual(els[0].string, u'Some text') ++ self.assertEqual(els[0].string, 'Some text') + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + self.assertEqual(len(els), 1) +- self.assertEqual(els[0].string, u'Another') ++ self.assertEqual(els[0].string, 'Another') + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') +@@ -2243,7 +2243,7 @@ class TestSoupSelector(TreeTest): + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + self.assertEqual(len(els), 1) +- self.assertEqual(els[0].string, u'Some text') ++ self.assertEqual(els[0].string, 'Some text') + + def test_id_child_selector_nth_of_type(self): + self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) +@@ -2324,7 +2324,7 @@ class TestSoupSelector(TreeTest): + markup = '
' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") +- self.assertEquals(3, len(selected)) ++ self.assertEqual(3, len(selected)) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different +diff --git a/convert-py3k b/convert-py3k +deleted file mode 100755 +index 05fab53..0000000 +--- a/convert-py3k ++++ /dev/null +@@ -1,16 +0,0 @@ +-#!/bin/sh +-# +-# The Python 2 source is the definitive source. This script uses 2to3-3.2 to +-# create a new python3/bs4 source tree that works under Python 3. +-# +-# See README.txt to see how to run the test suite after conversion. +-echo "About to destroy and rebuild the py3k/bs4 directory." +-echo "If you've got stuff in there, Ctrl-C out of this script or answer 'n'." +-mkdir -p py3k +-rm -rfI py3k/bs4 +-cp -r bs4/ py3k/ +-2to3 -w py3k +-echo "" +-echo "OK, conversion is done." +-echo "Now running the unit tests." +-(cd py3k && python3 -m unittest discover -s bs4) +\ No newline at end of file +diff --git a/doc/source/index.rst b/doc/source/index.rst +index 34ec7cf..b8ca011 100644 +--- a/doc/source/index.rst ++++ b/doc/source/index.rst +@@ -170,16 +170,13 @@ Installing Beautiful Soup + If you're using a recent version of Debian or Ubuntu Linux, you can + install Beautiful Soup with the system package manager: + +-:kbd:`$ apt-get install python-bs4` (for Python 2) +- +-:kbd:`$ apt-get install python3-bs4` (for Python 3) ++:kbd:`$ apt-get install python3-bs4` + + Beautiful Soup 4 is published through PyPi, so if you can't install it + with the system packager, you can install it with ``easy_install`` or +-``pip``. The package name is ``beautifulsoup4``, and the same package +-works on Python 2 and Python 3. Make sure you use the right version of +-``pip`` or ``easy_install`` for your Python version (these may be named +-``pip3`` and ``easy_install3`` respectively if you're using Python 3). ++``pip``. The package name is ``beautifulsoup4``. Make sure you use the ++right version of ``pip`` or ``easy_install`` for your Python version ++(these may be named ``pip3`` and ``easy_install3`` respectively). + + :kbd:`$ easy_install beautifulsoup4` + +@@ -202,40 +199,8 @@ package the entire library with your application. You can download the + tarball, copy its ``bs4`` directory into your application's codebase, + and use Beautiful Soup without installing it at all. + +-I use Python 2.7 and Python 3.8 to develop Beautiful Soup, but it +-should work with other recent versions. +- +-Problems after installation +---------------------------- +- +-Beautiful Soup is packaged as Python 2 code. When you install it for +-use with Python 3, it's automatically converted to Python 3 code. If +-you don't install the package, the code won't be converted. There have +-also been reports on Windows machines of the wrong version being +-installed. +- +-If you get the ``ImportError`` "No module named HTMLParser", your +-problem is that you're running the Python 2 version of the code under +-Python 3. +- +-If you get the ``ImportError`` "No module named html.parser", your +-problem is that you're running the Python 3 version of the code under +-Python 2. +- +-In both cases, your best bet is to completely remove the Beautiful +-Soup installation from your system (including any directory created +-when you unzipped the tarball) and try the installation again. +- +-If you get the ``SyntaxError`` "Invalid syntax" on the line +-``ROOT_TAG_NAME = u'[document]'``, you need to convert the Python 2 +-code to Python 3. You can do this either by installing the package: +- +-:kbd:`$ python3 setup.py install` +- +-or by manually running Python's ``2to3`` conversion script on the +-``bs4`` directory: +- +-:kbd:`$ 2to3-3.2 -w bs4` ++I use Python 3.8 to develop Beautiful Soup, but it should work with ++other recent versions. + + .. _parser-installation: + +@@ -272,8 +237,7 @@ This table summarizes the advantages and disadvantages of each parser library: + +----------------------+--------------------------------------------+--------------------------------+--------------------------+ + | Python's html.parser | ``BeautifulSoup(markup, "html.parser")`` | * Batteries included | * Not as fast as lxml, | + | | | * Decent speed | less lenient than | +-| | | * Lenient (As of Python 2.7.3 | html5lib. | +-| | | and 3.2.) | | ++| | | * Lenient (As of Python 3.2) | html5lib. | + +----------------------+--------------------------------------------+--------------------------------+--------------------------+ + | lxml's HTML parser | ``BeautifulSoup(markup, "lxml")`` | * Very fast | * External C dependency | + | | | * Lenient | | +@@ -289,9 +253,9 @@ This table summarizes the advantages and disadvantages of each parser library: + +----------------------+--------------------------------------------+--------------------------------+--------------------------+ + + If you can, I recommend you install and use lxml for speed. If you're +-using a very old version of Python -- earlier than 2.7.3 or 3.2.2 -- +-it's `essential` that you install lxml or html5lib. Python's built-in +-HTML parser is just not very good in those old versions. ++using a very old version of Python -- earlier than 3.2.2 -- it's ++`essential` that you install lxml or html5lib. Python's built-in HTML ++parser is just not very good in those old versions. + + Note that if a document is invalid, different parsers will generate + different Beautiful Soup trees for it. See `Differences +@@ -481,8 +445,7 @@ uses the ``NavigableString`` class to contain these bits of text:: + A ``NavigableString`` is just like a Python Unicode string, except + that it also supports some of the features described in `Navigating + the tree`_ and `Searching the tree`_. You can convert a +-``NavigableString`` to a Unicode string with ``unicode()`` (in +-Python 2) or ``str`` (in Python 3):: ++``NavigableString`` to a Unicode string with ``str``:: + + unicode_string = str(tag.string) + unicode_string +@@ -2230,8 +2193,7 @@ Non-pretty printing + ------------------- + + If you just want a string, with no fancy formatting, you can call +-``str()`` on a ``BeautifulSoup`` object (``unicode()`` in Python 2), +-or on a ``Tag`` within it:: ++``str()`` on a ``BeautifulSoup`` object, or on a ``Tag`` within it:: + + str(soup) + # 'I linked to example.com' +@@ -3139,10 +3101,10 @@ Version mismatch problems + ------------------------- + + * ``SyntaxError: Invalid syntax`` (on the line ``ROOT_TAG_NAME = +- '[document]'``): Caused by running the Python 2 version of ++ '[document]'``): Caused by running an old Python 2 version of + Beautiful Soup under Python 3, without converting the code. + +-* ``ImportError: No module named HTMLParser`` - Caused by running the ++* ``ImportError: No module named HTMLParser`` - Caused by running an old + Python 2 version of Beautiful Soup under Python 3. + + * ``ImportError: No module named html.parser`` - Caused by running the +diff --git a/setup.py b/setup.py +index 7b4b393..b9b4ed2 100644 +--- a/setup.py ++++ b/setup.py +@@ -4,23 +4,22 @@ from setuptools import ( + ) + import sys + ++from bs4 import __version__ ++ + with open("README.md", "r") as fh: + long_description = fh.read() + + setup( + name="beautifulsoup4", +- # NOTE: We can't import __version__ from bs4 because bs4/__init__.py is Python 2 code, +- # and converting it to Python 3 means going through this code to run 2to3. +- # So we have to specify it twice for the time being. +- version = '4.9.3', ++ version = __version__, + author="Leonard Richardson", + author_email='leonardr@segfault.org', + url="http://www.crummy.com/software/BeautifulSoup/bs4/", + download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/", + description="Screen-scraping library", ++ python_requires='>3.0.0', + install_requires=[ +- "soupsieve >1.2; python_version>='3.0'", +- "soupsieve >1.2, <2.0; python_version<'3.0'", ++ "soupsieve >1.2", + ], + long_description=long_description, + long_description_content_type="text/markdown", +@@ -30,12 +29,10 @@ setup( + 'lxml' : [ 'lxml'], + 'html5lib' : ['html5lib'], + }, +- use_2to3 = True, + classifiers=["Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", +- "Programming Language :: Python :: 2.7", + 'Programming Language :: Python :: 3', + "Topic :: Text Processing :: Markup :: HTML", + "Topic :: Text Processing :: Markup :: XML", +diff --git a/test-all-versions b/test-all-versions +index 01e436b..fe7758a 100755 +--- a/test-all-versions ++++ b/test-all-versions +@@ -1 +1 @@ +-python2.7 -m unittest discover -s bs4 && ./convert-py3k ++python3 -m unittest discover -s bs4 +-- +2.13.7 + diff --git a/python-beautifulsoup4.spec b/python-beautifulsoup4.spec index 976b1f9..21656d9 100644 --- a/python-beautifulsoup4.spec +++ b/python-beautifulsoup4.spec @@ -1,12 +1,15 @@ Name: python-beautifulsoup4 Version: 4.9.3 -Release: 1 +Release: 2 Summary: HTML/XML parser for quick-turnaround projects License: MIT URL: http://www.crummy.com/software/BeautifulSoup/ Source0: https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-%{version}.tar.gz BuildArch: noarch BuildRequires: python3-devel python3-html5lib python3-setuptools python3-lxml +BuildRequires: python3-soupsieve + +Patch6000: backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch %global _description\ This package provides a python library which is designed for quick\ @@ -24,19 +27,15 @@ Requires: python3-lxml %prep %setup -q -n beautifulsoup4-%{version} -rm -rf %{py3dir} && cp -a . %{py3dir} +%patch6000 -p1 %build -pushd %{py3dir} -2to3 --write --nobackups . %{py3_build} %install -pushd %{py3dir} %{py3_install} %check -pushd %{py3dir} %{__python3} -m unittest discover -s bs4 || : %files -n python3-beautifulsoup4 @@ -46,6 +45,9 @@ pushd %{py3dir} %{python3_sitelib}/bs4 %changelog +* Mon Jan 10 2022 shixuantong - 4.9.3-2 +- converts the code base to Python 3, and removes the use_2to3 reference in setup.py. + * Mon Jul 26 2021 liusheng - 4.9.3-1 - Upgrade to version 4.9.3