diff --git a/backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch b/backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch new file mode 100644 index 0000000..1cab095 --- /dev/null +++ b/backport-converts-the-code-base-to-Python-3-and-removes-the-u.patch @@ -0,0 +1,2062 @@ +diff --git a/README.md b/README.md +index 92dd339..884f9eb 100644 +--- a/README.md ++++ b/README.md +@@ -53,17 +53,11 @@ To go beyond the basics, [comprehensive documentation is available](http://www.c + + # Note on Python 2 sunsetting + +-Since 2012, Beautiful Soup has been developed as a Python 2 library +-which is automatically converted to Python 3 code as necessary. This +-makes it impossible to take advantage of some features of Python +-3. +- +-For this reason, I plan to discontinue Beautiful Soup's Python 2 +-support at some point after December 31, 2020: one year after the +-sunset date for Python 2 itself. Beyond that point, new Beautiful Soup +-development will exclusively target Python 3. Of course, older +-releases of Beautiful Soup, which support both versions, will continue +-to be available. ++Beautiful Soup's support for Python 2 was discontinued on December 31, ++2020: one year after the sunset date for Python 2 itself. From this ++point onward, new Beautiful Soup development will exclusively target ++Python 3. The final release of Beautiful Soup 4 to support Python 2 ++was 4.9.3. + + # Supporting the project + +@@ -93,10 +87,5 @@ $ nosetests + ``` + + ``` +-$ python -m unittest discover -s bs4 ++$ python3 -m unittest discover -s bs4 + ``` +- +-If you checked out the source tree, you should see a script in the +-home directory called test-all-versions. This script will run the unit +-tests under Python 2, then create a temporary Python 3 conversion of +-the source and run the unit tests again under Python 3. +diff --git a/bs4/__init__.py b/bs4/__init__.py +index 8f78809..51ccc21 100644 +--- a/bs4/__init__.py ++++ b/bs4/__init__.py +@@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a + provides methods and Pythonic idioms that make it easy to navigate, + search, and modify the parse tree. + +-Beautiful Soup works with Python 2.7 and up. It works better if lxml ++Beautiful Soup works with Python 3.5 and up. It works better if lxml + and/or html5lib is installed. + + For more than you ever wanted to know about Beautiful Soup, see the +@@ -29,6 +29,11 @@ import sys + import traceback + import warnings + ++# The very first thing we do is give a useful error if someone is ++# running this code under Python 2. ++if sys.version_info.major < 3: ++ raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') ++ + from .builder import builder_registry, ParserRejectedMarkup + from .dammit import UnicodeDammit + from .element import ( +@@ -49,10 +54,6 @@ from .element import ( + TemplateString, + ) + +-# The very first thing we do is give a useful error if someone is +-# running this code under Python 3 without converting it. +-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +- + # Define some custom warnings. + class GuessedAtParserWarning(UserWarning): + """The warning issued when BeautifulSoup has to guess what parser to +@@ -100,7 +101,7 @@ class BeautifulSoup(Tag): + # Since BeautifulSoup subclasses Tag, it's possible to treat it as + # a Tag with a .name. This name makes it clear the BeautifulSoup + # object isn't a real markup tag. +- ROOT_TAG_NAME = u'[document]' ++ ROOT_TAG_NAME = '[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. +@@ -217,7 +218,7 @@ class BeautifulSoup(Tag): + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + +- if from_encoding and isinstance(markup, unicode): ++ if from_encoding and isinstance(markup, str): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + +@@ -234,7 +235,7 @@ class BeautifulSoup(Tag): + builder_class = builder + builder = None + elif builder is None: +- if isinstance(features, basestring): ++ if isinstance(features, str): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES +@@ -309,13 +310,13 @@ class BeautifulSoup(Tag): + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) +- or (isinstance(markup, unicode) and not u'<' in markup) ++ or (isinstance(markup, str) and not '<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. +- if (isinstance(markup, unicode) ++ if (isinstance(markup, str) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: +@@ -323,7 +324,7 @@ class BeautifulSoup(Tag): + is_file = False + try: + is_file = os.path.exists(possible_filename) +- except Exception, e: ++ except Exception as e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. +@@ -353,9 +354,9 @@ class BeautifulSoup(Tag): + pass + + if not success: +- other_exceptions = [unicode(e) for e in rejections] ++ other_exceptions = [str(e) for e in rejections] + raise ParserRejectedMarkup( +- u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) ++ "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + + # Clear out the markup and remove the builder's circular +@@ -406,9 +407,9 @@ class BeautifulSoup(Tag): + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") +- elif isinstance(markup, unicode): +- space = u' ' +- cant_start_with = (u"http:", u"https:") ++ elif isinstance(markup, str): ++ space = ' ' ++ cant_start_with = ("http:", "https:") + else: + return + +@@ -545,7 +546,7 @@ class BeautifulSoup(Tag): + containerClass = self.string_container(containerClass) + + if self.current_data: +- current_data = u''.join(self.current_data) ++ current_data = ''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. +@@ -748,9 +749,9 @@ class BeautifulSoup(Tag): + eventual_encoding = None + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding +- prefix = u'\n' % encoding_part ++ prefix = '\n' % encoding_part + else: +- prefix = u'' ++ prefix = '' + if not pretty_print: + indent_level = None + else: +@@ -788,4 +789,4 @@ class FeatureNotFound(ValueError): + if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) +- print(soup.prettify()) ++ print((soup.prettify())) +diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py +index 03da4c6..03fbd6a 100644 +--- a/bs4/builder/__init__.py ++++ b/bs4/builder/__init__.py +@@ -300,13 +300,13 @@ class TreeBuilder(object): + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) +- for attr in attrs.keys(): ++ for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] +- if isinstance(value, basestring): ++ if isinstance(value, str): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice +@@ -496,7 +496,7 @@ class ParserRejectedMarkup(Exception): + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception +- message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) ++ message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + + # Builders are registered in reverse order of priority, so that custom +diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py +index a1c6134..69aefd7 100644 +--- a/bs4/builder/_html5lib.py ++++ b/bs4/builder/_html5lib.py +@@ -33,7 +33,7 @@ try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +-except ImportError, e: ++except ImportError as e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True +@@ -79,7 +79,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + self.underlying_builder.parser = parser + extra_kwargs = dict() +- if not isinstance(markup, unicode): ++ if not isinstance(markup, str): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: +@@ -87,13 +87,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] +- if not isinstance(original_encoding, basestring): ++ if not isinstance(original_encoding, str): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. +@@ -110,7 +110,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" +- return u'
%s' % fragment ++ return '%s' % fragment + + + class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): +@@ -217,7 +217,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] +- for name, value in element.attrs.items(): ++ for name, value in list(element.attrs.items()): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): +@@ -272,7 +272,7 @@ class Element(treebuilder_base.Node): + + def appendChild(self, node): + string_child = child = None +- if isinstance(node, basestring): ++ if isinstance(node, str): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. +@@ -289,7 +289,7 @@ class Element(treebuilder_base.Node): + child = node.element + node.parent = self + +- if not isinstance(child, basestring) and child.parent is not None: ++ if not isinstance(child, str) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents +@@ -302,7 +302,7 @@ class Element(treebuilder_base.Node): + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: +- if isinstance(node, basestring): ++ if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + +@@ -340,7 +340,7 @@ class Element(treebuilder_base.Node): + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) +- for name, value in attributes.items(): ++ for name, value in list(attributes.items()): + self.element[name] = value + + # The attributes may contain variables that need substitution. +diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py +index 96a7b7d..88860a9 100644 +--- a/bs4/builder/_htmlparser.py ++++ b/bs4/builder/_htmlparser.py +@@ -8,11 +8,11 @@ __all__ = [ + 'HTMLParserTreeBuilder', + ] + +-from HTMLParser import HTMLParser ++from html.parser import HTMLParser + + try: +- from HTMLParser import HTMLParseError +-except ImportError, e: ++ from html.parser import HTMLParseError ++except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): +@@ -219,14 +219,14 @@ class BeautifulSoupHTMLParser(HTMLParser): + continue + try: + data = bytearray([real_name]).decode(encoding) +- except UnicodeDecodeError, e: ++ except UnicodeDecodeError as e: + pass + if not data: + try: +- data = unichr(real_name) +- except (ValueError, OverflowError), e: ++ data = chr(real_name) ++ except (ValueError, OverflowError) as e: + pass +- data = data or u"\N{REPLACEMENT CHARACTER}" ++ data = data or "\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): +@@ -353,7 +353,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # Parse Unicode as-is. + yield (markup, None, None, False) + return +@@ -376,7 +376,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): + try: + parser.feed(markup) + parser.close() +- except HTMLParseError, e: ++ except HTMLParseError as e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e +diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py +index 1b44d75..432a2c8 100644 +--- a/bs4/builder/_lxml.py ++++ b/bs4/builder/_lxml.py +@@ -8,11 +8,11 @@ __all__ = [ + + try: + from collections.abc import Callable # Python 3.6 +-except ImportError , e: ++except ImportError as e: + from collections import Callable + + from io import BytesIO +-from StringIO import StringIO ++from io import StringIO + from lxml import etree + from bs4.element import ( + Comment, +@@ -35,7 +35,7 @@ LXML = 'lxml' + + def _invert(d): + "Invert a dictionary." +- return dict((v,k) for k, v in d.items()) ++ return dict((v,k) for k, v in list(d.items())) + + class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser +@@ -81,7 +81,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + + :param mapping: A dictionary mapping namespace prefixes to URIs. + """ +- for key, value in mapping.items(): ++ for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same +@@ -169,12 +169,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): + else: + self.processing_instruction_class = XMLProcessingInstruction + +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + +- if isinstance(markup, unicode): ++ if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", +@@ -189,7 +189,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) +- elif isinstance(markup, unicode): ++ elif isinstance(markup, str): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, +@@ -204,7 +204,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + if len(data) != 0: + self.parser.feed(data) + self.parser.close() +- except (UnicodeDecodeError, LookupError, etree.ParserError), e: ++ except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + def close(self): +@@ -233,7 +233,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() +- for prefix, namespace in nsmap.items(): ++ for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace +@@ -242,7 +242,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} +- for attr, value in attrs.items(): ++ for attr, value in list(attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value +@@ -302,7 +302,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" +- return u'\n%s' % fragment ++ return '\n%s' % fragment + + + class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): +@@ -323,10 +323,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() +- except (UnicodeDecodeError, LookupError, etree.ParserError), e: ++ except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" +- return u'%s' % fragment ++ return '%s' % fragment +diff --git a/bs4/dammit.py b/bs4/dammit.py +index 33f7b7d..ee3708f 100644 +--- a/bs4/dammit.py ++++ b/bs4/dammit.py +@@ -10,7 +10,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job. + __license__ = "MIT" + + import codecs +-from htmlentitydefs import codepoint2name ++from html.entities import codepoint2name + import re + import logging + import string +@@ -22,7 +22,7 @@ try: + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): +- if isinstance(s, unicode): ++ if isinstance(s, str): + return None + return cchardet.detect(s)['encoding'] + except ImportError: +@@ -32,7 +32,7 @@ except ImportError: + # PyPI package: chardet + import chardet + def chardet_dammit(s): +- if isinstance(s, unicode): ++ if isinstance(s, str): + return None + return chardet.detect(s)['encoding'] + #import chardet.constants +@@ -53,14 +53,14 @@ except ImportError: + + # Build bytestring and Unicode versions of regular expressions for finding + # a declared encoding inside an XML or HTML document. +-xml_encoding = u'^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +-html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' ++xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' ++html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' + encoding_res = dict() + encoding_res[bytes] = { + 'html' : re.compile(html_meta.encode("ascii"), re.I), + 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), + } +-encoding_res[unicode] = { ++encoding_res[str] = { + 'html' : re.compile(html_meta, re.I), + 'xml' : re.compile(xml_encoding, re.I) + } +@@ -80,7 +80,7 @@ class EntitySubstitution(object): + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: +- character = unichr(codepoint) ++ character = chr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it +@@ -323,7 +323,7 @@ class EncodingDetector: + :return: A 2-tuple (modified data, implied encoding) + """ + encoding = None +- if isinstance(data, unicode): ++ if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ +@@ -370,7 +370,7 @@ class EncodingDetector: + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: +- res = encoding_res[unicode] ++ res = encoding_res[str] + + xml_re = res['xml'] + html_re = res['html'] +@@ -431,9 +431,9 @@ class UnicodeDammit: + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. +- if isinstance(markup, unicode) or markup == '': ++ if isinstance(markup, str) or markup == '': + self.markup = markup +- self.unicode_markup = unicode(markup) ++ self.unicode_markup = str(markup) + self.original_encoding = None + return + +@@ -523,7 +523,7 @@ class UnicodeDammit: + + :param encoding: The name of an encoding. + """ +- return unicode(data, encoding, errors) ++ return str(data, encoding, errors) + + @property + def declared_html_encoding(self): +diff --git a/bs4/diagnose.py b/bs4/diagnose.py +index e4f2f47..500e92d 100644 +--- a/bs4/diagnose.py ++++ b/bs4/diagnose.py +@@ -4,8 +4,8 @@ + __license__ = "MIT" + + import cProfile +-from StringIO import StringIO +-from HTMLParser import HTMLParser ++from io import StringIO ++from html.parser import HTMLParser + import bs4 + from bs4 import BeautifulSoup, __version__ + from bs4.builder import builder_registry +@@ -25,8 +25,8 @@ def diagnose(data): + :param data: A string containing markup that needs to be explained. + :return: None; diagnostics are printed to standard output. + """ +- print("Diagnostic running on Beautiful Soup %s" % __version__) +- print("Python version %s" % sys.version) ++ print(("Diagnostic running on Beautiful Soup %s" % __version__)) ++ print(("Python version %s" % sys.version)) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: +@@ -35,16 +35,16 @@ def diagnose(data): + break + else: + basic_parsers.remove(name) +- print( ++ print(( + "I noticed that %s is not installed. Installing it may help." % +- name) ++ name)) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree +- print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) +- except ImportError, e: ++ print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) ++ except ImportError as e: + print( + "lxml is not installed or couldn't be imported.") + +@@ -52,21 +52,21 @@ def diagnose(data): + if 'html5lib' in basic_parsers: + try: + import html5lib +- print("Found html5lib version %s" % html5lib.__version__) +- except ImportError, e: ++ print(("Found html5lib version %s" % html5lib.__version__)) ++ except ImportError as e: + print( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): +- print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) ++ print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + else: + try: + if os.path.exists(data): +- print('"%s" looks like a filename. Reading data from the file.' % data) ++ print(('"%s" looks like a filename. Reading data from the file.' % data)) + with open(data) as fp: + data = fp.read() + except ValueError: +@@ -76,19 +76,19 @@ def diagnose(data): + print("") + + for parser in basic_parsers: +- print("Trying to parse your markup with %s" % parser) ++ print(("Trying to parse your markup with %s" % parser)) + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True +- except Exception, e: +- print("%s could not parse the markup." % parser) ++ except Exception as e: ++ print(("%s could not parse the markup." % parser)) + traceback.print_exc() + if success: +- print("Here's what %s did with the markup:" % parser) +- print(soup.prettify()) ++ print(("Here's what %s did with the markup:" % parser)) ++ print((soup.prettify())) + +- print("-" * 80) ++ print(("-" * 80)) + + def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. +@@ -104,7 +104,7 @@ def lxml_trace(data, html=True, **kwargs): + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): +- print("%s, %4s, %s" % (event, element.tag, element.text)) ++ print(("%s, %4s, %s" % (event, element.tag, element.text))) + + class AnnouncingParser(HTMLParser): + """Subclass of HTMLParser that announces parse events, without doing +@@ -193,9 +193,9 @@ def rdoc(num_elements=1000): + + def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" +- print("Comparative parser benchmark on Beautiful Soup %s" % __version__) ++ print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) + data = rdoc(num_elements) +- print("Generated a large invalid HTML document (%d bytes)." % len(data)) ++ print(("Generated a large invalid HTML document (%d bytes)." % len(data))) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False +@@ -204,24 +204,24 @@ def benchmark_parsers(num_elements=100000): + soup = BeautifulSoup(data, parser) + b = time.time() + success = True +- except Exception, e: +- print("%s could not parse the markup." % parser) ++ except Exception as e: ++ print(("%s could not parse the markup." % parser)) + traceback.print_exc() + if success: +- print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) ++ print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() +- print("Raw lxml parsed the markup in %.2fs." % (b-a)) ++ print(("Raw lxml parsed the markup in %.2fs." % (b-a))) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() +- print("Raw html5lib parsed the markup in %.2fs." % (b-a)) ++ print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) + + def profile(num_elements=100000, parser="lxml"): + """Use Python's profiler on a randomly generated document.""" +diff --git a/bs4/element.py b/bs4/element.py +index 09a81d9..81d9db9 100644 +--- a/bs4/element.py ++++ b/bs4/element.py +@@ -3,14 +3,14 @@ __license__ = "MIT" + + try: + from collections.abc import Callable # Python 3.6 +-except ImportError , e: ++except ImportError as e: + from collections import Callable + import re + import sys + import warnings + try: + import soupsieve +-except ImportError, e: ++except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' +@@ -57,22 +57,22 @@ def _alias(attr): + # Source: + # https://docs.python.org/3/library/codecs.html#python-specific-encodings + PYTHON_SPECIFIC_ENCODINGS = set([ +- u"idna", +- u"mbcs", +- u"oem", +- u"palmos", +- u"punycode", +- u"raw_unicode_escape", +- u"undefined", +- u"unicode_escape", +- u"raw-unicode-escape", +- u"unicode-escape", +- u"string-escape", +- u"string_escape", ++ "idna", ++ "mbcs", ++ "oem", ++ "palmos", ++ "punycode", ++ "raw_unicode_escape", ++ "undefined", ++ "unicode_escape", ++ "raw-unicode-escape", ++ "unicode-escape", ++ "string-escape", ++ "string_escape", + ]) + + +-class NamespacedAttribute(unicode): ++class NamespacedAttribute(str): + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ +@@ -84,18 +84,18 @@ class NamespacedAttribute(unicode): + name = None + + if name is None: +- obj = unicode.__new__(cls, prefix) ++ obj = str.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. +- obj = unicode.__new__(cls, name) ++ obj = str.__new__(cls, name) + else: +- obj = unicode.__new__(cls, prefix + ":" + name) ++ obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +-class AttributeValueWithCharsetSubstitution(unicode): ++class AttributeValueWithCharsetSubstitution(str): + """A stand-in object for a character encoding specified in HTML.""" + + class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): +@@ -106,7 +106,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """ + + def __new__(cls, original_value): +- obj = unicode.__new__(cls, original_value) ++ obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + +@@ -134,9 +134,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. +- return unicode.__new__(unicode, original_value) ++ return str.__new__(str, original_value) + +- obj = unicode.__new__(cls, original_value) ++ obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + +@@ -376,7 +376,7 @@ class PageElement(object): + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") +- if (isinstance(new_child, basestring) ++ if (isinstance(new_child, str) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + +@@ -753,7 +753,7 @@ class PageElement(object): + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) +- elif isinstance(name, basestring): ++ elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, +@@ -872,7 +872,7 @@ class PageElement(object): + return self.parents + + +-class NavigableString(unicode, PageElement): ++class NavigableString(str, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup penguin, it will +@@ -895,10 +895,10 @@ class NavigableString(unicode, PageElement): + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ +- if isinstance(value, unicode): +- u = unicode.__new__(cls, value) ++ if isinstance(value, str): ++ u = str.__new__(cls, value) + else: +- u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) ++ u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + +@@ -909,7 +909,7 @@ class NavigableString(unicode, PageElement): + return type(self)(self) + + def __getnewargs__(self): +- return (unicode(self),) ++ return (str(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards +@@ -975,30 +975,30 @@ class PreformattedString(NavigableString): + + class CData(PreformattedString): + """A CDATA block.""" +- PREFIX = u'' ++ PREFIX = '' + + class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + +- PREFIX = u'' +- SUFFIX = u'>' ++ PREFIX = '' ++ SUFFIX = '>' + + class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" +- PREFIX = u'' +- SUFFIX = u'?>' ++ PREFIX = '' ++ SUFFIX = '?>' + + class Comment(PreformattedString): + """An HTML or XML comment.""" +- PREFIX = u'' ++ PREFIX = '' + + + class Declaration(PreformattedString): + """An XML declaration.""" +- PREFIX = u'' +- SUFFIX = u'?>' ++ PREFIX = '' ++ SUFFIX = '?>' + + + class Doctype(PreformattedString): +@@ -1026,8 +1026,8 @@ class Doctype(PreformattedString): + + return Doctype(value) + +- PREFIX = u'\n' ++ PREFIX = '\n' + + + class Stylesheet(NavigableString): +@@ -1263,7 +1263,7 @@ class Tag(PageElement): + for string in self._all_strings(True): + yield string + +- def get_text(self, separator=u"", strip=False, ++ def get_text(self, separator="", strip=False, + types=(NavigableString, CData)): + """Get all child strings, concatenated using the given separator. + +@@ -1416,7 +1416,7 @@ class Tag(PageElement): + def __contains__(self, x): + return x in self.contents + +- def __nonzero__(self): ++ def __bool__(self): + "A tag is non-None even if it has no contents." + return True + +@@ -1565,8 +1565,8 @@ class Tag(PageElement): + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) +- elif not isinstance(val, basestring): +- val = unicode(val) ++ elif not isinstance(val, str): ++ val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None +@@ -1575,7 +1575,7 @@ class Tag(PageElement): + + text = formatter.attribute_value(val) + decoded = ( +- unicode(key) + '=' ++ str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' +@@ -1934,7 +1934,7 @@ class SoupStrainer(object): + else: + attrs = kwargs + normalized_attrs = {} +- for key, value in attrs.items(): ++ for key, value in list(attrs.items()): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs +@@ -1943,7 +1943,7 @@ class SoupStrainer(object): + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. +- if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') ++ if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + +@@ -1956,7 +1956,7 @@ class SoupStrainer(object): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) +- and not isinstance(v, unicode)): ++ and not isinstance(v, str)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. +@@ -1968,7 +1968,7 @@ class SoupStrainer(object): + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. +- return unicode(str(value)) ++ return str(str(value)) + + def __str__(self): + """A human-readable representation of this SoupStrainer.""" +@@ -1996,7 +1996,7 @@ class SoupStrainer(object): + markup = markup_name + markup_attrs = markup + +- if isinstance(self.name, basestring): ++ if isinstance(self.name, str): + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. +@@ -2052,7 +2052,7 @@ class SoupStrainer(object): + found = None + # If given a list of items, scan it for a text element that + # matches. +- if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): ++ if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): +@@ -2065,7 +2065,7 @@ class SoupStrainer(object): + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ +- isinstance(markup, basestring): ++ isinstance(markup, str): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: +@@ -2110,7 +2110,7 @@ class SoupStrainer(object): + return not match_against + + if (hasattr(match_against, '__iter__') +- and not isinstance(match_against, basestring)): ++ and not isinstance(match_against, str)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. +@@ -2137,7 +2137,7 @@ class SoupStrainer(object): + # the tag's name and once against its prefixed name. + match = False + +- if not match and isinstance(match_against, unicode): ++ if not match and isinstance(match_against, str): + # Exact string match + match = markup == match_against + +diff --git a/bs4/formatter.py b/bs4/formatter.py +index 9a692ec..2cbab4c 100644 +--- a/bs4/formatter.py ++++ b/bs4/formatter.py +@@ -83,7 +83,7 @@ class Formatter(EntitySubstitution): + """ + if not self.entity_substitution: + return ns +- from element import NavigableString ++ from .element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): +diff --git a/bs4/testing.py b/bs4/testing.py +index a2f83a1..9ca507b 100644 +--- a/bs4/testing.py ++++ b/bs4/testing.py +@@ -25,7 +25,7 @@ from bs4.element import ( + from bs4.builder import HTMLParserTreeBuilder + default_builder = HTMLParserTreeBuilder + +-BAD_DOCUMENT = u"""A bare string ++BAD_DOCUMENT = """A bare string + + + +@@ -94,7 +94,7 @@ class SoupTest(unittest.TestCase): + # Verify that every tag that was opened was eventually closed. + + # There are no tags in the open tag counter. +- assert all(v==0 for v in obj.open_tag_counter.values()) ++ assert all(v==0 for v in list(obj.open_tag_counter.values())) + + # The only tag in the tag stack is the one for the root + # document. +@@ -372,7 +372,7 @@ class HTMLTreeBuilderSmokeTest(object): + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. +- markup = u"""""" ++ markup = """""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + +@@ -544,14 +544,14 @@ Hello, world! + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( +- u"• AT&T is in the s&p 500
", +- u"\u2022 AT&T is in the s&p 500
" ++ "• AT&T is in the s&p 500
", ++ "\u2022 AT&T is in the s&p 500
" + ) + + def test_apos_entity(self): + self.assertSoupEquals( +- u"Bob's Bar
", +- u"Bob's Bar
", ++ "Bob's Bar
", ++ "Bob's Bar
", + ) + + def test_entities_in_foreign_document_encoding(self): +@@ -564,17 +564,17 @@ Hello, world! + # characters. + markup = "Hello -☃
" + soup = self.soup(markup) +- self.assertEquals(u"“Hello” -☃", soup.p.string) ++ self.assertEqual("“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): +- expect = u'' ++ expect = '' + self.assertSoupEquals('', expect) + self.assertSoupEquals('', expect) + self.assertSoupEquals('', expect) + self.assertSoupEquals('', expect) + + def test_entities_in_text_converted_to_unicode(self): +- expect = u'pi\N{LATIN SMALL LETTER N WITH TILDE}ata
' ++ expect = 'pi\N{LATIN SMALL LETTER N WITH TILDE}ata
' + self.assertSoupEquals("piñata
", expect) + self.assertSoupEquals("piñata
", expect) + self.assertSoupEquals("piñata
", expect) +@@ -585,7 +585,7 @@ Hello, world! + 'I said "good day!"
') + + def test_out_of_range_entity(self): +- expect = u"\N{REPLACEMENT CHARACTER}" ++ expect = "\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("", expect) + self.assertSoupEquals("", expect) + self.assertSoupEquals("", expect) +@@ -663,9 +663,9 @@ Hello, world! + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! +- markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' ++ markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) +- self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) ++ self.assertEqual('Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" +@@ -705,7 +705,7 @@ Hello, world! + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "<<sacré bleu!>>
" +- expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
" ++ expected = "<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): +@@ -715,15 +715,15 @@ Hello, world! + soup = self.soup(quote) + self.assertEqual( + soup.p.string, +- u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") ++ "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup(" ") +- self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) ++ self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "<<sacré bleu!>>
" +- expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
".encode("utf-8") ++ expected = "<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + +@@ -732,7 +732,7 @@ Hello, world! + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. +- unicode_html = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
' ++ unicode_html = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. +@@ -848,8 +848,8 @@ Hello, world! + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( +- u'idna', u'mbcs', u'oem', u'undefined', +- u'string_escape', u'string-escape' ++ 'idna', 'mbcs', 'oem', 'undefined', ++ 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't +@@ -910,8 +910,8 @@ class XMLTreeBuilderSmokeTest(object): + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( +- u'idna', u'mbcs', u'oem', u'undefined', +- u'string_escape', u'string-escape' ++ 'idna', 'mbcs', 'oem', 'undefined', ++ 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't +@@ -962,15 +962,15 @@ class XMLTreeBuilderSmokeTest(object): + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): +- markup = u'foo
\n' + soup = self.soup(markup) +- self.assertEqual(u"foo
\n", soup.body.decode()) ++ self.assertEqual("foo
\n", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + + def test_reparented_markup_ends_with_whitespace(self): + markup = 'foo
\n\n' + soup = self.soup(markup) +- self.assertEqual(u"foo
\n\n", soup.body.decode()) ++ self.assertEqual("foo
\n\n", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): +@@ -127,7 +127,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + def test_foster_parenting(self): + markup = b"""A"""
+ soup = self.soup(markup)
+- self.assertEqual(u"Aéé" ++ data = "éé" + soup = self.soup(data) +- self.assertEqual(u"éé", soup.h1.string) ++ self.assertEqual("éé", soup.h1.string) + + def test_embedded_null(self): +- data = u"foo\0bar" ++ data = "foo\0bar" + soup = self.soup(data) +- self.assertEqual(u"foo\0bar", soup.h1.string) ++ self.assertEqual("foo\0bar", soup.h1.string) + + def test_exclude_encodings(self): +- utf8_data = u"Räksmörgås".encode("utf-8") ++ utf8_data = "Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + +@@ -127,7 +127,7 @@ class TestConstructor(SoupTest): + yield markup, None, None, False + + import re +- self.assertRaisesRegexp( ++ self.assertRaisesRegex( + ParserRejectedMarkup, + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.", + BeautifulSoup, '', builder=Mock, +@@ -303,7 +303,7 @@ class TestWarnings(SoupTest): + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning +- soup = self.soup(u"http://www.crummyunicode.com/") ++ soup = self.soup("http://www.crummyunicode.com/") + warning = self._assert_warning( + warning_list, MarkupResemblesLocatorWarning + ) +@@ -319,7 +319,7 @@ class TestWarnings(SoupTest): + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: +- soup = self.soup(u"http://www.crummyuncode.com/ is great") ++ soup = self.soup("http://www.crummyuncode.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + +@@ -341,9 +341,9 @@ class TestEntitySubstitution(unittest.TestCase): + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. +- s = u"foo\u2200\N{SNOWMAN}\u00f5bar" ++ s = "foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEqual(self.sub.substitute_html(s), +- u"foo∀\N{SNOWMAN}õbar") ++ "foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we +@@ -408,7 +408,7 @@ class TestEncodingConversion(SoupTest): + + def setUp(self): + super(TestEncodingConversion, self).setUp() +- self.unicode_data = u'Räksmörgås') +- self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås') ++ soup = self.soup('Räksmörgås') ++ self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') + + def test_unicode_attribute_find(self): +- soup = self.soup(u'here it is') ++ soup = self.soup('here it is') + str(soup) +- self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text) ++ self.assertEqual("here it is", soup.find(id='Räksmörgås').text) + + + def test_find_everything(self): +@@ -101,17 +101,17 @@ class TestFindAll(TreeTest): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. +- self.assertEqual(soup.find_all(string="bar"), [u"bar"]) +- self.assertEqual(soup.find_all(text="bar"), [u"bar"]) ++ self.assertEqual(soup.find_all(string="bar"), ["bar"]) ++ self.assertEqual(soup.find_all(text="bar"), ["bar"]) + # Match any of a number of strings. + self.assertEqual( +- soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) ++ soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) + # Match a regular expression. + self.assertEqual(soup.find_all(text=re.compile('.*')), +- [u"Foo", u"bar", u'\xbb']) ++ ["Foo", "bar", '\xbb']) + # Match anything. + self.assertEqual(soup.find_all(text=True), +- [u"Foo", u"bar", u'\xbb']) ++ ["Foo", "bar", '\xbb']) + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" +@@ -254,8 +254,8 @@ class TestFindAllByAttribute(TreeTest): + ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): +- peace = u"םולש".encode("utf8") +- data = u''.encode("utf8") ++ peace = "םולש".encode("utf8") ++ data = ''.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) +@@ -444,7 +444,7 @@ class TestSmooth(TreeTest): + # output. + + # Since the tag has two children, its .string is None. +- self.assertEquals(None, div.span.string) ++ self.assertEqual(None, div.span.string) + + self.assertEqual(7, len(div.contents)) + div.smooth() +@@ -755,18 +755,18 @@ class TestTag(SoupTest): + + # No list of whitespace-preserving tags -> pretty-print + tag._preserve_whitespace_tags = None +- self.assertEquals(True, tag._should_pretty_print(0)) ++ self.assertEqual(True, tag._should_pretty_print(0)) + + # List exists but tag is not on the list -> pretty-print + tag.preserve_whitespace_tags = ["some_other_tag"] +- self.assertEquals(True, tag._should_pretty_print(1)) ++ self.assertEqual(True, tag._should_pretty_print(1)) + + # Indent level is None -> don't pretty-print +- self.assertEquals(False, tag._should_pretty_print(None)) ++ self.assertEqual(False, tag._should_pretty_print(None)) + + # Tag is on the whitespace-preserving list -> don't pretty-print + tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"] +- self.assertEquals(False, tag._should_pretty_print(1)) ++ self.assertEqual(False, tag._should_pretty_print(1)) + + + class TestTagCreation(SoupTest): +@@ -905,10 +905,10 @@ class TestTreeModification(SoupTest): + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) +- self.assertEquals("And now, a word:", p1.string) +- self.assertEquals("p2", p2.string) +- self.assertEquals("p3", p3.string) +- self.assertEquals("And we're back.", p4.string) ++ self.assertEqual("And now, a word:", p1.string) ++ self.assertEqual("p2", p2.string) ++ self.assertEqual("p3", p3.string) ++ self.assertEqual("And we're back.", p4.string) + + + def test_replace_with_maintains_next_element_throughout(self): +@@ -1015,8 +1015,8 @@ class TestTreeModification(SoupTest): + d1 = soup.find('div', id='d1') + d2 = soup.find('div', id='d2') + d2.extend(d1) +- self.assertEqual(u'', d1.decode()) +- self.assertEqual(u'', d2.decode()) ++ self.assertEqual('', d1.decode()) ++ self.assertEqual('', d2.decode()) + + def test_move_tag_to_beginning_of_parent(self): + data = "', 'html.parser') + encoding = soup.original_encoding + copy = soup.__copy__() +- self.assertEqual(u" ", unicode(copy)) ++ self.assertEqual(" ", str(copy)) + self.assertEqual(encoding, copy.original_encoding) + + def test_copy_preserves_builder_information(self): +@@ -1554,14 +1554,14 @@ class TestPersistence(SoupTest): + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. +- html = u"\N{SNOWMAN}" ++ html = "\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + def test_copy_navigablestring_is_not_attached_to_tree(self): +- html = u"FooBar" ++ html = "FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) +@@ -1573,7 +1573,7 @@ class TestPersistence(SoupTest): + self.assertEqual(None, s2.previous_element) + + def test_copy_navigablestring_subclass_has_same_type(self): +- html = u"" ++ html = "" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) +@@ -1581,19 +1581,19 @@ class TestPersistence(SoupTest): + self.assertTrue(isinstance(s2, Comment)) + + def test_copy_entire_soup(self): +- html = u"end" ++ html = "end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): +- html = u"end" ++ html = "end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. +- self.assertEqual(unicode(div), unicode(div_copy)) ++ self.assertEqual(str(div), str(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. +@@ -1609,17 +1609,17 @@ class TestPersistence(SoupTest): + class TestSubstitutions(SoupTest): + + def test_default_formatter_is_minimal(self): +- markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( +- u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) ++ "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_html(self): +- markup = u" <<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = " <<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( +@@ -1627,7 +1627,7 @@ class TestSubstitutions(SoupTest): + self.document_for(" <<Sacré bleu!>>")) + + def test_formatter_html5(self): +- markup = u" <<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = " <<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( +@@ -1635,49 +1635,49 @@ class TestSubstitutions(SoupTest): + self.document_for(" <<Sacré bleu!>>")) + + def test_formatter_minimal(self): +- markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( +- u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) ++ "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_null(self): +- markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ++ markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + self.assertEqual(decoded, +- self.document_for(u"< " ++ markup = "<foo>bar " + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, +- self.document_for(u" ")) ++ self.document_for(" ")) + + def test_formatter_is_run_on_attribute_values(self): +- markup = u'e' ++ markup = 'e' + soup = self.soup(markup) + a = soup.a + +- expect_minimal = u'e' ++ expect_minimal = 'e' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + +- expect_html = u'e' ++ expect_html = 'e' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) +- expect_upper = u'E' ++ expect_upper = 'E' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + + def test_formatter_skips_script_tag_for_html_documents(self): +@@ -1703,7 +1703,7 @@ class TestSubstitutions(SoupTest): + # Everything outside the tag is reformatted, but everything + # inside is left alone. + self.assertEqual( +- u' |