162 lines
5.0 KiB
Diff
162 lines
5.0 KiB
Diff
From 838f53a97ce44ea0f8f4d361afcb62a441f8633f Mon Sep 17 00:00:00 2001
|
|
From: Adam Williamson <awilliam@redhat.com>
|
|
Date: Mon, 26 Jul 2021 13:11:17 -0700
|
|
Subject: [PATCH] Considerably simplify html_util for Python 3.10
|
|
compatibility (#58)
|
|
|
|
As reported in #58 and RHBZ #1972391, `formatter` was removed
|
|
from the Python standard library in Python 3.10. This heavily
|
|
simplifies `html_util.html_to_text()` by using the stdlib
|
|
`HTMLParser` class, which avoids the use of `formatter`.
|
|
|
|
Signed-off-by: Adam Williamson <awilliam@redhat.com>
|
|
---
|
|
src/setroubleshoot/html_util.py | 110 ++++----------------------------
|
|
1 file changed, 12 insertions(+), 98 deletions(-)
|
|
|
|
diff --git a/src/setroubleshoot/html_util.py b/src/setroubleshoot/html_util.py
|
|
index 5c6d07a..095eaeb 100644
|
|
--- a/src/setroubleshoot/html_util.py
|
|
+++ b/src/setroubleshoot/html_util.py
|
|
@@ -28,110 +28,29 @@ __all__ = [
|
|
|
|
import syslog
|
|
import sys
|
|
+import textwrap
|
|
if sys.version_info > (3,):
|
|
import html
|
|
- import html.parser
|
|
import html.entities
|
|
- from io import StringIO
|
|
+ from html.parser import HTMLParser
|
|
else:
|
|
import htmllib
|
|
- from StringIO import StringIO
|
|
-import formatter as Formatter
|
|
+ from HTMLParser import HTMLParser
|
|
import string
|
|
from types import *
|
|
|
|
#------------------------------------------------------------------------------
|
|
|
|
+class HTMLFilter(HTMLParser):
|
|
+ def __init__(self):
|
|
+ HTMLParser.__init__(self)
|
|
+ self.text = ""
|
|
|
|
-class TextWriter(Formatter.DumbWriter):
|
|
-
|
|
- def __init__(self, file=None, maxcol=80, indent_width=4):
|
|
- Formatter.DumbWriter.__init__(self, file, maxcol)
|
|
- self.indent_level = 0
|
|
- self.indent_width = indent_width
|
|
- self._set_indent()
|
|
-
|
|
- def _set_indent(self):
|
|
- self.indent_col = self.indent_level * self.indent_width
|
|
- self.indent = ' ' * self.indent_col
|
|
-
|
|
- def new_margin(self, margin, level):
|
|
- self.indent_level = level
|
|
- self._set_indent()
|
|
-
|
|
- def send_label_data(self, data):
|
|
- data = data + ' '
|
|
- if len(data) > self.indent_col:
|
|
- self.send_literal_data(data)
|
|
- else:
|
|
- offset = self.indent_col - len(data)
|
|
- self.send_literal_data(' ' * offset + data)
|
|
-
|
|
- def send_flowing_data(self, data):
|
|
- if not data:
|
|
- return
|
|
- atbreak = self.atbreak or data[0] in string.whitespace
|
|
- col = self.col
|
|
- maxcol = self.maxcol
|
|
- write = self.file.write
|
|
- col = self.col
|
|
- if col == 0:
|
|
- write(self.indent)
|
|
- col = self.indent_col
|
|
- for word in data.split():
|
|
- if atbreak:
|
|
- if col + len(word) >= maxcol:
|
|
- write('\n' + self.indent)
|
|
- col = self.indent_col
|
|
- else:
|
|
- write(' ')
|
|
- col = col + 1
|
|
- write(word)
|
|
- col = col + len(word)
|
|
- atbreak = 1
|
|
- self.col = col
|
|
- self.atbreak = data[-1] in string.whitespace
|
|
-
|
|
-if sys.version_info > (3,):
|
|
- class HTMLParserAnchor(html.parser.HTMLParser):
|
|
-
|
|
- def __init__(self, formatter, strict=False, convert_charrefs=False):
|
|
- super(HTMLParserAnchor, self).__init__()
|
|
- self.formatter = formatter
|
|
- self.anchor_href = None
|
|
-
|
|
- def handle_starttag(self, tag, attrs):
|
|
- if tag == 'a':
|
|
- for key, value in attrs:
|
|
- if key == 'href':
|
|
- self.anchor_href = value
|
|
-
|
|
- def handle_endtag(self, tag):
|
|
- if tag == 'a':
|
|
- if self.anchor_href != None:
|
|
- self.formatter.writer.send_flowing_data('(' + self.anchor_href + ')')
|
|
- self.anchor_href = None
|
|
-
|
|
- def handle_data(self, data):
|
|
- self.formatter.writer.send_flowing_data(data)
|
|
-
|
|
-else:
|
|
- class HTMLParserAnchor(htmllib.HTMLParser):
|
|
-
|
|
- def __init__(self, formatter, verbose=0):
|
|
- htmllib.HTMLParser.__init__(self, formatter, verbose)
|
|
-
|
|
- def anchor_bgn(self, href, name, type):
|
|
- self.anchor = href
|
|
-
|
|
- def anchor_end(self):
|
|
- if self.anchor:
|
|
- self.handle_data(' (%s) ' % self.anchor)
|
|
- self.anchor = None
|
|
+ def handle_data(self, data):
|
|
+ self.text += data
|
|
|
|
#------------------------------------------------------------------------------
|
|
|
|
-
|
|
def escape_html(s):
|
|
if s is None:
|
|
return None
|
|
@@ -161,14 +80,9 @@ def unescape_html(s):
|
|
|
|
def html_to_text(html, maxcol=80):
|
|
try:
|
|
- buffer = StringIO()
|
|
- formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
|
|
- parser = HTMLParserAnchor(formatter)
|
|
- parser.feed(html)
|
|
- parser.close()
|
|
- text = buffer.getvalue()
|
|
- buffer.close()
|
|
- return text
|
|
+ filter = HTMLFilter()
|
|
+ filter.feed(html)
|
|
+ return textwrap.fill(filter.text, width=maxcol)
|
|
except Exception as e:
|
|
syslog.syslog(syslog.LOG_ERR, 'cannot convert html to text: %s' % e)
|
|
return None
|
|
--
|
|
2.27.0
|
|
|