0
0
Fork 0
mirror of https://github.com/healthchecks/healthchecks.git synced 2025-04-10 23:40:11 +00:00
healthchecks_healthchecks/hc/lib/html.py
2022-08-22 11:56:02 +03:00

34 lines
832 B
Python

from html.parser import HTMLParser
class TextOnlyParser(HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.active = True
self.buf = []
self.skiplist = set(["script", "style"])
def handle_starttag(self, tag, attrs):
if tag in self.skiplist:
self.active = False
def handle_endtag(self, tag):
if tag in self.skiplist:
self.active = True
def handle_data(self, data):
if self.active and data:
self.buf.append(data)
def get_text(self):
messy = "".join(self.buf)
return " ".join(messy.split())
def html2text(html, skip_pre=False):
parser = TextOnlyParser()
if skip_pre:
parser.skiplist.add("pre")
parser.feed(html)
return parser.get_text()