menu arrow_back 湛蓝安全空间 |狂野湛蓝,暴躁每天 chevron_right ... chevron_right builder chevron_right _htmlparser.py
  • home 首页
  • brightness_4 暗黑模式
  • cloud
    xLIYhHS7e34ez7Ma
    cloud
    湛蓝安全
    code
    Github
    _htmlparser.py
    8.63 KB / 2021-07-04 19:32:24
        """Use the HTMLParser library to parse HTML files that aren't too bad."""
    
    __all__ = [
        'HTMLParserTreeBuilder',
        ]
    
    from HTMLParser import (
        HTMLParser,
        HTMLParseError,
        )
    import sys
    import warnings
    
    # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
    # argument, which we'd like to set to False. Unfortunately,
    # http://bugs.python.org/issue13273 makes strict=True a better bet
    # before Python 3.2.3.
    #
    # At the end of this file, we monkeypatch HTMLParser so that
    # strict=True works well on Python 3.2.2.
    major, minor, release = sys.version_info[:3]
    CONSTRUCTOR_TAKES_STRICT = (
        major > 3
        or (major == 3 and minor > 2)
        or (major == 3 and minor == 2 and release >= 3))
    
    from bs4.element import (
        CData,
        Comment,
        Declaration,
        Doctype,
        ProcessingInstruction,
        )
    from bs4.dammit import EntitySubstitution, UnicodeDammit
    
    from bs4.builder import (
        HTML,
        HTMLTreeBuilder,
        STRICT,
        )
    
    
    HTMLPARSER = 'html.parser'
    
    class BeautifulSoupHTMLParser(HTMLParser):
        def handle_starttag(self, name, attrs):
            # XXX namespace
            attr_dict = {}
            for key, value in attrs:
                # Change None attribute values to the empty string
                # for consistency with the other tree builders.
                if value is None:
                    value = ''
                attr_dict[key] = value
                attrvalue = '""'
            self.soup.handle_starttag(name, None, None, attr_dict)
    
        def handle_endtag(self, name):
            self.soup.handle_endtag(name)
    
        def handle_data(self, data):
            self.soup.handle_data(data)
    
        def handle_charref(self, name):
            # XXX workaround for a bug in HTMLParser. Remove this once
            # it's fixed.
            if name.startswith('x'):
                real_name = int(name.lstrip('x'), 16)
            elif name.startswith('X'):
                real_name = int(name.lstrip('X'), 16)
            else:
                real_name = int(name)
    
            try:
                data = unichr(real_name)
            except (ValueError, OverflowError), e:
                data = u"\N{REPLACEMENT CHARACTER}"
    
            self.handle_data(data)
    
        def handle_entityref(self, name):
            character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
            if character is not None:
                data = character
            else:
                data = "&%s;" % name
            self.handle_data(data)
    
        def handle_comment(self, data):
            self.soup.endData()
            self.soup.handle_data(data)
            self.soup.endData(Comment)
    
        def handle_decl(self, data):
            self.soup.endData()
            if data.startswith("DOCTYPE "):
                data = data[len("DOCTYPE "):]
            elif data == 'DOCTYPE':
                # i.e. "<!DOCTYPE>"
                data = ''
            self.soup.handle_data(data)
            self.soup.endData(Doctype)
    
        def unknown_decl(self, data):
            if data.upper().startswith('CDATA['):
                cls = CData
                data = data[len('CDATA['):]
            else:
                cls = Declaration
            self.soup.endData()
            self.soup.handle_data(data)
            self.soup.endData(cls)
    
        def handle_pi(self, data):
            self.soup.endData()
            if data.endswith("?") and data.lower().startswith("xml"):
                # "An XHTML processing instruction using the trailing '?'
                # will cause the '?' to be included in data." - HTMLParser
                # docs.
                #
                # Strip the question mark so we don't end up with two
                # question marks.
                data = data[:-1]
            self.soup.handle_data(data)
            self.soup.endData(ProcessingInstruction)
    
    
    class HTMLParserTreeBuilder(HTMLTreeBuilder):
    
        is_xml = False
        features = [HTML, STRICT, HTMLPARSER]
    
        def __init__(self, *args, **kwargs):
            if CONSTRUCTOR_TAKES_STRICT:
                kwargs['strict'] = False
            self.parser_args = (args, kwargs)
    
        def prepare_markup(self, markup, user_specified_encoding=None,
                           document_declared_encoding=None):
            """
            :return: A 4-tuple (markup, original encoding, encoding
            declared within markup, whether any characters had to be
            replaced with REPLACEMENT CHARACTER).
            """
            if isinstance(markup, unicode):
                yield (markup, None, None, False)
                return
    
            try_encodings = [user_specified_encoding, document_declared_encoding]
            dammit = UnicodeDammit(markup, try_encodings, is_html=True)
            yield (dammit.markup, dammit.original_encoding,
                   dammit.declared_html_encoding,
                   dammit.contains_replacement_characters)
    
        def feed(self, markup):
            args, kwargs = self.parser_args
            parser = BeautifulSoupHTMLParser(*args, **kwargs)
            parser.soup = self.soup
            try:
                parser.feed(markup)
            except HTMLParseError, e:
                warnings.warn(RuntimeWarning(
                    "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
                raise e
    
    # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
    # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
    # string.
    #
    # XXX This code can be removed once most Python 3 users are on 3.2.3.
    if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
        import re
        attrfind_tolerant = re.compile(
            r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
            r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
        HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
    
        locatestarttagend = re.compile(r"""
      <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
      (?:\s+                             # whitespace before attribute name
        (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
          (?:\s*=\s*                     # value indicator
            (?:'[^']*'                   # LITA-enclosed value
              |\"[^\"]*\"                # LIT-enclosed value
              |[^'\">\s]+                # bare value
             )
           )?
         )
       )*
      \s*                                # trailing whitespace
    """, re.VERBOSE)
        BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
    
        from html.parser import tagfind, attrfind
    
        def parse_starttag(self, i):
            self.__starttag_text = None
            endpos = self.check_for_whole_start_tag(i)
            if endpos < 0:
                return endpos
            rawdata = self.rawdata
            self.__starttag_text = rawdata[i:endpos]
    
            # Now parse the data between i+1 and j into a tag and attrs
            attrs = []
            match = tagfind.match(rawdata, i+1)
            assert match, 'unexpected call to parse_starttag()'
            k = match.end()
            self.lasttag = tag = rawdata[i+1:k].lower()
            while k < endpos:
                if self.strict:
                    m = attrfind.match(rawdata, k)
                else:
                    m = attrfind_tolerant.match(rawdata, k)
                if not m:
                    break
                attrname, rest, attrvalue = m.group(1, 2, 3)
                if not rest:
                    attrvalue = None
                elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                     attrvalue[:1] == '"' == attrvalue[-1:]:
                    attrvalue = attrvalue[1:-1]
                if attrvalue:
                    attrvalue = self.unescape(attrvalue)
                attrs.append((attrname.lower(), attrvalue))
                k = m.end()
    
            end = rawdata[k:endpos].strip()
            if end not in (">", "/>"):
                lineno, offset = self.getpos()
                if "\n" in self.__starttag_text:
                    lineno = lineno + self.__starttag_text.count("\n")
                    offset = len(self.__starttag_text) \
                             - self.__starttag_text.rfind("\n")
                else:
                    offset = offset + len(self.__starttag_text)
                if self.strict:
                    self.error("junk characters in start tag: %r"
                               % (rawdata[k:endpos][:20],))
                self.handle_data(rawdata[i:endpos])
                return endpos
            if end.endswith('/>'):
                # XHTML-style empty tag: <span attr="value" />
                self.handle_startendtag(tag, attrs)
            else:
                self.handle_starttag(tag, attrs)
                if tag in self.CDATA_CONTENT_ELEMENTS:
                    self.set_cdata_mode(tag)
            return endpos
    
        def set_cdata_mode(self, elem):
            self.cdata_elem = elem.lower()
            self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
    
        BeautifulSoupHTMLParser.parse_starttag = parse_starttag
        BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
    
        CONSTRUCTOR_TAKES_STRICT = True
    
    
    links
    file_download