Adding Quotes to Attribute Values in HTML Documents : Parse HTML « Network « Python Tutorial






import HTMLParser
import urllib
import sys

class parseAttrs(HTMLParser.HTMLParser):
    def init_parser (self):
        self.pieces = []

    def handle_starttag(self, tag, attrs):
        fixedAttrs = ""
        for name, value in attrs:
            fixedAttrs += "%s=\"%s\" " % (name, value)
        self.pieces.append("<%s %s>" % (tag, fixedAttrs))

    def handle_charref(self, name):
        self.pieces.append("&#%s;" % (name))

    def handle_endtag(self, tag):
        self.pieces.append("</%s>" % (tag))

    def handle_entityref(self, ref):
        self.pieces.append("&%s" % (ref))

    def handle_data(self, text):
        self.pieces.append(text)

    def handle_comment(self, text):
        self.pieces.append("<!--%s-->" % (text))

    def handle_pi(self, text):
        self.pieces.append("<?%s>" % (text))

    def handle_decl(self, text):
        self.pieces.append("<!%s>" % (text))

    def parsed (self):
        return "".join(self.pieces)

attrParser = parseAttrs()
attrParser.init_parser()
attrParser.feed(urllib.urlopen("test2.html").read())
print open("test2.html").read()
print attrParser.parsed()
attrParser.close()








21.21.Parse HTML
21.21.1.Extract list of URLs in a web page
21.21.2.Opening HTML Documents
21.21.3.Retrieving Links from HTML Documents
21.21.4.Retrieving Images from HTML Documents
21.21.5.Retrieving Text from HTML Documents
21.21.6.Retrieving Cookies in HTML Documents
21.21.7.Adding Quotes to Attribute Values in HTML Documents
21.21.8.Basic HTML Title Retriever
21.21.9.HTML Title Retriever With Entity Support