HTML Title Retriever With Entity Support : Parse HTML « Network « Python Tutorial






from htmlentitydefs import entitydefs
from HTMLParser import HTMLParser
import sys

class TitleParser(HTMLParser):
    def __init__(self):
        self.title = ''
        self.readingtitle = 0
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.readingtitle = 1

    def handle_data(self, data):
        if self.readingtitle:
            self.title += data

    def handle_endtag(self, tag):
        if tag == 'title':
            self.readingtitle = 0

    def handle_entityref(self, name):
        if entitydefs.has_key(name):
            self.handle_data(entitydefs[name])
        else:
            self.handle_data('&' + name + ';')

    def handle_charref(self, name):
        try:
            charnum = int(name)
        except ValueError:
            return

        if charnum < 1 or charnum > 255:
            return

        self.handle_data(chr(charnum))

    def gettitle(self):
        return self.title

fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
print "Title is:", tp.gettitle()








21.21.Parse HTML
21.21.1.Extract list of URLs in a web page
21.21.2.Opening HTML Documents
21.21.3.Retrieving Links from HTML Documents
21.21.4.Retrieving Images from HTML Documents
21.21.5.Retrieving Text from HTML Documents
21.21.6.Retrieving Cookies in HTML Documents
21.21.7.Adding Quotes to Attribute Values in HTML Documents
21.21.8.Basic HTML Title Retriever
21.21.9.HTML Title Retriever With Entity Support