"""Extract list of URLs in a web pageThis program is part of "Dive Into Python", a free Python book forexperienced programmers. Visit http://diveintopython.org/ for thelatest version."""__author__ = "Mark Pilgrim (mark@diveintopython.org)"__version__ = "$Revision: 1.2 $"__date__ = "$Date: 2004/05/05 21:57:19 $"__copyright__ = "Copyright (c) 2001 Mark Pilgrim"__license__ = "Python"
#声明四个字符串,并赋值
from sgmllib import SGMLParser
#导入SGMLParser类class URLLister(SGMLParser):
#自定义一个新类URLLister
def reset(self): SGMLParser.reset(self)
#调用SGMLParser的reset方法,重置类实例 self.urls = []
#初始化self的urls变量为空数组def start_a(self, attrs): href = [v for k, v in attrs if k=='href'] if href: self.urls.extend(href)
#若href不为空,将href添加到urls中if __name__ == "__main__":import urllibusock = urllib.urlopen("http://diveintopython.org/")
#调入urllib这个类,打开http://diveintopython.org/这个网址parser = URLLister()
#获取类URLLister的实例,并赋给parserparser.feed(usock.read())
#将usock读取的内容“喂给”parserparser.close()
#关闭parserusock.close()
#关闭usockfor url in parser.urls: print url
#打印parser的所有元素
