今天课堂笔记
from urllib import request import re url="http://www.kgc.cn/coding/" for i in range(1,25): #把抓取到的页面存到本机,从页面得知总共25页 fname='list-{0}-6-9-9-0.shtml'.format(i) response=request.urlopen(url+fname) print('开始保存:{}'.format(fname)) f=open(fname,'w+',encoding='utf-8') f.writelines(response.read().decode("utf-8")) f.close() for i in range(1,25): #解析 fname='list-{0}-6-9-9-0.shtml'.format(i) f=open(fname,'r',encoding='UTF-8') f2=open('data-{0}.txt'.format(i),'w+',encoding='utf-8') htmlStr=f.read() p=r'<a href=".+" class="yui3-u course-title-a" target="_blank" alt=".+">(.+)</a>' p2=r'<span class="course-pepo">(\d+)</span>' p3=r'<span class="view0-old">(.+)</span>' rs1=re.findall(p,htmlStr,re.I|re.M) rs2=re.findall(p2,htmlStr,re.I|re.M) rs3=re.findall(p3,htmlStr,re.I|re.M) for n in range(len(rs1)): #把解析结果写入档案 f2.write("{0} {1} {2}".format(rs1[n],rs2[n],rs3[n])) f2.write('\n') f2.close() f.close()