import urllib
import re
from urllib
import request
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile(
'<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile(
'<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile(
'<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile(
'<td>')
#把段落开头换为\n加空两格
replacePara = re.compile(
'<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile(
'<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile(
'<.*?>')
def replace(
self,x):
x = re.sub(
self.removeImg
,"",x)
x = re.sub(
self.removeAddr
,"",x)
x = re.sub(
self.replaceLine
,"\n",x)
x = re.sub(
self.replaceTD
,"\t",x)
x = re.sub(
self.replacePara
,"\n ",x)
x = re.sub(
self.replaceBR
,"\n",x)
x = re.sub(
self.removeExtraTag
,"",x)
#strip()将前后多余内容删除
return x.strip()
class BDTB:
def __init__(
self,baseUrl
,seeLZ
,floorTag):
self.baseURL = baseUrl
self.seeLZ =
'?see_lz='+
str(seeLZ)
self.tool = Tool()
self.file =
None
# 楼层标号,初始为1
self.floor =
1
# 默认的标题,如果没有成功获取到标题的话则会用这个标题
self.defaultTitle =
u"百度贴吧"
# 是否写入楼分隔符的标记
self.floorTag = floorTag
def getPage(
self,pageNum):
try:
url =
self.baseURL+
self.seeLZ+
'&pn='+
str(pageNum)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
#print(response.read().decode('utf-8'))
content = response.read().decode(
'utf-8')
return content
except urllib.error.URLError
as e:
if hasattr(e
, "reason"):
print(
u"连接百度贴吧失败,错误原因", e.reason)
return None
def getTitle(
self, page):
#print(page)
pattern = re.compile(
'<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
result = re.search(pattern
, str(page))
if result:
#print(result.group(1))
return result.group(
1).strip()
else:
return None
def getPageNum(
self, indexPage):
pattern = re.compile(
'<span class="red">(.*?)</span>', re.S)
result = re.search(pattern
, indexPage)
if result:
#print(result.group(1))
return result.group(
1).strip()
else:
return None
def getContent(
self, page):
pattern = re.compile(
'<div id="post_content_.*?>(.*?)</div>', re.S)
items = re.findall(pattern
, page)
floor =
1
contents = []
#items_res = list()
for item
in items:
content =
'\n'+
self.tool.replace(item)+
'\n'
contents.append(content)
#print(floor,u'楼-----------------------------------------------------\n')
#items_res.append(self.tool.replace(item))
#print(self.tool.replace(item))
floor +=
1
return contents
def write(
self, contents):
for item
in contents:
filename =
'tieba.txt'
with open(filename
, 'a')
as file_object:
if self.floorTag ==
'1':
# 楼之间的分隔符
floorLine =
"\n" +
str(
self.floor) +
u"-----------------------------------------------------------------------------------------\n"
file_object.write(floorLine)
file_object.write(
str(item))
self.floor +=
1
def start(
self):
indexPage =
self.getPage(
1)
#print(indexPage)
pageNum =
self.getPageNum(indexPage)
title =
self.getTitle(indexPage)
if pageNum ==
None:
print(
'URL已失效,请重试!!')
return
try:
print(
"该帖子共有"+
str(pageNum)+
'页')
for i
in range(
1,int(pageNum)+
1):
print(
'正在写入第'+
str(i)+
'页数据')
page =
self.getPage(i)
contents =
self.getContent(page)
self.write(contents)
except IOError as e:
print(
"写入异常,原因" + e.message)
finally:
print(
"写入任务完成")
# batb = BDTB(baseURL, 1,1)
# print(batb.getPage(1))
# print(u'请输入帖子网址')
# baseurl = input(u'请输入帖子网址:')
# seeLZ = input("是否只获取楼主发言,是输入1,否输入0\n")
# floorTag = input("是否写入楼层信息,是输入1,否输入0\n")
# bdtb = BDTB(baseurl, seeLZ)
# bdtb.start()
#baseURL = 'https://tieba.baidu.com/p/' + str(input(u'https://tieba.baidu.com/p/'))
baseURL =
'https://tieba.baidu.com/p/3138733512'
seeLZ =
input(
"是否只获取楼主发言,是输入1,否输入0\n")
floorTag =
input(
"是否写入楼层信息,是输入1,否输入0\n")
bdtb = BDTB(baseURL
,seeLZ
,floorTag)
bdtb.start()