爬虫基本步骤
扒取数据处理数据进一步处理数据
爬虫代码与解析
import re
from bs4
import BeautifulSoup
import requests
def main():
resp = requests.get(
'http://sports.sohu.com/nba_a.shtml')
html = resp.content.decode(
'gbk')
bs = BeautifulSoup(html,
'lxml')
for elem
in bs.select(
'a[test=a]'):
link_url = elem.attrs[
'href']
resp = requests.get(link_url)
bs_sub = BeautifulSoup(resp.text,
'lxml')
print(re.sub(
r'[\r\n]',
'',bs_sub.select_one(
'h1').text))
if __name__ ==
'__main__':
main()