爬取“Python 100例”练习题以及代码 | 爬虫练习

  • 代码
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
url = 'http://www.runoob.com/python/python-100-examples.html'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}      #反爬机制,其实没用上。。。

r = requests.get(url, headers = headers).content.decode('utf-8')
soup = BeautifulSoup(r, 'lxml')

re_a = soup.find(id = 'content').ul.find_all('a')

list = []
for i in re_a:
	list.append(i['href'])  #收集100个url
data = []
for x in list:
	dict = {}
	test = requests.get('http://www.runoob.com' + x, headers = headers).content.decode('utf-8')
	soup_test = BeautifulSoup(test, 'html.parser')
	dict['title'] = soup_test.find(id = 'content').h1.text	#分析html结构
	dict['problem'] = soup_test.find(id = 'content').find_all('p')[1].text
	dict['analyse'] = soup_test.find(id = 'content').find_all('p')[2].text
	try:
		dict['code'] = soup_test.find(class_="hl-main").text
	except Exception as e:
		dict['code'] = soup_test.find('pre').text
	with open('100-py.csv', 'a+', encoding = 'utf-8') as file:  #文件保存
		file.write(dict['title'] + '\n')
		file.write(dict['problem'] + '\n')
		file.write(dict['analyse'] + '\n')
		file.write(dict['code'] + '\n')
		file.write('-' * 50 + '\n')
		file.write('\n')
	print(dict['title'])
  • 爬取结果
    avatar