import requests
import lxml.etree as etree
import time
# 可以继续添加其他豆列
urls_all = ["https://www.douban.com/doulist/40421457/?dt_platform=mobile_qq&dt_dapp=1"
]
doulist_name = ["涂绘学院"]
# headers换成其他的浏览器的headers
hearders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
books_all = []
def paqushuju(url):
books = []
# 爬取网页中的书名,作者,出版社,出版时间,isbn
response = requests.get(url, headers=hearders)
html = etree.HTML(response.text)
# 获取classname为doulist-item的div标签
book_list = html.xpath('//div[@class="doulist-item"]')
for book in book_list:
# 获取每个div标签中的书名,作者,出版社,出版时间,isbn,并去除文字前后的空格和换行符
book_item = {}
# 如果book.xpath('.//div[@class="title"]/a/text()')为空,会报错,所以要加上[0]
if book.xpath('.//div[@class="title"]/a/text()'):
book_name = book.xpath('.//div[@class="title"]/a/text()')[0].strip()
else:
book_name = 'None'
if book.xpath('.//div[@class="abstract"]/text()'):
author = book.xpath('.//div[@class="abstract"]/text()')[0].strip()[4:]
else:
author = 'None'
if len(book.xpath('.//div[@class="abstract"]/text()')) >= 2:
publisher = book.xpath('.//div[@class="abstract"]/text()')[1].strip()[5:]
else:
publisher = 'None'
if len(book.xpath('.//div[@class="abstract"]/text()')) == 3:
publish_time = book.xpath('.//div[@class="abstract"]/text()')[2].strip()[5:]
else:
publish_time = 'None'
# isbn号,先打开每本书的详情页,再获取isbn号
book_url = book.xpath('.//div[@class="title"]/a/@href')[0]
time.sleep(15)
book_response = requests.get(book_url, headers=hearders)
book_html = etree.HTML(book_response.text)
# 获取class=pl的span标签
pl_list = book_html.xpath('//*[@id="info"]/span[@class="pl"]')
isbn = 'None'
for i in pl_list:
title_pl = i.xpath('./text()')[0].strip()
if title_pl == 'ISBN:':
isbn = i.xpath('./following-sibling::text()')[0].strip()
book_item['book_name'] = book_name
book_item['author'] = author
book_item['publisher'] = publisher
book_item['publish_time'] = publish_time
book_item['isbn'] = isbn
books.append(book_item)
# break
# 检测后页按钮中是否有网址,如果有就继续爬取后页
next_page = html.xpath('//span[@class="next"]/a/@href')
if next_page:
url = next_page[0]
else:
url = None
return books, url
# 创建一个打开books.csv文件,并在第一行写入表头的函数
def create_csv(csv_name):
with open(f'{csv_name}.csv', 'w', encoding='utf-8-sig') as f:
# 已经有了若干数据,但要在第一行插入表头,需要先读取原来的数据,然后再写入
f.write('书名,作者,出版社,出版时间,ISBN号' + '\n')
for url in urls_all:
csv_name = doulist_name[urls_all.index(url)]
create_csv(csv_name)
next_url = None
books, next_url = paqushuju(url)
while next_url != None:
books_next,next_url = paqushuju(next_url)
books.extend(books_next)
#爬取完一个url的所有书后,将books_all中的数据写入csv文件
with open(f'{csv_name}1.csv', 'a+', encoding='utf-8-sig') as f:
for book in books:
f.write(book['book_name'] + ',' + book['author'] + ',' + book['publisher'] + ',' + book['publish_time'] + ',' + book['isbn'] + '\n')
# f.write(book['book_name'] + ',' + book['author'] + ',' + book['publisher'] + ',' + book['publish_time'] + '\n')
python实现爬取豆瓣豆列中图书的作者,出版社,出版日期,ISBN号
标题:python实现爬取豆瓣豆列中图书的作者,出版社,出版日期,ISBN号
作者:abandon
地址:HTTPS://www.songsci.com/articles/2023/03/29/1680064712161.html