文章 35
评论 44
浏览 92749
python实现爬取豆瓣豆列中图书的作者,出版社,出版日期,ISBN号

python实现爬取豆瓣豆列中图书的作者,出版社,出版日期,ISBN号

import requests
import lxml.etree as etree
import time
# 可以继续添加其他豆列
urls_all = ["https://www.douban.com/doulist/40421457/?dt_platform=mobile_qq&dt_dapp=1"
            ]
doulist_name = ["涂绘学院"]
# headers换成其他的浏览器的headers
hearders = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
books_all = []

def paqushuju(url):
    books = []
    # 爬取网页中的书名,作者,出版社,出版时间,isbn
    response = requests.get(url, headers=hearders)
    html = etree.HTML(response.text)
    # 获取classname为doulist-item的div标签
    book_list = html.xpath('//div[@class="doulist-item"]')

    for book in book_list:
        # 获取每个div标签中的书名,作者,出版社,出版时间,isbn,并去除文字前后的空格和换行符
        book_item = {}
        # 如果book.xpath('.//div[@class="title"]/a/text()')为空,会报错,所以要加上[0]
        if book.xpath('.//div[@class="title"]/a/text()'):
            book_name = book.xpath('.//div[@class="title"]/a/text()')[0].strip()
        else:
            book_name = 'None'
        if book.xpath('.//div[@class="abstract"]/text()'):
            author = book.xpath('.//div[@class="abstract"]/text()')[0].strip()[4:]
        else:
            author = 'None'
        if len(book.xpath('.//div[@class="abstract"]/text()')) >= 2:
            publisher = book.xpath('.//div[@class="abstract"]/text()')[1].strip()[5:]
        else:
            publisher = 'None'
        if len(book.xpath('.//div[@class="abstract"]/text()')) == 3:
            publish_time = book.xpath('.//div[@class="abstract"]/text()')[2].strip()[5:]
        else:
            publish_time = 'None'

        # isbn号,先打开每本书的详情页,再获取isbn号
        book_url = book.xpath('.//div[@class="title"]/a/@href')[0]
        time.sleep(15)
        book_response = requests.get(book_url, headers=hearders)
        book_html = etree.HTML(book_response.text)
        # 获取class=pl的span标签
        pl_list = book_html.xpath('//*[@id="info"]/span[@class="pl"]')
        isbn = 'None'
        for i in pl_list:
            title_pl = i.xpath('./text()')[0].strip()
            if title_pl == 'ISBN:':
                isbn = i.xpath('./following-sibling::text()')[0].strip()
        book_item['book_name'] = book_name
        book_item['author'] = author
        book_item['publisher'] = publisher
        book_item['publish_time'] = publish_time
        book_item['isbn'] = isbn
        books.append(book_item)
        # break

    # 检测后页按钮中是否有网址,如果有就继续爬取后页
    next_page = html.xpath('//span[@class="next"]/a/@href')
    if next_page:
        url = next_page[0]
    else:
        url = None
    
    return books, url

# 创建一个打开books.csv文件,并在第一行写入表头的函数
def create_csv(csv_name):
    with open(f'{csv_name}.csv', 'w', encoding='utf-8-sig') as f:
        # 已经有了若干数据,但要在第一行插入表头,需要先读取原来的数据,然后再写入
        f.write('书名,作者,出版社,出版时间,ISBN号' + '\n')

for url in urls_all:
    csv_name = doulist_name[urls_all.index(url)]
    create_csv(csv_name)
    next_url = None
    books, next_url = paqushuju(url)
    while next_url != None:
        books_next,next_url = paqushuju(next_url)
        books.extend(books_next)
    #爬取完一个url的所有书后,将books_all中的数据写入csv文件
    with open(f'{csv_name}1.csv', 'a+', encoding='utf-8-sig') as f:
        for book in books:
            f.write(book['book_name'] + ',' + book['author'] + ',' + book['publisher'] + ',' + book['publish_time'] + ',' + book['isbn'] + '\n')
            # f.write(book['book_name'] + ',' + book['author'] + ',' + book['publisher'] + ',' + book['publish_time'] + '\n')

image.png


标题:python实现爬取豆瓣豆列中图书的作者,出版社,出版日期,ISBN号
作者:abandon
地址:HTTPS://www.songsci.com/articles/2023/03/29/1680064712161.html

Life Is Like A Boat

取消