网络小说爬虫
针对不同网站,专门写的。仅供研究学习使用。
爱下电子书
# ixdzs8NovelDowner.py
"""
ixdzs8.com小说下载脚本
run example : python .\ixdzs8NovelDowner.py -BookUrl https://ixdzs8.com/read/293674/ -o 迪迦的传说.txt
命令行参数:
-BookUrl (str): 书籍详情页的网址
-o (str) : 输出txt文件名
-i (int): 从第i章节开始下载
"""
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import argparse
if __name__ == '__main__':
# 创建解析器
parser = argparse.ArgumentParser()
parser.add_argument("-BookUrl", type=str, help="爱下电子书的书籍网址,如:https://ixdzs8.com/read/544947/")
parser.add_argument("-o", type=str, help="保存文件名")
parser.add_argument("-i",type=int,help="从第i章节开始下载")
# 解析命令行参数
args = parser.parse_args()
# 请求头
agent = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
# 保存文件名
txtfile = args.o if args.o else '小说.txt'
# 状态码OK
StatusCodeOK = 200
# 程序执行
fp = open(txtfile, 'w', encoding='utf-8')
i = args.i if args.i else 1
url = args.BookUrl + f'p{i}.html'
# 循环下载
while True:
if not i%100 :
print(f"正在下载第{i}章...")
response = requests.get(url, headers=agent)
if response.status_code == StatusCodeOK:
soup = BeautifulSoup(response.text, 'html.parser')
# 获取章节名
chapter_name_tag = soup.find('h1', class_='page-d-name')
chapter_name = chapter_name_tag.text.strip() if chapter_name_tag else f"第{i}章"
# 获取正文内容
content_tags = soup.find_all('p')
content_text = '\n'.join([p.text.strip() for p in content_tags if p.text.strip()])
# 写入TXT文件
fp.write(chapter_name + '\n\n')
fp.write(content_text + '\n\n')
# 查找“下一章”的链接
next_button = soup.find('a', class_='chapter-paging chapter-next')
if next_button and 'href' in next_button.attrs:
url = 'https://ixdzs8.com'+next_button['href']
if 'end' in url:
break;
else:
break
else:
print(f"无法获取URL: {url}, 状态码: {response.status_code}")
break
i += 1
fp.close()
笔趣阁(22biqu)
# 22biquNovelDwner.py
"""
这是一个下载www.22biquge.com小说的爬虫脚本。脚本通过命令行使用,命令行参数使用argsparse库解析;
命令行参数如下:
-ChapterUrl (str): 小说起始下载章节的网址
-o (str) : 输出txt文件名
-n (int) : 下载的章节数
for example:
python ./22biqugeNovelDwner.py -ChapterUrl https://www.22biqu.com/biqu61686/31180568.html -o 斗罗之双枪绝世.txt
"""
import argparse
import requests
from bs4 import BeautifulSoup
def get_chapter(url):
"""获取章节标题、正文、下一页url"""
resp = requests.get(url)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text, 'html.parser')
# 章节标题
title = soup.find('h1', class_='title').get_text(strip=True)
# 正文内容
content_div = soup.find('div', id='content')
paras = [p.get_text(strip=True) for p in content_div.find_all('p')]
content = '\n'.join(paras)
# 下一页url
next_a = soup.find('a', id='next_url')
if not next_a or '没有了' in next_a.get_text():
return '', content, ''
else:
next_url = next_a['href']
if not next_url.startswith('https'):
next_url = 'https://www.22biqu.com'+next_url
if '下一页' in next_a.get_text():
# 说明章节没结束
return title, content, next_url
if '下一章' in next_a.get_text():
return '', content, next_url
def main():
parser = argparse.ArgumentParser(description='下载www.22biquge.com小说章节')
parser.add_argument('-ChapterUrl', type=str, help='小说起始下载章节的网址')
parser.add_argument('-o', type=str, required=True, help='输出txt文件名')
parser.add_argument('-n', type=int, default=9999, help='下载的章节数')
args = parser.parse_args()
url = args.ChapterUrl
out_file = args.o
max_chapters = args.n
with open(out_file, 'a', encoding='utf-8') as f:
chapter_count = 0
while url and chapter_count < max_chapters:
title, content, next_url = get_chapter(url)
if title :
if title not in content:
f.write(title + '\n\n')
f.write(content + '\n')
else:
f.write(content + '\n\n')
chapter_count += 1
if not chapter_count % 50:
print(f'已下载: {title}')
url = next_url
if __name__ == '__main__':
main()