Python脚本-保存csdn文章为pdf/html/md

import requests
from bs4 import BeautifulSoup
import random
import html2text
import os
import re
import pdfkit
import time
import logging
import json

# user_agent库:每次执行一次访问随机选取一个 user_agent,防止过于频繁访问被禁止
USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
    # ... 其他 user agent ...
]

class CSDNSpider():
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': random.choice(USER_AGENT_LIST),
            'Referer': 'https://blog.csdn.net/',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Cookie': 'uuid_tt_dd=10_28867322640-1634638612434-763342; dc_session_id=10_1634638612434.351143; c_first_ref=www.baidu.com; c_first_page=https%3A//blog.csdn.net/; dc_sid=96044c0a6a9786eff8e99a4b7542f67b'
        }
        self.md_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN爬取\MD'
        self.pdf_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN爬取\PDF'
        self.html_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN爬取\HTML'
        
        # 确保所有目录都存在
        for dir_path in [self.md_dir, self.pdf_dir, self.html_dir]:
            os.makedirs(dir_path, exist_ok=True)

        # 读取并解析robots.txt
        self.robots_rules = self.read_robots_txt()

    def read_robots_txt(self):
        robots_url = "https://blog.csdn.net/robots.txt"
        response = self.session.get(robots_url, headers=self.headers)
        if response.status_code == 200:
            print("成功读取robots.txt")
            return response.text
        else:
            print("无法读取robots.txt")
            return ""

    def is_allowed_by_robots(self, url):
        # 简单解析robots.txt内容,检查是否允许访问给定的URL
        disallowed_paths = []
        for line in self.robots_rules.splitlines():
            if line.startswith("Disallow:"):
                path = line.split(":")[1].strip()
                disallowed_paths.append(path)
        
        for path in disallowed_paths:
            if re.match(path.replace("*", ".*"), url):
                print(f"URL '{url}' 被robots.txt禁止访问")
                return False
        return True

    def fetch_article(self, url):
        # 移除URL中的@符号
        if url.startswith('@'):
            url = url[1:]
        
        if not self.is_allowed_by_robots(url):
            print("根据robots.txt规则,无法访问此URL")
            return None

        match = re.match(r'https://blog\.csdn\.net/([^/]+)/article/details/(\d+)', url)
        if not match:
            print("无效的CSDN文章URL")
            return None
        
        author, article_id = match.groups()
        standard_url = f"https://blog.csdn.net/{author}/article/details/{article_id}"
        
        print(f"正在爬取文章: {standard_url}")
        
        time.sleep(random.uniform(1, 3))
        
        response = self.session.get(url=standard_url, headers=self.headers)
        response.encoding = "utf-8"
        if response.status_code == 200:
            print("成功获取页面")
            return self.parse_article(response.text)
        else:
            print(f"获取页面失败,状态码: {response.status_code}")
            return None

    def parse_article(self, html):
        soup = BeautifulSoup(html, 'lxml')
        title = soup.find('h1', class_='title-article')
        if title:
            title = title.text.strip()
            print(f"找到文章标题: {title}")
        else:
            print("无法找到文章标题")
            return None, None, None

        content = soup.find('div', id="content_views")
        if content:
            print("找到文章内容")
            h = html2text.HTML2Text()
            h.ignore_links = False
            markdown_content = h.handle(str(content))
            return title, markdown_content, str(content)
        else:
            print("无法找到文章内容")
            return None, None, None

    def save_markdown(self, title, content):
        clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
        filename = os.path.join(self.md_dir, f"{clean_title}.md")
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"# {title}\n\n{content}")
        
        return filename

    def save_html(self, title, content):
        clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
        filename = os.path.join(self.html_dir, f"{clean_title}.html")
        
        full_html = html_str.format(article=content)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(full_html)
        
        return filename

    def save_pdf(self, title, html_filename):
        clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
        filename = os.path.join(self.pdf_dir, f"{clean_title}.pdf")
        
        config = pdfkit.configuration(wkhtmltopdf=r'D:\EdgeDownload\VSCode\wkhtmltopdf\bin\wkhtmltopdf.exe')
        pdfkit.from_file(html_filename, filename, configuration=config)
        
        return filename

def main(url):
    spider = CSDNSpider()
    result = spider.fetch_article(url)
    if result:
        title, markdown_content, html_content = result
        md_filename = spider.save_markdown(title, markdown_content)
        html_filename = spider.save_html(title, html_content)
        pdf_filename = spider.save_pdf(title, html_filename)
        print(f"文章 '{title}' 已成功爬取并保存。")
        print(f"Markdown文件: {md_filename}")
        print(f"HTML文件: {html_filename}")
        print(f"PDF文件: {pdf_filename}")
    else:
        print("无法获取文章内容。请检查URL是否正确或者文章是否可访问。")

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    
    article_url = input("请输入CSDN文章的URL: ")
    main(article_url)

需要修改的地方:

  • 文件保存目录
self.md_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN爬取\MD'
        self.pdf_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN爬取\PDF'
        self.html_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN爬取\HTML'
  • html转pdf
config = pdfkit.configuration(wkhtmltopdf=r'D:\EdgeDownload\VSCode\wkhtmltopdf\bin\wkhtmltopdf.exe')
  • useragent更改可选
USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
    # ... 其他 user agent ...
]

参考useragent库

注意:

  • 只是获取能获取的文章并转换形式保存,获取付费的文章?那我也想 :grinning_face:
  • 直接运行报错多为缺少必要的库/依赖,扔给AI,就知道安装命令了(在开头)
温馨提示: 本文最后更新于2025-03-23 18:41:31,某些文章具有时效性,若有错误或已失效,请在下方 留言或联系 Macfun-A very interesting site
© 版权声明
THE END
喜欢就支持一下吧
点赞15赞赏 分享
评论 抢沙发
头像
欢迎您留下宝贵的见解!
提交
头像

昵称

取消
昵称表情代码图片

    暂无评论内容