cheerio 爬虫示例

cheerio.load

load 加载 html 字符串

const { data: html } = await axios({
  method: "get",
  maxBodyLength: Infinity,
  url: pageURL,
  headers: {
    host: "acggw.me",
  },
})
const $ = cheerio.load(html)

获取 html 内特定标签内容

const results = []
$(".list-content").each(async (_, element) => {
  const el = $(element)

  const data = {
    href: "",
    dimensions: "",
  }

  const a = el.find("h2 a")
  data.href = a.attr("href")
  data.title = a.html().trim()

  data.dimensions = el.find(".list-footer a").html()

  if (data.href && !crawledLinks.has(data.href)) {
    // 记录已爬取的链接
    results.push(data)
    crawledLinks.add(data.href)
  }
})

案例

import axios from "axios";
import * as cheerio from "cheerio";
import fs from "fs";
import path from "path";

// 配置
const baseURL = "https://www.xxxx.com/page/"; // 分页 URL 模板
const outputFile = path.resolve(process.cwd(), "paged_data.json"); // 数据存储文件
const progressFile = path.resolve(process.cwd(), "progress.json"); // 进度存储文件
const maxPages = 300; // 最大页数限制（根据需求调整）

// 加载已保存的数据
const loadData = (filePath, defaultValue = []) => {
  if (fs.existsSync(filePath)) {
    return JSON.parse(fs.readFileSync(filePath, "utf-8"));
  }
  return defaultValue;
};

// 保存数据到文件
const saveData = (filePath, data) => {
  fs.writeFileSync(filePath, JSON.stringify(data, null, 2), "utf-8");
};

const sleep = (timer = 100) =>
  new Promise((res) => {
    setTimeout(() => {
      res(1000);
    }, timer);
  });

// 抓取单页内容
const crawlPage = async (pageNumber, crawledLinks) => {
  const pageURL = `${baseURL}${pageNumber}`;
  console.log(`正在获取: ${pageURL}`);

  try {
    const { data: html } = await axios({
      method: "get",
      maxBodyLength: Infinity,
      url: pageURL,
      headers: {
        host: "xxx.com",
      },
    });
    const $ = cheerio.load(html);

    const results = [];
    $(".list-content").each(async (_, element) => {
      const el = $(element);

      const data = {
        href: "",
        dimensions: "",
      };

      const a = el.find("h2 a");
      data.href = a.attr("href");
      data.title = a.html().trim();

      data.dimensions = el.find(".list-footer a").html();

      if (data.href && !crawledLinks.has(data.href)) {
        // 记录已爬取的链接
        results.push(data);
        crawledLinks.add(data.href);
      }
    });

    try {
      for (let data of results) {
        // await sleep();
        console.log("开始获取" + data.title);
        const { data: html } = await axios.request({
          method: "get",
          url: data.href,
          headers: {
            host: "xxx.com",
          },
        });
        const $ = cheerio.load(html);

        data.content = $(".panel-body").html().trim();

        data.tags = [];
        $(".post-tags a").each((_, el) => {
          data.tags.push($(el).text().trim());
        });

        $('[href*="屏幕截图.jpg"]').remove();
      }
    } catch (e) {
      console.log("err");
    }

    return results;
  } catch (error) {
    console.error(`错误链接 ${pageURL}:` /* error.message */);
    return [];
  }
};

// 主程序
const startCrawling = async () => {
  // 加载爬取进度和已爬取数据
  const crawledData = loadData(outputFile);
  const crawledLinks = new Set(crawledData.map(({ href }) => href));
  const progress = loadData(progressFile, { currentPage: 1 });

  let { currentPage } = progress;

  for (; currentPage <= maxPages; currentPage++) {
    // 抓取当前页内容
    const pageResults = await crawlPage(currentPage, crawledLinks);

    // 如果当前页无有效数据，提前停止爬取
    if (pageResults.length === 0) {
      console.log(`没有找到数据 ==> 第 ${currentPage} 页`);
      break;
    }

    // 保存新数据
    crawledData.push(...pageResults);
    saveData(outputFile, crawledData);

    // 更新进度并保存
    saveData(progressFile, { currentPage: currentPage + 1 });

    console.log(`第 ${currentPage} 页已保存:`, pageResults.length, "项");
  }

  console.log("爬取结束");
};

// 启动爬虫
startCrawling();

cheerio 爬虫示例 ​

cheerio.load ​

获取 html 内特定标签内容 ​

案例 ​

cheerio 爬虫示例

cheerio.load

获取 html 内特定标签内容

案例