返回 Skill 列表
extension
分类: 数据与分析无需 API Key

Browser Automation

使用 Puppeteer 进行网页抓取和浏览器自动化。适用于需要从网站提取数据、爬取页面、抓取由 JavaScript 渲染的动态内容。

person作者: fasjdashubclawhub

Browser Automation

Web scraping and browser automation powered by Puppeteer.

When to Use

USE this skill when:

  • "Scrape data from [URL]"
  • "Extract all [products/listings/items] from [website]"
  • "Take a screenshot of [page]"
  • "Crawl [website] and collect [info]"
  • "Fill and submit [form]"
  • Any JavaScript-rendered content that won't load without a browser

DON'T use this skill when:

  • Simple static pages → use web_fetch instead
  • APIs available → fetch API directly
  • Rate-limited sites → respect robots.txt

Quick Start

# Install Puppeteer
npm install puppeteer

# Basic scraping
node scripts/scrape.js https://example.com

Core Patterns

Launch Browser

const puppeteer = require('puppeteer');

async function scrape(url) {
  const browser = await puppeteer.launch({
    headless: 'new',
    args: ['--no-sandbox', '--disable-setuid-sandbox']
  });
  
  const page = await browser.newPage();
  await page.goto(url, { waitUntil: 'networkidle2' });
  
  // ... extract data ...
  
  await browser.close();
}

Extract Text Content

// Get all text from a selector
const titles = await page.$$eval('h2', els => els.map(el => el.textContent.trim()));

// Get text from single element
const price = await page.$eval('.price', el => el.textContent.trim());

Extract HTML

const html = await page.$eval('.product-list', el => el.innerHTML);

Extract Attributes

const links = await page.$$eval('a', els => els.map(el => ({
  text: el.textContent.trim(),
  href: el.getAttribute('href')
})));

Wait for Content

// Wait for selector
await page.waitForSelector('.results', { timeout: 10000 });

// Wait for network idle
await page.goto(url, { waitUntil: 'networkidle2' });

// Wait for function
await page.waitForFunction(() => document.querySelectorAll('.item').length > 10);

Pagination

async function scrapeWithPagination(baseUrl, maxPages = 5) {
  const browser = await puppeteer.launch({ headless: 'new' });
  const page = await browser.newPage();
  
  let results = [];
  
  for (let i = 1; i <= maxPages; i++) {
    const url = `${baseUrl}?page=${i}`;
    await page.goto(url, { waitUntil: 'networkidle2' });
    
    const items = await page.$$eval('.item', els => 
      els.map(el => el.textContent.trim())
    );
    
    if (items.length === 0) break;
    results.push(...items);
  }
  
  await browser.close();
  return results;
}

Screenshots

// Full page screenshot
await page.screenshot({ path: 'screenshot.png', fullPage: true });

// Element screenshot
const element = await page.$('.chart');
await element.screenshot({ path: 'chart.png' });

Block Resources (Speed Up)

await page.setRequestInterception(true);
page.on('request', req => {
  if (['image', 'stylesheet', 'font'].includes(req.resourceType())) {
    req.abort();
  } else {
    req.continue();
  }
});

Scripts

scrape.js — Basic Scraping

// Usage: node scripts/scrape.js <url> [selector]
const puppeteer = require('puppeteer');

const url = process.argv[2];
const selector = process.argv[3] || 'body';

if (!url) {
  console.error('Usage: node scrape.js <url> [selector]');
  process.exit(1);
}

(async () => {
  const browser = await puppeteer.launch({ headless: 'new' });
  const page = await browser.newPage();
  
  await page.goto(url, { waitUntil: 'networkidle2' });
  
  const content = await page.$$eval(selector, els => 
    els.map(el => el.textContent.trim())
  );
  
  console.log(JSON.stringify(content, null, 2));
  
  await browser.close();
})();

screenshot.js — Page Screenshots

// Usage: node scripts/screenshot.js <url> [output.png]
const puppeteer = require('puppeteer');

const url = process.argv[2];
const output = process.argv[3] || 'screenshot.png';

if (!url) {
  console.error('Usage: node screenshot.js <url> [output.png]');
  process.exit(1);
}

(async () => {
  const browser = await puppeteer.launch({ headless: 'new' });
  const page = await browser.newPage();
  
  await page.goto(url, { waitUntil: 'networkidle2' });
  await page.screenshot({ path: output, fullPage: true });
  
  console.log(`Screenshot saved to ${output}`);
  await browser.close();
})();

crawl.js — Multi-Page Crawler

// Usage: node crawl.js <url> <selector> [maxPages]
const puppeteer = require('puppeteer');

const url = process.argv[2];
const selector = process.argv[3];
const maxPages = parseInt(process.argv[4]) || 10;

if (!url || !selector) {
  console.error('Usage: node crawl.js <url> <selector> [maxPages]');
  process.exit(1);
}

(async () => {
  const browser = await puppeteer.launch({ headless: 'new' });
  const page = await browser.newPage();
  
  let allData = [];
  
  for (let i = 1; i <= maxPages; i++) {
    const pageUrl = url.includes('?') ? `${url}&page=${i}` : `${url}?page=${i}`;
    console.error(`Crawling: ${pageUrl}`);
    
    await page.goto(pageUrl, { waitUntil: 'networkidle2' });
    
    const data = await page.$$eval(selector, els => 
      els.map(el => el.textContent.trim())
    );
    
    if (data.length === 0) break;
    allData.push(...data);
  }
  
  console.log(JSON.stringify(allData, null, 2));
  await browser.close();
})();

Common Selectors

| Target | Selector | |--------|----------| | All links | a | | All images | img | | Headings | h1, h2, h3 | | Lists | ul li, ol li | | Tables | table tr | | Cards/Items | .item, .card, .product | | Prices | .price, [class*="price"] | | Descriptions | .description, .summary |

Tips

  • Check robots.txt before scraping: curl example.com/robots.txt
  • Add delays between requests to avoid bans: await new Promise(r => setTimeout(r, 2000))
  • Use networkidle2 for SPAs (Single Page Apps)
  • Debug with screenshots when selectors fail
  • Set user agent for sites that block bots:
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
    

Reference

For detailed Puppeteer API, see puppeteer/docs/api.md.