PlayWright
[PlayWright] 웹페이지 크롤링 및 json 데이터 만들기
DingCoDing
2023. 1. 7. 22:04
반응형
https://blog.apify.com/how-to-scrape-the-web-with-playwright-ece1ced75f73/
import { chromium } from "playwright";
const browser = await chromium.launch({
headless: false
})
const context = await browser.newContext({
bypassCSP: true
});
const page = await context.newPage();
await page.goto("https://github.com/topics/javascript")
for (let i = 1; i < 10; i++) {
await page.click('text=Load more');
}
const repos = await page.$$eval('article.border', (repoCards) => {
return repoCards.map(card => {
const [user, repo] = card.querySelectorAll('h3 a');
const stars = card.querySelector('#repo-stars-counter-star')
.getAttribute('title');
const description = card.querySelector('div.px-3 > p');
const topics = card.querySelectorAll('a.topic-tag')
const toText = (element) => element && element.innerText.trim();
const parseNumber = (text) => Number(text.replace(/,/g, ''))
return {
user: toText(user),
repo: toText(repo),
url: repo.href,
stars: parseNumber(stars),
description: toText(description),
topics: Array.from(topics).map((t) => toText(t))
}
})
})
console.log(`We extracted ${repos.length} repositories.`);
// console.dir(repos);
await page.pause();
await page.waitForTimeout(10000);
await browser.close()
crawlee 활용
import { Configuration, PlaywrightCrawler } from "crawlee";
Configuration.set('headless', false)
const crawler = new PlaywrightCrawler({
requestHandler: async ({ page, infiniteScroll }) => {
const title = await page.title()
console.log(title)
await infiniteScroll({
buttonSelector: 'text=Load more',
stopScrollCallback: async () => {
const repos = await page.$$('article.border');
return repos.length >= 100;
},
})
const repos = await page.$$eval('article.border', (repoCards) => {
return repoCards.map(card => {
const [user, repo] = card.querySelectorAll('h3 a');
const stars = card.querySelector('#repo-stars-counter-star')
.getAttribute('title');
const description = card.querySelector('div.px-3 > p');
const topics = card.querySelectorAll('a.topic-tag');
const toText = (element) => element && element.innerText.trim();
const parseNumber = (text) => Number(text.replace(/,/g, ''));
return {
user: toText(user),
repo: toText(repo),
url: repo.href,
stars: parseNumber(stars),
description: toText(description),
topics: Array.from(topics).map((t) => toText(t)),
};
});
});
console.log('Repository count:', repos.length);
console.dir(repos);
await page.waitForTimeout(10000)
}
})
await crawler.run(['https://github.com/topics/javascript'])
router 활용하여 깔끔하게 코드 작성
// crawlee.js
import { Dataset, PlaywrightCrawler } from 'crawlee';
import { router } from './router.js';
const crawler = new PlaywrightCrawler({
requestHandler: router
})
await crawler.run(['https://github.com/topics/javascript'])
await Dataset.exportToCSV('repositories');
// router.js
import { createPlaywrightRouter, Dataset, Request } from 'crawlee';
export const router = createPlaywrightRouter();
const REPO_COUNT = 20;
router.use(async ({ page }) => {
const title = await page.title()
console.log(title);
})
router.addHandler('repository', async ({ page, request }) => {
const commitText = await page
.getByRole('listitem', )
.filter({ hasText: 'commits'})
.textContent()
const numberStrings = commitText.match(/\d+/g);
const commitCount = Number(numberStrings.join(''));
await Dataset.pushData({
...request.userData,
commitCount,
});
});
router.addDefaultHandler(async ({ page, infiniteScroll, crawler }) => {
await infiniteScroll({
buttonSelector: 'text=Load more',
stopScrollCallback: async () => {
const repos = await page.$$('article.border');
return repos.length >= REPO_COUNT;
},
});
const repos = await page.$$eval('article.border', (repoCards) => {
return repoCards.map(card => {
const [user, repo] = card.querySelectorAll('h3 a');
const stars = card.querySelector('#repo-stars-counter-star')
.getAttribute('title');
const description = card.querySelector('div.px-3 > p');
const topics = card.querySelectorAll('a.topic-tag');
const toText = (element) => element && element.innerText.trim();
const parseNumber = (text) => Number(text.replace(/,/g, ''));
return {
user: toText(user),
repo: toText(repo),
url: repo.href,
stars: parseNumber(stars),
description: toText(description),
topics: Array.from(topics)
.map((t) => toText(t)),
};
});
});
console.log('Repository count:', repos.length);
const requests = repos.map(repo => new Request({
url: repo.url,
label: 'repository',
userData: repo,
}));
await crawler.addRequests(requests);
})
반응형