PlayWright

[PlayWright] 웹페이지 크롤링 및 json 데이터 만들기

DingCoDing 2023. 1. 7. 22:04
반응형

https://blog.apify.com/how-to-scrape-the-web-with-playwright-ece1ced75f73/

 

import { chromium } from "playwright";

const browser = await chromium.launch({
    headless: false
})

const context = await browser.newContext({
    bypassCSP: true
});

const page = await context.newPage();
await page.goto("https://github.com/topics/javascript")


for (let i = 1; i < 10; i++) {
    await page.click('text=Load more');
}

const repos = await page.$$eval('article.border', (repoCards) => {
    return repoCards.map(card => {

        const [user, repo] = card.querySelectorAll('h3 a');
        const stars = card.querySelector('#repo-stars-counter-star')
            .getAttribute('title');
        const description = card.querySelector('div.px-3 > p');
        const topics = card.querySelectorAll('a.topic-tag')

        const toText = (element) => element && element.innerText.trim();
        const parseNumber = (text) => Number(text.replace(/,/g, ''))

        return {
            user: toText(user),
            repo: toText(repo),
            url: repo.href,
            stars: parseNumber(stars),
            description: toText(description),
            topics: Array.from(topics).map((t) => toText(t))
        }

    })
})


console.log(`We extracted ${repos.length} repositories.`);
// console.dir(repos);

await page.pause();
await page.waitForTimeout(10000);
await browser.close()

 

crawlee 활용

import { Configuration, PlaywrightCrawler } from "crawlee";

Configuration.set('headless', false)

const crawler = new PlaywrightCrawler({
    requestHandler: async ({ page, infiniteScroll }) => {
        const title = await page.title()
        console.log(title)


        await infiniteScroll({
            buttonSelector: 'text=Load more',

            stopScrollCallback: async () => {
                const repos = await page.$$('article.border');
                return repos.length >= 100;
            },
        })

        const repos = await page.$$eval('article.border', (repoCards) => {
            return repoCards.map(card => {
                const [user, repo] = card.querySelectorAll('h3 a');
                const stars = card.querySelector('#repo-stars-counter-star')
                    .getAttribute('title');
                const description = card.querySelector('div.px-3 > p');
                const topics = card.querySelectorAll('a.topic-tag');

                const toText = (element) => element && element.innerText.trim();
                const parseNumber = (text) => Number(text.replace(/,/g, ''));

                return {
                    user: toText(user),
                    repo: toText(repo),
                    url: repo.href,
                    stars: parseNumber(stars),
                    description: toText(description),
                    topics: Array.from(topics).map((t) => toText(t)),
                };
            });
        });
        console.log('Repository count:', repos.length);
        console.dir(repos);

        await page.waitForTimeout(10000)
    }
})

await crawler.run(['https://github.com/topics/javascript'])

 

 

router 활용하여 깔끔하게 코드 작성

// crawlee.js
import { Dataset, PlaywrightCrawler } from 'crawlee';
import { router } from './router.js';

const crawler = new PlaywrightCrawler({
    requestHandler: router
})

await crawler.run(['https://github.com/topics/javascript'])
await Dataset.exportToCSV('repositories');

 

 

 

// router.js
import { createPlaywrightRouter, Dataset, Request } from 'crawlee';

export const router = createPlaywrightRouter();

const REPO_COUNT = 20;

router.use(async ({ page }) => {
    const title = await page.title()
    console.log(title);
})

router.addHandler('repository', async ({ page, request }) => {
    const commitText = await page
        .getByRole('listitem', )
        .filter({ hasText: 'commits'})
        .textContent()
    const numberStrings = commitText.match(/\d+/g);
    const commitCount = Number(numberStrings.join(''));

    await Dataset.pushData({
        ...request.userData,
        commitCount,
    });
});

router.addDefaultHandler(async ({ page, infiniteScroll, crawler }) => {
    await infiniteScroll({
        buttonSelector: 'text=Load more',
        stopScrollCallback: async () => {
            const repos = await page.$$('article.border');
            return repos.length >= REPO_COUNT;
        },
    });

    const repos = await page.$$eval('article.border', (repoCards) => {
        return repoCards.map(card => {
            const [user, repo] = card.querySelectorAll('h3 a');
            const stars = card.querySelector('#repo-stars-counter-star')
                .getAttribute('title');
            const description = card.querySelector('div.px-3 > p');
            const topics = card.querySelectorAll('a.topic-tag');

            const toText = (element) => element && element.innerText.trim();
            const parseNumber = (text) => Number(text.replace(/,/g, ''));

            return {
                user: toText(user),
                repo: toText(repo),
                url: repo.href,
                stars: parseNumber(stars),
                description: toText(description),
                topics: Array.from(topics)
                    .map((t) => toText(t)),
            };
        });
    });

    console.log('Repository count:', repos.length);
    const requests = repos.map(repo => new Request({
        url: repo.url,
        label: 'repository',
        userData: repo,
    }));

    await crawler.addRequests(requests);
})
반응형