Scrape Paged

Minimal playground version: v1.0.0

Iterate over all pages and scrape data

DECLARE @jobConfig wds.JobConfig = 'JobName: CrawlAllProducts; Server: wds://localhost:2807; StartUrls: http://playground.svc';
DECLARE @pages TABLE (Task wds.DownloadTask);

-- Gathering categories' first pages
INSERT INTO @pages (Task)
SELECT nav.Task
FROM wds.Start(@jobConfig) root
    OUTER APPLY wds.Crawl(root.Task, 'css: ul.nav a:not([href="/"])', null) nav

-- Gathering the other pages of categories
WHILE @@ROWCOUNT > 0
BEGIN
    INSERT INTO @pages (Task)
    SELECT newPages.Task
    FROM @pages curPages
        CROSS APPLY wds.Crawl(curPages.Task, 'css: ul.pagination li:not(.disabled) a', null) newPages
    WHERE NOT EXISTS (SELECT NULL FROM @pages p WHERE p.Task = newPages.Task)
END

-- Iterating through pages, visiting product pages, and scraping data from them
SELECT 
    products.Task.Url ProductUrl,
    wds.ScrapeFirst(products.Task, 'css: h1', null) AS ProductName,
    wds.ScrapeFirst(products.Task, 'css: .price span', null) AS ProductPrice
FROM @pages pages
    CROSS APPLY wds.Crawl(pages.Task, 'css: .table a', null) products

Iterate over all pages and scrape data using the ScrapeMultiple function. This approach is a bit faster because fewer requests to the WDS API Server are required.

DECLARE @jobConfig wds.JobConfig = 'JobName: CrawlAllProductsBatch; Server: wds://localhost:2807; StartUrls: http://playground.svc';
DECLARE @pages TABLE (Task wds.DownloadTask);

-- Gathering categories' first pages
INSERT INTO @pages
    (Task)
SELECT nav.Task
FROM wds.Start(@jobConfig) root
    OUTER APPLY wds.Crawl(root.Task, 'css: ul.nav a:not([href="/"])', null) nav

-- Gathering the other pages of categories
WHILE @@ROWCOUNT > 0
BEGIN
    INSERT INTO @pages
        (Task)
    SELECT newPages.Task
    FROM @pages curPages
        CROSS APPLY wds.Crawl(curPages.Task, 'css: ul.pagination li:not(.disabled) a', null) newPages
    WHERE NOT EXISTS (SELECT NULL FROM @pages p WHERE p.Task = newPages.Task)
END

-- Iterating through pages, visiting product pages, and scraping data from them
SELECT
    products.Task.Url ProductUrl,
    product.ScrapeResult.GetFirst('ProductName') AS ProductName,
    product.ScrapeResult.GetFirst('ProductPrice') AS ProductPrice
FROM @pages pages
    CROSS APPLY wds.Crawl(pages.Task, 'css: .table a', null) products
    CROSS APPLY (
        SELECT wds.ScrapeMultiple(products.Task)
                .AddScrapeParams('ProductName', 'css: h1', null)
                .AddScrapeParams('ProductPrice', 'css: .price span', null) AS ScrapeResult
    ) product

Please rotate your device to landscape mode

This documentation is specifically designed with a wider layout to provide a better reading experience for code examples, tables, and diagrams.
Rotating your device horizontally ensures you can see everything clearly without excessive scrolling or resizing.

Return to Web Data Source Home