Scrape Paged
Minimal playground version: v1.0.0
Iterate over all pages and scrape data
DECLARE @jobConfig wds.JobConfig = 'JobName: CrawlAllProducts; Server: wds://localhost:2807; StartUrls: http://playground.svc';
DECLARE @pages TABLE (Task wds.DownloadTask);
-- Gathering categories' first pages
INSERT INTO @pages (Task)
SELECT nav.Task
FROM wds.Start(@jobConfig) root
OUTER APPLY wds.Crawl(root.Task, 'css: ul.nav a:not([href="/"])', null) nav
-- Gathering the other pages of categories
WHILE @@ROWCOUNT > 0
BEGIN
INSERT INTO @pages (Task)
SELECT newPages.Task
FROM @pages curPages
CROSS APPLY wds.Crawl(curPages.Task, 'css: ul.pagination li:not(.disabled) a', null) newPages
WHERE NOT EXISTS (SELECT NULL FROM @pages p WHERE p.Task = newPages.Task)
END
-- Iterating through pages, visiting product pages, and scraping data from them
SELECT
products.Task.Url ProductUrl,
wds.ScrapeFirst(products.Task, 'css: h1', null) AS ProductName,
wds.ScrapeFirst(products.Task, 'css: .price span', null) AS ProductPrice
FROM @pages pages
CROSS APPLY wds.Crawl(pages.Task, 'css: .table a', null) products
Iterate over all pages and scrape data using the ScrapeMultiple function. This approach is a bit faster because fewer requests to the WDS API Server are required.
DECLARE @jobConfig wds.JobConfig = 'JobName: CrawlAllProductsBatch; Server: wds://localhost:2807; StartUrls: http://playground.svc';
DECLARE @pages TABLE (Task wds.DownloadTask);
-- Gathering categories' first pages
INSERT INTO @pages
(Task)
SELECT nav.Task
FROM wds.Start(@jobConfig) root
OUTER APPLY wds.Crawl(root.Task, 'css: ul.nav a:not([href="/"])', null) nav
-- Gathering the other pages of categories
WHILE @@ROWCOUNT > 0
BEGIN
INSERT INTO @pages
(Task)
SELECT newPages.Task
FROM @pages curPages
CROSS APPLY wds.Crawl(curPages.Task, 'css: ul.pagination li:not(.disabled) a', null) newPages
WHERE NOT EXISTS (SELECT NULL FROM @pages p WHERE p.Task = newPages.Task)
END
-- Iterating through pages, visiting product pages, and scraping data from them
SELECT
products.Task.Url ProductUrl,
product.ScrapeResult.GetFirst('ProductName') AS ProductName,
product.ScrapeResult.GetFirst('ProductPrice') AS ProductPrice
FROM @pages pages
CROSS APPLY wds.Crawl(pages.Task, 'css: .table a', null) products
CROSS APPLY (
SELECT wds.ScrapeMultiple(products.Task)
.AddScrapeParams('ProductName', 'css: h1', null)
.AddScrapeParams('ProductPrice', 'css: .price span', null) AS ScrapeResult
) product