Examples
WDS MS SQL CLR Functions Cheat Sheet
By default, the JobRestartMode is Continue, so the first runs would take some time to grab real data from the playground, but the rest will work on the cache, which is much faster. If you don’t like this, change the JobRestartMode
Iterate over all pages and scrape data
DECLARE @jobConfig wds.JobConfig = 'JobName: CrawlAllProducts; Server: wds://localhost:2807; StartUrls: http://playground.svc';
DECLARE @pages TABLE (Task wds.DownloadTask);
-- Gathering categories' first pages
INSERT INTO @pages (Task)
SELECT nav.Task
FROM wds.Start(@jobConfig) root
OUTER APPLY wds.Crawl(root.Task, 'css: ul.nav a:not([href="/"])') nav
-- Gathering the other pages of categories
WHILE @@ROWCOUNT > 0
BEGIN
INSERT INTO @pages (Task)
SELECT newPages.Task
FROM @pages curPages
CROSS APPLY wds.Crawl(curPages.Task, 'css: ul.pagination li:not(.disabled) a') newPages
WHERE NOT EXISTS (SELECT NULL FROM @pages p WHERE p.Task = newPages.Task)
END
-- Iterating through pages, visiting product pages, and scraping data from them
SELECT
products.Task.Url ProductUrl,
wds.ScrapeFirst(products.Task, 'css: h1', DEFAULT) AS ProductName,
wds.ScrapeFirst(products.Task, 'css: .price span', DEFAULT) AS ProductPrice
FROM @pages pages
CROSS APPLY wds.Crawl(pages.Task, 'css: .table a') products
Iterate over all pages and scrape data using the batch approach (faster)
DECLARE @jobConfig wds.JobConfig = 'JobName: CrawlAllProductsBatch; Server: wds://localhost:2807; StartUrls: http://playground.svc';
DECLARE @pages TABLE (Task wds.DownloadTask);
-- Gathering categories' first pages
INSERT INTO @pages
(Task)
SELECT nav.Task
FROM wds.Start(@jobConfig) root
OUTER APPLY wds.Crawl(root.Task, 'css: ul.nav a:not([href="/"])') nav
-- Gathering the other pages of categories
WHILE @@ROWCOUNT > 0
BEGIN
INSERT INTO @pages
(Task)
SELECT newPages.Task
FROM @pages curPages
CROSS APPLY wds.Crawl(curPages.Task, 'css: ul.pagination li:not(.disabled) a') newPages
WHERE NOT EXISTS (SELECT NULL FROM @pages p WHERE p.Task = newPages.Task)
END
-- Iterating through pages, visiting product pages, and scraping data from them
SELECT
products.Task.Url ProductUrl,
product.ScrapeResult.GetFirst('ProductName') AS ProductName,
product.ScrapeResult.GetFirst('ProductPrice') AS ProductPrice
FROM @pages pages
CROSS APPLY wds.Crawl(pages.Task, 'css: .table a') products
CROSS APPLY (
SELECT wds.ScrapeMultiple(products.Task)
.AddScrapeParams('ProductName', 'css: h1', '')
.AddScrapeParams('ProductPrice', 'css: .price span', '') AS ScrapeResult
) product