Justmetrically Editorial
Insights for buyers of recurring public-web data workflows
December 26, 2024

python
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.com']
start_urls = ['https://example.com']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = scrapy.Item()
item['url'] = response.url
item['title'] = response.xpath('//title').get()
item['content'] = response.xpath('//content').get()
return item