I think I have found a simplest yet fitting solution (at least for my purpose), in respect to existent code written to scrape projecteuler:
# -*- coding: utf-8 -*-
import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader
class EulerSpider(scrapy.Spider):
name = "euler'
allowed_domains = ['projecteuler.net']
start_urls = ["https://projecteuler.net/archives"]
def parse(self, response):
numpag = response.css("div.pagination a[href]::text").extract()
maxpag = int(numpag[len(numpag) - 1])
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
for i in range(2, maxpag + 1):
next_page = "https://projecteuler.net/archives;page=" + str(i)
yield response.follow(next_page, self.parse_next)
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_next(self, response):
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
From the start page (archives) I follow every single link to a problem, scraping the data that I need with parse_problems
. Then I launch the scraper for the other pages of the site, with the same procedure for every list of link.
Also the Item definition with pre and post processes is very clean:
import re
import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags
def extract_first_number(text):
i = re.search('\d+', text)
return int(text[i.start():i.end()])
def array_to_value(element):
return element[0]
class Problem(scrapy.Item):
id = scrapy.Field(
input_processor=MapCompose(remove_tags, extract_first_number),
output_processor=Compose(array_to_value)
)
title = scrapy.Field(input_processor=MapCompose(remove_tags))
content = scrapy.Field()
I launch this with the command scrapy crawl euler -o euler.json
and it outputs an array of unordered json objects, everyone corrisponding to a single problem: this is fine for me because I’m going to process it with javascript, even if I think resolving the ordering problem via scrapy can be very simple.
EDIT: in fact it is simple, using this pipeline
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.list_items = []
self.file = open('euler.json', 'w')
def close_spider(self, spider):
ordered_list = [None for i in range(len(self.list_items))]
self.file.write("[\n")
for i in self.list_items:
ordered_list[int(i['id']-1)] = json.dumps(dict(i))
for i in ordered_list:
self.file.write(str(i)+",\n")
self.file.write("]\n")
self.file.close()
def process_item(self, item, spider):
self.list_items.append(item)
return item
though the best solution may be to create a custom exporter:
from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes
class OrderedJsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
# To initialize the object we use JsonItemExporter's constructor
super().__init__(file)
self.list_items = []
def export_item(self, item):
self.list_items.append(item)
def finish_exporting(self):
ordered_list = [None for i in range(len(self.list_items))]
for i in self.list_items:
ordered_list[int(i['id'] - 1)] = i
for i in ordered_list:
if self.first_item:
self.first_item = False
else:
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(i))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
self._beautify_newline()
self.file.write(b"]")
and configure it in settings to call it for json:
FEED_EXPORTERS = {
'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}
solved Scraping Project Euler site with scrapy [closed]