1234567891011121314151617181920212223242526272829303132333435363738394041 |
- import time
- import scrapy
- from electric.items import ElectricItem
- # 风能产业网
- class CweeaSpider(scrapy.Spider):
- name = 'ppcc'
- allowed_domains = ['paper.people.com.cn']
- start_urls = ['http://paper.people.com.cn/rmrb/index.html',
- 'http://paper.people.com.cn/zgnyb/paperindex.htm']
- custom_settings = {
- 'DOWNLOAD_DELAY': 20,
- }
- def start_requests(self):
- for url in self.start_urls:
- yield scrapy.Request(url=url, callback=self.parse_blocks, errback=self.errback_httpbin)
- def parse_blocks(self, response):
- for url in response.css('a[id="pageLink"]::attr(href)').getall():
- yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
- def parse_pages(self, response):
- for url in response.css('.news-list a::attr(href)').getall():
- yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
- def parse_item(self, response):
- url = response.url
- title = response.css('title::text').get()
- source = '人民日报'
- description = ''
- content = ''.join(''.join(response.css('#ozoom p::text').getall()).split())
- date = time.time()
- column = ''
- yield ElectricItem(url=url, title=title, source=source,
- description=description, content=content,
- date=date, column=column)
- def errback_httpbin(self, failure):
- self.logger.error(repr(failure))
|