ppcc.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. # 风能产业网
  5. class CweeaSpider(scrapy.Spider):
  6. name = 'ppcc'
  7. allowed_domains = ['paper.people.com.cn']
  8. start_urls = ['http://paper.people.com.cn/rmrb/index.html',
  9. 'http://paper.people.com.cn/zgnyb/paperindex.htm']
  10. custom_settings = {
  11. 'DOWNLOAD_DELAY': 20,
  12. }
  13. def start_requests(self):
  14. for url in self.start_urls:
  15. yield scrapy.Request(url=url, callback=self.parse_blocks, errback=self.errback_httpbin)
  16. def parse_blocks(self, response):
  17. for url in response.css('a[id="pageLink"]::attr(href)').getall():
  18. yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
  19. def parse_pages(self, response):
  20. for url in response.css('.news-list a::attr(href)').getall():
  21. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
  22. def parse_item(self, response):
  23. url = response.url
  24. title = response.css('title::text').get()
  25. source = '人民日报'
  26. description = ''
  27. content = ''.join(''.join(response.css('#ozoom p::text').getall()).split())
  28. date = time.time()
  29. column = ''
  30. yield ElectricItem(url=url, title=title, source=source,
  31. description=description, content=content,
  32. date=date, column=column)
  33. def errback_httpbin(self, failure):
  34. self.logger.error(repr(failure))