solarenpv.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. # 光伏产业网
  5. class SolapvSpider(scrapy.Spider):
  6. name = 'solarenpv'
  7. allowed_domains = ['solarenpv.com']
  8. start_urls = [
  9. 'http://www.solarenpv.com/',
  10. ]
  11. custom_settings = {
  12. 'DOWNLOAD_DELAY': 10,
  13. }
  14. def start_requests(self):
  15. for url in self.start_urls:
  16. yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
  17. def parse_classes(self, response):
  18. for url in response.css('.m_r .ibox_head a::attr(href)').getall():
  19. yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
  20. def parse_pages(self, response):
  21. for url in response.css(".catlist_li a::attr(href)").getall():
  22. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
  23. try:
  24. url = response.css('.pages a:nth-of-type(10)::attr(href)').get()
  25. yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
  26. except:
  27. self.logger.info('Last page')
  28. def parse_item(self, response):
  29. url = response.url
  30. title = response.css('title::text').get()
  31. source = '光伏产业网'
  32. description = response.css('meta[name=description]::attr(content)').get()
  33. content = ''.join(''.join(response.xpath('//div[@id="article"]//text()').getall()).split())
  34. date = time.time()
  35. column = '光伏'
  36. yield ElectricItem(url=url, title=title, source=source,
  37. description=description, content=content,
  38. date=date, column=column)
  39. def errback_httpbin(self, failure):
  40. self.logger.error(repr(failure))