zzsolar.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. ## 郑州国际太阳能光伏展览会
  5. class ZzsolarSpider(scrapy.Spider):
  6. name = 'zzsolar'
  7. allowed_domains = ['zzsolar.com.cn']
  8. start_urls = [
  9. ('https://zzsolar.com.cn/a/news/hyxw/list_13_{}.html', 122),
  10. ]
  11. custom_settings = {
  12. 'DOWNLOAD_DELAY': 10,
  13. }
  14. def start_requests(self):
  15. for baseurl, maxpage in self.start_urls:
  16. url = baseurl.format(1)
  17. yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": baseurl, "nextpage": 2, 'maxpage': maxpage}, dont_filter=True)
  18. def parse_pages(self, response):
  19. for item in response.xpath('//div[@class="cont thumblist1"]/ul/li'):
  20. url = item.css('a.fl').attrib['href']
  21. self.logger.info('parse item: {}'.format(url))
  22. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
  23. if response.meta['nextpage'] <= response.meta['maxpage']:
  24. nextpage = response.meta['url'].format(response.meta['nextpage'])
  25. self.logger.info('next page: {}'.format(nextpage))
  26. yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
  27. def parse_item(self, response):
  28. url = response.url
  29. title = response.css('h1::text').get()
  30. source = '郑州国际太阳能光伏展览会'
  31. description = ''
  32. content = response.xpath('//div[@class="showContxt"]//text()').getall()
  33. date = time.time()
  34. column = '光伏'
  35. yield ElectricItem(url=url, title=title, source=source,
  36. description=description, content=content,
  37. date=date, column=column)
  38. def errback_httpbin(self, failure):
  39. self.logger.error(repr(failure))