nengyuanjie.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. # 能源界
  5. class NyjieSpider(scrapy.Spider):
  6. name = 'nengyuanjie'
  7. allowed_domains = ['nengyuanjie.net']
  8. start_urls = [
  9. ('http://www.nengyuanjie.net/series/chuneng.html',188,'储能'),
  10. ('http://www.nengyuanjie.net/series/hedian.html',180,'核电'),
  11. ('http://www.nengyuanjie.net/series/fengdian.html',188,'风电'),
  12. ('http://www.nengyuanjie.net/series/guangfu.html',257,'光伏'),
  13. ('http://www.nengyuanjie.net/series/nengyuanhulianwang.html',43,'计算机'),
  14. ('http://www.nengyuanjie.net/series/qingneng.html',90,'氢能'),
  15. ]
  16. custom_settings = {
  17. 'DOWNLOAD_DELAY': 10,
  18. 'DOWNLOAD_TIMEOUT': 1800,
  19. }
  20. def start_requests(self):
  21. for url, maxpage, column in self.start_urls:
  22. yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": url, "nextpage": 2, 'maxpage': maxpage, 'column': column}, dont_filter=True)
  23. def parse_pages(self, response):
  24. for item in response.xpath('//div[@class="lists"]/div[@class="li"]'):
  25. url = item.css('a').attrib['href']
  26. self.logger.info('parse item: {}'.format(url))
  27. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={'column': response.meta['column']})
  28. if response.meta['nextpage'] <= response.meta['maxpage']:
  29. nextpage = response.meta['url'] + '?page={}'.format(response.meta['nextpage'])
  30. self.logger.info('next page: {}'.format(nextpage))
  31. yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage'], 'column': response.meta['column']}, dont_filter=True)
  32. def parse_item(self, response):
  33. url = response.url
  34. title = response.css('title::text').get()
  35. source = '能源界'
  36. description = response.css('meta[name=description]::attr(content)').get()
  37. content = ''.join(''.join(response.xpath('//div[@class="content"]//text()').getall()).split())
  38. date = time.time()
  39. column = response.meta['column']
  40. yield ElectricItem(url=url, title=title, source=source,
  41. description=description, content=content,
  42. date=date, column=column)
  43. def errback_httpbin(self, failure):
  44. self.logger.error(repr(failure))