gxepa.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. # 广西电力行业协会
  5. class GxepaSpider(scrapy.Spider):
  6. name = 'gxepa'
  7. allowed_domains = ['gxepa.org.cn']
  8. start_urls = [
  9. ('http://www.gxepa.org.cn/news_rdjj',102),
  10. ('http://www.gxepa.org.cn/news_xhyw',31),
  11. ('http://www.gxepa.org.cn/news_hyyw', 81),
  12. ]
  13. custom_settings = {
  14. 'DOWNLOAD_DELAY': 10,
  15. 'DOWNLOAD_TIMEOUT': 1800,
  16. }
  17. def start_requests(self):
  18. for url, maxpage in self.start_urls:
  19. yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin,
  20. meta={"url": url, "nextpage": 2, 'maxpage': maxpage},
  21. dont_filter=True)
  22. def parse_pages(self, response):
  23. for item in response.xpath('//div[@class="list-news"]/ul/li'):
  24. url = item.css('a').attrib['href']
  25. self.logger.info('parse item: {}'.format(url))
  26. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
  27. if response.meta['nextpage'] <= response.meta['maxpage']:
  28. nextpage = response.meta['url'] + '?pageNo={}'.format(response.meta['nextpage'])
  29. self.logger.info('next page: {}'.format(nextpage))
  30. yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin,
  31. meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1,
  32. 'maxpage': response.meta['maxpage']},
  33. dont_filter=True)
  34. def parse_item(self, response):
  35. url = response.url
  36. title = response.css('title::text').get()
  37. source = '广西电力行业协会'
  38. description = ''
  39. content = ''.join(''.join(response.xpath('//div[@id="ArtText"]//text()').getall()).split())
  40. date = time.time()
  41. column = ""
  42. self.logger.info('title: {}'.format(title))
  43. yield ElectricItem(url=url, title=title, source=source,
  44. description=description, content=content,
  45. date=date, column=column)
  46. def errback_httpbin(self, failure):
  47. self.logger.error(repr(failure))